Compare commits
397 Commits
pcre2-10.3
...
amigaos
Author | SHA1 | Date |
---|---|---|
George Sokianos | 4a45482c9c | |
Philip Hazel | 8b133fa0ba | |
Philip Hazel | cc5e121c8e | |
Philip Hazel | 1343bdff8f | |
Philip Hazel | d90fb23878 | |
Ezekiel Warren | e47fc51584 | |
Zoltan Herczeg | b67d568201 | |
Zoltan Herczeg | 4851890ede | |
Amin Yahyaabadi | 3e52db5209 | |
Philip Hazel | 4804b00e8f | |
Philip Hazel | 7549fdca74 | |
Philip Hazel | 5271b533c4 | |
larinsv | 45af1203bd | |
Rémi Verschelde | 187b7ba050 | |
William A Rowe Jr | 06f34ba374 | |
GregThain | a334ea2a34 | |
Carlo Marcelo Arenas Belón | 15a82c3efd | |
Philip Hazel | 51a5fcdc1f | |
Philip Hazel | 104fe2fead | |
Philip Hazel | f65df06305 | |
pkeir | a13d7d4340 | |
Lucas Trzesniewski | c630e868ca | |
Joe Zhang | 77ce1ff528 | |
Philip Hazel | ff5402a378 | |
Philip Hazel | b52d055d1b | |
Carlo Marcelo Arenas Belón | a4ac97fea8 | |
Philip Hazel | fedf4d9d40 | |
Philip Hazel | 8ebf9efe7b | |
Carlo Marcelo Arenas Belón | 4edcf6ada5 | |
Philip Hazel | d0c7544e78 | |
Carlo Marcelo Arenas Belón | f28e82602d | |
Philip Hazel | 1bb2b97b29 | |
Lucas Trzesniewski | 3fec24a26f | |
Philip Hazel | 66b3cb34df | |
Philip Hazel | 29a43aa11d | |
Philip Hazel | 3103b8f20a | |
Philip Hazel | 13be26a5c2 | |
pagabuc | ba6a5f16d2 | |
Zoltan Herczeg | d07c967b3a | |
Carlo Marcelo Arenas Belón | 4279abbd7d | |
Philip Hazel | 8ff3ab27d5 | |
Zoltan Herczeg | e612e06b5d | |
Philip Hazel | 64c9baaaa4 | |
Carlo Marcelo Arenas Belón | 9c8abddc52 | |
Carlo Marcelo Arenas Belón | f11c26842d | |
Zoltan Herczeg | 4ca0530b9b | |
Zoltan Herczeg | 03654e751e | |
Zoltan Herczeg | d4fa336fbc | |
Zoltan Herczeg | 50a51cb7e6 | |
Philip Hazel | f7a7341726 | |
Philip Hazel | eef5740ff9 | |
Zoltan Herczeg | dea56d2df9 | |
Adam | 111cd470b5 | |
Philip Hazel | fdd9479108 | |
Philip Hazel | 419e3c68a3 | |
Zoltan Herczeg | e21345de97 | |
Philip Hazel | e85a81ebac | |
Philip Hazel | 504ff06fff | |
Philip Hazel | 360a84e80b | |
Zoltan Herczeg | 061e57695a | |
Philip Hazel | 7f7d3e8521 | |
Philip Hazel | bf35c0518c | |
Zoltan Herczeg | 68fbc1982e | |
Philip Hazel | 06d3a66065 | |
Philip Hazel | 87571b5af3 | |
Philip Hazel | 838cdac4dc | |
Philip Hazel | 628a804102 | |
Philip Hazel | ec091e2e44 | |
Philip Hazel | 636569a957 | |
Philip Hazel | 81d3729c66 | |
Zoltan Herczeg | f90542a209 | |
Carlo Marcelo Arenas Belón | 14dbc6e6ec | |
Philip Hazel | 80205ee2a0 | |
Jessica Clarke | 04ecb267c0 | |
Jessica Clarke | 534b4760e3 | |
Philip Hazel | 31fb2e58a1 | |
Zoltan Herczeg | 435140a0ac | |
Philip Hazel | c24047f15d | |
Zoltan Herczeg | e7457003cd | |
Philip Hazel | d888d36013 | |
Zoltan Herczeg | 6614b281bc | |
Zoltan Herczeg | afa4756d19 | |
Philip Hazel | 7713f33e46 | |
Michael Kaufmann | af2637ee5e | |
Philip Hazel | 98e7d70bc6 | |
Philip Hazel | 321b559ed4 | |
Philip Hazel | 16c8a84cce | |
Philip Hazel | 4514ddd2a2 | |
Philip Hazel | 944f0e10a1 | |
Philip Hazel | b29732063b | |
Philip Hazel | 92d7cf1dd0 | |
Philip Hazel | 1d432ee3cf | |
Philip Hazel | 194a15315a | |
Philip Hazel | 1c41a5b815 | |
Zoltan Herczeg | 4243515033 | |
Philip Hazel | 49b29f837d | |
Philip Hazel | 30abd0ac8d | |
Philip Hazel | 0246c6bf64 | |
Philip Hazel | 823d4ac956 | |
Philip Hazel | ba3d0edcbd | |
Philip Hazel | 4ef0c51d2b | |
Philip Hazel | 7ab2769728 | |
Philip Hazel | 2a294ddadb | |
Philip Hazel | cb854a912e | |
Philip Hazel | 16dccbcb13 | |
Carlo Marcelo Arenas Belón | ae4e6261e5 | |
Carlo Marcelo Arenas Belón | d24a1c9d31 | |
Carlo Marcelo Arenas Belón | 055b7ce4a9 | |
Philip Hazel | 4a8f5d104c | |
Carlo Marcelo Arenas Belón | 587b94277b | |
Philip Hazel | c8d31f1605 | |
Carlo Marcelo Arenas Belón | adf76faace | |
Zoltan Herczeg | d144199dfb | |
Carlo Marcelo Arenas Belón | eb42305f07 | |
Philip Hazel | 46890604a4 | |
Carlo Marcelo Arenas Belón | acc520924c | |
Philip Hazel | bc70a183fc | |
Carlo Marcelo Arenas Belón | dae475092d | |
Philip Hazel | 1ed34b9cb1 | |
Philip Hazel | f19e84674e | |
Carlo Marcelo Arenas Belón | 7db8784296 | |
Philip Hazel | 072717a61f | |
Philip Hazel | 35fee4193b | |
Philip Hazel | 3469b13b8e | |
Philip Hazel | 29c37f9aa3 | |
Carlo Marcelo Arenas Belón | 128c50360c | |
Philip Hazel | bf2c8cc564 | |
Philip Hazel | 87f32b9b39 | |
Philip Hazel | 7ed39af7cc | |
Carlo Marcelo Arenas Belón | 3b973ebf4b | |
Carlo Marcelo Arenas Belón | f5e4e10042 | |
Carlo Marcelo Arenas Belón | d46f1863be | |
Philip Hazel | c99f0738c5 | |
Philip Hazel | 794470b51d | |
PhilipHazel | 179c5d212c | |
Lucas Trzesniewski | ec0755b829 | |
Philip Hazel | 8d9e91228c | |
PhilipHazel | e7af7efaa1 | |
Zoltan Herczeg | 51ec2c9893 | |
Philip Hazel | 0612ed77c2 | |
Philip Hazel | 507e4dcf6f | |
Zoltan Herczeg | dc5f966635 | |
Philip Hazel | 8f3e11a355 | |
Philip Hazel | e2fde18833 | |
Philip Hazel | 857ac92372 | |
Philip Hazel | 31a46200fa | |
Philip Hazel | edcc076bd8 | |
Philip Hazel | c232286c6b | |
Philip Hazel | 21c26698b3 | |
Philip Hazel | eea410b33a | |
Philip Hazel | d5a61ee891 | |
Philip Hazel | 6c2fe9da99 | |
Philip Hazel | 5ff1daffa0 | |
Philip Hazel | f4beac6c1a | |
Philip Hazel | e1cd61c292 | |
Philip Hazel | 6ee9921a89 | |
Philip Hazel | b8c60ce272 | |
Philip Hazel | b61aa572f6 | |
Philip Hazel | 25bb9de6fc | |
Philip Hazel | e74a9b6932 | |
PhilipHazel | 30036e670f | |
Philip Hazel | a8c4ef7f20 | |
Philip Hazel | c2fc6cfa0a | |
Philip Hazel | 587e46b372 | |
Philip Hazel | d8267c20fd | |
Philip Hazel | 15b692fd82 | |
Philip Hazel | 4ccef1697a | |
Philip Hazel | 5c0d38b3a8 | |
Philip Hazel | 23c16e6ced | |
Philip Hazel | 876ba431b0 | |
Philip Hazel | f64fbed2e1 | |
Philip.Hazel | 2410fbe386 | |
Philip.Hazel | d70da76dfb | |
Zoltán Herczeg | a5389db88d | |
Zoltán Herczeg | 3d80cf5a25 | |
Zoltán Herczeg | 900921f83e | |
Zoltán Herczeg | 1951243b5d | |
Philip.Hazel | 1c3256349f | |
Philip.Hazel | cd45050ee4 | |
Philip.Hazel | a5d81d06f4 | |
Philip.Hazel | 85fc061dcf | |
Philip.Hazel | 080d7789eb | |
Zoltán Herczeg | 38dbea6200 | |
Philip.Hazel | 8c1df186ab | |
Zoltán Herczeg | 0dd0283b17 | |
Zoltán Herczeg | 19a1319c0a | |
Philip.Hazel | 2c4d3942e4 | |
Zoltán Herczeg | b6acebe497 | |
Philip.Hazel | 25029849c3 | |
Philip.Hazel | 4cfa216898 | |
Philip.Hazel | 91485e5d5a | |
Philip.Hazel | 6cb388d55b | |
Philip.Hazel | 8144ae04e9 | |
Philip.Hazel | 166e576f91 | |
Philip.Hazel | c246f53ae1 | |
Zoltán Herczeg | e5e1fab2db | |
Zoltán Herczeg | b730793117 | |
Zoltán Herczeg | 46158a811f | |
Philip.Hazel | 027c9375c0 | |
Philip.Hazel | 7eb23f423e | |
Philip.Hazel | 6a9900c53b | |
Philip.Hazel | 9e15c97b6d | |
Zoltán Herczeg | d19789c251 | |
Philip.Hazel | 000bbf2ea7 | |
Philip.Hazel | dc426be88e | |
Zoltán Herczeg | fb54d81528 | |
Zoltán Herczeg | 2451870e3c | |
Zoltán Herczeg | 37b76d8609 | |
Philip.Hazel | 92554d19aa | |
Philip.Hazel | 6d4936dc29 | |
Philip.Hazel | fff544a1e9 | |
Philip.Hazel | deffc391ce | |
Philip.Hazel | 81da2b97e3 | |
Zoltán Herczeg | 3bdc76e4f3 | |
Philip.Hazel | f8cbb1f58d | |
Philip.Hazel | 0cf247f558 | |
Philip.Hazel | a2f0fd01c7 | |
Philip.Hazel | 5652d41209 | |
Zoltán Herczeg | 384620a172 | |
Zoltán Herczeg | 3d317692ac | |
Philip.Hazel | 0ad89ab06d | |
Philip.Hazel | ed489f99ae | |
Philip.Hazel | 3faff02596 | |
Philip.Hazel | cffe1ca463 | |
Philip.Hazel | b55dba885a | |
Zoltán Herczeg | fda3221597 | |
Zoltán Herczeg | 0652de5597 | |
Philip.Hazel | e44976f929 | |
Zoltán Herczeg | e0c6029a62 | |
Philip.Hazel | 5dfe817b5e | |
Philip.Hazel | e73119cbfa | |
Philip.Hazel | 768c7fe67e | |
Zoltán Herczeg | 018044a54e | |
Philip.Hazel | 9ff7f342f8 | |
Philip.Hazel | 56c4bf9095 | |
Philip.Hazel | bf4ca900f3 | |
Philip.Hazel | b940ed7520 | |
Philip.Hazel | d4e4533240 | |
Philip.Hazel | ce558bbff1 | |
Philip.Hazel | 5ec5c45423 | |
Philip.Hazel | ca55d0be6b | |
Philip.Hazel | 8b3f8af535 | |
Zoltán Herczeg | cf670e3bb9 | |
Philip.Hazel | 28f92c8596 | |
Philip.Hazel | 9cebee7e75 | |
Philip.Hazel | c472f3f91a | |
Philip.Hazel | 59233b8079 | |
Philip.Hazel | f988433788 | |
Philip.Hazel | 8057c3c8b9 | |
Zoltán Herczeg | 953d4e9c95 | |
Zoltán Herczeg | 0d0d954bbd | |
Zoltán Herczeg | 21c40e638b | |
Zoltán Herczeg | 106d9d3a25 | |
Zoltán Herczeg | 325908279e | |
Philip.Hazel | 3155a6951f | |
Zoltán Herczeg | 305e273e99 | |
Philip.Hazel | 68f9c49517 | |
Philip.Hazel | 3be538015b | |
Philip.Hazel | 4e8f13cbd6 | |
Philip.Hazel | f50ee03f5d | |
Zoltán Herczeg | a3057bbecd | |
Philip.Hazel | 4a7dfab0ec | |
Zoltán Herczeg | d0666136c9 | |
Zoltán Herczeg | c39fb3a9e1 | |
Zoltán Herczeg | c21bd97754 | |
Philip.Hazel | eedd9d8e55 | |
Philip.Hazel | a57787b7cd | |
Philip.Hazel | 29c0d64158 | |
Zoltán Herczeg | 697cf5f602 | |
Zoltán Herczeg | d71dc302a5 | |
Zoltán Herczeg | ed8a3146b9 | |
Philip.Hazel | e2c8dc8c2e | |
Philip.Hazel | b040e2e1cd | |
Philip.Hazel | 3a6b4948d1 | |
Philip.Hazel | 9e960f5465 | |
Philip.Hazel | f3c658cf87 | |
Philip.Hazel | 9e8c98587f | |
Zoltán Herczeg | 0a6ca6d420 | |
Zoltán Herczeg | 09984bb0e4 | |
Philip.Hazel | e8d70e2459 | |
Philip.Hazel | 7171d86587 | |
Zoltán Herczeg | bf4cd8212f | |
Philip.Hazel | 03720de840 | |
Philip.Hazel | 5ba5230b82 | |
Philip.Hazel | eaf4572ff8 | |
Philip.Hazel | 6707614863 | |
Philip.Hazel | 279128cbde | |
Philip.Hazel | f006fa5e3c | |
Philip.Hazel | ac4ab7186d | |
Philip.Hazel | d170829b26 | |
Philip.Hazel | 777582d4de | |
Philip.Hazel | f3fd8b18cb | |
Philip.Hazel | 0a2033f0f7 | |
Zoltán Herczeg | 880aac5dda | |
Zoltán Herczeg | 2632526c67 | |
Zoltán Herczeg | f5286d8f56 | |
Philip.Hazel | add4db4c87 | |
Zoltán Herczeg | af45f41fbb | |
Philip.Hazel | 26fc863155 | |
Philip.Hazel | 3c869816ac | |
Zoltán Herczeg | 6f41a5a01a | |
Philip.Hazel | 9323fa32b2 | |
Philip.Hazel | 8855b0efe1 | |
Zoltán Herczeg | 1838261037 | |
Philip.Hazel | ae9208ab7b | |
Philip.Hazel | 7ecc9cdfaf | |
Zoltán Herczeg | f768448fd3 | |
Philip.Hazel | 90ae0ae01e | |
Philip.Hazel | 2a0faa2114 | |
Zoltán Herczeg | 97acc05f0c | |
Zoltán Herczeg | 70b0debf10 | |
Philip.Hazel | 3787601f81 | |
Zoltán Herczeg | e69a614430 | |
Philip.Hazel | e413f3147c | |
Philip.Hazel | d917899be5 | |
Philip.Hazel | 78fae97f6c | |
Philip.Hazel | bf15267c30 | |
Zoltán Herczeg | aae44b83f8 | |
Philip.Hazel | b48aa469d6 | |
Philip.Hazel | 27d40c8ad8 | |
Philip.Hazel | 7bbdc58513 | |
Philip.Hazel | 963b570fd0 | |
Philip.Hazel | 87bc092222 | |
Philip.Hazel | 0970ae4195 | |
Philip.Hazel | 45b219e6bc | |
Philip.Hazel | 1e5e9aaa70 | |
Zoltán Herczeg | 60df4c65d5 | |
Philip.Hazel | 71eb916d79 | |
Philip.Hazel | ce751bfc84 | |
Philip.Hazel | 59c7c5d100 | |
Philip.Hazel | 81ad92820a | |
Philip.Hazel | ec6191cd7f | |
Philip.Hazel | 630e4bb516 | |
Philip.Hazel | c0ed5a3ab3 | |
Philip.Hazel | b69460ece3 | |
Philip.Hazel | a33d61aada | |
Philip.Hazel | 7292c751a3 | |
Philip.Hazel | aff5a78056 | |
Philip.Hazel | 9319b5bb83 | |
Philip.Hazel | fe2df37c9f | |
Philip.Hazel | 24c62fc0d0 | |
Zoltán Herczeg | 82a4729e13 | |
Philip.Hazel | 3572634086 | |
Philip.Hazel | f7e21162fa | |
Philip.Hazel | c84a06c96e | |
Philip.Hazel | 344056baf8 | |
Philip.Hazel | c30815f5a1 | |
Zoltán Herczeg | f5b35e7943 | |
Zoltán Herczeg | c11b23e8cc | |
Philip.Hazel | 0d0ee67eb0 | |
Philip.Hazel | bca9888a2c | |
Philip.Hazel | 046c5cd21c | |
Philip.Hazel | 66811c6c73 | |
Philip.Hazel | 4677b1b0bb | |
Philip.Hazel | 620f3a1307 | |
Zoltán Herczeg | 691aca7a86 | |
Philip.Hazel | 4543001e85 | |
Philip.Hazel | f985a68ea5 | |
Philip.Hazel | 2e06fdcdc1 | |
Philip.Hazel | a5c601091e | |
Philip.Hazel | 4866bd3652 | |
Philip.Hazel | c0d0ee5365 | |
Philip.Hazel | 434e3f7468 | |
Philip.Hazel | d21f7daf9b | |
Zoltán Herczeg | 7f24a98cfb | |
Zoltán Herczeg | 7768756737 | |
Philip.Hazel | c6ee84317d | |
Philip.Hazel | a89423624d | |
Philip.Hazel | 175b4919f7 | |
Philip.Hazel | 8eb01ad8a9 | |
Philip.Hazel | e92f1d3b72 | |
Philip.Hazel | 9d00c46ff1 | |
Philip.Hazel | 9c53b6b11a | |
Philip.Hazel | da5155fed3 | |
Philip.Hazel | ef79b978a6 | |
Zoltán Herczeg | 3b2fa4dff2 | |
Philip.Hazel | 1ebc2c50cc | |
Philip.Hazel | ead78198d1 | |
Philip.Hazel | 0d1ab8515f | |
Philip.Hazel | 300bf6e2d6 | |
Philip.Hazel | 49f174ef78 | |
Philip.Hazel | 1f6b9097f4 | |
Philip.Hazel | f0c06ee212 | |
Philip.Hazel | 306f2b9c57 | |
Zoltán Herczeg | cc51779d88 | |
Philip.Hazel | dea540877b | |
Philip.Hazel | 16d47a9cb1 | |
Philip.Hazel | d5dc4e0c33 | |
Philip.Hazel | 4f31de2866 | |
Philip.Hazel | 5850cc5928 | |
Philip.Hazel | 16c046ce50 | |
Zoltán Herczeg | 2ad4329f83 | |
Philip.Hazel | 342c16ecd3 | |
Philip.Hazel | e118e60a68 | |
Philip.Hazel | a31c548210 | |
Zoltán Herczeg | 274efb8ded | |
Philip.Hazel | 16de9003e5 |
|
@ -0,0 +1,3 @@
|
|||
common --experimental_enable_bzlmod
|
||||
build --incompatible_enable_cc_toolchain_resolution
|
||||
build --incompatible_strict_action_env
|
|
@ -0,0 +1,77 @@
|
|||
|
||||
name: Build
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
name: Linux
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
alpine:
|
||||
name: alpine
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Autotools
|
||||
run: apk add --no-cache automake autoconf gcc libtool make musl-dev
|
||||
|
||||
- name: Autogen
|
||||
run: ./autogen.sh
|
||||
|
||||
- name: Configure
|
||||
run: ./configure --enable-jit --enable-pcre2-8 --enable-pcre2-16 --enable-pcre2-32
|
||||
|
||||
- name: Build
|
||||
run: make
|
||||
|
||||
- name: Test (main test script)
|
||||
run: ./RunTest
|
||||
|
||||
- name: Test (JIT test program)
|
||||
run: ./pcre2_jit_test
|
||||
|
||||
- name: Test (pcre2grep test script)
|
||||
run: ./RunGrepTest
|
||||
|
||||
windows:
|
||||
name: 32bit Windows
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Configure
|
||||
run: cmake -DPCRE2_SUPPORT_JIT=ON -DPCRE2_BUILD_PCRE2_16=ON -DPCRE2_BUILD_PCRE2_32=ON -B build -A Win32
|
||||
|
||||
- name: Build
|
||||
run: cmake --build build
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
cd build\Debug
|
||||
..\..\RunTest.bat
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ master ]
|
||||
pull_request:
|
||||
# The branches below must be a subset of the branches above
|
||||
branches: [ master ]
|
||||
schedule:
|
||||
- cron: '27 6 * * 4'
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'cpp', 'python' ]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
|
||||
# Learn more about CodeQL language support at https://git.io/codeql-language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v1
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v1
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
|
||||
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
|
||||
# and modify them (or add more) to build your code if your project
|
||||
# uses a compiled language
|
||||
|
||||
#- run: |
|
||||
# make bootstrap
|
||||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v1
|
|
@ -0,0 +1,55 @@
|
|||
name: Scorecards supply-chain security
|
||||
on:
|
||||
# Only the default branch is supported.
|
||||
branch_protection_rule:
|
||||
schedule:
|
||||
- cron: '23 17 * * 1'
|
||||
push:
|
||||
branches: [ master ]
|
||||
|
||||
# Declare default permissions as read only.
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
analysis:
|
||||
name: Scorecards analysis
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed to upload the results to code-scanning dashboard.
|
||||
security-events: write
|
||||
actions: read
|
||||
contents: read
|
||||
|
||||
steps:
|
||||
- name: "Checkout code"
|
||||
uses: actions/checkout@ec3a7ce113134d7a93b817d10a8272cb61118579 # v2.4.0
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: "Run analysis"
|
||||
uses: ossf/scorecard-action@c1aec4ac820532bab364f02a81873c555a0ba3a1 # v1.0.4
|
||||
with:
|
||||
results_file: results.sarif
|
||||
results_format: sarif
|
||||
# Read-only PAT token. To create it,
|
||||
# follow the steps in https://github.com/ossf/scorecard-action#pat-token-creation.
|
||||
repo_token: ${{ secrets.SCORECARD_READ_TOKEN }}
|
||||
# Publish the results to enable scorecard badges. For more details, see
|
||||
# https://github.com/ossf/scorecard-action#publishing-results.
|
||||
# For private repositories, `publish_results` will automatically be set to `false`,
|
||||
# regardless of the value entered here.
|
||||
publish_results: true
|
||||
|
||||
# Upload the results as artifacts (optional).
|
||||
- name: "Upload artifact"
|
||||
uses: actions/upload-artifact@82c141cc518b40d92cc801eee768e7aafc9c2fa2 # v2.3.1
|
||||
with:
|
||||
name: SARIF file
|
||||
path: results.sarif
|
||||
retention-days: 5
|
||||
|
||||
# Upload the results to GitHub's code scanning dashboard.
|
||||
- name: "Upload to code-scanning"
|
||||
uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # v1.0.26
|
||||
with:
|
||||
sarif_file: results.sarif
|
|
@ -0,0 +1,82 @@
|
|||
# Public .gitignore file for PCRE2
|
||||
|
||||
*.a
|
||||
*.lo
|
||||
*.la
|
||||
*.pc
|
||||
*.o
|
||||
*~
|
||||
*.lha
|
||||
|
||||
__pycache__
|
||||
.deps
|
||||
.libs
|
||||
|
||||
INSTALL
|
||||
Makefile
|
||||
Makefile.in
|
||||
RunGrepTest.log
|
||||
RunGrepTest.trs
|
||||
RunTest.log
|
||||
RunTest.trs
|
||||
|
||||
aclocal.m4
|
||||
ar-lib
|
||||
compile
|
||||
config.guess
|
||||
config.log
|
||||
config.status
|
||||
config.sub
|
||||
configure
|
||||
depcomp
|
||||
install-sh
|
||||
libtool
|
||||
ltmain.sh
|
||||
missing
|
||||
pcre2-config
|
||||
pcre2_dftables
|
||||
pcre2_jit_test
|
||||
pcre2_jit_test.log
|
||||
pcre2_jit_test.trs
|
||||
pcre2demo
|
||||
pcre2fuzzcheck
|
||||
pcre2grep
|
||||
pcre2test
|
||||
test-driver
|
||||
test-suite.log
|
||||
test3input
|
||||
test3output
|
||||
testNinput
|
||||
testNinputgrep
|
||||
teststderr
|
||||
teststderrM
|
||||
teststderrgrep
|
||||
teststdout
|
||||
teststdoutM
|
||||
testtemp1
|
||||
testtemp1grep
|
||||
testtemp2
|
||||
testtemp2grep
|
||||
testtry
|
||||
testtrygrep
|
||||
|
||||
m4/libtool.m4
|
||||
m4/ltoptions.m4
|
||||
m4/ltsugar.m4
|
||||
m4/ltversion.m4
|
||||
m4/lt~obsolete.m4
|
||||
|
||||
maint/ucptest
|
||||
maint/utf8
|
||||
|
||||
src/.deps
|
||||
src/.dirstamp
|
||||
src/config.h
|
||||
src/pcre2.h
|
||||
src/pcre2_chartables.c
|
||||
src/stamp-h1
|
||||
|
||||
/bazel-*
|
||||
|
||||
# End
|
||||
|
12
AUTHORS
12
AUTHORS
|
@ -2,13 +2,13 @@ THE MAIN PCRE2 LIBRARY CODE
|
|||
---------------------------
|
||||
|
||||
Written by: Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2019 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved
|
||||
|
||||
|
||||
|
@ -19,7 +19,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2019 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -30,7 +30,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Emain domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2019 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
####
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
|
||||
load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
|
||||
|
||||
copy_file(
|
||||
name = "config_h_generic",
|
||||
src = "src/config.h.generic",
|
||||
out = "src/config.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_h_generic",
|
||||
src = "src/pcre2.h.generic",
|
||||
out = "src/pcre2.h",
|
||||
)
|
||||
|
||||
copy_file(
|
||||
name = "pcre2_chartables_c",
|
||||
src = "src/pcre2_chartables.c.dist",
|
||||
out = "src/pcre2_chartables.c",
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "pcre2",
|
||||
srcs = [
|
||||
"src/pcre2_auto_possess.c",
|
||||
"src/pcre2_compile.c",
|
||||
"src/pcre2_config.c",
|
||||
"src/pcre2_context.c",
|
||||
"src/pcre2_convert.c",
|
||||
"src/pcre2_dfa_match.c",
|
||||
"src/pcre2_error.c",
|
||||
"src/pcre2_extuni.c",
|
||||
"src/pcre2_find_bracket.c",
|
||||
"src/pcre2_maketables.c",
|
||||
"src/pcre2_match.c",
|
||||
"src/pcre2_match_data.c",
|
||||
"src/pcre2_newline.c",
|
||||
"src/pcre2_ord2utf.c",
|
||||
"src/pcre2_pattern_info.c",
|
||||
"src/pcre2_script_run.c",
|
||||
"src/pcre2_serialize.c",
|
||||
"src/pcre2_string_utils.c",
|
||||
"src/pcre2_study.c",
|
||||
"src/pcre2_substitute.c",
|
||||
"src/pcre2_substring.c",
|
||||
"src/pcre2_tables.c",
|
||||
"src/pcre2_ucd.c",
|
||||
"src/pcre2_ucptables.c",
|
||||
"src/pcre2_valid_utf.c",
|
||||
"src/pcre2_xclass.c",
|
||||
":pcre2_chartables_c",
|
||||
],
|
||||
hdrs = glob(["src/*.h"]) + [
|
||||
":config_h_generic",
|
||||
":pcre2_h_generic",
|
||||
],
|
||||
defines = [
|
||||
"HAVE_CONFIG_H",
|
||||
"PCRE2_CODE_UNIT_WIDTH=8",
|
||||
"PCRE2_STATIC",
|
||||
],
|
||||
includes = ["src"],
|
||||
strip_include_prefix = "src",
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "pcre2demo",
|
||||
srcs = ["src/pcre2demo.c"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [":pcre2"],
|
||||
)
|
578
CMakeLists.txt
578
CMakeLists.txt
|
@ -1,6 +1,5 @@
|
|||
# CMakeLists.txt
|
||||
#
|
||||
#
|
||||
# This file enables PCRE2 to be built with the CMake configuration and build
|
||||
# tool. Download CMake in source or binary form from http://www.cmake.org/
|
||||
# Converted to support PCRE2 from the original PCRE file, August 2014.
|
||||
|
@ -85,19 +84,44 @@
|
|||
# 2018-11-14 PH removed unnecessary checks for stdint.h and inttypes.h
|
||||
# 2018-11-16 PH added PCRE2GREP_SUPPORT_CALLOUT_FORK support and tidied
|
||||
# 2019-02-16 PH hacked to avoid CMP0026 policy issue (see comments below)
|
||||
# 2020-03-16 PH renamed dftables as pcre2_dftables (as elsewhere)
|
||||
# 2020-03-24 PH changed CMAKE_MODULE_PATH definition to add, not replace
|
||||
# 2020-04-08 Carlo added function check for secure_getenv, fixed strerror
|
||||
# 2020-04-16 enh added check for __attribute__((uninitialized))
|
||||
# 2020-04-25 PH applied patches from Uwe Korn to support pkg-config and
|
||||
# library versioning.
|
||||
# 2020-04-25 Carlo added function check for mkostemp used in ProtExecAllocator
|
||||
# 2020-04-28 PH added function check for memfd_create based on Carlo's patch
|
||||
# 2020-05-25 PH added a check for Intel CET
|
||||
# 2020-12-03 PH altered the definition of pcre2test as suggested by Daniel
|
||||
# 2021-06-29 JWSB added the option to build static library with PIC.
|
||||
# 2021-07-05 JWSB modified such both the static and shared library can be
|
||||
# build in one go.
|
||||
# 2021-08-28 PH increased minimum version
|
||||
# 2021-08-28 PH added test for realpath()
|
||||
|
||||
PROJECT(PCRE2 C)
|
||||
|
||||
# Increased minimum to 2.8.0 to support newer add_test features.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
|
||||
# Increased minimum to 2.8.5 to support GNUInstallDirs.
|
||||
# Increased minimum to 3.1 to support imported targets.
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 3.1)
|
||||
|
||||
# Set policy CMP0026 to avoid warnings for the use of LOCATION in
|
||||
# GET_TARGET_PROPERTY. This should no longer be required.
|
||||
# CMAKE_POLICY(SET CMP0026 OLD)
|
||||
|
||||
SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # for FindReadline.cmake
|
||||
# With a recent cmake, you can provide a rootdir to look for non
|
||||
# standard installed library dependencies, but to do so, the policy
|
||||
# needs to be set to new (by uncommenting the following)
|
||||
# CMAKE_POLICY(SET CMP0074 NEW)
|
||||
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR}/src")
|
||||
# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
|
||||
# on the command line.
|
||||
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
|
||||
LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
|
||||
|
||||
INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src)
|
||||
|
||||
# external packages
|
||||
FIND_PACKAGE( BZip2 )
|
||||
|
@ -107,29 +131,66 @@ FIND_PACKAGE( Editline )
|
|||
|
||||
# Configuration checks
|
||||
|
||||
INCLUDE(CheckIncludeFile)
|
||||
INCLUDE(CheckCSourceCompiles)
|
||||
INCLUDE(CheckFunctionExists)
|
||||
INCLUDE(CheckSymbolExists)
|
||||
INCLUDE(CheckIncludeFile)
|
||||
INCLUDE(CheckTypeSize)
|
||||
INCLUDE(GNUInstallDirs) # for CMAKE_INSTALL_LIBDIR
|
||||
|
||||
CHECK_INCLUDE_FILE(dirent.h HAVE_DIRENT_H)
|
||||
CHECK_INCLUDE_FILE(stdint.h HAVE_STDINT_H)
|
||||
CHECK_INCLUDE_FILE(inttypes.h HAVE_INTTYPES_H)
|
||||
CHECK_INCLUDE_FILE(sys/stat.h HAVE_SYS_STAT_H)
|
||||
CHECK_INCLUDE_FILE(sys/types.h HAVE_SYS_TYPES_H)
|
||||
CHECK_INCLUDE_FILE(unistd.h HAVE_UNISTD_H)
|
||||
CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
|
||||
|
||||
CHECK_FUNCTION_EXISTS(bcopy HAVE_BCOPY)
|
||||
CHECK_FUNCTION_EXISTS(memmove HAVE_MEMMOVE)
|
||||
CHECK_FUNCTION_EXISTS(strerror HAVE_STRERROR)
|
||||
CHECK_SYMBOL_EXISTS(bcopy "strings.h" HAVE_BCOPY)
|
||||
CHECK_SYMBOL_EXISTS(memfd_create "sys/mman.h" HAVE_MEMFD_CREATE)
|
||||
CHECK_SYMBOL_EXISTS(memmove "string.h" HAVE_MEMMOVE)
|
||||
CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h" HAVE_SECURE_GETENV)
|
||||
CHECK_SYMBOL_EXISTS(strerror "string.h" HAVE_STRERROR)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
int main(int c, char *v[]) { char buf[PATH_MAX]; realpath(v[1], buf); return 0; }"
|
||||
HAVE_REALPATH
|
||||
)
|
||||
|
||||
set(ORIG_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
||||
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror")
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"int main() { char buf[128] __attribute__((uninitialized)); (void)buf; return 0; }"
|
||||
HAVE_ATTRIBUTE_UNINITIALIZED
|
||||
)
|
||||
set(CMAKE_REQUIRED_FLAGS ${ORIG_CMAKE_REQUIRED_FLAGS})
|
||||
|
||||
# Check whether Intel CET is enabled, and if so, adjust compiler flags. This
|
||||
# code was written by PH, trying to imitate the logic from the autotools
|
||||
# configuration.
|
||||
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#ifndef __CET__
|
||||
#error CET is not enabled
|
||||
#endif
|
||||
int main() { return 0; }"
|
||||
INTEL_CET_ENABLED
|
||||
)
|
||||
|
||||
IF (INTEL_CET_ENABLED)
|
||||
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mshstk")
|
||||
ENDIF(INTEL_CET_ENABLED)
|
||||
|
||||
|
||||
|
||||
# User-configurable options
|
||||
#
|
||||
# Note: CMakeSetup displays these in alphabetical order, regardless of
|
||||
# the order we use here.
|
||||
|
||||
SET(BUILD_SHARED_LIBS OFF CACHE BOOL
|
||||
"Build shared libraries instead of static ones.")
|
||||
SET(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries.")
|
||||
|
||||
OPTION(BUILD_STATIC_LIBS "Build static libraries." ON)
|
||||
|
||||
OPTION(PCRE2_BUILD_PCRE2_8 "Build 8 bit PCRE2 library" ON)
|
||||
|
||||
|
@ -137,6 +198,8 @@ OPTION(PCRE2_BUILD_PCRE2_16 "Build 16 bit PCRE2 library" OFF)
|
|||
|
||||
OPTION(PCRE2_BUILD_PCRE2_32 "Build 32 bit PCRE2 library" OFF)
|
||||
|
||||
OPTION(PCRE2_STATIC_PIC "Build the static library with the option position independent code enabled." OFF)
|
||||
|
||||
OPTION(PCRE2_DEBUG "Include debugging code" OFF)
|
||||
|
||||
OPTION(PCRE2_DISABLE_PERCENT_ZT "Disable the use of %zu and %td (rarely needed)" OFF)
|
||||
|
@ -177,8 +240,12 @@ SET(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL
|
|||
SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
|
||||
"Enable support for Just-in-time compiling.")
|
||||
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
|
||||
"Enable SELinux compatible execmem allocator in JIT.")
|
||||
IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF CACHE BOOL
|
||||
"Enable SELinux compatible execmem allocator in JIT (experimental).")
|
||||
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC IGNORE)
|
||||
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
|
||||
SET(PCRE2GREP_SUPPORT_JIT ON CACHE BOOL
|
||||
"Enable use of Just-in-time compiling in pcre2grep.")
|
||||
|
@ -244,9 +311,19 @@ ENDIF(PCRE2_SUPPORT_LIBZ)
|
|||
IF(EDITLINE_FOUND)
|
||||
OPTION (PCRE2_SUPPORT_LIBEDIT "Enable support for linking pcre2test with libedit." OFF)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
IF(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
INCLUDE_DIRECTORIES(${EDITLINE_INCLUDE_DIR})
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ELSE(EDITLINE_FOUND)
|
||||
IF(PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" libedit not found, set EDITLINE_INCLUDE_DIR to a compatible header\n"
|
||||
" or set Editline_ROOT to a full libedit installed tree, as needed\n"
|
||||
" Might need to enable policy CMP0074 in CMakeLists.txt"
|
||||
)
|
||||
ENDIF(PCRE2_SUPPORT_LIBEDIT)
|
||||
ENDIF(EDITLINE_FOUND)
|
||||
|
||||
# readline lib
|
||||
IF(READLINE_FOUND)
|
||||
|
@ -258,9 +335,9 @@ ENDIF(PCRE2_SUPPORT_LIBREADLINE)
|
|||
|
||||
# Prepare build configuration
|
||||
|
||||
IF(NOT BUILD_SHARED_LIBS)
|
||||
SET(PCRE2_STATIC 1)
|
||||
ENDIF(NOT BUILD_SHARED_LIBS)
|
||||
IF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
|
||||
MESSAGE(FATAL_ERROR "At least one of BUILD_SHARED_LIBS or BUILD_STATIC_LIBS must be enabled.")
|
||||
ENDIF(NOT BUILD_SHARED_LIBS AND NOT BUILD_STATIC_LIBS)
|
||||
|
||||
IF(NOT PCRE2_BUILD_PCRE2_8 AND NOT PCRE2_BUILD_PCRE2_16 AND NOT PCRE2_BUILD_PCRE2_32)
|
||||
MESSAGE(FATAL_ERROR "At least one of PCRE2_BUILD_PCRE2_8, PCRE2_BUILD_PCRE2_16 or PCRE2_BUILD_PCRE2_32 must be enabled")
|
||||
|
@ -284,7 +361,12 @@ IF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
|||
ENDIF(PCRE2_BUILD_PCRE2GREP AND NOT PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
IF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
MESSAGE(FATAL_ERROR "Only one of libreadline or libeditline can be specified")
|
||||
IF(READLINE_FOUND)
|
||||
MESSAGE(FATAL_ERROR
|
||||
" Only one of the readline compatible libraries can be enabled.\n"
|
||||
" Disable libreadline with -DPCRE2_SUPPORT_LIBREADLINE=OFF"
|
||||
)
|
||||
ENDIF(READLINE_FOUND)
|
||||
ENDIF(PCRE2_SUPPORT_LIBREADLINE AND PCRE2_SUPPORT_LIBEDIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||
|
@ -300,11 +382,29 @@ IF(PCRE2_SUPPORT_UNICODE)
|
|||
ENDIF(PCRE2_SUPPORT_UNICODE)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT)
|
||||
SET(SUPPORT_JIT 1)
|
||||
SET(SUPPORT_JIT 1)
|
||||
IF(UNIX)
|
||||
FIND_PACKAGE(Threads REQUIRED)
|
||||
IF(CMAKE_USE_PTHREADS_INIT)
|
||||
SET(REQUIRE_PTHREAD 1)
|
||||
ENDIF(CMAKE_USE_PTHREADS_INIT)
|
||||
ENDIF(UNIX)
|
||||
ENDIF(PCRE2_SUPPORT_JIT)
|
||||
|
||||
IF(PCRE2_SUPPORT_JIT_SEALLOC)
|
||||
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
|
||||
SET(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
|
||||
CHECK_SYMBOL_EXISTS(mkostemp stdlib.h REQUIRED)
|
||||
UNSET(CMAKE_REQUIRED_DEFINITIONS)
|
||||
IF(${REQUIRED})
|
||||
IF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
ADD_DEFINITIONS(-D_GNU_SOURCE)
|
||||
SET(SLJIT_PROT_EXECUTABLE_ALLOCATOR 1)
|
||||
ELSE(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
MESSAGE(FATAL_ERROR "Your configuration is not supported")
|
||||
ENDIF(${CMAKE_SYSTEM_NAME} MATCHES Linux|NetBSD)
|
||||
ELSE(${REQUIRED})
|
||||
SET(PCRE2_SUPPORT_JIT_SEALLOC OFF)
|
||||
ENDIF(${REQUIRED})
|
||||
ENDIF(PCRE2_SUPPORT_JIT_SEALLOC)
|
||||
|
||||
IF(PCRE2GREP_SUPPORT_JIT)
|
||||
|
@ -400,12 +500,13 @@ file(STRINGS ${PROJECT_SOURCE_DIR}/configure.ac
|
|||
LIMIT_COUNT 50 # Read only the first 50 lines of the file
|
||||
)
|
||||
|
||||
set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date")
|
||||
set(SEARCHED_VARIABLES "pcre2_major" "pcre2_minor" "pcre2_prerelease" "pcre2_date"
|
||||
"libpcre2_posix_version" "libpcre2_8_version" "libpcre2_16_version" "libpcre2_32_version")
|
||||
foreach(configure_line ${configure_lines})
|
||||
foreach(_substitution_variable ${SEARCHED_VARIABLES})
|
||||
string(TOUPPER ${_substitution_variable} _substitution_variable_upper)
|
||||
if (NOT ${_substitution_variable_upper})
|
||||
string(REGEX MATCH "m4_define\\(${_substitution_variable}, \\[(.*)\\]" MACTHED_STRING ${configure_line})
|
||||
string(REGEX MATCH "m4_define\\(${_substitution_variable}, *\\[(.*)\\]" MATCHED_STRING ${configure_line})
|
||||
if (CMAKE_MATCH_1)
|
||||
set(${_substitution_variable_upper} ${CMAKE_MATCH_1})
|
||||
endif()
|
||||
|
@ -413,21 +514,83 @@ foreach(configure_line ${configure_lines})
|
|||
endforeach()
|
||||
endforeach()
|
||||
|
||||
macro(PARSE_LIB_VERSION VARIABLE_PREFIX)
|
||||
string(REPLACE ":" ";" ${VARIABLE_PREFIX}_VERSION_LIST ${${VARIABLE_PREFIX}_VERSION})
|
||||
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 0 ${VARIABLE_PREFIX}_VERSION_CURRENT)
|
||||
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 1 ${VARIABLE_PREFIX}_VERSION_REVISION)
|
||||
list(GET ${VARIABLE_PREFIX}_VERSION_LIST 2 ${VARIABLE_PREFIX}_VERSION_AGE)
|
||||
|
||||
math(EXPR ${VARIABLE_PREFIX}_SOVERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} - ${${VARIABLE_PREFIX}_VERSION_AGE}")
|
||||
math(EXPR ${VARIABLE_PREFIX}_MACHO_COMPATIBILITY_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
|
||||
math(EXPR ${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_VERSION_CURRENT} + 1")
|
||||
set(${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION "${${VARIABLE_PREFIX}_MACHO_CURRENT_VERSION}.${${VARIABLE_PREFIX}_VERSION_REVISION}}")
|
||||
set(${VARIABLE_PREFIX}_VERSION "${${VARIABLE_PREFIX}_SOVERSION}.${${VARIABLE_PREFIX}_VERSION_AGE}.${${VARIABLE_PREFIX}_VERSION_REVISION}")
|
||||
endmacro()
|
||||
|
||||
PARSE_LIB_VERSION(LIBPCRE2_POSIX)
|
||||
PARSE_LIB_VERSION(LIBPCRE2_8)
|
||||
PARSE_LIB_VERSION(LIBPCRE2_16)
|
||||
PARSE_LIB_VERSION(LIBPCRE2_32)
|
||||
|
||||
CONFIGURE_FILE(src/pcre2.h.in
|
||||
${PROJECT_BINARY_DIR}/pcre2.h
|
||||
@ONLY)
|
||||
|
||||
# What about pcre2-config and libpcre2.pc?
|
||||
# Make sure to not link debug libs
|
||||
# against release libs and vice versa
|
||||
IF(WIN32)
|
||||
SET(CMAKE_DEBUG_POSTFIX "d")
|
||||
ENDIF(WIN32)
|
||||
|
||||
# Generate pkg-config files
|
||||
|
||||
SET(PACKAGE_VERSION "${PCRE2_MAJOR}.${PCRE2_MINOR}")
|
||||
SET(prefix ${CMAKE_INSTALL_PREFIX})
|
||||
|
||||
SET(exec_prefix "\${prefix}")
|
||||
SET(libdir "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
|
||||
SET(includedir "\${prefix}/include")
|
||||
IF(WIN32 AND (CMAKE_BUILD_TYPE MATCHES Debug))
|
||||
SET(LIB_POSTFIX ${CMAKE_DEBUG_POSTFIX})
|
||||
ENDIF()
|
||||
CONFIGURE_FILE(libpcre2-posix.pc.in libpcre2-posix.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-posix.pc")
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_8)
|
||||
CONFIGURE_FILE(libpcre2-8.pc.in libpcre2-8.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-8.pc")
|
||||
SET(enable_pcre2_8 "yes")
|
||||
ELSE()
|
||||
SET(enable_pcre2_8 "no")
|
||||
ENDIF()
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_16)
|
||||
CONFIGURE_FILE(libpcre2-16.pc.in libpcre2-16.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-16.pc")
|
||||
SET(enable_pcre2_16 "yes")
|
||||
ELSE()
|
||||
SET(enable_pcre2_16 "no")
|
||||
ENDIF()
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_32)
|
||||
CONFIGURE_FILE(libpcre2-32.pc.in libpcre2-32.pc @ONLY)
|
||||
SET(pkg_config_files ${pkg_config_files} "${CMAKE_CURRENT_BINARY_DIR}/libpcre2-32.pc")
|
||||
SET(enable_pcre2_32 "yes")
|
||||
ELSE()
|
||||
SET(enable_pcre2_32 "no")
|
||||
ENDIF()
|
||||
|
||||
CONFIGURE_FILE(pcre2-config.in pcre2-config @ONLY)
|
||||
|
||||
# Character table generation
|
||||
|
||||
OPTION(PCRE2_REBUILD_CHARTABLES "Rebuild char tables" OFF)
|
||||
IF(PCRE2_REBUILD_CHARTABLES)
|
||||
ADD_EXECUTABLE(dftables src/dftables.c)
|
||||
ADD_EXECUTABLE(pcre2_dftables src/pcre2_dftables.c)
|
||||
ADD_CUSTOM_COMMAND(
|
||||
COMMENT "Generating character tables (pcre2_chartables.c) for current locale"
|
||||
DEPENDS dftables
|
||||
COMMAND dftables
|
||||
DEPENDS pcre2_dftables
|
||||
COMMAND pcre2_dftables
|
||||
ARGS ${PROJECT_BINARY_DIR}/pcre2_chartables.c
|
||||
OUTPUT ${PROJECT_BINARY_DIR}/pcre2_chartables.c
|
||||
)
|
||||
|
@ -474,39 +637,37 @@ SET(PCRE2_SOURCES
|
|||
SET(PCRE2POSIX_HEADERS src/pcre2posix.h)
|
||||
SET(PCRE2POSIX_SOURCES src/pcre2posix.c)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2.rc pcre2.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2 coff info in mingw build)
|
||||
SET(PCRE2_SOURCES
|
||||
${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2posix.rc pcre2posix.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2posix coff info in mingw build)
|
||||
SET(PCRE2POSIX_SOURCES
|
||||
${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MINGW AND BUILD_SHARED_LIBS)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2.rc pcre2.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2 coff info in mingw build)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2.o)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
||||
IF(MSVC AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES
|
||||
${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
SET(PCRE2POSIX_SOURCES
|
||||
${PCRE2POSIX_SOURCES} pcre2posix.rc)
|
||||
ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MSVC AND NOT PCRE2_STATIC)
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ADD_CUSTOM_COMMAND(OUTPUT ${PROJECT_SOURCE_DIR}/pcre2posix.o
|
||||
PRE-LINK
|
||||
COMMAND windres ARGS pcre2posix.rc pcre2posix.o
|
||||
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
|
||||
COMMENT Using pcre2posix coff info in mingw build)
|
||||
SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} ${PROJECT_SOURCE_DIR}/pcre2posix.o)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC AND BUILD_SHARED_LIBS)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-posix.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-posixd.pdb ${dll_pdb_debug_files})
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
SET(PCRE2_SOURCES ${PCRE2_SOURCES} pcre2.rc)
|
||||
ENDIF(EXISTS ${PROJECT_SOURCE_DIR}/pcre2.rc)
|
||||
|
||||
IF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
SET(PCRE2POSIX_SOURCES ${PCRE2POSIX_SOURCES} pcre2posix.rc)
|
||||
ENDIF (EXISTS ${PROJECT_SOURCE_DIR}/pcre2posix.rc)
|
||||
ENDIF(MSVC AND BUILD_SHARED_LIBS)
|
||||
|
||||
# Fix static compilation with MSVC: https://bugs.exim.org/show_bug.cgi?id=1681
|
||||
# This code was taken from the CMake wiki, not from WebM.
|
||||
|
@ -529,71 +690,219 @@ IF(MSVC)
|
|||
ENDIF(MSVC)
|
||||
|
||||
SET(CMAKE_INCLUDE_CURRENT_DIR 1)
|
||||
# needed to make sure to not link debug libs
|
||||
# against release libs and vice versa
|
||||
IF(WIN32)
|
||||
SET(CMAKE_DEBUG_POSTFIX "d")
|
||||
ENDIF(WIN32)
|
||||
|
||||
SET(targets)
|
||||
|
||||
# 8-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_8)
|
||||
ADD_LIBRARY(pcre2-8 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_PROPERTY(TARGET pcre2-8
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
|
||||
SET(targets ${targets} pcre2-8)
|
||||
ADD_LIBRARY(pcre2-posix ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_PROPERTY(TARGET pcre2-posix
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8)
|
||||
SET(targets ${targets} pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix pcre2-8)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-8-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix-static STATIC ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_POSIX_VERSION}
|
||||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION})
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-static pcre2-8-static)
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-posix-static PUBLIC PCRE2_STATIC)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET(targets ${targets} pcre2-posix-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8 pcre2-posix PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8-static)
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static PROPERTIES OUTPUT_NAME pcre2-8)
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-static PROPERTIES OUTPUT_NAME pcre2-posix)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-static pcre2-posix-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-8-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-8-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_8_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_8_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_8_VERSION}
|
||||
SOVERSION ${LIBPCRE2_8_SOVERSION}
|
||||
OUTPUT_NAME pcre2-8)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-8-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix-shared SHARED ${PCRE2POSIX_HEADERS} ${PCRE2POSIX_SOURCES})
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-posix-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-posix-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=8
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_POSIX_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_POSIX_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_POSIX_VERSION}
|
||||
SOVERSION ${LIBPCRE2_POSIX_SOVERSION}
|
||||
OUTPUT_NAME pcre2-posix)
|
||||
TARGET_LINK_LIBRARIES(pcre2-posix-shared pcre2-8-shared)
|
||||
SET(targets ${targets} pcre2-posix-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-8.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-8d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-8-shared pcre2-posix-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-static)
|
||||
ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-8 ALIAS pcre2-8-shared)
|
||||
ADD_LIBRARY(pcre2-posix ALIAS pcre2-posix-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_8)
|
||||
|
||||
# 16-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_16)
|
||||
ADD_LIBRARY(pcre2-16 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_PROPERTY(TARGET pcre2-16
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16)
|
||||
SET(targets ${targets} pcre2-16)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-16-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16 PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES OUTPUT_NAME pcre2-16)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-16-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-16-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=16
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_16_VERSION}
|
||||
SOVERSION ${LIBPCRE2_16_SOVERSION}
|
||||
OUTPUT_NAME pcre2-16)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-16-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-16-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-16.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-16d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-16-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-16 ALIAS pcre2-16-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_16)
|
||||
|
||||
# 32-bit library
|
||||
|
||||
IF(PCRE2_BUILD_PCRE2_32)
|
||||
ADD_LIBRARY(pcre2-32 ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
SET_PROPERTY(TARGET pcre2-32
|
||||
PROPERTY COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32)
|
||||
SET(targets ${targets} pcre2-32)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-static STATIC ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-static PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION})
|
||||
TARGET_COMPILE_DEFINITIONS(pcre2-32-static PUBLIC PCRE2_STATIC)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-static Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-static)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32 PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32-static)
|
||||
ELSE(MSVC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES OUTPUT_NAME pcre2-32)
|
||||
ENDIF(MSVC)
|
||||
IF(PCRE2_STATIC_PIC)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-static PROPERTIES POSITION_INDEPENDENT_CODE 1)
|
||||
ENDIF(PCRE2_STATIC_PIC)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
ADD_LIBRARY(pcre2-32-shared SHARED ${PCRE2_HEADERS} ${PCRE2_SOURCES} ${PROJECT_BINARY_DIR}/config.h)
|
||||
TARGET_INCLUDE_DIRECTORIES(pcre2-32-shared PUBLIC ${PROJECT_BINARY_DIR})
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES
|
||||
COMPILE_DEFINITIONS PCRE2_CODE_UNIT_WIDTH=32
|
||||
MACHO_COMPATIBILITY_VERSION "${LIBPCRE2_32_MACHO_COMPATIBILITY_VERSION}"
|
||||
MACHO_CURRENT_VERSION "${LIBPCRE2_32_MACHO_CURRENT_VERSION}"
|
||||
VERSION ${LIBPCRE2_32_VERSION}
|
||||
SOVERSION ${LIBPCRE2_32_SOVERSION}
|
||||
OUTPUT_NAME pcre2-32)
|
||||
IF(REQUIRE_PTHREAD)
|
||||
TARGET_LINK_LIBRARIES(pcre2-32-shared Threads::Threads)
|
||||
ENDIF(REQUIRE_PTHREAD)
|
||||
SET(targets ${targets} pcre2-32-shared)
|
||||
SET(dll_pdb_files ${PROJECT_BINARY_DIR}/pcre2-32.pdb ${dll_pdb_files})
|
||||
SET(dll_pdb_debug_files ${PROJECT_BINARY_DIR}/pcre2-32d.pdb ${dll_pdb_debug_files})
|
||||
|
||||
IF(MINGW)
|
||||
IF(NON_STANDARD_LIB_PREFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES PREFIX "")
|
||||
ENDIF(NON_STANDARD_LIB_PREFIX)
|
||||
IF(NON_STANDARD_LIB_SUFFIX)
|
||||
SET_TARGET_PROPERTIES(pcre2-32-shared PROPERTIES SUFFIX "-0.dll")
|
||||
ENDIF(NON_STANDARD_LIB_SUFFIX)
|
||||
ENDIF(MINGW)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-static)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
ADD_LIBRARY(pcre2-32 ALIAS pcre2-32-shared)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
ENDIF(PCRE2_BUILD_PCRE2_32)
|
||||
|
||||
# Executables
|
||||
|
@ -718,7 +1027,9 @@ if test \"$?\" != \"0\"; then exit 1; fi
|
|||
\@echo off
|
||||
setlocal
|
||||
SET srcdir=\"${winsrc}\"
|
||||
SET pcre2test=\"${winexe}\"
|
||||
# The next line was replaced by the following one after a user comment.
|
||||
# SET pcre2test=\"${winexe}\"
|
||||
SET pcre2test=\"${winbin}\\pcre2test.exe\"
|
||||
if not [%CMAKE_CONFIG_TYPE%]==[] SET pcre2test=\"${winbin}\\%CMAKE_CONFIG_TYPE%\\pcre2test.exe\"
|
||||
call %srcdir%\\RunTest.Bat
|
||||
if errorlevel 1 exit /b 1
|
||||
|
@ -754,42 +1065,44 @@ SET(CMAKE_INSTALL_ALWAYS 1)
|
|||
|
||||
INSTALL(TARGETS ${targets}
|
||||
RUNTIME DESTINATION bin
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib)
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
INSTALL(FILES ${pkg_config_files} DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
||||
INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/pcre2-config"
|
||||
DESTINATION bin
|
||||
# Set 0755 permissions
|
||||
PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
|
||||
|
||||
INSTALL(FILES ${PCRE2_HEADERS} ${PCRE2POSIX_HEADERS} DESTINATION include)
|
||||
|
||||
# CMake config files.
|
||||
set(PCRE2_CONFIG_IN ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config.cmake.in)
|
||||
set(PCRE2_CONFIG_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config.cmake)
|
||||
configure_file(${PCRE2_CONFIG_IN} ${PCRE2_CONFIG_OUT} @ONLY)
|
||||
set(PCRE2_CONFIG_VERSION_IN ${CMAKE_CURRENT_SOURCE_DIR}/cmake/pcre2-config-version.cmake.in)
|
||||
set(PCRE2_CONFIG_VERSION_OUT ${CMAKE_CURRENT_BINARY_DIR}/cmake/pcre2-config-version.cmake)
|
||||
configure_file(${PCRE2_CONFIG_VERSION_IN} ${PCRE2_CONFIG_VERSION_OUT} @ONLY)
|
||||
install(FILES ${PCRE2_CONFIG_OUT} ${PCRE2_CONFIG_VERSION_OUT} DESTINATION cmake)
|
||||
|
||||
FILE(GLOB html ${PROJECT_SOURCE_DIR}/doc/html/*.html)
|
||||
FILE(GLOB man1 ${PROJECT_SOURCE_DIR}/doc/*.1)
|
||||
FILE(GLOB man3 ${PROJECT_SOURCE_DIR}/doc/*.3)
|
||||
|
||||
FOREACH(man ${man3})
|
||||
GET_FILENAME_COMPONENT(man_tmp ${man} NAME)
|
||||
SET(man3_new ${man3} ${man})
|
||||
ENDFOREACH(man ${man3})
|
||||
SET(man3 ${man3_new})
|
||||
|
||||
INSTALL(FILES ${man1} DESTINATION man/man1)
|
||||
INSTALL(FILES ${man3} DESTINATION man/man3)
|
||||
INSTALL(FILES ${html} DESTINATION share/doc/pcre2/html)
|
||||
|
||||
IF(MSVC AND INSTALL_MSVC_PDB)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posix.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${PROJECT_BINARY_DIR}/pcre2d.pdb
|
||||
${PROJECT_BINARY_DIR}/pcre2posixd.pdb
|
||||
DESTINATION bin
|
||||
CONFIGURATIONS Debug)
|
||||
INSTALL(FILES ${dll_pdb_files} DESTINATION bin CONFIGURATIONS RelWithDebInfo)
|
||||
INSTALL(FILES ${dll_pdb_debug_files} DESTINATION bin CONFIGURATIONS Debug)
|
||||
ENDIF(MSVC AND INSTALL_MSVC_PDB)
|
||||
|
||||
# Help, only for nice output
|
||||
IF(BUILD_SHARED_LIBS)
|
||||
SET(BUILD_STATIC_LIBS OFF)
|
||||
ELSE(BUILD_SHARED_LIBS)
|
||||
IF(BUILD_STATIC_LIBS)
|
||||
SET(BUILD_STATIC_LIBS ON)
|
||||
ENDIF(BUILD_SHARED_LIBS)
|
||||
ELSE(BUILD_STATIC_LIBS)
|
||||
SET(BUILD_STATIC_LIBS OFF)
|
||||
ENDIF(BUILD_STATIC_LIBS)
|
||||
|
||||
IF(PCRE2_HEAP_MATCH_RECURSE)
|
||||
MESSAGE(WARNING "HEAP_MATCH_RECURSE is obsolete and does nothing.")
|
||||
|
@ -802,7 +1115,7 @@ IF(PCRE2_SHOW_REPORT)
|
|||
ENDIF(CMAKE_C_FLAGS)
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS "PCRE2 configuration summary:")
|
||||
MESSAGE(STATUS "PCRE2-${PCRE2_MAJOR}.${PCRE2_MINOR} configuration summary:")
|
||||
MESSAGE(STATUS "")
|
||||
MESSAGE(STATUS " Install prefix .................. : ${CMAKE_INSTALL_PREFIX}")
|
||||
MESSAGE(STATUS " C compiler ...................... : ${CMAKE_C_COMPILER}")
|
||||
|
@ -827,6 +1140,7 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Match depth limit ............... : ${PCRE2_MATCH_LIMIT_DEPTH}")
|
||||
MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}")
|
||||
MESSAGE(STATUS " Build static libs ............... : ${BUILD_STATIC_LIBS}")
|
||||
MESSAGE(STATUS " with PIC enabled ............. : ${PCRE2_STATIC_PIC}")
|
||||
MESSAGE(STATUS " Build pcre2grep ................. : ${PCRE2_BUILD_PCRE2GREP}")
|
||||
MESSAGE(STATUS " Enable JIT in pcre2grep ......... : ${PCRE2GREP_SUPPORT_JIT}")
|
||||
MESSAGE(STATUS " Enable callouts in pcre2grep .... : ${PCRE2GREP_SUPPORT_CALLOUT}")
|
||||
|
@ -861,10 +1175,10 @@ IF(PCRE2_SHOW_REPORT)
|
|||
MESSAGE(STATUS " Use %zu and %td ..................: AUTO" )
|
||||
ENDIF(PCRE2_DISABLE_PERCENT_ZT)
|
||||
|
||||
IF(MINGW AND NOT PCRE2_STATIC)
|
||||
IF(MINGW AND BUILD_SHARED_LIBS)
|
||||
MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")
|
||||
MESSAGE(STATUS " Non-standard dll names (suffix) . : ${NON_STANDARD_LIB_SUFFIX}")
|
||||
ENDIF(MINGW AND NOT PCRE2_STATIC)
|
||||
ENDIF(MINGW AND BUILD_SHARED_LIBS)
|
||||
|
||||
IF(MSVC)
|
||||
MESSAGE(STATUS " Install MSVC .pdb files ..........: ${INSTALL_MSVC_PDB}")
|
||||
|
|
747
ChangeLog
747
ChangeLog
|
@ -1,5 +1,746 @@
|
|||
Change Log for PCRE2
|
||||
--------------------
|
||||
Change Log for PCRE2 - see also the Git log
|
||||
-------------------------------------------
|
||||
|
||||
|
||||
Version 10.41 xx-xxx-2022
|
||||
-------------------------
|
||||
|
||||
1. Add fflush() before and after a fork callout in pcre2grep to get its output
|
||||
to be the same on all systems. (THere were previously ordering differences in
|
||||
Alpine Linux).
|
||||
|
||||
2. Merged patch from @carenas (GitHub #110) for pthreads support in CMake.
|
||||
|
||||
3. SSF scorecards grumbled about possible overflow in an expression in
|
||||
pcre2test. It never would have overflowed in practice, but some casts have been
|
||||
added and at the some time there's been some tidying of fprints that output
|
||||
size_t values.
|
||||
|
||||
4. PR #94 showed up an unused enum in pcre2_convert.c, which is now removed.
|
||||
|
||||
5. Minor code re-arrangement to remove gcc warning about realloc() in
|
||||
pcre2test.
|
||||
|
||||
6. Change a number of int variables that hold buffer and line lengths in
|
||||
pcre2grep to PCRE2_SIZE (aka size_t).
|
||||
|
||||
7. Added an #ifdef to cut out a call to PRIV(jit_free) when JIT is not
|
||||
supported (even though that function would do nothing in that case) at the
|
||||
request of a user who doesn't even want to link with pcre_jit_compile.o. Also
|
||||
tidied up an untidy #ifdef arrangement in pcre2test.
|
||||
|
||||
8. Fixed an issue in the backtracking optimization of character repeats in
|
||||
JIT. Furthermore optimize star repetitions, not just plus repetitions.
|
||||
|
||||
9. Removed the use of an initial backtracking frames vector on the system stack
|
||||
in pcre2_match() so that it now always uses the heap. (In a multi-thread
|
||||
environment with very small stacks there had been an issue.) This also is
|
||||
tidier for JIT matching, which didn't need that vector. The heap vector is now
|
||||
remembered in the match data block and re-used if that block itself is re-used.
|
||||
It is freed with the match data block.
|
||||
|
||||
10. Adjusted the find_limits code in pcre2test to work with change 9 above.
|
||||
|
||||
11. Added find_limits_noheap to pcre2test, because the heap limits are now
|
||||
different in different environments and so cannot be included in the standard
|
||||
tests.
|
||||
|
||||
12. Created a test for pcre2_match() heap processing that is not part of the
|
||||
tests run by 'make check', but can be run manually. The current output is from
|
||||
a 64-bit system.
|
||||
|
||||
13. Implemented -Z aka --null in pcre2grep.
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
1. Merged patch from @carenas (GitHub #35, 7db87842) to fix pcre2grep incorrect
|
||||
handling of multiple passes.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #36, dae47509) to fix portability issue
|
||||
in pcre2grep with buffered fseek(stdin).
|
||||
|
||||
3. Merged patch from @carenas (GitHub #37, acc520924) to fix tests when -S is
|
||||
not supported.
|
||||
|
||||
4. Revert an unintended change in JIT repeat detection.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #52, b037bfa1) to fix build on GNU Hurd.
|
||||
|
||||
6. Merged documentation and comments patches from @carenas (GitHub #47).
|
||||
|
||||
7. Merged patch from @carenas (GitHub #49) to remove obsolete JFriedl test code
|
||||
from pcre2grep.
|
||||
|
||||
8. Merged patch from @carenas (GitHub #48) to fix CMake install issue #46.
|
||||
|
||||
9. Merged patch from @carenas (GitHub #53) fixing NULL checks in matching and
|
||||
substituting.
|
||||
|
||||
10. Add null_subject and null_replacement modifiers to pcre2test.
|
||||
|
||||
11. Add check for NULL subject to POSIX regexec() function.
|
||||
|
||||
12. Add check for NULL replacement to pcre2_substitute().
|
||||
|
||||
13. For the subject arguments of pcre2_match(), pcre2_dfa_match(), and
|
||||
pcre2_substitute(), and the replacement argument of the latter, if the pointer
|
||||
is NULL and the length is zero, treat as an empty string. Apparently a number
|
||||
of applications treat NULL/0 in this way.
|
||||
|
||||
14. Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
15. Fix some minor issues raised by clang sanitize.
|
||||
|
||||
16. Very minor code speed up for maximizing character property matches.
|
||||
|
||||
17. A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
18. The Python scripts in the maint directory have been refactored. There are
|
||||
now three scripts that generate pcre2_ucd.c, pcre2_ucp.h, and pcre2_ucptables.c
|
||||
(which is #included by pcre2_tables.c). The data lists that used to be
|
||||
duplicated are now held in a single common Python module.
|
||||
|
||||
19. On CHERI, and thus Arm's Morello prototype, pointers are represented as
|
||||
hardware capabilities, which consist of both an integer address and additional
|
||||
metadata, meaning they are twice the size of the platform's size_t type, i.e.
|
||||
16 bytes on a 64-bit system. The ovector member of heapframe happens to only be
|
||||
8 byte aligned, and so computing frame_size ended up with a multiple of 8 but
|
||||
not 16. Whilst the first frame was always suitably aligned, this then
|
||||
misaligned the frame that follows, resulting in an alignment fault when storing
|
||||
a pointer to Fecode at the start of match. Patch to fix this issue by Jessica
|
||||
Clarke PR#72.
|
||||
|
||||
20. Added -LP and -LS listing options to pcre2test.
|
||||
|
||||
21. A user discovered that the library names in CMakeLists.txt for MSVC
|
||||
debugger (PDB) files were incorrect - perhaps never tried for PCRE2?
|
||||
|
||||
22. An item such as [Aa] is optimized into a caseless single character match.
|
||||
When this was quantified (e.g. [Aa]{2}) and was also the last literal item in a
|
||||
pattern, the optimizing "must be present for a match" character check was not
|
||||
being flagged as caseless, causing some matches that should have succeeded to
|
||||
fail.
|
||||
|
||||
23. Fixed a unicode property matching issue in JIT. The character was not
|
||||
fully read in caseless matching.
|
||||
|
||||
24. Fixed an issue affecting recursions in JIT caused by duplicated data
|
||||
transfers.
|
||||
|
||||
25. Merged patch from @carenas (GitHub #96) which fixes some problems with
|
||||
pcre2test and readline/readedit:
|
||||
|
||||
* Use the right header for libedit in FreeBSD with autoconf
|
||||
* Really allow libedit with cmake
|
||||
* Avoid using readline headers with libedit
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Merged patch from @carenas (GitHub #28):
|
||||
|
||||
Visual Studio 2013 includes support for %zu and %td, so let newer
|
||||
versions of it avoid the fallback, and while at it, make sure that
|
||||
the first check is for DISABLE_PERCENT_ZT so it will be always
|
||||
honoured if chosen.
|
||||
|
||||
prtdiff_t is signed, so use a signed type instead, and make sure
|
||||
that an appropriate width is chosen if pointers are 64bit wide and
|
||||
long is not (ex: Windows 64bit).
|
||||
|
||||
IMHO removing the cast (and therefore the possibilty of truncation)
|
||||
make the code cleaner and the fallback is likely portable enough
|
||||
with all 64-bit POSIX systems doing LP64 except for Windows.
|
||||
|
||||
3. Merged patch from @carenas (GitHub #29) to update to Unicode 14.0.0.
|
||||
|
||||
4. Merged patch from @carenas (GitHub #30):
|
||||
|
||||
* Cleanup: remove references to no longer used stdint.h
|
||||
|
||||
Since 19c50b9d (Unconditionally use inttypes.h instead of trying for stdint.h
|
||||
(simplification) and remove the now unnecessary inclusion in
|
||||
pcre2_internal.h., 2018-11-14), stdint.h is no longer used.
|
||||
|
||||
Remove checks for it in autotools and CMake and document better the expected
|
||||
build failures for systems that might have stdint.h (C99) and not inttypes.h
|
||||
(from POSIX), like old Windows.
|
||||
|
||||
* Cleanup: remove detection for inttypes.h which is a hard dependency
|
||||
|
||||
CMake checks for standard headers are not meant to be used for hard
|
||||
dependencies, so will prevent a possible fallback to work.
|
||||
|
||||
Alternatively, the header could be checked to make the configuration fail
|
||||
instead of breaking the build, but that was punted, as it was missing anyway
|
||||
from autotools.
|
||||
|
||||
5. Merged patch from @carenas (GitHub #32):
|
||||
|
||||
* jit: allow building with ancient MSVC versions
|
||||
|
||||
Visual Studio older than 2013 fails to build with JIT enabled, because it is
|
||||
unable to parse non C89 compatible syntax, with mixed declarations and code.
|
||||
While most recent compilers wouldn't even report this as a warning since it
|
||||
is valid C99, it could be also made visible by adding to gcc/clang the
|
||||
-Wdeclaration-after-statement flag at build time.
|
||||
|
||||
Move the code below the affected definitions.
|
||||
|
||||
* pcre2grep: avoid mixing declarations with code
|
||||
|
||||
Since d5a61ee8 (Patch to detect (and ignore) symlink loops in pcre2grep,
|
||||
2021-08-28), code will fail to build in a strict C89 compiler.
|
||||
|
||||
Reformat slightly to make it C89 compatible again.
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
1. Fix invalid single character repetition issues in JIT when the repetition
|
||||
is inside a capturing bracket and the bracket is preceded by character
|
||||
literals.
|
||||
|
||||
2. Installed revised CMake configuration files provided by Jan-Willem Blokland.
|
||||
This extends the CMake build system to build both static and shared libraries
|
||||
in one go, builds the static library with PIC, and exposes PCRE2 libraries
|
||||
using the CMake config files. JWB provided these notes:
|
||||
|
||||
- Introduced CMake variable BUILD_STATIC_LIBS to build the static library.
|
||||
|
||||
- Make a small modification to config-cmake.h.in by removing the PCRE2_STATIC
|
||||
variable. Added PCRE2_STATIC variable to the static build using the
|
||||
target_compile_definitions() function.
|
||||
|
||||
- Extended the CMake config files.
|
||||
|
||||
- Introduced CMake variable PCRE2_USE_STATIC_LIBS to easily switch between
|
||||
the static and shared libraries.
|
||||
|
||||
- Added the PCRE_STATIC variable to the target compile definitions for the
|
||||
import of the static library.
|
||||
|
||||
Building static and shared libraries using MSVC results in a name clash of
|
||||
the libraries. Both static and shared library builds create, for example, the
|
||||
file pcre2-8.lib. Therefore, I decided to change the static library names by
|
||||
adding "-static". For example, pcre2-8.lib has become pcre2-8-static.lib.
|
||||
[Comment by PH: this is MSVC-specific. It doesn't happen on Linux.]
|
||||
|
||||
3. Increased the minimum release number for CMake to 3.0.0 because older than
|
||||
2.8.12 is deprecated (it was set to 2.8.5) and causes warnings. Even 3.0.0 is
|
||||
quite old; it was released in 2014.
|
||||
|
||||
4. Implemented a modified version of Thomas Tempelmann's pcre2grep patch for
|
||||
detecting symlink loops. This is dependent on the availability of realpath(),
|
||||
which is now tested for in ./configure and CMakeLists.txt.
|
||||
|
||||
5. Implemented a modified version of Thomas Tempelmann's patch for faster
|
||||
case-independent "first code unit" searches for unanchored patterns in 8-bit
|
||||
mode in the interpreters. Instead of just remembering whether one case matched
|
||||
or not, it remembers the position of a previous match so as to avoid
|
||||
unnecessary repeated searching.
|
||||
|
||||
6. Perl now locks out \K in lookarounds, so PCRE2 now does the same by default.
|
||||
However, just in case anybody was relying on the old behaviour, there is an
|
||||
option called PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK that enables the old behaviour.
|
||||
An option has also been added to pcre2grep to enable this.
|
||||
|
||||
7. Re-enable a JIT optimization which was unintentionally disabled in 10.35.
|
||||
|
||||
8. There is a loop counter to catch excessively crazy patterns when checking
|
||||
the lengths of lookbehinds at compile time. This was incorrectly getting reset
|
||||
whenever a lookahead was processed, leading to some fuzzer-generated patterns
|
||||
taking a very long time to compile when (?|) was present in the pattern,
|
||||
because (?|) disables caching of group lengths.
|
||||
|
||||
|
||||
Version 10.37 26-May-2021
|
||||
-------------------------
|
||||
|
||||
1. Change RunGrepTest to use tr instead of sed when testing with binary
|
||||
zero bytes, because sed varies a lot from system to system and has problems
|
||||
with binary zeros. This is from Bugzilla #2681. Patch from Jeremie
|
||||
Courreges-Anglas via Nam Nguyen. This fixes RunGrepTest for OpenBSD. Later:
|
||||
it broke it for at least one version of Solaris, where tr can't handle binary
|
||||
zeros. However, that system had /usr/xpg4/bin/tr installed, which works OK, so
|
||||
RunGrepTest now checks for that command and uses it if found.
|
||||
|
||||
2. Compiling with gcc 10.2's -fanalyzer option showed up a hypothetical problem
|
||||
with a NULL dereference. I don't think this case could ever occur in practice,
|
||||
but I have put in a check in order to get rid of the compiler error.
|
||||
|
||||
3. An alternative patch for CMakeLists.txt because 10.36 #4 breaks CMake on
|
||||
Windows. Patch from email@cs-ware.de fixes bugzilla #2688.
|
||||
|
||||
4. Two bugs related to over-large numbers have been fixed so the behaviour is
|
||||
now the same as Perl.
|
||||
|
||||
(a) A pattern such as /\214748364/ gave an overflow error instead of being
|
||||
treated as the octal number \214 followed by literal digits.
|
||||
|
||||
(b) A sequence such as {65536 that has no terminating } so is not a
|
||||
quantifier was nevertheless complaining that a quantifier number was too big.
|
||||
|
||||
5. A run of autoconf suggested that configure.ac was out-of-date with respect
|
||||
to the lastest autoconf. Running autoupdate made some valid changes, some valid
|
||||
suggestions, and also some invalid changes, which were fixed by hand. Autoconf
|
||||
now runs clean and the resulting "configure" seems to work, so I hope nothing
|
||||
is broken. Later: the requirement for autoconf 2.70 broke some automatic test
|
||||
robots. It doesn't seem to be necessary: trying a reduction to 2.60.
|
||||
|
||||
6. The pattern /a\K.(?0)*/ when matched against "abac" by the interpreter gave
|
||||
the answer "bac", whereas Perl and JIT both yield "c". This was because the
|
||||
effect of \K was not propagating back from the full pattern recursion. Other
|
||||
recursions such as /(a\K.(?1)*)/ did not have this problem.
|
||||
|
||||
7. Restore single character repetition optimization in JIT. Currently fewer
|
||||
character repetitions are optimized than in 10.34.
|
||||
|
||||
8. When the names of the functions in the POSIX wrapper were changed to
|
||||
pcre2_regcomp() etc. (see change 10.33 #4 below), functions with the original
|
||||
names were left in the library so that pre-compiled programs would still work.
|
||||
However, this has proved troublesome when programs link with several libraries,
|
||||
some of which use PCRE2 via the POSIX interface while others use a native POSIX
|
||||
library. For this reason, the POSIX function names are removed in this release.
|
||||
The macros in pcre2posix.h should ensure that re-compiling fixes any programs
|
||||
that haven't been compiled since before 10.33.
|
||||
|
||||
|
||||
Version 10.36 04-December-2020
|
||||
------------------------------
|
||||
|
||||
1. Add CET_CFLAGS so that when Intel CET is enabled, pass -mshstk to
|
||||
compiler. This fixes https://bugs.exim.org/show_bug.cgi?id=2578. Patch for
|
||||
Makefile.am and configure.ac by H.J. Lu. Equivalent patch for CMakeLists.txt
|
||||
invented by PH.
|
||||
|
||||
2. Fix inifinite loop when a single byte newline is searched in JIT when
|
||||
invalid utf8 mode is enabled.
|
||||
|
||||
3. Updated CMakeLists.txt with patch from Wolfgang Stöggl (Bugzilla #2584):
|
||||
|
||||
- Include GNUInstallDirs and use ${CMAKE_INSTALL_LIBDIR} instead of hardcoded
|
||||
lib. This allows differentiation between lib and lib64.
|
||||
CMAKE_INSTALL_LIBDIR is used for installation of libraries and also for
|
||||
pkgconfig file generation.
|
||||
|
||||
- Add the version of PCRE2 to the configuration summary like ./configure
|
||||
does.
|
||||
|
||||
- Fix typo: MACTHED_STRING->MATCHED_STRING
|
||||
|
||||
4. Updated CMakeLists.txt with another patch from Wolfgang Stöggl (Bugzilla
|
||||
#2588):
|
||||
|
||||
- Add escaped double quotes around include directory in CMakeLists.txt to
|
||||
allow spaces in directory names.
|
||||
|
||||
- This fixes a cmake error, if the path of the pcre2 source contains a space.
|
||||
|
||||
5. Updated CMakeLists.txt with a patch from B. Scott Michel: CMake's
|
||||
documentation suggests using CHECK_SYMBOL_EXISTS over CHECK_FUNCTION_EXIST.
|
||||
Moreover, these functions come from specific header files, which need to be
|
||||
specified (and, thankfully, are the same on both the Linux and WinXX
|
||||
platforms.)
|
||||
|
||||
6. Added a (uint32_t) cast to prevent a compiler warning in pcre2_compile.c.
|
||||
|
||||
7. Applied a patch from Wolfgang Stöggl (Bugzilla #2600) to fix postfix for
|
||||
debug Windows builds using CMake. This also updated configure so that it
|
||||
generates *.pc files and pcre2-config with the same content, as in the past.
|
||||
|
||||
8. If a pattern ended with (?(VERSION=n.d where n is any number but d is just a
|
||||
single digit, the code unit beyond d was being read (i.e. there was a read
|
||||
buffer overflow). Fixes ClusterFuzz 23779.
|
||||
|
||||
9. After the rework in r1235, certain character ranges were incorrectly
|
||||
handled by an optimization in JIT. Furthermore a wrong offset was used to
|
||||
read a value from a buffer which could lead to memory overread.
|
||||
|
||||
10. Unnoticed for many years was the fact that delimiters other than / in the
|
||||
testinput1 and testinput4 files could cause incorrect behaviour when these
|
||||
files were processed by perltest.sh. There were several tests that used quotes
|
||||
as delimiters, and it was just luck that they didn't go wrong with perltest.sh.
|
||||
All the patterns in testinput1 and testinput4 now use / as their delimiter.
|
||||
This fixes Bugzilla #2641.
|
||||
|
||||
11. Perl has started to give an error for \K within lookarounds (though there
|
||||
are cases where it doesn't). PCRE2 still allows this, so the tests that include
|
||||
this case have been moved from test 1 to test 2.
|
||||
|
||||
12. Further to 10 above, pcre2test has been updated to detect and grumble if a
|
||||
delimiter other than / is used after #perltest.
|
||||
|
||||
13. Fixed a bug with PCRE2_MATCH_INVALID_UTF in 8-bit mode when PCRE2_CASELESS
|
||||
was set and PCRE2_NO_START_OPTIMIZE was not set. The optimization for finding
|
||||
the start of a match was not resetting correctly after a failed match on the
|
||||
first valid fragment of the subject, possibly causing incorrect "no match"
|
||||
returns on subsequent fragments. For example, the pattern /A/ failed to match
|
||||
the subject \xe5A. Fixes Bugzilla #2642.
|
||||
|
||||
14. Fixed a bug in character set matching when JIT is enabled and both unicode
|
||||
scripts and unicode classes are present at the same time.
|
||||
|
||||
15. Added GNU grep's -m (aka --max-count) option to pcre2grep.
|
||||
|
||||
16. Refactored substitution processing in pcre2grep strings, both for the -O
|
||||
option and when dealing with callouts. There is now a single function that
|
||||
handles $ expansion in all cases (instead of multiple copies of almost
|
||||
identical code). This means that the same escape sequences are available
|
||||
everywhere, which was not previously the case. At the same time, the escape
|
||||
sequences $x{...} and $o{...} have been introduced, to allow for characters
|
||||
whose code points are greater than 255 in Unicode mode.
|
||||
|
||||
17. Applied the patch from Bugzilla #2628 to RunGrepTest. This does an explicit
|
||||
test for a version of sed that can handle binary zero, instead of assuming that
|
||||
any Linux version will work. Later: replaced $(...) by `...` because not all
|
||||
shells recognize the former.
|
||||
|
||||
18. Fixed a word boundary check bug in JIT when partial matching is enabled.
|
||||
|
||||
19. Fix ARM64 compilation warning in JIT. Patch by Carlo.
|
||||
|
||||
20. A bug in the RunTest script meant that if the first part of test 2 failed,
|
||||
the failure was not reported.
|
||||
|
||||
21. Test 2 was failing when run from a directory other than the source
|
||||
directory. This failure was previously missed in RunTest because of 20 above.
|
||||
Fixes added to both RunTest and RunTest.bat.
|
||||
|
||||
22. Patch to CMakeLists.txt from Daniel to fix problem with testing under
|
||||
Windows.
|
||||
|
||||
|
||||
Version 10.35 09-May-2020
|
||||
---------------------------
|
||||
|
||||
1. Use PCRE2_MATCH_EMPTY flag to detect empty matches in JIT.
|
||||
|
||||
2. Fix ARMv5 JIT improper handling of labels right after a constant pool.
|
||||
|
||||
3. A JIT bug is fixed which allowed to read the fields of the compiled
|
||||
pattern before its existence is checked.
|
||||
|
||||
4. Back in the PCRE1 day, capturing groups that contained recursive back
|
||||
references to themselves were made atomic (version 8.01, change 18) because
|
||||
after the end a repeated group, the captured substrings had their values from
|
||||
the final repetition, not from an earlier repetition that might be the
|
||||
destination of a backtrack. This feature was documented, and was carried over
|
||||
into PCRE2. However, it has now been realized that the major refactoring that
|
||||
was done for 10.30 has made this atomicizing unnecessary, and it is confusing
|
||||
when users are unaware of it, making some patterns appear not to be working as
|
||||
expected. Capture values of recursive back references in repeated groups are
|
||||
now correctly backtracked, so this unnecessary restriction has been removed.
|
||||
|
||||
5. Added PCRE2_SUBSTITUTE_LITERAL.
|
||||
|
||||
6. Avoid some VS compiler warnings.
|
||||
|
||||
7. Added PCRE2_SUBSTITUTE_MATCHED.
|
||||
|
||||
8. Added (?* and (?<* as synonyms for (*napla: and (*naplb: to match another
|
||||
regex engine. The Perl regex folks are aware of this usage and have made a note
|
||||
about it.
|
||||
|
||||
9. When an assertion is repeated, PCRE2 used to limit the maximum repetition to
|
||||
1, believing that repeating an assertion is pointless. However, if a positive
|
||||
assertion contains capturing groups, repetition can be useful. In any case, an
|
||||
assertion could always be wrapped in a repeated group. The only restriction
|
||||
that is now imposed is that an unlimited maximum is changed to one more than
|
||||
the minimum.
|
||||
|
||||
10. Fix *THEN verbs in lookahead assertions in JIT.
|
||||
|
||||
11. Added PCRE2_SUBSTITUTE_REPLACEMENT_ONLY.
|
||||
|
||||
12. The JIT stack should be freed when the low-level stack allocation fails.
|
||||
|
||||
13. In pcre2grep, if the final line in a scanned file is output but does not
|
||||
end with a newline sequence, add a newline according to the --newline setting.
|
||||
|
||||
14. (?(DEFINE)...) groups were not being handled correctly when checking for
|
||||
the fixed length of a lookbehind assertion. Such a group within a lookbehind
|
||||
should be skipped, as it does not contribute to the length of the group.
|
||||
Instead, the (DEFINE) group was being processed, and if at the end of the
|
||||
lookbehind, that end was not correctly recognized. Errors such as "lookbehind
|
||||
assertion is not fixed length" and also "internal error: bad code value in
|
||||
parsed_skip()" could result.
|
||||
|
||||
15. Put a limit of 1000 on recursive calls in pcre2_study() when searching
|
||||
nested groups for starting code units, in order to avoid stack overflow issues.
|
||||
If the limit is reached, it just gives up trying for this optimization.
|
||||
|
||||
16. The control verb chain list must always be restored when exiting from a
|
||||
recurse function in JIT.
|
||||
|
||||
17. Fix a crash which occurs when the character type of an invalid UTF
|
||||
character is decoded in JIT.
|
||||
|
||||
18. Changes in many areas of the code so that when Unicode is supported and
|
||||
PCRE2_UCP is set without PCRE2_UTF, Unicode character properties are used for
|
||||
upper/lower case computations on characters whose code points are greater than
|
||||
127.
|
||||
|
||||
19. The function for checking UTF-16 validity was returning an incorrect offset
|
||||
for the start of the error when a high surrogate was not followed by a valid
|
||||
low surrogate. This caused incorrect behaviour, for example when
|
||||
PCRE2_MATCH_INVALID_UTF was set and a match started immediately following the
|
||||
invalid high surrogate, such as /aa/ matching "\x{d800}aa".
|
||||
|
||||
20. If a DEFINE group immediately preceded a lookbehind assertion, the pattern
|
||||
could be mis-compiled and therefore not match correctly. This is the example
|
||||
that found this: /(?(DEFINE)(?<foo>bar))(?<![-a-z0-9])word/ which failed to
|
||||
match "word" because the "move back" value was set to zero.
|
||||
|
||||
21. Following a request from a user, some extensions and tidies to the
|
||||
character tables handling have been done:
|
||||
|
||||
(a) The dftables auxiliary program is renamed pcre2_dftables, but it is still
|
||||
not installed for public use.
|
||||
|
||||
(b) There is now a -b option for pcre2_dftables, which causes the tables to
|
||||
be written in binary. There is also a -help option.
|
||||
|
||||
(c) PCRE2_CONFIG_TABLES_LENGTH is added to pcre2_config() so that an
|
||||
application that wants to save tables in binary knows how long they are.
|
||||
|
||||
22. Changed setting of CMAKE_MODULE_PATH in CMakeLists.txt from SET to
|
||||
LIST(APPEND...) to allow a setting from the command line to be included.
|
||||
|
||||
23. Updated to Unicode 13.0.0.
|
||||
|
||||
24. CMake build now checks for secure_getenv() and strerror(). Patch by Carlo.
|
||||
|
||||
25. Avoid using [-1] as a suffix in pcre2test because it can provoke a compiler
|
||||
warning.
|
||||
|
||||
26. Added tests for __attribute__((uninitialized)) to both the configure and
|
||||
CMake build files, and then applied this attribute to the variable called
|
||||
stack_frames_vector[] in pcre2_match(). When implemented, this disables
|
||||
automatic initialization (a facility in clang), which can take time on big
|
||||
variables.
|
||||
|
||||
27. Updated CMakeLists.txt (patches by Uwe Korn) to add support for
|
||||
pcre2-config, the libpcre*.pc files, SOVERSION, VERSION and the
|
||||
MACHO_*_VERSIONS settings for CMake builds.
|
||||
|
||||
28. Another patch to CMakeLists.txt to check for mkostemp (configure already
|
||||
does). Patch by Carlo Marcelo Arenas Belon.
|
||||
|
||||
29. Check for the existence of memfd_create in both CMake and configure
|
||||
configurations. Patch by Carlo Marcelo Arenas Belon.
|
||||
|
||||
30. Restrict the configuration setting for the SELinux compatible execmem
|
||||
allocator (change 10.30/44) to Linux and NetBSD.
|
||||
|
||||
|
||||
Version 10.34 21-November-2019
|
||||
------------------------------
|
||||
|
||||
1. The maximum number of capturing subpatterns is 65535 (documented), but no
|
||||
check on this was ever implemented. This omission has been rectified; it fixes
|
||||
ClusterFuzz 14376.
|
||||
|
||||
2. Improved the invalid utf32 support of the JIT compiler. Now it correctly
|
||||
detects invalid characters in the 0xd800-0xdfff range.
|
||||
|
||||
3. Fix minor typo bug in JIT compile when \X is used in a non-UTF string.
|
||||
|
||||
4. Add support for matching in invalid UTF strings to the pcre2_match()
|
||||
interpreter, and integrate with the existing JIT support via the new
|
||||
PCRE2_MATCH_INVALID_UTF compile-time option.
|
||||
|
||||
5. Give more error detail for invalid UTF-8 when detected in pcre2grep.
|
||||
|
||||
6. Add support for invalid UTF-8 to pcre2grep.
|
||||
|
||||
7. Adjust the limit for "must have" code unit searching, in particular,
|
||||
increase it substantially for non-anchored patterns.
|
||||
|
||||
8. Allow (*ACCEPT) to be quantified, because an ungreedy quantifier with a zero
|
||||
minimum is potentially useful.
|
||||
|
||||
9. Some changes to the way the minimum subject length is handled:
|
||||
|
||||
* When PCRE2_NO_START_OPTIMIZE is set, no minimum length is computed;
|
||||
pcre2test now omits this item instead of showing a value of zero.
|
||||
|
||||
* An incorrect minimum length could be calculated for a pattern that
|
||||
contained (*ACCEPT) inside a qualified group whose minimum repetition was
|
||||
zero, for example /A(?:(*ACCEPT))?B/, which incorrectly computed a minimum
|
||||
of 2. The minimum length scan no longer happens for a pattern that
|
||||
contains (*ACCEPT).
|
||||
|
||||
* When no minimum length is set by the normal scan, but a first and/or last
|
||||
code unit is recorded, set the minimum to 1 or 2 as appropriate.
|
||||
|
||||
* When a pattern contains multiple groups with the same number, a back
|
||||
reference cannot know which one to scan for a minimum length. This used to
|
||||
cause the minimum length finder to give up with no result. Now it treats
|
||||
such references as not adding to the minimum length (which it should have
|
||||
done all along).
|
||||
|
||||
* Furthermore, the above action now happens only if the back reference is to
|
||||
a group that exists more than once in a pattern instead of any back
|
||||
reference in a pattern with duplicate numbers.
|
||||
|
||||
10. A (*MARK) value inside a successful condition was not being returned by the
|
||||
interpretive matcher (it was returned by JIT). This bug has been mended.
|
||||
|
||||
11. A bug in pcre2grep meant that -o without an argument (or -o0) didn't work
|
||||
if the pattern had more than 32 capturing parentheses. This is fixed. In
|
||||
addition (a) the default limit for groups requested by -o<n> has been raised to
|
||||
50, (b) the new --om-capture option changes the limit, (c) an error is raised
|
||||
if -o asks for a group that is above the limit.
|
||||
|
||||
12. The quantifier {1} was always being ignored, but this is incorrect when it
|
||||
is made possessive and applied to an item in parentheses, because a
|
||||
parenthesized item may contain multiple branches or other backtracking points,
|
||||
for example /(a|ab){1}+c/ or /(a+){1}+a/.
|
||||
|
||||
13. For partial matches, pcre2test was always showing the maximum lookbehind
|
||||
characters, flagged with "<", which is misleading when the lookbehind didn't
|
||||
actually look behind the start (because it was later in the pattern). Showing
|
||||
all consulted preceding characters for partial matches is now controlled by the
|
||||
existing "allusedtext" modifier and, as for complete matches, this facility is
|
||||
available only for non-JIT matching, because JIT does not maintain the first
|
||||
and last consulted characters.
|
||||
|
||||
14. DFA matching (using pcre2_dfa_match()) was not recognising a partial match
|
||||
if the end of the subject was encountered in a lookahead (conditional or
|
||||
otherwise), an atomic group, or a recursion.
|
||||
|
||||
15. Give error if pcre2test -t, -T, -tm or -TM is given an argument of zero.
|
||||
|
||||
16. Check for integer overflow when computing lookbehind lengths. Fixes
|
||||
Clusterfuzz issue 15636.
|
||||
|
||||
17. Implemented non-atomic positive lookaround assertions.
|
||||
|
||||
18. If a lookbehind contained a lookahead that contained another lookbehind
|
||||
within it, the nested lookbehind was not correctly processed. For example, if
|
||||
/(?<=(?=(?<=a)))b/ was matched to "ab" it gave no match instead of matching
|
||||
"b".
|
||||
|
||||
19. Implemented pcre2_get_match_data_size().
|
||||
|
||||
20. Two alterations to partial matching:
|
||||
|
||||
(a) The definition of a partial match is slightly changed: if a pattern
|
||||
contains any lookbehinds, an empty partial match may be given, because this
|
||||
is another situation where adding characters to the current subject can
|
||||
lead to a full match. Example: /c*+(?<=[bc])/ with subject "ab".
|
||||
|
||||
(b) Similarly, if a pattern could match an empty string, an empty partial
|
||||
match may be given. Example: /(?![ab]).*/ with subject "ab". This case
|
||||
applies only to PCRE2_PARTIAL_HARD.
|
||||
|
||||
(c) An empty string partial hard match can be returned for \z and \Z as it
|
||||
is documented that they shouldn't match.
|
||||
|
||||
21. A branch that started with (*ACCEPT) was not being recognized as one that
|
||||
could match an empty string.
|
||||
|
||||
22. Corrected pcre2_set_character_tables() tables data type: was const unsigned
|
||||
char * instead of const uint8_t *, as generated by pcre2_maketables().
|
||||
|
||||
23. Upgraded to Unicode 12.1.0.
|
||||
|
||||
24. Add -jitfast command line option to pcre2test (to make all the jit options
|
||||
available directly).
|
||||
|
||||
25. Make pcre2test -C show if libreadline or libedit is supported.
|
||||
|
||||
26. If the length of one branch of a group exceeded 65535 (the maximum value
|
||||
that is remembered as a minimum length), the whole group's length was
|
||||
incorrectly recorded as 65535, leading to incorrect "no match" when start-up
|
||||
optimizations were in force.
|
||||
|
||||
27. The "rightmost consulted character" value was not always correct; in
|
||||
particular, if a pattern ended with a negative lookahead, characters that were
|
||||
inspected in that lookahead were not included.
|
||||
|
||||
28. Add the pcre2_maketables_free() function.
|
||||
|
||||
29. The start-up optimization that looks for a unique initial matching
|
||||
code unit in the interpretive engines uses memchr() in 8-bit mode. When the
|
||||
search is caseless, it was doing so inefficiently, which ended up slowing down
|
||||
the match drastically when the subject was very long. The revised code (a)
|
||||
remembers if one case is not found, so it never repeats the search for that
|
||||
case after a bumpalong and (b) when one case has been found, it searches only
|
||||
up to that position for an earlier occurrence of the other case. This fix
|
||||
applies to both interpretive pcre2_match() and to pcre2_dfa_match().
|
||||
|
||||
30. While scanning to find the minimum length of a group, if any branch has
|
||||
minimum length zero, there is no need to scan any subsequent branches (a small
|
||||
compile-time performance improvement).
|
||||
|
||||
31. Installed a .gitignore file on a user's suggestion. When using the svn
|
||||
repository with git (through git svn) this helps keep it tidy.
|
||||
|
||||
32. Add underflow check in JIT which may occur when the value of subject
|
||||
string pointer is close to 0.
|
||||
|
||||
33. Arrange for classes such as [Aa] which contain just the two cases of the
|
||||
same character, to be treated as a single caseless character. This causes the
|
||||
first and required code unit optimizations to kick in where relevant.
|
||||
|
||||
34. Improve the bitmap of starting bytes for positive classes that include wide
|
||||
characters, but no property types, in UTF-8 mode. Previously, on encountering
|
||||
such a class, the bits for all bytes greater than \xc4 were set, thus
|
||||
specifying any character with codepoint >= 0x100. Now the only bits that are
|
||||
set are for the relevant bytes that start the wide characters. This can give a
|
||||
noticeable performance improvement.
|
||||
|
||||
35. If the bitmap of starting code units contains only 1 or 2 bits, replace it
|
||||
with a single starting code unit (1 bit) or a caseless single starting code
|
||||
unit if the two relevant characters are case-partners. This is particularly
|
||||
relevant to the 8-bit library, though it applies to all. It can give a
|
||||
performance boost for patterns such as [Ww]ord and (word|WORD). However, this
|
||||
optimization doesn't happen if there is a "required" code unit of the same
|
||||
value (because the search for a "required" code unit starts at the match start
|
||||
for non-unique first code unit patterns, but after a unique first code unit,
|
||||
and patterns such as a*a need the former action).
|
||||
|
||||
36. Small patch to pcre2posix.c to set the erroroffset field to -1 immediately
|
||||
after a successful compile, instead of at the start of matching to avoid a
|
||||
sanitizer complaint (regexec is supposed to be thread safe).
|
||||
|
||||
37. Add NEON vectorization to JIT to speed up matching of first character and
|
||||
pairs of characters on ARM64 CPUs.
|
||||
|
||||
38. If a non-ASCII character was the first in a starting assertion in a
|
||||
caseless match, the "first code unit" optimization did not get the casing
|
||||
right, and the assertion failed to match a character in the other case if it
|
||||
did not start with the same code unit.
|
||||
|
||||
39. Fixed the incorrect computation of jump sizes on x86 CPUs in JIT. A masking
|
||||
operation was incorrectly removed in r1136. Reported by Ralf Junker.
|
||||
|
||||
|
||||
Version 10.33 16-April-2019
|
||||
|
@ -153,7 +894,7 @@ Patch by Guillem Jover.
|
|||
warnings were reported.
|
||||
|
||||
38. Using the clang compiler with sanitizing options causes runtime complaints
|
||||
about truncation for statments such as x = ~x when x is an 8-bit value; it
|
||||
about truncation for statements such as x = ~x when x is an 8-bit value; it
|
||||
seems to compute ~x as a 32-bit value. Changing such statements to x = 255 ^ x
|
||||
gets rid of the warnings. There were also two missing casts in pcre2test.
|
||||
|
||||
|
|
25
CheckMan
25
CheckMan
|
@ -16,6 +16,7 @@ while (scalar(@ARGV) > 0)
|
|||
|
||||
while (<IN>)
|
||||
{
|
||||
$count = 0;
|
||||
$line++;
|
||||
if (/^\s*$/)
|
||||
{
|
||||
|
@ -50,14 +51,24 @@ while (scalar(@ARGV) > 0)
|
|||
$yield = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
elsif (/\\[^ef]|\\f[^IBP]/)
|
||||
{
|
||||
if (/\\[^ef]|\\f[^IBP]/)
|
||||
{
|
||||
printf "Bad backslash in line $line of $file\n";
|
||||
$yield = 1;
|
||||
}
|
||||
}
|
||||
printf "Bad backslash in line $line of $file\n";
|
||||
$yield = 1;
|
||||
}
|
||||
while (/\\f[BI]/g)
|
||||
{
|
||||
$count++;
|
||||
}
|
||||
while (/\\fP/g)
|
||||
{
|
||||
$count--;
|
||||
}
|
||||
if ($count != 0)
|
||||
{
|
||||
printf "Mismatching formatting in line $line of $file\n";
|
||||
$yield = 1;
|
||||
}
|
||||
}
|
||||
|
||||
close(IN);
|
||||
|
|
94
HACKING
94
HACKING
|
@ -8,8 +8,8 @@ library is referred to as PCRE1 below. For information about testing PCRE2, see
|
|||
the pcre2test documentation and the comment at the head of the RunTest file.
|
||||
|
||||
PCRE1 releases were up to 8.3x when PCRE2 was developed, and later bug fix
|
||||
releases remain in the 8.xx series. PCRE2 releases started at 10.00 to avoid
|
||||
confusion with PCRE1.
|
||||
releases carried on the 8.xx series, up to the final 8.45 release. PCRE2
|
||||
releases started at 10.00 to avoid confusion with PCRE1.
|
||||
|
||||
|
||||
Historical note 1
|
||||
|
@ -38,8 +38,8 @@ Historical note 2
|
|||
By contrast, the code originally written by Henry Spencer (which was
|
||||
subsequently heavily modified for Perl) compiles the expression twice: once in
|
||||
a dummy mode in order to find out how much store will be needed, and then for
|
||||
real. (The Perl version probably doesn't do this any more; I'm talking about
|
||||
the original library.) The execution function operates by backtracking and
|
||||
real. (The Perl version may or may not still do this; I'm talking about the
|
||||
original library.) The execution function operates by backtracking and
|
||||
maximizing (or, optionally, minimizing, in Perl) the amount of the subject that
|
||||
matches individual wild portions of the pattern. This is an "NFA algorithm" in
|
||||
Friedl's terminology.
|
||||
|
@ -151,8 +151,8 @@ of code units in the item itself. The exception is the aforementioned large
|
|||
advance to check for such values. When auto-callouts are enabled, the generous
|
||||
assumption is made that there will be a callout for each pattern code unit
|
||||
(which of course is only actually true if all code units are literals) plus one
|
||||
at the end. There is a default parsed pattern vector on the system stack, but
|
||||
if this is not big enough, heap memory is used.
|
||||
at the end. A default parsed pattern vector is defined on the system stack, to
|
||||
minimize memory handling, but if this is not big enough, heap memory is used.
|
||||
|
||||
As before, the actual compiling function is run twice, the first time to
|
||||
determine the amount of memory needed for the final compiled pattern. It
|
||||
|
@ -187,7 +187,7 @@ META_CLASS_EMPTY [] empty class - only with PCRE2_ALLOW_EMPTY_CLASS
|
|||
META_CLASS_EMPTY_NOT [^] negative empty class - ditto
|
||||
META_CLASS_END ] end of non-empty class
|
||||
META_CLASS_NOT [^ start non-empty negative class
|
||||
META_COMMIT (*COMMIT)
|
||||
META_COMMIT (*COMMIT) - no argument (see below for with argument)
|
||||
META_COND_ASSERT (?(?assertion)
|
||||
META_DOLLAR $ metacharacter
|
||||
META_DOT . metacharacter
|
||||
|
@ -195,23 +195,24 @@ META_END End of pattern (this value is 0x80000000)
|
|||
META_FAIL (*FAIL)
|
||||
META_KET ) closing parenthesis
|
||||
META_LOOKAHEAD (?= start of lookahead
|
||||
META_LOOKAHEAD_NA (*napla: start of non-atomic lookahead
|
||||
META_LOOKAHEADNOT (?! start of negative lookahead
|
||||
META_NOCAPTURE (?: no capture parens
|
||||
META_PLUS +
|
||||
META_PLUS_PLUS ++
|
||||
META_PLUS_QUERY +?
|
||||
META_PRUNE (*PRUNE) - no argument
|
||||
META_PRUNE (*PRUNE) - no argument (see below for with argument)
|
||||
META_QUERY ?
|
||||
META_QUERY_PLUS ?+
|
||||
META_QUERY_QUERY ??
|
||||
META_RANGE_ESCAPED hyphen in class range with at least one escape
|
||||
META_RANGE_LITERAL hyphen in class range defined literally
|
||||
META_SKIP (*SKIP) - no argument
|
||||
META_THEN (*THEN) - no argument
|
||||
META_SKIP (*SKIP) - no argument (see below for with argument)
|
||||
META_THEN (*THEN) - no argument (see below for with argument)
|
||||
|
||||
The two RANGE values occur only in character classes. They are positioned
|
||||
between two literals that define the start and end of the range. In an EBCDIC
|
||||
evironment it is necessary to know whether either of the range values was
|
||||
environment it is necessary to know whether either of the range values was
|
||||
specified as an escape. In an ASCII/Unicode environment the distinction is not
|
||||
relevant.
|
||||
|
||||
|
@ -228,17 +229,16 @@ If the data for META_ALT is non-zero, it is inside a lookbehind, and the data
|
|||
is the length of its branch, for which OP_REVERSE must be generated.
|
||||
|
||||
META_BACKREF, META_CAPTURE, and META_RECURSE have the capture group number as
|
||||
their data in the lower 16 bits of the element.
|
||||
their data in the lower 16 bits of the element. META_RECURSE is followed by an
|
||||
offset, for use in error messages.
|
||||
|
||||
META_BACKREF is followed by an offset if the back reference group number is 10
|
||||
or more. The offsets of the first ocurrences of references to groups whose
|
||||
or more. The offsets of the first occurrences of references to groups whose
|
||||
numbers are less than 10 are put in cb->small_ref_offset[] (only the first
|
||||
occurrence is useful). On 64-bit systems this avoids using more than two parsed
|
||||
pattern elements for items such as \3. The offset is used when an error occurs
|
||||
because the reference is to a non-existent group.
|
||||
|
||||
META_RECURSE is always followed by an offset, for use in error messages.
|
||||
|
||||
META_ESCAPE has an ESC_xxx value as its data. For ESC_P and ESC_p, the next
|
||||
element contains the 16-bit type and data property values, packed together.
|
||||
ESC_g and ESC_k are used only for named references - numerical ones are turned
|
||||
|
@ -286,12 +286,13 @@ The following are also followed just by an offset, but also the lower 16 bits
|
|||
of the main word contain the length of the first branch of the lookbehind
|
||||
group; this is used when generating OP_REVERSE for that branch.
|
||||
|
||||
META_LOOKBEHIND (?<=
|
||||
META_LOOKBEHINDNOT (?<!
|
||||
META_LOOKBEHIND (?<= start of lookbehind
|
||||
META_LOOKBEHIND_NA (*naplb: start of non-atomic lookbehind
|
||||
META_LOOKBEHINDNOT (?<! start of negative lookbehind
|
||||
|
||||
The following are followed by two elements, the minimum and maximum. Repeat
|
||||
values are limited to 65535 (MAX_REPEAT). A maximum value of "unlimited" is
|
||||
represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
The following are followed by two elements, the minimum and maximum. The
|
||||
maximum value is limited to 65535 (MAX_REPEAT). A maximum value of "unlimited"
|
||||
is represented by UNLIMITED_REPEAT, which is bigger than MAX_REPEAT:
|
||||
|
||||
META_MINMAX {n,m} repeat
|
||||
META_MINMAX_PLUS {n,m}+ repeat
|
||||
|
@ -345,11 +346,11 @@ support is not available for this kind of matching.
|
|||
Changeable options
|
||||
------------------
|
||||
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL, and
|
||||
others) may be changed in the middle of patterns by items such as (?i). Their
|
||||
processing is handled entirely at compile time by generating different opcodes
|
||||
for the different settings. The runtime functions do not need to keep track of
|
||||
an option's state.
|
||||
The /i, /m, or /s options (PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL) and
|
||||
some others may be changed in the middle of patterns by items such as (?i).
|
||||
Their processing is handled entirely at compile time by generating different
|
||||
opcodes for the different settings. The runtime functions do not need to keep
|
||||
track of an option's state.
|
||||
|
||||
PCRE2_DUPNAMES, PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE
|
||||
are tracked and processed during the parsing pre-pass. The others are handled
|
||||
|
@ -435,7 +436,7 @@ Backtracking control verbs
|
|||
--------------------------
|
||||
|
||||
Verbs with no arguments generate opcodes with no following data (as listed
|
||||
in the section above).
|
||||
in the section above).
|
||||
|
||||
(*MARK:NAME) generates OP_MARK followed by the mark name, preceded by a
|
||||
length in one code unit, and followed by a binary zero. The name length is
|
||||
|
@ -466,8 +467,8 @@ Caseless matching (positive or negative) of characters that have more than two
|
|||
case-equivalent code points (which is possible only in UTF mode) is handled by
|
||||
compiling a Unicode property item (see below), with the pseudo-property
|
||||
PT_CLIST. The value of this property is an offset in a vector called
|
||||
"ucd_caseless_sets" which identifies the start of a short list of equivalent
|
||||
characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
"ucd_caseless_sets" which identifies the start of a short list of case
|
||||
equivalent characters, terminated by the value NOTACHAR (0xffffffff).
|
||||
|
||||
|
||||
Repeating single characters
|
||||
|
@ -544,8 +545,9 @@ Each is followed by two code units that encode the desired property as a type
|
|||
and a value. The types are a set of #defines of the form PT_xxx, and the values
|
||||
are enumerations of the form ucp_xx, defined in the pcre2_ucp.h source file.
|
||||
The value is relevant only for PT_GC (General Category), PT_PC (Particular
|
||||
Category), PT_SC (Script), and the pseudo-property PT_CLIST, which is used to
|
||||
identify a list of case-equivalent characters when there are three or more.
|
||||
Category), PT_SC (Script), PT_BIDICL (Bidi Class), PT_BOOL (Boolean property),
|
||||
and the pseudo-property PT_CLIST, which is used to identify a list of
|
||||
case-equivalent characters when there are three or more (see above).
|
||||
|
||||
Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by
|
||||
three code units: OP_PROP or OP_NOTPROP, and then the desired property type and
|
||||
|
@ -663,9 +665,9 @@ a count that immediately follows the offset.
|
|||
There are several opcodes that mark the end of a subpattern group. OP_KET is
|
||||
used for subpatterns that do not repeat indefinitely, OP_KETRMIN and
|
||||
OP_KETRMAX are used for indefinite repetitions, minimally or maximally
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
respectively, and OP_KETRPOS for possessive repetitions (see below for more
|
||||
details). All four are followed by a LINK_SIZE value giving (as a positive
|
||||
number) the offset back to the matching bracket opcode.
|
||||
number) the offset back to the matching opening bracket opcode.
|
||||
|
||||
If a subpattern is quantified such that it is permitted to match zero times, it
|
||||
is preceded by one of OP_BRAZERO, OP_BRAMINZERO, or OP_SKIPZERO. These are
|
||||
|
@ -715,13 +717,15 @@ Assertions
|
|||
----------
|
||||
|
||||
Forward assertions are also just like other subpatterns, but starting with one
|
||||
of the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
|
||||
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
|
||||
is OP_REVERSE, followed by a count of the number of characters to move back the
|
||||
pointer in the subject string. In ASCII or UTF-32 mode, the count is also the
|
||||
number of code units, but in UTF-8/16 mode each character may occupy more than
|
||||
one code unit. A separate count is present in each alternative of a lookbehind
|
||||
assertion, allowing them to have different (but fixed) lengths.
|
||||
of the opcodes OP_ASSERT, OP_ASSERT_NA (non-atomic assertion), or
|
||||
OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK,
|
||||
OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT, and the first opcode inside the
|
||||
assertion is OP_REVERSE, followed by a count of the number of characters to
|
||||
move back the pointer in the subject string. In ASCII or UTF-32 mode, the count
|
||||
is also the number of code units, but in UTF-8/16 mode each character may
|
||||
occupy more than one code unit. A separate count is present in each alternative
|
||||
of a lookbehind assertion, allowing each branch to have a different (but fixed)
|
||||
length.
|
||||
|
||||
|
||||
Conditional subpatterns
|
||||
|
@ -754,11 +758,11 @@ tests the PCRE2 version number. This compiles into one of the opcodes OP_TRUE
|
|||
or OP_FALSE.
|
||||
|
||||
If a condition is not a back reference, recursion test, DEFINE, or VERSION, it
|
||||
must start with a parenthesized assertion, whose opcode normally immediately
|
||||
follows OP_COND or OP_SCOND. However, if automatic callouts are enabled, a
|
||||
callout is inserted immediately before the assertion. It is also possible to
|
||||
insert a manual callout at this point. Only assertion conditions may have
|
||||
callouts preceding the condition.
|
||||
must start with a parenthesized atomic assertion, whose opcode normally
|
||||
immediately follows OP_COND or OP_SCOND. However, if automatic callouts are
|
||||
enabled, a callout is inserted immediately before the assertion. It is also
|
||||
possible to insert a manual callout at this point. Only assertion conditions
|
||||
may have callouts preceding the condition.
|
||||
|
||||
A condition that is the negative assertion (?!) is optimized to OP_FAIL in all
|
||||
parts of the pattern, so this is another opcode that may appear as a condition.
|
||||
|
@ -823,4 +827,4 @@ not a real opcode, but is used to check at compile time that tables indexed by
|
|||
opcode are the correct length, in order to catch updating errors.
|
||||
|
||||
Philip Hazel
|
||||
20 July 2018
|
||||
April 2022
|
||||
|
|
12
LICENCE
12
LICENCE
|
@ -20,13 +20,13 @@ THE BASIC LIBRARY FUNCTIONS
|
|||
---------------------------
|
||||
|
||||
Written by: Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
|
||||
University of Cambridge Computing Service,
|
||||
Retired from University of Cambridge Computing Service,
|
||||
Cambridge, England.
|
||||
|
||||
Copyright (c) 1997-2019 University of Cambridge
|
||||
Copyright (c) 1997-2022 University of Cambridge
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -37,7 +37,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2010-2019 Zoltan Herczeg
|
||||
Copyright(c) 2010-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
@ -48,7 +48,7 @@ Written by: Zoltan Herczeg
|
|||
Email local part: hzmester
|
||||
Email domain: freemail.hu
|
||||
|
||||
Copyright(c) 2009-2019 Zoltan Herczeg
|
||||
Copyright(c) 2009-2022 Zoltan Herczeg
|
||||
All rights reserved.
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
module(
|
||||
name = "pcre2",
|
||||
version = "10.40",
|
||||
compatibility_level = 1,
|
||||
)
|
||||
|
||||
bazel_dep(name = "rules_cc", version = "0.0.1")
|
||||
bazel_dep(name = "bazel_skylib", version = "1.2.1")
|
49
Makefile.am
49
Makefile.am
|
@ -46,6 +46,7 @@ dist_html_DATA = \
|
|||
doc/html/pcre2_general_context_free.html \
|
||||
doc/html/pcre2_get_error_message.html \
|
||||
doc/html/pcre2_get_mark.html \
|
||||
doc/html/pcre2_get_match_data_size.html \
|
||||
doc/html/pcre2_get_ovector_count.html \
|
||||
doc/html/pcre2_get_ovector_pointer.html \
|
||||
doc/html/pcre2_get_startchar.html \
|
||||
|
@ -56,6 +57,7 @@ dist_html_DATA = \
|
|||
doc/html/pcre2_jit_stack_create.html \
|
||||
doc/html/pcre2_jit_stack_free.html \
|
||||
doc/html/pcre2_maketables.html \
|
||||
doc/html/pcre2_maketables_free.html \
|
||||
doc/html/pcre2_match.html \
|
||||
doc/html/pcre2_match_context_copy.html \
|
||||
doc/html/pcre2_match_context_create.html \
|
||||
|
@ -140,6 +142,7 @@ dist_man_MANS = \
|
|||
doc/pcre2_general_context_free.3 \
|
||||
doc/pcre2_get_error_message.3 \
|
||||
doc/pcre2_get_mark.3 \
|
||||
doc/pcre2_get_match_data_size.3 \
|
||||
doc/pcre2_get_ovector_count.3 \
|
||||
doc/pcre2_get_ovector_pointer.3 \
|
||||
doc/pcre2_get_startchar.3 \
|
||||
|
@ -150,6 +153,7 @@ dist_man_MANS = \
|
|||
doc/pcre2_jit_stack_create.3 \
|
||||
doc/pcre2_jit_stack_free.3 \
|
||||
doc/pcre2_maketables.3 \
|
||||
doc/pcre2_maketables_free.3 \
|
||||
doc/pcre2_match.3 \
|
||||
doc/pcre2_match_context_copy.3 \
|
||||
doc/pcre2_match_context_create.3 \
|
||||
|
@ -321,18 +325,18 @@ include_HEADERS = src/pcre2posix.h
|
|||
bin_SCRIPTS = pcre2-config
|
||||
|
||||
## ---------------------------------------------------------------
|
||||
## The dftables program is used to rebuild character tables before compiling
|
||||
## PCRE2, if --enable-rebuild-chartables is specified. It is not a user-visible
|
||||
## program. The default (when --enable-rebuild-chartables is not specified) is
|
||||
## to copy a distributed set of tables that are defined for ASCII code. In this
|
||||
## case, dftables is not needed.
|
||||
## The pcre2_dftables program is used to rebuild character tables before
|
||||
## compiling PCRE2, if --enable-rebuild-chartables is specified. It is not an
|
||||
## installed program. The default (when --enable-rebuild-chartables is not
|
||||
## specified) is to copy a distributed set of tables that are defined for ASCII
|
||||
## code. In this case, pcre2_dftables is not needed.
|
||||
|
||||
if WITH_REBUILD_CHARTABLES
|
||||
noinst_PROGRAMS += dftables
|
||||
dftables_SOURCES = src/dftables.c
|
||||
src/pcre2_chartables.c: dftables$(EXEEXT)
|
||||
noinst_PROGRAMS += pcre2_dftables
|
||||
pcre2_dftables_SOURCES = src/pcre2_dftables.c
|
||||
src/pcre2_chartables.c: pcre2_dftables$(EXEEXT)
|
||||
rm -f $@
|
||||
./dftables$(EXEEXT) $@
|
||||
./pcre2_dftables$(EXEEXT) $@
|
||||
else
|
||||
src/pcre2_chartables.c: $(srcdir)/src/pcre2_chartables.c.dist
|
||||
rm -f $@
|
||||
|
@ -358,6 +362,8 @@ COMMON_SOURCES = \
|
|||
src/pcre2_internal.h \
|
||||
src/pcre2_intmodedep.h \
|
||||
src/pcre2_jit_compile.c \
|
||||
src/pcre2_jit_neon_inc.h \
|
||||
src/pcre2_jit_simd_inc.h \
|
||||
src/pcre2_maketables.c \
|
||||
src/pcre2_match.c \
|
||||
src/pcre2_match_data.c \
|
||||
|
@ -376,6 +382,10 @@ COMMON_SOURCES = \
|
|||
src/pcre2_valid_utf.c \
|
||||
src/pcre2_xclass.c
|
||||
|
||||
# The pcre2_ucptables.c file is #included by pcre2_tables.c
|
||||
|
||||
EXTRA_DIST += src/pcre2_ucptables.c
|
||||
|
||||
if WITH_PCRE2_8
|
||||
lib_LTLIBRARIES += libpcre2-8.la
|
||||
libpcre2_8_la_SOURCES = \
|
||||
|
@ -385,6 +395,7 @@ nodist_libpcre2_8_la_SOURCES = \
|
|||
libpcre2_8_la_CFLAGS = \
|
||||
-DPCRE2_CODE_UNIT_WIDTH=8 \
|
||||
$(VISIBILITY_CFLAGS) \
|
||||
$(CET_CFLAGS) \
|
||||
$(AM_CFLAGS)
|
||||
libpcre2_8_la_LIBADD =
|
||||
endif # WITH_PCRE2_8
|
||||
|
@ -398,6 +409,7 @@ nodist_libpcre2_16_la_SOURCES = \
|
|||
libpcre2_16_la_CFLAGS = \
|
||||
-DPCRE2_CODE_UNIT_WIDTH=16 \
|
||||
$(VISIBILITY_CFLAGS) \
|
||||
$(CET_CFLAGS) \
|
||||
$(AM_CFLAGS)
|
||||
libpcre2_16_la_LIBADD =
|
||||
endif # WITH_PCRE2_16
|
||||
|
@ -411,6 +423,7 @@ nodist_libpcre2_32_la_SOURCES = \
|
|||
libpcre2_32_la_CFLAGS = \
|
||||
-DPCRE2_CODE_UNIT_WIDTH=32 \
|
||||
$(VISIBILITY_CFLAGS) \
|
||||
$(CET_CFLAGS) \
|
||||
$(AM_CFLAGS)
|
||||
libpcre2_32_la_LIBADD =
|
||||
endif # WITH_PCRE2_32
|
||||
|
@ -439,15 +452,16 @@ EXTRA_DIST += \
|
|||
src/sljit/sljitNativePPC_32.c \
|
||||
src/sljit/sljitNativePPC_64.c \
|
||||
src/sljit/sljitNativePPC_common.c \
|
||||
src/sljit/sljitNativeSPARC_32.c \
|
||||
src/sljit/sljitNativeSPARC_common.c \
|
||||
src/sljit/sljitNativeTILEGX-encoder.c \
|
||||
src/sljit/sljitNativeTILEGX_64.c \
|
||||
src/sljit/sljitNativeRISCV_32.c \
|
||||
src/sljit/sljitNativeRISCV_64.c \
|
||||
src/sljit/sljitNativeRISCV_common.c \
|
||||
src/sljit/sljitNativeS390X.c \
|
||||
src/sljit/sljitNativeX86_32.c \
|
||||
src/sljit/sljitNativeX86_64.c \
|
||||
src/sljit/sljitNativeX86_common.c \
|
||||
src/sljit/sljitProtExecAllocator.c \
|
||||
src/sljit/sljitUtils.c
|
||||
src/sljit/sljitUtils.c \
|
||||
src/sljit/sljitWXExecAllocator.c
|
||||
|
||||
# Some of the JIT sources are also in separate files that are #included.
|
||||
|
||||
|
@ -628,6 +642,7 @@ EXTRA_DIST += \
|
|||
testdata/grepoutputCN \
|
||||
testdata/grepoutputN \
|
||||
testdata/greppatN4 \
|
||||
testdata/testbtables \
|
||||
testdata/testinput1 \
|
||||
testdata/testinput2 \
|
||||
testdata/testinput3 \
|
||||
|
@ -653,6 +668,7 @@ EXTRA_DIST += \
|
|||
testdata/testinput23 \
|
||||
testdata/testinput24 \
|
||||
testdata/testinput25 \
|
||||
testdata/testinput26 \
|
||||
testdata/testinputEBC \
|
||||
testdata/testoutput1 \
|
||||
testdata/testoutput2 \
|
||||
|
@ -695,6 +711,7 @@ EXTRA_DIST += \
|
|||
testdata/testoutput23 \
|
||||
testdata/testoutput24 \
|
||||
testdata/testoutput25 \
|
||||
testdata/testoutput26 \
|
||||
testdata/testoutputEBC \
|
||||
testdata/valgrind-jit.supp \
|
||||
testdata/wintestinput3 \
|
||||
|
@ -849,9 +866,11 @@ endif # WITH_GCOV
|
|||
|
||||
EXTRA_DIST += \
|
||||
cmake/COPYING-CMAKE-SCRIPTS \
|
||||
cmake/FindEditline.cmake \
|
||||
cmake/FindPackageHandleStandardArgs.cmake \
|
||||
cmake/FindReadline.cmake \
|
||||
cmake/FindEditline.cmake \
|
||||
cmake/pcre2-config-version.cmake.in \
|
||||
cmake/pcre2-config.cmake.in \
|
||||
CMakeLists.txt \
|
||||
config-cmake.h.in
|
||||
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
#
|
||||
# Project: pcre2
|
||||
#
|
||||
# Created on: 10-01-2022 22:01:46
|
||||
#
|
||||
# commands to use:
|
||||
# make -f Makefile.os4 libpcre2.a
|
||||
# make -f Makefile.os4 libpcre2-posix.a
|
||||
# make -f Makefile.os4 pcre2test
|
||||
# sh RunTest
|
||||
# make -f Makefile.os4 clean
|
||||
#
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Objects
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2_OBJ := \
|
||||
src/pcre2_chartables.o src/pcre2_auto_possess.o src/pcre2_compile.o \
|
||||
src/pcre2_config.o src/pcre2_context.o src/pcre2_convert.o \
|
||||
src/pcre2_dfa_match.o src/pcre2_error.o src/pcre2_extuni.o \
|
||||
src/pcre2_find_bracket.o src/pcre2_jit_compile.o src/pcre2_maketables.o \
|
||||
src/pcre2_match.o src/pcre2_match_data.o src/pcre2_newline.o \
|
||||
src/pcre2_ord2utf.o src/pcre2_pattern_info.o src/pcre2_script_run.o \
|
||||
src/pcre2_serialize.o src/pcre2_string_utils.o src/pcre2_study.o \
|
||||
src/pcre2_substitute.o src/pcre2_substring.o src/pcre2_tables.o \
|
||||
src/pcre2_ucd.o src/pcre2_valid_utf.o src/pcre2_xclass.o \
|
||||
|
||||
|
||||
|
||||
pcre2posix_OBJ := \
|
||||
src/pcre2posix.o
|
||||
|
||||
|
||||
pcre2test_OBJ := \
|
||||
src/pcre2test.o
|
||||
|
||||
|
||||
pcre2grep_OBJ := \
|
||||
src/pcre2grep.o
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Variables and Environment
|
||||
##
|
||||
###################################################################
|
||||
|
||||
MCRT := -mcrt=newlib
|
||||
ifeq ($(USE_CLIB2), yes)
|
||||
MCRT := -mcrt=clib2
|
||||
endif
|
||||
|
||||
CC := gcc:bin/gcc
|
||||
|
||||
INCPATH := -I. -Isrc
|
||||
|
||||
# for pcre2test
|
||||
CFLAGS := $(MCRT) $(INCPATH) -O2 -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// General rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
.PHONY: all all-before all-after clean clean-custom realclean
|
||||
|
||||
all: all-before libpcre2.a libpcre2-posix.a all-after
|
||||
|
||||
all-before:
|
||||
# You can add rules here to execute before the project is built
|
||||
|
||||
all-after:
|
||||
# You can add rules here to execute after the project is built
|
||||
|
||||
tests: pcre2test pcre2grep
|
||||
|
||||
clean: clean-custom
|
||||
@echo "Cleaning compiler objects..."
|
||||
@rm -f $(libpcre2_OBJ) $(pcre2posix_OBJ) $(pcre2test_OBJ)
|
||||
|
||||
cleanall: clean
|
||||
@echo "Cleaning compiler targets..."
|
||||
@rm -f libpcre.a libpcre-posix.a pcre2test pcre2grep
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Targets
|
||||
##
|
||||
###################################################################
|
||||
|
||||
libpcre2.a: $(libpcre2_OBJ)
|
||||
ar -rcs libpcre2.a $(libpcre2_OBJ)
|
||||
ranlib libpcre2.a
|
||||
|
||||
libpcre2-posix.a: $(pcre2posix_OBJ)
|
||||
ar -rcs libpcre2-posix.a $(pcre2posix_OBJ)
|
||||
ranlib libpcre2-posix.a
|
||||
|
||||
pcre2test: libpcre2.a libpcre2-posix.a $(pcre2test_OBJ)
|
||||
@echo "Linking pcre2test"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2test $(pcre2test_OBJ) -L. -lauto -lpcre2 -lpcre2-posix
|
||||
@echo "Removing stale debug target: pcre2test"
|
||||
@rm -f pcre2test.debug
|
||||
|
||||
pcre2grep: libpcre2.a $(pcre2grep_OBJ)
|
||||
@echo "Linking pcre2grep"
|
||||
@gcc:bin/gcc $(MCRT) -o pcre2grep $(pcre2grep_OBJ) -L . -lauto -lpcre2
|
||||
@echo "Removing stale debug target: pcre2grep"
|
||||
@rm -f pcre2grep.debug
|
||||
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Standard rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
# A default rule to make all the objects listed below
|
||||
# because we are hiding compiler commands from the output
|
||||
|
||||
.c.o:
|
||||
@echo "Compiling $<"
|
||||
@$(CC) -c $< -o $*.o $(CFLAGS)
|
||||
|
||||
src/pcre2_chartables.o: src/pcre2_chartables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_auto_possess.o: src/pcre2_auto_possess.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_compile.o: src/pcre2_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_config.o: src/pcre2_config.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_context.o: src/pcre2_context.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_convert.o: src/pcre2_convert.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_dfa_match.o: src/pcre2_dfa_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_error.o: src/pcre2_error.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_extuni.o: src/pcre2_extuni.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_find_bracket.o: src/pcre2_find_bracket.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_jit_compile.o: src/pcre2_jit_compile.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
src/sljit/sljitLir.c src/sljit/sljitLir.h src/sljit/sljitConfig.h \
|
||||
src/sljit/sljitConfigInternal.h src/sljit/sljitUtils.c src/sljit/sljitProtExecAllocator.c \
|
||||
src/sljit/sljitWXExecAllocator.c src/sljit/sljitExecAllocator.c src/pcre2_jit_simd_inc.h \
|
||||
src/pcre2_jit_neon_inc.h src/pcre2_jit_match.c
|
||||
|
||||
src/pcre2_maketables.o: src/pcre2_maketables.c
|
||||
|
||||
src/pcre2_match.o: src/pcre2_match.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_match_data.o: src/pcre2_match_data.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_newline.o: src/pcre2_newline.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_ord2utf.o: src/pcre2_ord2utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_pattern_info.o: src/pcre2_pattern_info.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_script_run.o: src/pcre2_script_run.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_serialize.o: src/pcre2_serialize.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2test.o: src/pcre2test.c src/config.h src/pcre2.h \
|
||||
src/pcre2posix.h src/pcre2_internal.h src/pcre2_ucp.h \
|
||||
src/pcre2_intmodedep.h src/pcre2_tables.c src/pcre2_ucptables.c \
|
||||
src/pcre2_ucd.c src/pcre2_printint.c
|
||||
|
||||
src/pcre2_string_utils.o: src/pcre2_string_utils.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_study.o: src/pcre2_study.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substitute.o: src/pcre2_substitute.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_substring.o: src/pcre2_substring.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2posix.o: src/pcre2posix.c src/config.h src/pcre2.h \
|
||||
|
||||
|
||||
src/pcre2_tables.o: src/pcre2_tables.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h src/pcre2_intmodedep.h \
|
||||
|
||||
|
||||
src/pcre2_ucd.o: src/pcre2_ucd.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_valid_utf.o: src/pcre2_valid_utf.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
src/pcre2_xclass.o: src/pcre2_xclass.c src/config.h src/pcre2_internal.h \
|
||||
src/pcre2.h src/pcre2_ucp.h
|
||||
|
||||
|
||||
src/pcre2grep.o: src/pcre2grep.c src/config.h
|
||||
|
||||
###################################################################
|
||||
##
|
||||
##//// Custom rules
|
||||
##
|
||||
###################################################################
|
||||
|
||||
runtests: libpcre2.a libpcre2-posix.a tests
|
||||
sh RunTest
|
||||
sh RunGrepTest
|
||||
|
||||
release:
|
||||
@echo "Create release folders..."
|
||||
@mkdir -p release/local/newlib/lib release/local/clib2/lib release/local/Documentation/pcre2 release/local/common/include
|
||||
|
||||
@echo "Building newlib based libraries..."
|
||||
@make -f Makefile.os4 all
|
||||
@cp libpcre2.a release/local/newlib/lib/
|
||||
@cp libpcre2-posix.a release/local/newlib/lib/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Building clib2 based libraries..."
|
||||
@make -f Makefile.os4 all USE_CLIB2=yes
|
||||
@cp libpcre2.a release/local/clib2/lib/
|
||||
@cp libpcre2-posix.a release/local/clib2/lib/
|
||||
|
||||
@echo "Copy the necessary files..."
|
||||
@cp src/pcre2.h release/local/common/include/
|
||||
@cp src/pcre2posix.h release/local/common/include/
|
||||
@cp COPYING release/local/Documentation/pcre2/
|
||||
@cp HACKING release/local/Documentation/pcre2/
|
||||
@cp LICENCE release/local/Documentation/pcre2/
|
||||
@cp README release/local/Documentation/pcre2/
|
||||
@cp README-OS4.md release/local/Documentation/pcre2/
|
||||
|
||||
@echo "Clean build and libraries files..."
|
||||
@make -f Makefile.os4 cleanall
|
||||
|
||||
@echo "Creating the lha release file..."
|
||||
@rm -f pcre2.lha
|
||||
@lha -aeqr3 a pcre2.lha release/
|
||||
|
||||
@rm -rf release
|
||||
|
||||
###################################################################
|
||||
|
121
NEWS
121
NEWS
|
@ -2,8 +2,125 @@ News about PCRE2 releases
|
|||
-------------------------
|
||||
|
||||
|
||||
Version 10.33-RC1 16-April-2019
|
||||
-------------------------------
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
||||
This is mostly a bug-fixing and code-tidying release. However, there are some
|
||||
extensions to Unicode property handling:
|
||||
|
||||
* Added support for Bidi_Class and a number of binary Unicode properties,
|
||||
including Bidi_Control.
|
||||
|
||||
* A number of changes to script matching for \p and \P:
|
||||
|
||||
(a) Script extensions for a character are now coded as a bitmap instead of
|
||||
a list of script numbers, which should be faster and does not need a
|
||||
loop.
|
||||
|
||||
(b) Added the syntax \p{script:xxx} and \p{script_extensions:xxx} (synonyms
|
||||
sc and scx).
|
||||
|
||||
(c) Changed \p{scriptname} from being the same as \p{sc:scriptname} to being
|
||||
the same as \p{scx:scriptname} because this change happened in Perl at
|
||||
release 5.26.
|
||||
|
||||
(d) The standard Unicode 4-letter abbreviations for script names are now
|
||||
recognized.
|
||||
|
||||
(e) In accordance with Unicode and Perl's "loose matching" rules, spaces,
|
||||
hyphens, and underscores are ignored in property names, which are then
|
||||
matched independent of case.
|
||||
|
||||
As always, see ChangeLog for a list of all changes (also the Git log).
|
||||
|
||||
|
||||
Version 10.39 29-October-2021
|
||||
-----------------------------
|
||||
|
||||
This release is happening soon after 10.38 because the bug fix is important.
|
||||
|
||||
1. Fix incorrect detection of alternatives in first character search in JIT.
|
||||
|
||||
2. Update to Unicode 14.0.0.
|
||||
|
||||
3. Some code cleanups (see ChangeLog).
|
||||
|
||||
|
||||
Version 10.38 01-October-2021
|
||||
-----------------------------
|
||||
|
||||
As well as some bug fixes and tidies (as always, see ChangeLog for details),
|
||||
the documentation is updated to list the new URLs, following the move of the
|
||||
source repository to GitHub and the mailing list to Google Groups.
|
||||
|
||||
* The CMake build system can now build both static and shared libraries in one
|
||||
go.
|
||||
|
||||
* Following Perl's lead, \K is now locked out in lookaround assertions by
|
||||
default, but an option is provided to re-enable the previous behaviour.
|
||||
|
||||
|
||||
Version 10.37 26-May-2021
|
||||
-------------------------
|
||||
|
||||
A few more bug fixes and tidies. The only change of real note is the removal of
|
||||
the actual POSIX names regcomp etc. from the POSIX wrapper library because
|
||||
these have caused issues for some applications (see 10.33 #2 below).
|
||||
|
||||
|
||||
Version 10.36 04-December-2020
|
||||
------------------------------
|
||||
|
||||
Again, mainly bug fixes and tidies. The only enhancements are the addition of
|
||||
GNU grep's -m (aka --max-count) option to pcre2grep, and also unifying the
|
||||
handling of substitution strings for both -O and callouts in pcre2grep, with
|
||||
the addition of $x{...} and $o{...} to allow for characters whose code points
|
||||
are greater than 255 in Unicode mode.
|
||||
|
||||
NOTE: there is an outstanding issue with JIT support for MacOS on arm64
|
||||
hardware. For details, please see Bugzilla issue #2618.
|
||||
|
||||
|
||||
Version 10.35 15-April-2020
|
||||
---------------------------
|
||||
|
||||
Bugfixes, tidies, and a few new enhancements.
|
||||
|
||||
1. Capturing groups that contain recursive backreferences to themselves are no
|
||||
longer automatically atomic, because the restriction is no longer necessary
|
||||
as a result of the 10.30 restructuring.
|
||||
|
||||
2. Several new options for pcre2_substitute().
|
||||
|
||||
3. When Unicode is supported and PCRE2_UCP is set without PCRE2_UTF, Unicode
|
||||
character properties are used for upper/lower case computations on characters
|
||||
whose code points are greater than 127.
|
||||
|
||||
4. The character tables (for low-valued characters) can now more easily be
|
||||
saved and restored in binary.
|
||||
|
||||
5. Updated to Unicode 13.0.0.
|
||||
|
||||
|
||||
Version 10.34 21-November-2019
|
||||
------------------------------
|
||||
|
||||
Another release with a few enhancements as well as bugfixes and tidies. The
|
||||
main new features are:
|
||||
|
||||
1. There is now some support for matching in invalid UTF strings.
|
||||
|
||||
2. Non-atomic positive lookarounds are implemented in the pcre2_match()
|
||||
interpreter, but not in JIT.
|
||||
|
||||
3. Added two new functions: pcre2_get_match_data_size() and
|
||||
pcre2_maketables_free().
|
||||
|
||||
4. Upgraded to Unicode 12.1.0.
|
||||
|
||||
|
||||
Version 10.33 16-April-2019
|
||||
---------------------------
|
||||
|
||||
Yet more bugfixes, tidies, and a few enhancements, summarized here (see
|
||||
ChangeLog for the full list):
|
||||
|
|
|
@ -40,7 +40,11 @@ GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY
|
|||
|
||||
The following are generic instructions for building the PCRE2 C library "by
|
||||
hand". If you are going to use CMake, this section does not apply to you; you
|
||||
can skip ahead to the CMake section.
|
||||
can skip ahead to the CMake section. Note that the settings concerned with
|
||||
8-bit, 16-bit, and 32-bit code units relate to the type of data string that
|
||||
PCRE2 processes. They are NOT referring to the underlying operating system bit
|
||||
width. You do not have to do anything special to compile in a 64-bit
|
||||
environment, for example.
|
||||
|
||||
(1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
|
||||
macro settings that it contains to whatever is appropriate for your
|
||||
|
@ -74,23 +78,23 @@ can skip ahead to the CMake section.
|
|||
src/pcre2_chartables.c.
|
||||
|
||||
OR:
|
||||
Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
|
||||
if you have set up src/config.h), and then run it with the single
|
||||
argument "src/pcre2_chartables.c". This generates a set of standard
|
||||
character tables and writes them to that file. The tables are generated
|
||||
using the default C locale for your system. If you want to use a locale
|
||||
that is specified by LC_xxx environment variables, add the -L option to
|
||||
the dftables command. You must use this method if you are building on a
|
||||
system that uses EBCDIC code.
|
||||
Compile src/pcre2_dftables.c as a stand-alone program (using
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h), and then run it with
|
||||
the single argument "src/pcre2_chartables.c". This generates a set of
|
||||
standard character tables and writes them to that file. The tables are
|
||||
generated using the default C locale for your system. If you want to use
|
||||
a locale that is specified by LC_xxx environment variables, add the -L
|
||||
option to the pcre2_dftables command. You must use this method if you
|
||||
are building on a system that uses EBCDIC code.
|
||||
|
||||
The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
|
||||
specify alternative tables at run time.
|
||||
|
||||
(4) For an 8-bit library, compile the following source files from the src
|
||||
directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also
|
||||
set -DHAVE_CONFIG_H if you have set up src/config.h with your
|
||||
configuration, or else use other -D settings to change the configuration
|
||||
as required.
|
||||
(4) For a library that supports 8-bit code units in the character strings that
|
||||
it processes, compile the following source files from the src directory,
|
||||
setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h with your configuration,
|
||||
or else use other -D settings to change the configuration as required.
|
||||
|
||||
pcre2_auto_possess.c
|
||||
pcre2_chartables.c
|
||||
|
@ -117,6 +121,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -142,9 +147,9 @@ can skip ahead to the CMake section.
|
|||
If your system has static and shared libraries, you may have to do this
|
||||
once for each type.
|
||||
|
||||
(6) If you want to build a 16-bit library or 32-bit library (as well as, or
|
||||
instead of the 8-bit library) just supply 16 or 32 as the value of
|
||||
-DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
(6) If you want to build a library that supports 16-bit or 32-bit code units,
|
||||
(as well as, or instead of the 8-bit library) just supply 16 or 32 as the
|
||||
value of -DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
|
||||
(7) If you want to build the POSIX wrapper functions (which apply only to the
|
||||
8-bit library), ensure that you have the src/pcre2posix.h file and then
|
||||
|
@ -302,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -339,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -369,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
@ -401,6 +406,6 @@ Everything in that location, source and executable, is in EBCDIC and native
|
|||
z/OS file formats. The port provides an API for LE languages such as COBOL and
|
||||
for the z/OS and z/VM versions of the Rexx languages.
|
||||
|
||||
==============================
|
||||
Last Updated: 14 November 2018
|
||||
==============================
|
||||
===========================
|
||||
Last Updated: 28 April 2021
|
||||
===========================
|
||||
|
|
|
@ -190,7 +190,7 @@ files="\
|
|||
libpcre2-16.pc.in \
|
||||
libpcre2-32.pc.in \
|
||||
libpcre2-posix.pc.in \
|
||||
src/dftables.c \
|
||||
src/pcre2_dftables.c \
|
||||
src/pcre2.h.in \
|
||||
src/pcre2_auto_possess.c \
|
||||
src/pcre2_compile.c \
|
||||
|
|
162
README
162
README
|
@ -4,18 +4,20 @@ README file for PCRE2 (Perl-compatible regular expression library)
|
|||
PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
||||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features and the internals have been improved. The latest release of PCRE2 is
|
||||
always available in three alternative formats from:
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE (both the
|
||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||
subscribe or manage your subscription here:
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
pcre2-dev+subscribe@googlegroups.com.
|
||||
|
||||
https://lists.exim.org/mailman/listinfo/pcre-dev
|
||||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -112,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -164,9 +172,11 @@ library. They are also documented in the pcre2build man page.
|
|||
will be a compile time error. If in doubt, use --enable-jit=auto, which
|
||||
enables JIT only if the current hardware is supported.
|
||||
|
||||
. If you are enabling JIT under SELinux you may also want to add
|
||||
--enable-jit-sealloc, which enables the use of an execmem allocator in JIT
|
||||
that is compatible with SELinux. This has no effect if JIT is not enabled.
|
||||
. If you are enabling JIT under SELinux environment you may also want to add
|
||||
--enable-jit-sealloc, which enables the use of an executable memory allocator
|
||||
that is compatible with SELinux. Warning: this allocator is experimental!
|
||||
It does not support fork() operation and may crash when no disk space is
|
||||
available. This option has no effect if JIT is disabled.
|
||||
|
||||
. If you do not want to make use of the default support for UTF-8 Unicode
|
||||
character strings in the 8-bit library, UTF-16 Unicode character strings in
|
||||
|
@ -184,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -267,9 +277,9 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
--enable-rebuild-chartables
|
||||
|
||||
a program called dftables is compiled and run in the default C locale when
|
||||
you obey "make". It builds a source file called pcre2_chartables.c. If you do
|
||||
not specify this option, pcre2_chartables.c is created as a copy of
|
||||
a program called pcre2_dftables is compiled and run in the default C locale
|
||||
when you obey "make". It builds a source file called pcre2_chartables.c. If
|
||||
you do not specify this option, pcre2_chartables.c is created as a copy of
|
||||
pcre2_chartables.c.dist. See "Character tables" below for further
|
||||
information.
|
||||
|
||||
|
@ -295,8 +305,8 @@ library. They are also documented in the pcre2build man page.
|
|||
unaddressable. This allows it to detect invalid memory accesses, and is
|
||||
mostly useful for debugging PCRE2 itself.
|
||||
|
||||
. In environments where the gcc compiler is used and lcov version 1.6 or above
|
||||
is installed, if you specify
|
||||
. In environments where the gcc compiler is used and lcov is installed, if you
|
||||
specify
|
||||
|
||||
--enable-coverage
|
||||
|
||||
|
@ -365,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -390,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -407,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -546,11 +557,11 @@ Cross-compiling using autotools
|
|||
|
||||
You can specify CC and CFLAGS in the normal way to the "configure" command, in
|
||||
order to cross-compile PCRE2 for some other host. However, you should NOT
|
||||
specify --enable-rebuild-chartables, because if you do, the dftables.c source
|
||||
file is compiled and run on the local host, in order to generate the inbuilt
|
||||
character tables (the pcre2_chartables.c file). This will probably not work,
|
||||
because dftables.c needs to be compiled with the local compiler, not the cross
|
||||
compiler.
|
||||
specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c
|
||||
source file is compiled and run on the local host, in order to generate the
|
||||
inbuilt character tables (the pcre2_chartables.c file). This will probably not
|
||||
work, because pcre2_dftables.c needs to be compiled with the local compiler,
|
||||
not the cross compiler.
|
||||
|
||||
When --enable-rebuild-chartables is not specified, pcre2_chartables.c is
|
||||
created by making a copy of pcre2_chartables.c.dist, which is a default set of
|
||||
|
@ -558,9 +569,10 @@ tables that assumes ASCII code. Cross-compiling with the default tables should
|
|||
not be a problem.
|
||||
|
||||
If you need to modify the character tables when cross-compiling, you should
|
||||
move pcre2_chartables.c.dist out of the way, then compile dftables.c by hand
|
||||
and run it on the local host to make a new version of pcre2_chartables.c.dist.
|
||||
Then when you cross-compile PCRE2 this new version of the tables will be used.
|
||||
move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by
|
||||
hand and run it on the local host to make a new version of
|
||||
pcre2_chartables.c.dist. See the pcre2build section "Creating character tables
|
||||
at build time" for more details.
|
||||
|
||||
|
||||
Making new tarballs
|
||||
|
@ -597,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -684,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -719,8 +731,8 @@ compile context.
|
|||
The source file called pcre2_chartables.c contains the default set of tables.
|
||||
By default, this is created as a copy of pcre2_chartables.c.dist, which
|
||||
contains tables for ASCII coding. However, if --enable-rebuild-chartables is
|
||||
specified for ./configure, a different version of pcre2_chartables.c is built
|
||||
by the program dftables (compiled from dftables.c), which uses the ANSI C
|
||||
specified for ./configure, a new version of pcre2_chartables.c is built by the
|
||||
program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C
|
||||
character handling functions such as isalnum(), isalpha(), isupper(),
|
||||
islower(), etc. to build the table sources. This means that the default C
|
||||
locale that is set for your system will control the contents of these default
|
||||
|
@ -730,32 +742,40 @@ file does not get automatically re-generated. The best way to do this is to
|
|||
move pcre2_chartables.c.dist out of the way and replace it with your customized
|
||||
tables.
|
||||
|
||||
When the dftables program is run as a result of --enable-rebuild-chartables,
|
||||
it uses the default C locale that is set on your system. It does not pay
|
||||
attention to the LC_xxx environment variables. In other words, it uses the
|
||||
system's default locale rather than whatever the compiling user happens to have
|
||||
set. If you really do want to build a source set of character tables in a
|
||||
locale that is specified by the LC_xxx variables, you can run the dftables
|
||||
program by hand with the -L option. For example:
|
||||
When the pcre2_dftables program is run as a result of specifying
|
||||
--enable-rebuild-chartables, it uses the default C locale that is set on your
|
||||
system. It does not pay attention to the LC_xxx environment variables. In other
|
||||
words, it uses the system's default locale rather than whatever the compiling
|
||||
user happens to have set. If you really do want to build a source set of
|
||||
character tables in a locale that is specified by the LC_xxx variables, you can
|
||||
run the pcre2_dftables program by hand with the -L option. For example:
|
||||
|
||||
./dftables -L pcre2_chartables.c.special
|
||||
./pcre2_dftables -L pcre2_chartables.c.special
|
||||
|
||||
The first two 256-byte tables provide lower casing and case flipping functions,
|
||||
respectively. The next table consists of three 32-byte bit maps which identify
|
||||
digits, "word" characters, and white space, respectively. These are used when
|
||||
building 32-byte bit maps that represent character classes for code points less
|
||||
than 256. The final 256-byte table has bits indicating various character types,
|
||||
as follows:
|
||||
The second argument names the file where the source code for the tables is
|
||||
written. The first two 256-byte tables provide lower casing and case flipping
|
||||
functions, respectively. The next table consists of a number of 32-byte bit
|
||||
maps which identify certain character classes such as digits, "word"
|
||||
characters, white space, etc. These are used when building 32-byte bit maps
|
||||
that represent character classes for code points less than 256. The final
|
||||
256-byte table has bits indicating various character types, as follows:
|
||||
|
||||
1 white space character
|
||||
2 letter
|
||||
4 decimal digit
|
||||
8 hexadecimal digit
|
||||
4 lower case letter
|
||||
8 decimal digit
|
||||
16 alphanumeric or '_'
|
||||
128 regular expression metacharacter or binary zero
|
||||
|
||||
You should not alter the set of characters that contain the 128 bit, as that
|
||||
will cause PCRE2 to malfunction.
|
||||
You can also specify -b (with or without -L) when running pcre2_dftables. This
|
||||
causes the tables to be written in binary instead of as source code. A set of
|
||||
binary tables can be loaded into memory by an application and passed to
|
||||
pcre2_compile() in the same way as tables created dynamically by calling
|
||||
pcre2_maketables(). The tables are just a string of bytes, independent of
|
||||
hardware characteristics such as endianness. This means they can be bundled
|
||||
with an application that runs in different environments, to ensure consistent
|
||||
behaviour.
|
||||
|
||||
See also the pcre2build section "Creating character tables at build time".
|
||||
|
||||
|
||||
File manifest
|
||||
|
@ -766,7 +786,7 @@ The distribution should contain the files listed below.
|
|||
(A) Source files for the PCRE2 library functions and their headers are found in
|
||||
the src directory:
|
||||
|
||||
src/dftables.c auxiliary program for building pcre2_chartables.c
|
||||
src/pcre2_dftables.c auxiliary program for building pcre2_chartables.c
|
||||
when --enable-rebuild-chartables is specified
|
||||
|
||||
src/pcre2_chartables.c.dist a default set of character tables that assume
|
||||
|
@ -890,6 +910,6 @@ The distribution should contain the files listed below.
|
|||
) environments
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 April 2019
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
PCRE2 (Perl-compatible regular expression library)
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
This is a port of PCRE2 10.40 by Philip Hazel for AmigaOS 4, as found at the
|
||||
GitHub repository https://github.com/PCRE2Project/pcre2
|
||||
|
||||
More information about PCRE can be found at its official website
|
||||
at https://www.pcre.org and at the documentation that comes with this
|
||||
package.
|
||||
|
||||
In the archive both newlib and clib2 libraries are included. It has been
|
||||
tested with various applications, but in case you find issues please
|
||||
contact me.
|
||||
|
||||
To install it into your AmigaOS 4 SDK installation, just extract all the
|
||||
files in the SDK: path.
|
||||
|
||||
Compile
|
||||
--------------------------
|
||||
The source and the changes I did can be found at my personale repository
|
||||
https://git.walkero.gr/walkero/pcre2
|
||||
|
||||
You can compile it using the Makefile.os4 file, and produce the libraries
|
||||
yourself.
|
||||
|
||||
* with newlib run:
|
||||
```bash
|
||||
make -f Makefile.os4 all
|
||||
```
|
||||
* with clib2 run:
|
||||
```bash
|
||||
make -f Makefile.os4 all USE_CLIB2=yes
|
||||
```
|
||||
|
||||
Changelog
|
||||
--------------------------
|
||||
v10.40r1 - 2022-07-31
|
||||
* First release
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
# PCRE2 - Perl-Compatible Regular Expressions
|
||||
|
||||
The PCRE2 library is a set of C functions that implement regular expression
|
||||
pattern matching using the same syntax and semantics as Perl 5. PCRE2 has its
|
||||
own native API, as well as a set of wrapper functions that correspond to the
|
||||
POSIX regular expression API. The PCRE2 library is free, even for building
|
||||
proprietary software. It comes in three forms, for processing 8-bit, 16-bit,
|
||||
or 32-bit code units, in either literal or UTF encoding.
|
||||
|
||||
PCRE2 was first released in 2015 to replace the API in the original PCRE
|
||||
library, which is now obsolete and no longer maintained. As well as a more
|
||||
flexible API, the code of PCRE2 has been much improved since the fork.
|
||||
|
||||
## Download
|
||||
|
||||
As well as downloading from the
|
||||
[GitHub site](https://github.com/PCRE2Project/pcre2), you can download PCRE2
|
||||
or the older, unmaintained PCRE1 library from an
|
||||
[*unofficial* mirror](https://sourceforge.net/projects/pcre/files/) at SourceForge.
|
||||
|
||||
You can check out the PCRE2 source code via Git or Subversion:
|
||||
|
||||
git clone https://github.com/PCRE2Project/pcre2.git
|
||||
svn co https://github.com/PCRE2Project/pcre2.git
|
||||
|
||||
## Contributed Ports
|
||||
|
||||
If you just need the command-line PCRE2 tools on Windows, precompiled binary
|
||||
versions are available at this
|
||||
[Rexegg page](http://www.rexegg.com/pcregrep-pcretest.html).
|
||||
|
||||
A PCRE2 port for z/OS, a mainframe operating system which uses EBCDIC as its
|
||||
default character encoding, can be found at
|
||||
[http://www.cbttape.org](http://www.cbttape.org/) (File 939).
|
||||
|
||||
## Documentation
|
||||
|
||||
You can read the PCRE2 documentation
|
||||
[here](https://PCRE2Project.github.io/pcre2/doc/html/index.html).
|
||||
|
||||
Comparisons to Perl's regular expression semantics can be found in the
|
||||
community authored Wikipedia entry for PCRE.
|
||||
|
||||
There is a curated summary of changes for each PCRE release, copies of
|
||||
documentation from older releases, and other useful information from the third
|
||||
party authored
|
||||
[RexEgg PCRE Documentation and Change Log page](http://www.rexegg.com/pcre-documentation.html).
|
||||
|
||||
## Contact
|
||||
|
||||
To report a problem with the PCRE2 library, or to make a feature request, please
|
||||
use the PCRE2 GitHub issues tracker. There is a mailing list for discussion of
|
||||
PCRE2 issues and development at pcre2-dev@googlegroups.com, which is where any
|
||||
announcements will be made. You can browse the
|
||||
[list archives](https://groups.google.com/g/pcre2-dev).
|
||||
|
106
RunGrepTest
106
RunGrepTest
|
@ -8,7 +8,7 @@
|
|||
# * Put printf arguments in single, not double quotes to avoid unwanted
|
||||
# escaping.
|
||||
# * Use \0 for binary zero in printf, not \x0, for the benefit of older
|
||||
# versions.
|
||||
# versions (and use octal for other special values).
|
||||
|
||||
# Set the C locale, so that sort(1) behaves predictably.
|
||||
|
||||
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
|||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||
|
||||
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||
# in many operating systems. An earlier version of this script used sed to
|
||||
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character. However, on (some versions
|
||||
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||
# it to the current or parent directory, whichever one contains the test data.
|
||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||
|
@ -558,7 +574,7 @@ echo "RC=$?" >>testtrygrep
|
|||
echo "---------------------------- Test 107 -----------------------------" >>testtrygrep
|
||||
echo "a" >testtemp1grep
|
||||
echo "aaaaa" >>testtemp1grep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets --allow-lookaround-bsk '(?<=\Ka)' $builddir/testtemp1grep) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 108 ------------------------------" >>testtrygrep
|
||||
|
@ -638,13 +654,13 @@ echo "RC=$?" >>testtrygrep
|
|||
|
||||
echo "---------------------------- Test 125 -----------------------------" >>testtrygrep
|
||||
printf 'abcd\n' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?<=\K.)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K.)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?=.\K)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=.\K)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?<=\K[ac])' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?<=\K[ac])' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always '(?=[ac]\K)' testNinputgrep >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep --colour=always --allow-lookaround-bsk '(?=[ac]\K)' testNinputgrep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 126 -----------------------------" >>testtrygrep
|
||||
|
@ -653,6 +669,47 @@ printf 'ABC\0XYZ\nABCDEF\nDEFABC\n' >testtemp2grep
|
|||
$valgrind $vjs $pcre2grep -a -f testtemp1grep testtemp2grep >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 127 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 128 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 129 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m 2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 130 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 131 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -oc -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; head -1 <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; exec 3<testdata/grepinput; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; echo '---'; $valgrind $vjs $pcre2grep -m1 -A3 '^match' <&3; exec 3<&-) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 134 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
|
@ -674,7 +731,21 @@ if [ $utf8 -ne 0 ] ; then
|
|||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U3 ------------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep --line-offsets -u --newline=any --allow-lookaround-bsk '(?<=\K\x{17f})' ./testdata/grepinput8) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U4 ------------------------------" >>testtrygrep
|
||||
printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -u -o '....' $builddir/testtemp1grep) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U5 ------------------------------" >>testtrygrep
|
||||
printf 'A\341\200\200\200CD\342\200\200Z\n' >testtemp1grep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U6 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
$cf $srcdir/testdata/grepoutput8 testtrygrep
|
||||
|
@ -714,24 +785,10 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
|||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||
|
||||
# It seems impossible to handle NUL characters easily in many operating
|
||||
# systems, including Solaris (aka SunOS), where the version of sed explicitly
|
||||
# doesn't like them, and also MacOS (Darwin), OpenBSD, FreeBSD, and NetBSD. So
|
||||
# now we run this test only on OS that are known to work. For the rest, we
|
||||
# fudge the output so that the comparison works.
|
||||
|
||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||
uname=`uname`
|
||||
case $uname in
|
||||
Linux)
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | sed 's/\x00/ZERO/' >>testtrygrep
|
||||
echo "" >>testtrygrep
|
||||
;;
|
||||
*)
|
||||
echo '1:abcZERO2:def' >>testtrygrep
|
||||
;;
|
||||
esac
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||
echo "" >>testtrygrep
|
||||
|
||||
$cf $srcdir/testdata/grepoutputN testtrygrep
|
||||
if [ $? != 0 ] ; then exit 1; fi
|
||||
|
@ -747,6 +804,7 @@ if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scri
|
|||
$valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
|
||||
if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
|
||||
$cf $srcdir/testdata/grepoutputCN testtrygrep
|
||||
|
|
74
RunTest
74
RunTest
|
@ -17,8 +17,16 @@
|
|||
# individual test numbers, ranges of tests such as 3-6 or 3- (meaning 3 to the
|
||||
# end), or a number preceded by ~ to exclude a test. For example, "3-15 ~10"
|
||||
# runs tests 3 to 15, excluding test 10, and just "~10" runs all the tests
|
||||
# except test 10. Whatever order the arguments are in, the tests are always run
|
||||
# in numerical order.
|
||||
# except test 10. Whatever order the arguments are in, these tests are always
|
||||
# run in numerical order.
|
||||
#
|
||||
# If no specific tests are selected (which is the case when this script is run
|
||||
# via 'make check') the default is to run all the numbered tests.
|
||||
#
|
||||
# There may also be named (as well as numbered) tests for special purposes. At
|
||||
# present there is just one, called "heap". This test's output contains the
|
||||
# sizes of heap frames and frame vectors, which depend on the environment. It
|
||||
# is therefore not run unless explicitly requested.
|
||||
#
|
||||
# Inappropriate tests are automatically skipped (with a comment to say so). For
|
||||
# example, if JIT support is not compiled, test 16 is skipped, whereas if JIT
|
||||
|
@ -80,7 +88,9 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
|
|||
title23="Test 23: \C disabled test"
|
||||
title24="Test 24: Non-UTF pattern conversion tests"
|
||||
title25="Test 25: UTF pattern conversion tests"
|
||||
maxtest=25
|
||||
title26="Test 26: Auto-generated unicode property tests"
|
||||
maxtest=26
|
||||
titleheap="Test 'heap': Environment-specific heap tests"
|
||||
|
||||
if [ $# -eq 1 -a "$1" = "list" ]; then
|
||||
echo $title0
|
||||
|
@ -109,6 +119,12 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
|
|||
echo $title23
|
||||
echo $title24
|
||||
echo $title25
|
||||
echo $title26
|
||||
echo ""
|
||||
echo $titleheap
|
||||
echo ""
|
||||
echo "Numbered tests are automatically run if nothing selected."
|
||||
echo "Named tests must be explicitly selected."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
@ -238,6 +254,8 @@ do22=no
|
|||
do23=no
|
||||
do24=no
|
||||
do25=no
|
||||
do26=no
|
||||
doheap=no
|
||||
|
||||
while [ $# -gt 0 ] ; do
|
||||
case $1 in
|
||||
|
@ -267,6 +285,8 @@ while [ $# -gt 0 ] ; do
|
|||
23) do23=yes;;
|
||||
24) do24=yes;;
|
||||
25) do25=yes;;
|
||||
26) do26=yes;;
|
||||
heap) doheap=yes;;
|
||||
-8) arg8=yes;;
|
||||
-16) arg16=yes;;
|
||||
-32) arg32=yes;;
|
||||
|
@ -320,7 +340,8 @@ fi
|
|||
# set up a large stack.
|
||||
|
||||
$sim ./pcre2test -S 64 /dev/null /dev/null
|
||||
if [ $? -eq 0 -a "$bigstack" != "" ] ; then
|
||||
support_setstack=$?
|
||||
if [ $support_setstack -eq 0 -a "$bigstack" != "" ] ; then
|
||||
setstack="-S 64"
|
||||
else
|
||||
setstack=""
|
||||
|
@ -407,8 +428,8 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
|
|||
fi
|
||||
fi
|
||||
|
||||
# If no specific tests were requested, select all. Those that are not
|
||||
# relevant will be automatically skipped.
|
||||
# If no specific tests were requested, select all the numbered tests. Those
|
||||
# that are not relevant will be automatically skipped.
|
||||
|
||||
if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||
$do4 = no -a $do5 = no -a $do6 = no -a $do7 = no -a \
|
||||
|
@ -416,7 +437,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
|
||||
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
|
||||
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
|
||||
$do24 = no -a $do25 = no \
|
||||
$do24 = no -a $do25 = no -a $do26 = no -a $doheap = no \
|
||||
]; then
|
||||
do0=yes
|
||||
do1=yes
|
||||
|
@ -444,6 +465,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do23=yes
|
||||
do24=yes
|
||||
do25=yes
|
||||
do26=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -479,7 +501,9 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
echo '' >testtry
|
||||
checkspecial '-C'
|
||||
checkspecial '--help'
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
if [ $support_setstack -eq 0 ] ; then
|
||||
checkspecial '-S 1 -t 10 testSinput'
|
||||
fi
|
||||
echo " OK"
|
||||
fi
|
||||
|
||||
|
@ -493,15 +517,20 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
done
|
||||
fi
|
||||
|
||||
# PCRE2 tests that are not Perl-compatible: API, errors, internals
|
||||
# PCRE2 tests that are not Perl-compatible: API, errors, internals. We copy
|
||||
# the testbtables file to the current directory for use by this test.
|
||||
|
||||
if [ $do2 = yes ] ; then
|
||||
echo $title2 "(excluding UTF-$bits)"
|
||||
cp $testdata/testbtables .
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry
|
||||
if [ $? = 0 ] ; then
|
||||
saverc=$?
|
||||
if [ $saverc = 0 ] ; then
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -70,-62,-2,-1,0,100,101,191,200 >>testtry
|
||||
checkresult $? 2 "$opt"
|
||||
else
|
||||
checkresult $saverc 2 "$opt"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
@ -855,10 +884,33 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
fi
|
||||
|
||||
# Auto-generated unicode property tests
|
||||
|
||||
if [ $do26 = yes ] ; then
|
||||
echo $title26
|
||||
if [ $utf -eq 0 ] ; then
|
||||
echo " Skipped because UTF-$bits support is not available"
|
||||
else
|
||||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
|
||||
checkresult $? 26 "$opt"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# Manually selected heap tests - output may vary in different environments,
|
||||
# which is why that are not automatically run.
|
||||
|
||||
if [ $doheap = yes ] ; then
|
||||
echo $titleheap
|
||||
$sim $valgrind ./pcre2test -q $setstack $bmode $testdata/testinputheap testtry
|
||||
checkresult $? heap-$bits ""
|
||||
fi
|
||||
|
||||
# End of loop for 8/16/32-bit tests
|
||||
done
|
||||
|
||||
# Clean up local working files
|
||||
rm -f testSinput test3input testsaved1 testsaved2 test3output test3outputA test3outputB teststdout teststderr testtry
|
||||
rm -f testbtables testSinput test3input testsaved1 testsaved2 test3output test3outputA test3outputB teststdout teststderr testtry
|
||||
|
||||
# End
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
@rem Updated for new test 14 (moving others up a number), August 2015.
|
||||
@rem Tidied and updated for new tests 21, 22, 23 by PH, October 2015.
|
||||
@rem PH added missing "set type" for test 22, April 2016.
|
||||
@rem PH added copy command for new testbtables file, November 2020
|
||||
|
||||
|
||||
setlocal enabledelayedexpansion
|
||||
|
@ -134,9 +135,9 @@ if "%all%" == "yes" (
|
|||
set do7=yes
|
||||
set do8=yes
|
||||
set do9=yes
|
||||
set do10=yes
|
||||
set do10=no
|
||||
set do11=yes
|
||||
set do12=yes
|
||||
set do12=no
|
||||
set do13=yes
|
||||
set do14=yes
|
||||
set do15=yes
|
||||
|
@ -305,6 +306,7 @@ if %jit% EQU 1 call :runsub 1 testoutjit "Test with JIT Override" -q -jit
|
|||
goto :eof
|
||||
|
||||
:do2
|
||||
copy /y %srcdir%\testdata\testbtables testbtables
|
||||
call :runsub 2 testout "API, errors, internals, and non-Perl stuff" -q
|
||||
if %jit% EQU 1 call :runsub 2 testoutjit "Test with JIT Override" -q -jit
|
||||
goto :eof
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# See MODULE.bazel
|
|
@ -1,17 +1,16 @@
|
|||
# Modified from FindReadline.cmake (PH Feb 2012)
|
||||
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
if(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
set(EDITLINE_FOUND TRUE)
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h
|
||||
/usr/include/editline
|
||||
/usr/include/edit/readline
|
||||
/usr/include/readline
|
||||
else(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
FIND_PATH(EDITLINE_INCLUDE_DIR readline.h PATH_SUFFIXES
|
||||
editline
|
||||
edit/readline
|
||||
)
|
||||
|
||||
FIND_LIBRARY(EDITLINE_LIBRARY NAMES edit)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY )
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Editline DEFAULT_MSG EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
|
||||
MARK_AS_ADVANCED(EDITLINE_INCLUDE_DIR EDITLINE_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY AND NCURSES_LIBRARY)
|
||||
endif(EDITLINE_INCLUDE_DIR AND EDITLINE_LIBRARY)
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
set(PACKAGE_VERSION_MAJOR @PCRE2_MAJOR@)
|
||||
set(PACKAGE_VERSION_MINOR @PCRE2_MINOR@)
|
||||
set(PACKAGE_VERSION_PATCH 0)
|
||||
set(PACKAGE_VERSION @PCRE2_MAJOR@.@PCRE2_MINOR@.0)
|
||||
|
||||
# Check whether the requested PACKAGE_FIND_VERSION is compatible
|
||||
if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION OR
|
||||
PACKAGE_VERSION_MAJOR GREATER PACKAGE_FIND_VERSION_MAJOR)
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
else()
|
||||
set(PACKAGE_VERSION_COMPATIBLE TRUE)
|
||||
if(PACKAGE_VERSION VERSION_EQUAL PACKAGE_FIND_VERSION)
|
||||
set(PACKAGE_VERSION_EXACT TRUE)
|
||||
endif()
|
||||
endif()
|
|
@ -0,0 +1,145 @@
|
|||
# pcre2-config.cmake
|
||||
# ----------------
|
||||
#
|
||||
# Finds the PCRE2 library, specify the starting search path in PCRE2_ROOT.
|
||||
#
|
||||
# Static vs. shared
|
||||
# -----------------
|
||||
# To make use of the static library instead of the shared one, one needs
|
||||
# to set the variable PCRE2_USE_STATIC_LIBS to ON before calling find_package.
|
||||
# Example:
|
||||
# set(PCRE2_USE_STATIC_LIBS ON)
|
||||
# find_package(PCRE2 CONFIG COMPONENTS 8BIT)
|
||||
#
|
||||
# This will define the following variables:
|
||||
#
|
||||
# PCRE2_FOUND - True if the system has the PCRE2 library.
|
||||
# PCRE2_VERSION - The version of the PCRE2 library which was found.
|
||||
#
|
||||
# and the following imported targets:
|
||||
#
|
||||
# PCRE2::8BIT - The 8 bit PCRE2 library.
|
||||
# PCRE2::16BIT - The 16 bit PCRE2 library.
|
||||
# PCRE2::32BIT - The 32 bit PCRE2 library.
|
||||
# PCRE2::POSIX - The POSIX PCRE2 library.
|
||||
|
||||
set(PCRE2_NON_STANDARD_LIB_PREFIX @NON_STANDARD_LIB_PREFIX@)
|
||||
set(PCRE2_NON_STANDARD_LIB_SUFFIX @NON_STANDARD_LIB_SUFFIX@)
|
||||
set(PCRE2_8BIT_NAME pcre2-8)
|
||||
set(PCRE2_16BIT_NAME pcre2-16)
|
||||
set(PCRE2_32BIT_NAME pcre2-32)
|
||||
set(PCRE2_POSIX_NAME pcre2-posix)
|
||||
find_path(PCRE2_INCLUDE_DIR NAMES pcre2.h DOC "PCRE2 include directory")
|
||||
if (PCRE2_USE_STATIC_LIBS)
|
||||
if (MSVC)
|
||||
set(PCRE2_8BIT_NAME pcre2-8-static)
|
||||
set(PCRE2_16BIT_NAME pcre2-16-static)
|
||||
set(PCRE2_32BIT_NAME pcre2-32-static)
|
||||
set(PCRE2_POSIX_NAME pcre2-posix-static)
|
||||
endif ()
|
||||
|
||||
set(PCRE2_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX})
|
||||
set(PCRE2_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
|
||||
else ()
|
||||
set(PCRE2_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX})
|
||||
if (MINGW AND PCRE2_NON_STANDARD_LIB_PREFIX)
|
||||
set(PCRE2_PREFIX "")
|
||||
endif ()
|
||||
|
||||
set(PCRE2_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
|
||||
if (MINGW AND PCRE2_NON_STANDARD_LIB_SUFFIX)
|
||||
set(PCRE2_SUFFIX "-0.dll")
|
||||
endif ()
|
||||
endif ()
|
||||
find_library(PCRE2_8BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit PCRE2 library")
|
||||
find_library(PCRE2_16BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_16BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "16 bit PCRE2 library")
|
||||
find_library(PCRE2_32BIT_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_32BIT_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "32 bit PCRE2 library")
|
||||
find_library(PCRE2_POSIX_LIBRARY NAMES ${PCRE2_PREFIX}${PCRE2_POSIX_NAME}${PCRE2_SUFFIX} ${PCRE2_PREFIX}${PCRE2_8BIT_NAME}d${PCRE2_SUFFIX} DOC "8 bit POSIX PCRE2 library")
|
||||
unset(PCRE2_NON_STANDARD_LIB_PREFIX)
|
||||
unset(PCRE2_NON_STANDARD_LIB_SUFFIX)
|
||||
unset(PCRE2_8BIT_NAME)
|
||||
unset(PCRE2_16BIT_NAME)
|
||||
unset(PCRE2_32BIT_NAME)
|
||||
unset(PCRE2_POSIX_NAME)
|
||||
|
||||
# Set version
|
||||
if (PCRE2_INCLUDE_DIR)
|
||||
set(PCRE2_VERSION "@PCRE2_MAJOR@.@PCRE2_MINOR@.0")
|
||||
endif ()
|
||||
|
||||
# Which components have been found.
|
||||
if (PCRE2_8BIT_LIBRARY)
|
||||
set(PCRE2_8BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_16BIT_LIBRARY)
|
||||
set(PCRE2_16BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_32BIT_LIBRARY)
|
||||
set(PCRE2_32BIT_FOUND TRUE)
|
||||
endif ()
|
||||
if (PCRE2_POSIX_LIBRARY)
|
||||
set(PCRE2_POSIX_FOUND TRUE)
|
||||
endif ()
|
||||
|
||||
# Check if at least one component has been specified.
|
||||
list(LENGTH PCRE2_FIND_COMPONENTS PCRE2_NCOMPONENTS)
|
||||
if (PCRE2_NCOMPONENTS LESS 1)
|
||||
message(FATAL_ERROR "No components have been specified. This is not allowed. Please, specify at least one component.")
|
||||
endif ()
|
||||
unset(PCRE2_NCOMPONENTS)
|
||||
|
||||
# When POSIX component has been specified make sure that also 8BIT component is specified.
|
||||
set(PCRE2_8BIT_COMPONENT FALSE)
|
||||
set(PCRE2_POSIX_COMPONENT FALSE)
|
||||
foreach(component ${PCRE2_FIND_COMPONENTS})
|
||||
if (component STREQUAL "8BIT")
|
||||
set(PCRE2_8BIT_COMPONENT TRUE)
|
||||
elseif (component STREQUAL "POSIX")
|
||||
set(PCRE2_POSIX_COMPONENT TRUE)
|
||||
endif ()
|
||||
endforeach()
|
||||
|
||||
if (PCRE2_POSIX_COMPONENT AND NOT PCRE2_8BIT_COMPONENT)
|
||||
message(FATAL_ERROR "The component POSIX is specified while the 8BIT one is not. This is not allowed. Please, also specify the 8BIT component.")
|
||||
endif()
|
||||
unset(PCRE2_8BIT_COMPONENT)
|
||||
unset(PCRE2_POSIX_COMPONENT)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
|
||||
find_package_handle_standard_args(PCRE2
|
||||
FOUND_VAR PCRE2_FOUND
|
||||
REQUIRED_VARS PCRE2_INCLUDE_DIR
|
||||
HANDLE_COMPONENTS
|
||||
VERSION_VAR PCRE2_VERSION
|
||||
CONFIG_MODE
|
||||
)
|
||||
|
||||
set(PCRE2_LIBRARIES)
|
||||
if (PCRE2_FOUND)
|
||||
foreach(component ${PCRE2_FIND_COMPONENTS})
|
||||
if (PCRE2_USE_STATIC_LIBS)
|
||||
add_library(PCRE2::${component} STATIC IMPORTED)
|
||||
target_compile_definitions(PCRE2::${component} INTERFACE PCRE2_STATIC)
|
||||
else ()
|
||||
add_library(PCRE2::${component} SHARED IMPORTED)
|
||||
endif ()
|
||||
set_target_properties(PCRE2::${component} PROPERTIES
|
||||
IMPORTED_LOCATION "${PCRE2_${component}_LIBRARY}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${PCRE2_INCLUDE_DIR}"
|
||||
)
|
||||
if (component STREQUAL "POSIX")
|
||||
set_target_properties(PCRE2::${component} PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "PCRE2::8BIT"
|
||||
LINK_LIBRARIES "PCRE2::8BIT"
|
||||
)
|
||||
endif ()
|
||||
|
||||
set(PCRE2_LIBRARIES ${PCRE2_LIBRARIES} ${PCRE2_${component}_LIBRARY})
|
||||
mark_as_advanced(PCRE2_${component}_LIBRARY)
|
||||
endforeach()
|
||||
endif ()
|
||||
|
||||
mark_as_advanced(
|
||||
PCRE2_INCLUDE_DIR
|
||||
)
|
|
@ -1,8 +1,7 @@
|
|||
/* config.h for CMake builds */
|
||||
|
||||
#cmakedefine HAVE_ATTRIBUTE_UNINITIALIZED 1
|
||||
#cmakedefine HAVE_DIRENT_H 1
|
||||
#cmakedefine HAVE_INTTYPES_H 1
|
||||
#cmakedefine HAVE_STDINT_H 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
#cmakedefine HAVE_SYS_STAT_H 1
|
||||
#cmakedefine HAVE_SYS_TYPES_H 1
|
||||
|
@ -10,9 +9,10 @@
|
|||
#cmakedefine HAVE_WINDOWS_H 1
|
||||
|
||||
#cmakedefine HAVE_BCOPY 1
|
||||
#cmakedefine HAVE_MEMFD_CREATE 1
|
||||
#cmakedefine HAVE_MEMMOVE 1
|
||||
|
||||
#cmakedefine PCRE2_STATIC 1
|
||||
#cmakedefine HAVE_SECURE_GETENV 1
|
||||
#cmakedefine HAVE_STRERROR 1
|
||||
|
||||
#cmakedefine SUPPORT_PCRE2_8 1
|
||||
#cmakedefine SUPPORT_PCRE2_16 1
|
||||
|
|
113
configure.ac
113
configure.ac
|
@ -9,21 +9,21 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
|
|||
dnl be defined as -RC2, for example. For real releases, it should be empty.
|
||||
|
||||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [33])
|
||||
m4_define(pcre2_minor, [41])
|
||||
m4_define(pcre2_prerelease, [])
|
||||
m4_define(pcre2_date, [2019-04-16])
|
||||
m4_define(pcre2_date, [2022-xx-xx])
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [11:0:11])
|
||||
m4_define(libpcre2_16_version, [11:0:11])
|
||||
m4_define(libpcre2_32_version, [11:0:11])
|
||||
m4_define(libpcre2_posix_version, [3:2:0])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [8:0:8])
|
||||
m4_define(libpcre2_16_version, [8:0:8])
|
||||
m4_define(libpcre2_32_version, [8:0:8])
|
||||
m4_define(libpcre2_posix_version, [2:2:0])
|
||||
|
||||
AC_PREREQ(2.57)
|
||||
AC_INIT(PCRE2, pcre2_major.pcre2_minor[]pcre2_prerelease, , pcre2)
|
||||
AC_PREREQ([2.60])
|
||||
AC_INIT([PCRE2],pcre2_major.pcre2_minor[]pcre2_prerelease,[],[pcre2])
|
||||
AC_CONFIG_SRCDIR([src/pcre2.h.in])
|
||||
AM_INIT_AUTOMAKE([dist-bzip2 dist-zip])
|
||||
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
||||
|
@ -64,14 +64,31 @@ m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
|
|||
AC_TYPE_INT64_T
|
||||
|
||||
AC_PROG_INSTALL
|
||||
AC_LIBTOOL_WIN32_DLL
|
||||
LT_INIT
|
||||
LT_INIT([win32-dll])
|
||||
AC_PROG_LN_S
|
||||
|
||||
# Check for GCC visibility feature
|
||||
|
||||
PCRE2_VISIBILITY
|
||||
|
||||
# Check for Clang __attribute__((uninitialized)) feature
|
||||
|
||||
AC_MSG_CHECKING([for __attribute__((uninitialized))])
|
||||
AC_LANG_PUSH([C])
|
||||
tmp_CFLAGS=$CFLAGS
|
||||
CFLAGS="$CFLAGS -Werror"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,
|
||||
[[char buf[128] __attribute__((uninitialized));(void)buf]])],
|
||||
[pcre2_cc_cv_attribute_uninitialized=yes],
|
||||
[pcre2_cc_cv_attribute_uninitialized=no])
|
||||
AC_MSG_RESULT([$pcre2_cc_cv_attribute_uninitialized])
|
||||
if test "$pcre2_cc_cv_attribute_uninitialized" = yes; then
|
||||
AC_DEFINE([HAVE_ATTRIBUTE_UNINITIALIZED], 1, [Define this if your compiler
|
||||
supports __attribute__((uninitialized))])
|
||||
fi
|
||||
CFLAGS=$tmp_CFLAGS
|
||||
AC_LANG_POP([C])
|
||||
|
||||
# Versioning
|
||||
|
||||
PCRE2_MAJOR="pcre2_major"
|
||||
|
@ -158,11 +175,18 @@ if test "$enable_jit" = "auto"; then
|
|||
echo checking for JIT support on this hardware... $enable_jit
|
||||
fi
|
||||
|
||||
# Handle --enable-jit-sealloc (disabled by default)
|
||||
AC_ARG_ENABLE(jit-sealloc,
|
||||
AS_HELP_STRING([--enable-jit-sealloc],
|
||||
[enable SELinux compatible execmem allocator in JIT]),
|
||||
, enable_jit_sealloc=no)
|
||||
# Handle --enable-jit-sealloc (disabled by default and only experimental)
|
||||
case $host_os in
|
||||
linux* | netbsd*)
|
||||
AC_ARG_ENABLE(jit-sealloc,
|
||||
AS_HELP_STRING([--enable-jit-sealloc],
|
||||
[enable SELinux compatible execmem allocator in JIT (experimental)]),
|
||||
,enable_jit_sealloc=no)
|
||||
;;
|
||||
*)
|
||||
enable_jit_sealloc=unsupported
|
||||
;;
|
||||
esac
|
||||
|
||||
# Handle --disable-pcre2grep-jit (enabled by default)
|
||||
AC_ARG_ENABLE(pcre2grep-jit,
|
||||
|
@ -399,7 +423,7 @@ case "$enable_newline" in
|
|||
anycrlf) ac_pcre2_newline_value=5 ;;
|
||||
nul) ac_pcre2_newline_value=6 ;;
|
||||
*)
|
||||
AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
|
||||
AC_MSG_ERROR([invalid argument "$enable_newline" to --enable-newline option])
|
||||
;;
|
||||
esac
|
||||
|
||||
|
@ -428,7 +452,7 @@ fi
|
|||
case "$with_link_size" in
|
||||
2|3|4) ;;
|
||||
*)
|
||||
AC_MSG_ERROR([invalid argument \"$with_link_size\" to --with-link-size option])
|
||||
AC_MSG_ERROR([invalid argument "$with_link_size" to --with-link-size option])
|
||||
;;
|
||||
esac
|
||||
|
||||
|
@ -461,7 +485,6 @@ HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make
|
|||
sure both macros are undefined; an emulation function will then be used. */])
|
||||
|
||||
# Checks for header files.
|
||||
AC_HEADER_STDC
|
||||
AC_CHECK_HEADERS(limits.h sys/types.h sys/stat.h dirent.h)
|
||||
AC_CHECK_HEADERS([windows.h], [HAVE_WINDOWS_H=1])
|
||||
AC_CHECK_HEADERS([sys/wait.h], [HAVE_SYS_WAIT_H=1])
|
||||
|
@ -489,7 +512,20 @@ AC_TYPE_SIZE_T
|
|||
|
||||
# Checks for library functions.
|
||||
|
||||
AC_CHECK_FUNCS(bcopy memmove strerror mkostemp secure_getenv)
|
||||
AC_CHECK_FUNCS(bcopy memfd_create memmove mkostemp secure_getenv strerror)
|
||||
AC_MSG_CHECKING([for realpath])
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([[
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
]],[[
|
||||
char buffer[PATH_MAX];
|
||||
realpath(".", buffer);
|
||||
]])],
|
||||
[AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([HAVE_REALPATH], 1,
|
||||
[Define to 1 if you have the `realpath' function.])
|
||||
],
|
||||
AC_MSG_RESULT([no]))
|
||||
|
||||
# Check for the availability of libz (aka zlib)
|
||||
|
||||
|
@ -561,14 +597,14 @@ if test "$enable_pcre2test_libreadline" = "yes"; then
|
|||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Check for the availability of libedit. Different distributions put its
|
||||
# headers in different places. Try to cover the most common ones.
|
||||
|
||||
if test "$enable_pcre2test_libedit" = "yes"; then
|
||||
AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
|
||||
[AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
|
||||
AC_CHECK_HEADERS([editline/readline.h edit/readline/readline.h readline.h], [
|
||||
HAVE_LIBEDIT_HEADER=1
|
||||
break
|
||||
])
|
||||
AC_CHECK_LIB([edit], [readline], [LIBEDIT="-ledit"])
|
||||
fi
|
||||
|
||||
|
@ -904,10 +940,9 @@ if test "$enable_pcre2test_libedit" = "yes"; then
|
|||
echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
|
||||
exit 1
|
||||
fi
|
||||
if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
|
||||
"$HAVE_READLINE_READLINE_H" != "1"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
|
||||
echo "** nor readline/readline.h was found."
|
||||
if test -z "$HAVE_LIBEDIT_HEADER"; then
|
||||
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h,"
|
||||
echo "** edit/readline/readline.h nor a compatible header was found."
|
||||
exit 1
|
||||
fi
|
||||
if test -z "$LIBEDIT"; then
|
||||
|
@ -981,7 +1016,27 @@ fi # enable_coverage
|
|||
|
||||
AM_CONDITIONAL([WITH_GCOV],[test "x$enable_coverage" = "xyes"])
|
||||
|
||||
AC_MSG_CHECKING([whether Intel CET is enabled])
|
||||
AC_LANG_PUSH([C])
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,
|
||||
[[#ifndef __CET__
|
||||
# error CET is not enabled
|
||||
#endif]])],
|
||||
[pcre2_cc_cv_intel_cet_enabled=yes],
|
||||
[pcre2_cc_cv_intel_cet_enabled=no])
|
||||
AC_MSG_RESULT([$pcre2_cc_cv_intel_cet_enabled])
|
||||
if test "$pcre2_cc_cv_intel_cet_enabled" = yes; then
|
||||
CET_CFLAGS="-mshstk"
|
||||
AC_SUBST([CET_CFLAGS])
|
||||
fi
|
||||
AC_LANG_POP([C])
|
||||
|
||||
# LIB_POSTFIX is used by CMakeLists.txt for Windows debug builds.
|
||||
# Pass empty LIB_POSTFIX to *.pc files and pcre2-config here.
|
||||
AC_SUBST(LIB_POSTFIX)
|
||||
|
||||
# Produce these files, in addition to config.h.
|
||||
|
||||
AC_CONFIG_FILES(
|
||||
Makefile
|
||||
libpcre2-8.pc
|
||||
|
|
|
@ -40,7 +40,11 @@ GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY
|
|||
|
||||
The following are generic instructions for building the PCRE2 C library "by
|
||||
hand". If you are going to use CMake, this section does not apply to you; you
|
||||
can skip ahead to the CMake section.
|
||||
can skip ahead to the CMake section. Note that the settings concerned with
|
||||
8-bit, 16-bit, and 32-bit code units relate to the type of data string that
|
||||
PCRE2 processes. They are NOT referring to the underlying operating system bit
|
||||
width. You do not have to do anything special to compile in a 64-bit
|
||||
environment, for example.
|
||||
|
||||
(1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
|
||||
macro settings that it contains to whatever is appropriate for your
|
||||
|
@ -74,23 +78,23 @@ can skip ahead to the CMake section.
|
|||
src/pcre2_chartables.c.
|
||||
|
||||
OR:
|
||||
Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
|
||||
if you have set up src/config.h), and then run it with the single
|
||||
argument "src/pcre2_chartables.c". This generates a set of standard
|
||||
character tables and writes them to that file. The tables are generated
|
||||
using the default C locale for your system. If you want to use a locale
|
||||
that is specified by LC_xxx environment variables, add the -L option to
|
||||
the dftables command. You must use this method if you are building on a
|
||||
system that uses EBCDIC code.
|
||||
Compile src/pcre2_dftables.c as a stand-alone program (using
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h), and then run it with
|
||||
the single argument "src/pcre2_chartables.c". This generates a set of
|
||||
standard character tables and writes them to that file. The tables are
|
||||
generated using the default C locale for your system. If you want to use
|
||||
a locale that is specified by LC_xxx environment variables, add the -L
|
||||
option to the pcre2_dftables command. You must use this method if you
|
||||
are building on a system that uses EBCDIC code.
|
||||
|
||||
The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
|
||||
specify alternative tables at run time.
|
||||
|
||||
(4) For an 8-bit library, compile the following source files from the src
|
||||
directory, setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also
|
||||
set -DHAVE_CONFIG_H if you have set up src/config.h with your
|
||||
configuration, or else use other -D settings to change the configuration
|
||||
as required.
|
||||
(4) For a library that supports 8-bit code units in the character strings that
|
||||
it processes, compile the following source files from the src directory,
|
||||
setting -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set
|
||||
-DHAVE_CONFIG_H if you have set up src/config.h with your configuration,
|
||||
or else use other -D settings to change the configuration as required.
|
||||
|
||||
pcre2_auto_possess.c
|
||||
pcre2_chartables.c
|
||||
|
@ -117,6 +121,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -142,9 +147,9 @@ can skip ahead to the CMake section.
|
|||
If your system has static and shared libraries, you may have to do this
|
||||
once for each type.
|
||||
|
||||
(6) If you want to build a 16-bit library or 32-bit library (as well as, or
|
||||
instead of the 8-bit library) just supply 16 or 32 as the value of
|
||||
-DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
(6) If you want to build a library that supports 16-bit or 32-bit code units,
|
||||
(as well as, or instead of the 8-bit library) just supply 16 or 32 as the
|
||||
value of -DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||
|
||||
(7) If you want to build the POSIX wrapper functions (which apply only to the
|
||||
8-bit library), ensure that you have the src/pcre2posix.h file and then
|
||||
|
@ -302,7 +307,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||
|
||||
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||
4. Run cmake-gui from the Shell environment of your build tool, for example,
|
||||
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||
|
||||
|
@ -339,10 +344,10 @@ cache can be deleted by selecting "File > Delete Cache".
|
|||
|
||||
BUILDING PCRE2 ON WINDOWS WITH VISUAL STUDIO
|
||||
|
||||
The code currently cannot be compiled without a stdint.h header, which is
|
||||
available only in relatively recent versions of Visual Studio. However, this
|
||||
portable and permissively-licensed implementation of the header worked without
|
||||
issue:
|
||||
The code currently cannot be compiled without an inttypes.h header, which is
|
||||
available only with Visual Studio 2013 or newer. However, this portable and
|
||||
permissively-licensed implementation of the stdint.h header could be used as an
|
||||
alternative:
|
||||
|
||||
http://www.azillionmonkeys.com/qed/pstdint.h
|
||||
|
||||
|
@ -369,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
@ -401,6 +406,6 @@ Everything in that location, source and executable, is in EBCDIC and native
|
|||
z/OS file formats. The port provides an API for LE languages such as COBOL and
|
||||
for the z/OS and z/VM versions of the Rexx languages.
|
||||
|
||||
==============================
|
||||
Last Updated: 14 November 2018
|
||||
==============================
|
||||
===========================
|
||||
Last Updated: 28 April 2021
|
||||
===========================
|
||||
|
|
|
@ -4,18 +4,20 @@ README file for PCRE2 (Perl-compatible regular expression library)
|
|||
PCRE2 is a re-working of the original PCRE1 library to provide an entirely new
|
||||
API. Since its initial release in 2015, there has been further development of
|
||||
the code and it now differs from PCRE1 in more than just the API. There are new
|
||||
features and the internals have been improved. The latest release of PCRE2 is
|
||||
always available in three alternative formats from:
|
||||
features, and the internals have been improved. The original PCRE1 library is
|
||||
now obsolete and no longer maintained. The latest release of PCRE2 is available
|
||||
in .tar.gz, tar.bz2, or .zip form from this GitHub repository:
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
|
||||
https://github.com/PCRE2Project/pcre2/releases
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE (both the
|
||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||
subscribe or manage your subscription here:
|
||||
There is a mailing list for discussion about the development of PCRE2 at
|
||||
pcre2-dev@googlegroups.com. You can subscribe by sending an email to
|
||||
pcre2-dev+subscribe@googlegroups.com.
|
||||
|
||||
https://lists.exim.org/mailman/listinfo/pcre-dev
|
||||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -112,12 +114,18 @@ Building PCRE2 using autotools
|
|||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
||||
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||
command from the PCRE2 distribution directory, with your current directory set
|
||||
If you have downloaded and unpacked a PCRE2 release tarball, run the
|
||||
"configure" command from the PCRE2 directory, with your current directory set
|
||||
to the directory where you want the files to be created. This command is a
|
||||
standard GNU "autoconf" configuration script, for which generic instructions
|
||||
are supplied in the file INSTALL.
|
||||
|
||||
The files in the GitHub repository do not contain "configure". If you have
|
||||
downloaded the PCRE2 source files from GitHub, before you can run "configure"
|
||||
you must run the shell script called autogen.sh. This runs a number of
|
||||
autotools to create a "configure" script (you must of course have the autotools
|
||||
commands installed in order to do this).
|
||||
|
||||
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||
this case, on many systems, just running "./configure" is sufficient. However,
|
||||
the usual methods of changing standard defaults are available. For example:
|
||||
|
@ -164,9 +172,11 @@ library. They are also documented in the pcre2build man page.
|
|||
will be a compile time error. If in doubt, use --enable-jit=auto, which
|
||||
enables JIT only if the current hardware is supported.
|
||||
|
||||
. If you are enabling JIT under SELinux you may also want to add
|
||||
--enable-jit-sealloc, which enables the use of an execmem allocator in JIT
|
||||
that is compatible with SELinux. This has no effect if JIT is not enabled.
|
||||
. If you are enabling JIT under SELinux environment you may also want to add
|
||||
--enable-jit-sealloc, which enables the use of an executable memory allocator
|
||||
that is compatible with SELinux. Warning: this allocator is experimental!
|
||||
It does not support fork() operation and may crash when no disk space is
|
||||
available. This option has no effect if JIT is disabled.
|
||||
|
||||
. If you do not want to make use of the default support for UTF-8 Unicode
|
||||
character strings in the 8-bit library, UTF-16 Unicode character strings in
|
||||
|
@ -184,10 +194,10 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
As well as supporting UTF strings, Unicode support includes support for the
|
||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
Escape sequences such as \d and \w in patterns do not by default make use of
|
||||
Unicode properties, but can be made to do so by setting the PCRE2_UCP option
|
||||
or starting a pattern with (*UCP).
|
||||
However, only a subset of Unicode properties are supported; see the
|
||||
pcre2pattern man page for details. Escape sequences such as \d and \w in
|
||||
patterns do not by default make use of Unicode properties, but can be made to
|
||||
do so by setting the PCRE2_UCP option or starting a pattern with (*UCP).
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, or the NUL (zero)
|
||||
|
@ -267,9 +277,9 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
--enable-rebuild-chartables
|
||||
|
||||
a program called dftables is compiled and run in the default C locale when
|
||||
you obey "make". It builds a source file called pcre2_chartables.c. If you do
|
||||
not specify this option, pcre2_chartables.c is created as a copy of
|
||||
a program called pcre2_dftables is compiled and run in the default C locale
|
||||
when you obey "make". It builds a source file called pcre2_chartables.c. If
|
||||
you do not specify this option, pcre2_chartables.c is created as a copy of
|
||||
pcre2_chartables.c.dist. See "Character tables" below for further
|
||||
information.
|
||||
|
||||
|
@ -295,8 +305,8 @@ library. They are also documented in the pcre2build man page.
|
|||
unaddressable. This allows it to detect invalid memory accesses, and is
|
||||
mostly useful for debugging PCRE2 itself.
|
||||
|
||||
. In environments where the gcc compiler is used and lcov version 1.6 or above
|
||||
is installed, if you specify
|
||||
. In environments where the gcc compiler is used and lcov is installed, if you
|
||||
specify
|
||||
|
||||
--enable-coverage
|
||||
|
||||
|
@ -365,19 +375,20 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If --disable-percent-zt is specified, no use is made
|
||||
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
|
||||
size_t values.
|
||||
environments other than Microsoft Visual Studio versions earlier than 2013
|
||||
when __STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating C99). However, there is at least one environment that
|
||||
claims to be C99 but does not support these modifiers. If
|
||||
--disable-percent-zt is specified, no use is made of the z or t modifiers.
|
||||
Instead of %td or %zu, %lu is used, with a cast for size_t values.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
|
@ -390,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -407,7 +418,7 @@ The "configure" script builds the following files for the basic C library:
|
|||
. Makefile the makefile that builds the library
|
||||
. src/config.h build-time configuration options for the library
|
||||
. src/pcre2.h the public PCRE2 header file
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
. pcre2-config script that shows the building settings such as CFLAGS
|
||||
that were set for "configure"
|
||||
. libpcre2-8.pc )
|
||||
. libpcre2-16.pc ) data for the pkg-config command
|
||||
|
@ -546,11 +557,11 @@ Cross-compiling using autotools
|
|||
|
||||
You can specify CC and CFLAGS in the normal way to the "configure" command, in
|
||||
order to cross-compile PCRE2 for some other host. However, you should NOT
|
||||
specify --enable-rebuild-chartables, because if you do, the dftables.c source
|
||||
file is compiled and run on the local host, in order to generate the inbuilt
|
||||
character tables (the pcre2_chartables.c file). This will probably not work,
|
||||
because dftables.c needs to be compiled with the local compiler, not the cross
|
||||
compiler.
|
||||
specify --enable-rebuild-chartables, because if you do, the pcre2_dftables.c
|
||||
source file is compiled and run on the local host, in order to generate the
|
||||
inbuilt character tables (the pcre2_chartables.c file). This will probably not
|
||||
work, because pcre2_dftables.c needs to be compiled with the local compiler,
|
||||
not the cross compiler.
|
||||
|
||||
When --enable-rebuild-chartables is not specified, pcre2_chartables.c is
|
||||
created by making a copy of pcre2_chartables.c.dist, which is a default set of
|
||||
|
@ -558,9 +569,10 @@ tables that assumes ASCII code. Cross-compiling with the default tables should
|
|||
not be a problem.
|
||||
|
||||
If you need to modify the character tables when cross-compiling, you should
|
||||
move pcre2_chartables.c.dist out of the way, then compile dftables.c by hand
|
||||
and run it on the local host to make a new version of pcre2_chartables.c.dist.
|
||||
Then when you cross-compile PCRE2 this new version of the tables will be used.
|
||||
move pcre2_chartables.c.dist out of the way, then compile pcre2_dftables.c by
|
||||
hand and run it on the local host to make a new version of
|
||||
pcre2_chartables.c.dist. See the pcre2build section "Creating character tables
|
||||
at build time" for more details.
|
||||
|
||||
|
||||
Making new tarballs
|
||||
|
@ -597,13 +609,13 @@ is available. RunTest outputs a comment when it skips a test.
|
|||
|
||||
Many (but not all) of the tests that are not skipped are run twice if JIT
|
||||
support is available. On the second run, JIT compilation is forced. This
|
||||
testing can be suppressed by putting "nojit" on the RunTest command line.
|
||||
testing can be suppressed by putting "-nojit" on the RunTest command line.
|
||||
|
||||
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||
libraries that are enabled. If you want to run just one set of tests, call
|
||||
RunTest with either the -8, -16 or -32 option.
|
||||
|
||||
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||
If valgrind is installed, you can run the tests under it by putting "-valgrind"
|
||||
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||
files, give their numbers as arguments to RunTest, for example:
|
||||
|
||||
|
@ -684,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
@ -719,8 +731,8 @@ compile context.
|
|||
The source file called pcre2_chartables.c contains the default set of tables.
|
||||
By default, this is created as a copy of pcre2_chartables.c.dist, which
|
||||
contains tables for ASCII coding. However, if --enable-rebuild-chartables is
|
||||
specified for ./configure, a different version of pcre2_chartables.c is built
|
||||
by the program dftables (compiled from dftables.c), which uses the ANSI C
|
||||
specified for ./configure, a new version of pcre2_chartables.c is built by the
|
||||
program pcre2_dftables (compiled from pcre2_dftables.c), which uses the ANSI C
|
||||
character handling functions such as isalnum(), isalpha(), isupper(),
|
||||
islower(), etc. to build the table sources. This means that the default C
|
||||
locale that is set for your system will control the contents of these default
|
||||
|
@ -730,32 +742,40 @@ file does not get automatically re-generated. The best way to do this is to
|
|||
move pcre2_chartables.c.dist out of the way and replace it with your customized
|
||||
tables.
|
||||
|
||||
When the dftables program is run as a result of --enable-rebuild-chartables,
|
||||
it uses the default C locale that is set on your system. It does not pay
|
||||
attention to the LC_xxx environment variables. In other words, it uses the
|
||||
system's default locale rather than whatever the compiling user happens to have
|
||||
set. If you really do want to build a source set of character tables in a
|
||||
locale that is specified by the LC_xxx variables, you can run the dftables
|
||||
program by hand with the -L option. For example:
|
||||
When the pcre2_dftables program is run as a result of specifying
|
||||
--enable-rebuild-chartables, it uses the default C locale that is set on your
|
||||
system. It does not pay attention to the LC_xxx environment variables. In other
|
||||
words, it uses the system's default locale rather than whatever the compiling
|
||||
user happens to have set. If you really do want to build a source set of
|
||||
character tables in a locale that is specified by the LC_xxx variables, you can
|
||||
run the pcre2_dftables program by hand with the -L option. For example:
|
||||
|
||||
./dftables -L pcre2_chartables.c.special
|
||||
./pcre2_dftables -L pcre2_chartables.c.special
|
||||
|
||||
The first two 256-byte tables provide lower casing and case flipping functions,
|
||||
respectively. The next table consists of three 32-byte bit maps which identify
|
||||
digits, "word" characters, and white space, respectively. These are used when
|
||||
building 32-byte bit maps that represent character classes for code points less
|
||||
than 256. The final 256-byte table has bits indicating various character types,
|
||||
as follows:
|
||||
The second argument names the file where the source code for the tables is
|
||||
written. The first two 256-byte tables provide lower casing and case flipping
|
||||
functions, respectively. The next table consists of a number of 32-byte bit
|
||||
maps which identify certain character classes such as digits, "word"
|
||||
characters, white space, etc. These are used when building 32-byte bit maps
|
||||
that represent character classes for code points less than 256. The final
|
||||
256-byte table has bits indicating various character types, as follows:
|
||||
|
||||
1 white space character
|
||||
2 letter
|
||||
4 decimal digit
|
||||
8 hexadecimal digit
|
||||
4 lower case letter
|
||||
8 decimal digit
|
||||
16 alphanumeric or '_'
|
||||
128 regular expression metacharacter or binary zero
|
||||
|
||||
You should not alter the set of characters that contain the 128 bit, as that
|
||||
will cause PCRE2 to malfunction.
|
||||
You can also specify -b (with or without -L) when running pcre2_dftables. This
|
||||
causes the tables to be written in binary instead of as source code. A set of
|
||||
binary tables can be loaded into memory by an application and passed to
|
||||
pcre2_compile() in the same way as tables created dynamically by calling
|
||||
pcre2_maketables(). The tables are just a string of bytes, independent of
|
||||
hardware characteristics such as endianness. This means they can be bundled
|
||||
with an application that runs in different environments, to ensure consistent
|
||||
behaviour.
|
||||
|
||||
See also the pcre2build section "Creating character tables at build time".
|
||||
|
||||
|
||||
File manifest
|
||||
|
@ -766,7 +786,7 @@ The distribution should contain the files listed below.
|
|||
(A) Source files for the PCRE2 library functions and their headers are found in
|
||||
the src directory:
|
||||
|
||||
src/dftables.c auxiliary program for building pcre2_chartables.c
|
||||
src/pcre2_dftables.c auxiliary program for building pcre2_chartables.c
|
||||
when --enable-rebuild-chartables is specified
|
||||
|
||||
src/pcre2_chartables.c.dist a default set of character tables that assume
|
||||
|
@ -890,6 +910,6 @@ The distribution should contain the files listed below.
|
|||
) environments
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 16 April 2019
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 15 April 2022
|
||||
|
|
|
@ -146,6 +146,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_get_mark.html">pcre2_get_mark</a></td>
|
||||
<td> Get a (*MARK) name</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_get_match_data_size.html">pcre2_get_match_data_size</a></td>
|
||||
<td> Get the size of a match data block</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_get_ovector_count.html">pcre2_get_ovector_count</a></td>
|
||||
<td> Get the ovector count</td></tr>
|
||||
|
||||
|
@ -176,6 +179,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
|
||||
<td> Build character tables in current locale</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_maketables_free.html">pcre2_maketables_free</a></td>
|
||||
<td> Free character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_match.html">pcre2_match</a></td>
|
||||
<td> Match a compiled pattern to a subject string
|
||||
(Perl compatible)</td></tr>
|
||||
|
|
|
@ -28,7 +28,8 @@ nearly two decades, the limitations of the original API were making development
|
|||
increasingly difficult. The new API is more extensible, and it was simplified
|
||||
by abolishing the separate "study" optimizing function; in PCRE2, patterns are
|
||||
automatically optimized where possible. Since forking from PCRE1, the code has
|
||||
been extensively refactored and new features introduced.
|
||||
been extensively refactored and new features introduced. The old library is now
|
||||
obsolete and is no longer maintained.
|
||||
</P>
|
||||
<P>
|
||||
As well as Perl-style regular expression patterns, some features that appeared
|
||||
|
@ -38,8 +39,14 @@ Oniguruma syntax items, and there are options for requesting some minor changes
|
|||
that give better ECMAScript (aka JavaScript) compatibility.
|
||||
</P>
|
||||
<P>
|
||||
The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
|
||||
code units, which means that up to three separate libraries may be installed.
|
||||
The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit,
|
||||
or 32-bit code units, which means that up to three separate libraries may be
|
||||
installed, one for each code unit size. The size of code unit is not related to
|
||||
the bit size of the underlying hardware. In a 64-bit environment that also
|
||||
supports 32-bit applications, versions of PCRE2 that are compiled in both
|
||||
64-bit and 32-bit modes may be needed.
|
||||
</P>
|
||||
<P>
|
||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||
|
@ -187,20 +194,20 @@ function, listing its arguments and results.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<P>
|
||||
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||
use my two names separated by a dot at gmail.com.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 September 2018
|
||||
Last updated: 27 August 2021
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -65,6 +65,7 @@ The option bits are:
|
|||
PCRE2_EXTENDED Ignore white space and # comments
|
||||
PCRE2_FIRSTLINE Force matching to be before newline
|
||||
PCRE2_LITERAL Pattern characters are all literal
|
||||
PCRE2_MATCH_INVALID_UTF Enable support for matching invalid UTF
|
||||
PCRE2_MATCH_UNSET_BACKREF Match unset backreferences
|
||||
PCRE2_MULTILINE ^ and $ match newlines within data
|
||||
PCRE2_NEVER_BACKSLASH_C Lock out the use of \C in patterns
|
||||
|
@ -91,8 +92,18 @@ Additional options may be set in the compile context via the
|
|||
function.
|
||||
</P>
|
||||
<P>
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of <i>errorcode</i> or <i>erroroffset</i> is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the <i>errorcode</i> argument to the the
|
||||
<b>pcre2_get_error_message()</b> function. The offset (in code units) where the
|
||||
error was encountered is returned via the <i>erroroffset</i> argument.
|
||||
</P>
|
||||
<P>
|
||||
If there is no error, the value passed via <i>errorcode</i> returns the message
|
||||
"no error" if passed to <b>pcre2_get_error_message()</b>, and the value passed
|
||||
via <i>erroroffset</i> is zero.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
|
|
|
@ -45,10 +45,16 @@ just once (except when processing lookaround assertions). This function is
|
|||
<i>workspace</i> Points to a vector of ints used as working space
|
||||
<i>wscount</i> Number of elements in the vector
|
||||
</pre>
|
||||
For <b>pcre2_dfa_match()</b>, a match context is needed only if you want to set
|
||||
up a callout function or specify the heap limit or the match or the recursion
|
||||
depth limits. The <i>length</i> and <i>startoffset</i> values are code units, not
|
||||
characters. The options are:
|
||||
The size of output vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
|
||||
data block is therefore not advisable when using this function.
|
||||
</P>
|
||||
<P>
|
||||
A match context is needed only if you want to set up a callout function or
|
||||
specify the heap limit or the match or the recursion depth limits. The
|
||||
<i>length</i> and <i>startoffset</i> values are code units, not characters. The
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_COPY_MATCHED_SUBJECT
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2_get_match_data_size specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2_get_match_data_size man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
SYNOPSIS
|
||||
</b><br>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *<i>match_data</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function returns the size, in bytes, of the match data block that is its
|
||||
argument.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
page and a description of the POSIX API in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
page.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -40,11 +40,17 @@ bits:
|
|||
PCRE2_JIT_COMPLETE compile code for full matching
|
||||
PCRE2_JIT_PARTIAL_SOFT compile code for soft partial matching
|
||||
PCRE2_JIT_PARTIAL_HARD compile code for hard partial matching
|
||||
PCRE2_JIT_INVALID_UTF compile code to handle invalid UTF
|
||||
</pre>
|
||||
There is also an obsolete option called PCRE2_JIT_INVALID_UTF, which has been
|
||||
superseded by the <b>pcre2_compile()</b> option PCRE2_MATCH_INVALID_UTF. The old
|
||||
option is deprecated and may be removed in the future.
|
||||
</P>
|
||||
<P>
|
||||
The yield of the function is 0 for success, or a negative error code otherwise.
|
||||
In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or
|
||||
if an unknown bit is set in <i>options</i>.
|
||||
if an unknown bit is set in <i>options</i>. The function can also return
|
||||
PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the
|
||||
compiler, even if it was because of a system security restriction.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -29,7 +29,7 @@ This function frees unused JIT executable memory. The argument is a general
|
|||
context, for custom memory management, or NULL for standard memory management.
|
||||
JIT memory allocation retains some memory in order to improve future JIT
|
||||
compilation speed. In low memory conditions,
|
||||
\fBpcre2_jit_free_unused_memory()\fB can be used to cause this memory to be
|
||||
<b>pcre2_jit_free_unused_memory()</b> can be used to cause this memory to be
|
||||
freed.
|
||||
</P>
|
||||
<P>
|
||||
|
|
|
@ -33,7 +33,9 @@ processed by the JIT compiler against a given subject string, using a matching
|
|||
algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and
|
||||
it bypasses some of the sanity checks that <b>pcre2_match()</b> applies.
|
||||
Its arguments are exactly the same as for
|
||||
<a href="pcre2_match.html"><b>pcre2_match()</b>.</a>
|
||||
<a href="pcre2_match.html"><b>pcre2_match()</b>,</a>
|
||||
except that the subject string must be specified with a length;
|
||||
PCRE2_ZERO_TERMINATED is not supported.
|
||||
</P>
|
||||
<P>
|
||||
The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||
|
|
|
@ -34,7 +34,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
<b>pcre2_jit_stack_assign()</b> to associate the stack with a compiled pattern,
|
||||
which can then be processed by <b>pcre2_match()</b> or <b>pcre2_jit_match()</b>.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
page.
|
||||
</P>
|
||||
|
|
|
@ -19,7 +19,7 @@ SYNOPSIS
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>const unsigned char *pcre2_maketables(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<b>const uint8_t *pcre2_maketables(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2_maketables_free specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2_maketables_free man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
SYNOPSIS
|
||||
</b><br>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>void pcre2_maketables_free(pcre2_general_context *<i>gcontext</i>,</b>
|
||||
<b> const uint8_t *<i>tables</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function discards a set of character tables that were created by a call
|
||||
to
|
||||
<a href="pcre2_maketables.html"><b>pcre2_maketables()</b>.</a>
|
||||
</P>
|
||||
<P>
|
||||
The <i>gcontext</i> parameter should match what was used in that call to
|
||||
account for any custom allocators that might be in use; if it is NULL
|
||||
the system <b>free()</b> is used.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
page.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -30,8 +30,9 @@ This function creates a new match data block, which is used for holding the
|
|||
result of a match. The first argument specifies the number of pairs of offsets
|
||||
that are required. These form the "output vector" (ovector) within the match
|
||||
data block, and are used to identify the matched string and any captured
|
||||
substrings. There is always one pair of offsets; if <b>ovecsize</b> is zero, it
|
||||
is treated as one.
|
||||
substrings when matching with <b>pcre2_match()</b>, or a number of different
|
||||
matches at the same point when used with <b>pcre2_dfa_match()</b>. There is
|
||||
always one pair of offsets; if <b>ovecsize</b> is zero, it is treated as one.
|
||||
</P>
|
||||
<P>
|
||||
The second argument points to a general context, for custom memory management,
|
||||
|
|
|
@ -26,12 +26,15 @@ SYNOPSIS
|
|||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function creates a new match data block, which is used for holding the
|
||||
result of a match. The first argument points to a compiled pattern. The number
|
||||
of capturing parentheses within the pattern is used to compute the number of
|
||||
pairs of offsets that are required in the match data block. These form the
|
||||
"output vector" (ovector) within the match data block, and are used to identify
|
||||
the matched string and any captured substrings.
|
||||
This function creates a new match data block for holding the result of a match.
|
||||
The first argument points to a compiled pattern. The number of capturing
|
||||
parentheses within the pattern is used to compute the number of pairs of
|
||||
offsets that are required in the match data block. These form the "output
|
||||
vector" (ovector) within the match data block, and are used to identify the
|
||||
matched string and any captured substrings when matching with
|
||||
<b>pcre2_match()</b>. If you are using <b>pcre2_dfa_match()</b>, which uses the
|
||||
outut vector in a different way, you should use <b>pcre2_match_data_create()</b>
|
||||
instead of this function.
|
||||
</P>
|
||||
<P>
|
||||
The second argument points to a general context, for custom memory management,
|
||||
|
|
|
@ -48,7 +48,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA <i>number_of_codes</i> is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in <i>bytes</i>
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL <i>codes</i> or <i>bytes</i> is NULL
|
||||
</pre>
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -20,16 +20,19 @@ SYNOPSIS
|
|||
</P>
|
||||
<P>
|
||||
<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> const unsigned char *<i>tables</i>);</b>
|
||||
<b> const uint8_t *<i>tables</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function sets a pointer to custom character tables within a compile
|
||||
context. The second argument must be the result of a call to
|
||||
<b>pcre2_maketables()</b> or NULL to request the default tables. The result is
|
||||
always zero.
|
||||
context. The second argument must point to a set of PCRE2 character tables or
|
||||
be NULL to request the default tables. The result is always zero. Character
|
||||
tables can be created by calling <b>pcre2_maketables()</b> or by running the
|
||||
<b>pcre2_dftables</b> maintenance command in binary mode (see the
|
||||
<a href="pcre2build.html"><b>pcre2build</b></a>
|
||||
documentation).
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -30,7 +30,8 @@ This function sets additional option bits for <b>pcre2_compile()</b> that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
|
||||
|
|
|
@ -48,8 +48,8 @@ Its arguments are:
|
|||
<i>outlengthptr</i> Points to the length of the output buffer
|
||||
</pre>
|
||||
A match data block is needed only if you want to inspect the data from the
|
||||
match that is returned in that block. A match context is needed only if you
|
||||
want to:
|
||||
final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is
|
||||
set. A match context is needed only if you want to:
|
||||
<pre>
|
||||
Set up a callout function
|
||||
Set a matching offset limit
|
||||
|
@ -57,29 +57,46 @@ want to:
|
|||
Change the backtracking depth limit
|
||||
Set custom memory management in the match context
|
||||
</pre>
|
||||
The <i>length</i>, <i>startoffset</i> and <i>rlength</i> values are code
|
||||
units, not characters, as is the contents of the variable pointed at by
|
||||
<i>outlengthptr</i>, which is updated to the actual length of the new string.
|
||||
The <i>length</i>, <i>startoffset</i> and <i>rlength</i> values are code units,
|
||||
not characters, as is the contents of the variable pointed at by
|
||||
<i>outlengthptr</i>. This variable must contain the length of the output buffer
|
||||
when the function is called. If the function is successful, the value is
|
||||
changed to the length of the new string, excluding the trailing zero that is
|
||||
automatically added.
|
||||
</P>
|
||||
<P>
|
||||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
<pre>
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement
|
||||
(only relevant if PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
</pre>
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, <i>match_data</i> must be non-NULL; its
|
||||
contents must be the result of a call to <b>pcre2_match()</b> using the same
|
||||
pattern and subject.
|
||||
</P>
|
||||
<P>
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
are no matches. The result may be greater than one only when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned.
|
||||
</P>
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -128,7 +128,7 @@ To build it without Unicode support, add
|
|||
--disable-unicode
|
||||
</pre>
|
||||
to the <b>configure</b> command. This setting applies to all three libraries. It
|
||||
is not possible to build one library with Unicode support, and another without,
|
||||
is not possible to build one library with Unicode support and another without
|
||||
in the same configuration.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \P, \p,
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i> are
|
||||
supported. Details are given in the
|
||||
and \X. Only the general category properties such as <i>Lu</i> and <i>Nd</i>,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -188,11 +189,11 @@ which enables the use of an execmem allocator in JIT that is compatible with
|
|||
SELinux. This has no effect if JIT is not enabled. See the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
documentation for a discussion of JIT usage. When JIT support is enabled,
|
||||
pcre2grep automatically makes use of it, unless you add
|
||||
<b>pcre2grep</b> automatically makes use of it, unless you add
|
||||
<pre>
|
||||
--disable-pcre2grep-jit
|
||||
</pre>
|
||||
to the "configure" command.
|
||||
to the <b>configure</b> command.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">NEWLINE RECOGNITION</a><br>
|
||||
<P>
|
||||
|
@ -283,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
|||
counting is done differently).
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||
change this by a setting such as
|
||||
|
@ -307,7 +307,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
<pre>
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
</pre>
|
||||
to the <b>configure</b> command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -321,7 +321,7 @@ As well as applying to <b>pcre2_match()</b>, the depth limit also controls
|
|||
the depth of recursive function calls in <b>pcre2_dfa_match()</b>. These are
|
||||
used for lookaround assertions, atomic groups, and recursion within patterns.
|
||||
The limit does not apply to JIT matching.
|
||||
</P>
|
||||
<a name="createtables"></a></P>
|
||||
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
|
||||
<P>
|
||||
PCRE2 uses fixed tables for processing characters whose code points are less
|
||||
|
@ -332,12 +332,34 @@ only. If you add
|
|||
--enable-rebuild-chartables
|
||||
</pre>
|
||||
to the <b>configure</b> command, the distributed tables are no longer used.
|
||||
Instead, a program called <b>dftables</b> is compiled and run. This outputs the
|
||||
source for new set of tables, created in the default locale of your C run-time
|
||||
system. This method of replacing the tables does not work if you are cross
|
||||
compiling, because <b>dftables</b> is run on the local host. If you need to
|
||||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".
|
||||
Instead, a program called <b>pcre2_dftables</b> is compiled and run. This
|
||||
outputs the source for new set of tables, created in the default locale of your
|
||||
C run-time system. This method of replacing the tables does not work if you are
|
||||
cross compiling, because <b>pcre2_dftables</b> needs to be run on the local
|
||||
host and therefore not compiled with the cross compiler.
|
||||
</P>
|
||||
<P>
|
||||
If you need to create alternative tables when cross compiling, you will have to
|
||||
do so "by hand". There may also be other reasons for creating tables manually.
|
||||
To cause <b>pcre2_dftables</b> to be built on the local host, run a normal
|
||||
compiling command, and then run the program with the output file as its
|
||||
argument, for example:
|
||||
<pre>
|
||||
cc src/pcre2_dftables.c -o pcre2_dftables
|
||||
./pcre2_dftables src/pcre2_chartables.c
|
||||
</pre>
|
||||
This builds the tables in the default locale of the local host. If you want to
|
||||
specify a locale, you must use the -L option:
|
||||
<pre>
|
||||
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
|
||||
</pre>
|
||||
You can also specify -b (with or without -L). This causes the tables to be
|
||||
written in binary instead of as source code. A set of binary tables can be
|
||||
loaded into memory by an application and passed to <b>pcre2_compile()</b> in the
|
||||
same way as tables created by calling <b>pcre2_maketables()</b>. The tables are
|
||||
just a string of bytes, independent of hardware characteristics such as
|
||||
endianness. This means they can be bundled with an application that runs in
|
||||
different environments, to ensure consistent behaviour.
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">USING EBCDIC CODE</a><br>
|
||||
<P>
|
||||
|
@ -414,7 +436,7 @@ default parameter values by adding, for example,
|
|||
--with-pcre2grep-bufsize=51200
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
</pre>
|
||||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override
|
||||
to the <b>configure</b> command. The caller of <b>pcre2grep</b> can override
|
||||
these values by using --buffer-size and --max-buffer-size on the command line.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
|
@ -531,15 +553,16 @@ documentation.
|
|||
<P>
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
<pre>
|
||||
--disable-percent-zt
|
||||
</pre>
|
||||
is specified, no use is made of the z or t modifiers. Instead or %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
|
||||
<P>
|
||||
|
@ -585,16 +608,16 @@ give a warning.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 March 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -16,32 +16,43 @@ please consult the man page, in case the conversion went wrong.
|
|||
DIFFERENCES BETWEEN PCRE2 AND PERL
|
||||
</b><br>
|
||||
<P>
|
||||
This document describes the differences in the ways that PCRE2 and Perl handle
|
||||
regular expressions. The differences described here are with respect to Perl
|
||||
versions 5.26, but as both Perl and PCRE2 are continually changing, the
|
||||
information may sometimes be out of date.
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
</P>
|
||||
<P>
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
</P>
|
||||
<P>
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
</P>
|
||||
<P>
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \b* (but not \b{3}), but these do not seem to have any use.
|
||||
for example, \b* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
</P>
|
||||
<P>
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
</P>
|
||||
<P>
|
||||
4. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
5. The following Perl escape sequences are not supported: \F, \l, \L, \u,
|
||||
\U, and \N when followed by a character name. \N on its own, matching a
|
||||
non-newline character, and \N{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -52,24 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript
|
|||
interprets them.
|
||||
</P>
|
||||
<P>
|
||||
5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \p and \P are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
PCRE2 does support the Cs (surrogate) property, which Perl does not; the Perl
|
||||
documentation says "Because Perl hides the need for the user to understand the
|
||||
internal representation of Unicode characters, there is no need to implement
|
||||
the somewhat messy concept of surrogates."
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
documentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
</P>
|
||||
<P>
|
||||
6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \Q and \E which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \Q and \E just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \Q
|
||||
and \E which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \Q and \E just like any other character. Note the
|
||||
following examples:
|
||||
<pre>
|
||||
Pattern PCRE2 matches Perl matches
|
||||
|
||||
|
@ -79,41 +92,38 @@ other character. Note the following examples:
|
|||
\QA\B\E A\B A\B
|
||||
\Q\\E \ \\E
|
||||
</pre>
|
||||
The \Q...\E sequence is recognized both inside and outside character classes.
|
||||
The \Q...\E sequence is recognized both inside and outside character classes
|
||||
by both PCRE2 and Perl.
|
||||
</P>
|
||||
<P>
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation for details.
|
||||
</P>
|
||||
<P>
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
</P>
|
||||
<P>
|
||||
9. If any of the backtracking control verbs are used in a group that is called
|
||||
as a subroutine (whether or not recursively), their effect is confined to that
|
||||
group; it does not extend to the surrounding pattern. This is not always the
|
||||
case in Perl. In particular, if (*THEN) is present in a group that is called as
|
||||
a subroutine, its action is limited to that group, even if the group does not
|
||||
contain any | characters. Note that such groups are processed as anchored
|
||||
at the point where they are tested.
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
that is called as a subroutine, its action is limited to that group, even if
|
||||
the group does not contain any | characters. Note that such groups are
|
||||
processed as anchored at the point where they are tested.
|
||||
</P>
|
||||
<P>
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
</P>
|
||||
<P>
|
||||
11. Most backtracking verbs in assertions have their normal actions. They are
|
||||
not confined to the assertion.
|
||||
</P>
|
||||
<P>
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
|
@ -123,7 +133,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
|||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B), where the two
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
capture groups have the same number but different names, is not supported, and
|
||||
causes an error at compile time. If it were allowed, it would not be possible
|
||||
to distinguish which group matched, because both names map to capture group
|
||||
|
@ -146,19 +156,27 @@ certainly user mistakes.
|
|||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \p{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.24), \p{Lu} and \p{Ll} match all
|
||||
in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
</P>
|
||||
<P>
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 includes new features that are not in earlier versions of Perl, some
|
||||
17. From release 5.32.0, Perl locks out the use of \K in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\K is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
</P>
|
||||
<P>
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.26:
|
||||
list is with respect to Perl 5.34:
|
||||
<br>
|
||||
<br>
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative branch of a lookbehind assertion can match a different length
|
||||
of string. Perl requires them all to have the same length.
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
<br>
|
||||
<br>
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
|
@ -203,16 +221,21 @@ different way and is not Perl-compatible.
|
|||
<br>
|
||||
<br>
|
||||
(l) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at
|
||||
the start of a pattern that set overall options that cannot be changed within
|
||||
the start of a pattern. These set overall options that cannot be changed within
|
||||
the pattern.
|
||||
<br>
|
||||
<br>
|
||||
(m) PCRE2 supports non-atomic positive lookaround assertions. This is an
|
||||
extension to the lookaround facilities. The default, Perl-compatible
|
||||
lookarounds are atomic.
|
||||
</P>
|
||||
<P>
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
</P>
|
||||
<P>
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
<a href="pcre2limit.html"><b>pcre2limit</b></a>
|
||||
documentation for details. Perl went with 5.10 from recursion to iteration
|
||||
keeping the intermediate matches on the heap, which is ~10% slower but does not
|
||||
|
@ -225,7 +248,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -234,9 +257,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 12 February 2019
|
||||
Last updated: 08 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -141,8 +141,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
</P>
|
||||
<P>
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">CONVERTING POSIX PATTERNS</a><br>
|
||||
<P>
|
||||
|
|
|
@ -104,12 +104,11 @@ uint32_t newline;
|
|||
|
||||
PCRE2_SIZE erroroffset;
|
||||
PCRE2_SIZE *ovector;
|
||||
PCRE2_SIZE subject_length;
|
||||
|
||||
size_t subject_length;
|
||||
pcre2_match_data *match_data;
|
||||
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
|
@ -138,12 +137,14 @@ if (argc - i != 2)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* As pattern and subject are char arguments, they can be straightforwardly
|
||||
cast to PCRE2_SPTR as we are working in 8-bit code units. */
|
||||
/* Pattern and subject are char arguments, so they can be straightforwardly
|
||||
cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
|
||||
length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
|
||||
defined to be size_t. */
|
||||
|
||||
pattern = (PCRE2_SPTR)argv[i];
|
||||
subject = (PCRE2_SPTR)argv[i+1];
|
||||
subject_length = strlen((char *)subject);
|
||||
subject_length = (PCRE2_SIZE)strlen((char *)subject);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -172,17 +173,22 @@ if (re == NULL)
|
|||
|
||||
|
||||
/*************************************************************************
|
||||
* If the compilation succeeded, we call PCRE again, in order to do a *
|
||||
* If the compilation succeeded, we call PCRE2 again, in order to do a *
|
||||
* pattern match against the subject string. This does just ONE match. If *
|
||||
* further matching is needed, it will be done below. Before running the *
|
||||
* match we must set up a match_data block for holding the result. *
|
||||
* match we must set up a match_data block for holding the result. Using *
|
||||
* pcre2_match_data_create_from_pattern() ensures that the block is *
|
||||
* exactly the right size for the number of capturing parentheses in the *
|
||||
* pattern. If you need to know the actual size of a match_data block as *
|
||||
* a number of bytes, you can find it like this: *
|
||||
* *
|
||||
* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); *
|
||||
*************************************************************************/
|
||||
|
||||
/* Using this function ensures that the block is exactly the right size for
|
||||
the number of capturing parentheses in the pattern. */
|
||||
|
||||
match_data = pcre2_match_data_create_from_pattern(re, NULL);
|
||||
|
||||
/* Now run the match. */
|
||||
|
||||
rc = pcre2_match(
|
||||
re, /* the compiled pattern */
|
||||
subject, /* the subject string */
|
||||
|
@ -205,12 +211,12 @@ if (rc < 0)
|
|||
default: printf("Matching error %d\n", rc); break;
|
||||
}
|
||||
pcre2_match_data_free(match_data); /* Release memory used for the match */
|
||||
pcre2_code_free(re); /* data and the compiled pattern. */
|
||||
pcre2_code_free(re); /* data and the compiled pattern. */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded. Get a pointer to the output vector, where string offsets are
|
||||
stored. */
|
||||
/* Match succeeded. Get a pointer to the output vector, where string offsets
|
||||
are stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||
|
@ -228,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
|
|||
if (rc == 0)
|
||||
printf("ovector was not big enough for all the captured substrings\n");
|
||||
|
||||
/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion
|
||||
to set the start of a match later than its end. In this demonstration program,
|
||||
we just detect this case and give up. */
|
||||
/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
|
||||
assertions. However, there is an option to re-enable the old behaviour. If that
|
||||
is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
|
||||
assertion to set the start of a match later than its end. In this demonstration
|
||||
program, we show how to detect this case, but it shouldn't arise because the
|
||||
option is never set. */
|
||||
|
||||
if (ovector[0] > ovector[1])
|
||||
{
|
||||
|
@ -249,7 +258,7 @@ application you might want to do things other than print them. */
|
|||
for (i = 0; i < rc; i++)
|
||||
{
|
||||
PCRE2_SPTR substring_start = subject + ovector[2*i];
|
||||
size_t substring_length = ovector[2*i+1] - ovector[2*i];
|
||||
PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
|
||||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
|
@ -447,7 +456,7 @@ for (;;)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
/* Match succeeded */
|
||||
|
||||
printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
|
||||
|
||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
|||
<pre>
|
||||
pcre2grep some-pattern file1 - file3
|
||||
</pre>
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
<b>-N</b> (<b>--newline</b>) option.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||
terminator to a zero byte.
|
||||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
|
@ -111,8 +113,8 @@ matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
|
|||
(either shown literally, or as an offset), scanning resumes immediately
|
||||
following the match, so that further matches on the same line can be found. If
|
||||
there are multiple patterns, they are all tried on the remainder of the line,
|
||||
but patterns that follow the one that matched are not tried on the earlier part
|
||||
of the line.
|
||||
but patterns that follow the one that matched are not tried on the earlier
|
||||
matched part of the line.
|
||||
</P>
|
||||
<P>
|
||||
This behaviour means that the order in which multiple patterns are specified
|
||||
|
@ -146,11 +148,10 @@ ignored.
|
|||
<br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
|
||||
<P>
|
||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||
is identified as a binary file, and is processed specially. (GNU grep
|
||||
identifies binary files in this manner.) However, if the newline type is
|
||||
specified as "nul", that is, the line terminator is a binary zero, the test for
|
||||
a binary file is not applied. See the <b>--binary-files</b> option for a means
|
||||
of changing the way binary files are handled.
|
||||
is identified as a binary file, and is processed specially. However, if the
|
||||
newline type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the <b>--binary-files</b>
|
||||
option for a means of changing the way binary files are handled.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br>
|
||||
<P>
|
||||
|
@ -179,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
context lines (the <b>-Z</b> option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||
<b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -189,14 +192,21 @@ Treat binary files as text. This is equivalent to
|
|||
<b>--binary-files</b>=<i>text</i>.
|
||||
</P>
|
||||
<P>
|
||||
<b>--allow-lookaround-bsk</b>
|
||||
PCRE2 now forbids the use of \K in lookarounds by default, in line with Perl.
|
||||
This option causes <b>pcre2grep</b> to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option, which enables this somewhat dangerous usage.
|
||||
</P>
|
||||
<P>
|
||||
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
|
||||
Output up to <i>number</i> lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -406,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--heap-limit</b>=<i>number</i>
|
||||
|
@ -443,8 +455,8 @@ Ignore upper/lower case distinctions during comparisons.
|
|||
<P>
|
||||
<b>--include</b>=<i>pattern</i>
|
||||
If any <b>--include</b> patterns are specified, the only files that are
|
||||
processed are those that match one of the patterns (and do not match an
|
||||
<b>--exclude</b> pattern). This option does not affect directories, but it
|
||||
processed are those whose names match one of the patterns and do not match an
|
||||
<b>--exclude</b> pattern. This option does not affect directories, but it
|
||||
applies to all files, whether listed on the command line, obtained from
|
||||
<b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
|
||||
expression, and is matched against the final component of the file name, not
|
||||
|
@ -463,8 +475,8 @@ may be given any number of times; all the files are read.
|
|||
<P>
|
||||
<b>--include-dir</b>=<i>pattern</i>
|
||||
If any <b>--include-dir</b> patterns are specified, the only directories that
|
||||
are processed are those that match one of the patterns (and do not match an
|
||||
<b>--exclude-dir</b> pattern). This applies to all directories, whether listed
|
||||
are processed are those whose names match one of the patterns and do not match
|
||||
an <b>--exclude-dir</b> pattern. This applies to all directories, whether listed
|
||||
on the command line, obtained from <b>--file-list</b>, or by scanning a parent
|
||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||
the final component of the directory name, not the entire path. The <b>-F</b>,
|
||||
|
@ -476,19 +488,22 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
|||
<b>-L</b>, <b>--files-without-match</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-l</b> options.
|
||||
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-l</b>, <b>--files-with-matches</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches. This
|
||||
opeion overrides any previous <b>-H</b>, <b>-h</b>, or <b>-L</b> options.
|
||||
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--label</b>=<i>name</i>
|
||||
|
@ -501,8 +516,8 @@ short form for this option.
|
|||
When this option is given, non-compressed input is read and processed line by
|
||||
line, and the output is flushed after each write. By default, input is read in
|
||||
large chunks, unless <b>pcre2grep</b> can determine that it is reading from a
|
||||
terminal (which is currently possible only in Unix-like environments or
|
||||
Windows). Output to terminal is normally automatically flushed by the operating
|
||||
terminal, which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed by the operating
|
||||
system. This option can be useful when the input or output is attached to a
|
||||
pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data.
|
||||
However, its use will affect performance, and the <b>-M</b> (multiline) option
|
||||
|
@ -528,46 +543,6 @@ locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
|||
used. There is no short form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>--match-limit</b>=<i>number</i>
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--match-limit</b> option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by <b>--match-limit</b> is reached, an error occurs.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than <b>--match-limit</b>.
|
||||
<br>
|
||||
<br>
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
</P>
|
||||
<P>
|
||||
\fB--max-buffer-size=<i>number</i>
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
</P>
|
||||
<P>
|
||||
<b>-M</b>, <b>--multiline</b>
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
|
@ -597,29 +572,84 @@ well as possibly handling a two-character newline sequence.
|
|||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
||||
does not work when input is read line by line (see \fP--line-buffered\fP.)
|
||||
does not work when input is read line by line (see <b>--line-buffered</b>.)
|
||||
</P>
|
||||
<P>
|
||||
<b>-m</b> <i>number</i>, <b>--max-count</b>=<i>number</i>
|
||||
Stop processing after finding <i>number</i> matching lines, or non-matching
|
||||
lines if <b>-v</b> is also set. Any trailing context lines are output after the
|
||||
final match. In multiline mode, each multiline match counts as just one line
|
||||
for this purpose. If this limit is reached when reading the standard input from
|
||||
a regular file, the file is left positioned just after the last matching line.
|
||||
If <b>-c</b> is also set, the count that is output is never greater than
|
||||
<i>number</i>. This option has no effect if used with <b>-L</b>, <b>-l</b>, or
|
||||
<b>-q</b>, or when just checking for a match in a binary file.
|
||||
</P>
|
||||
<P>
|
||||
<b>--match-limit</b>=<i>number</i>
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--match-limit</b> option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by <b>--match-limit</b> is reached, an error occurs.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than <b>--match-limit</b>.
|
||||
<br>
|
||||
<br>
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
</P>
|
||||
<P>
|
||||
<b>--max-buffer-size</b>=<i>number</i>
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
</P>
|
||||
<P>
|
||||
<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
|
||||
The PCRE2 library supports five different conventions for indicating
|
||||
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||
which recognizes any of the preceding three types, and an "any" convention, in
|
||||
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||
(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||
PS (paragraph separator, U+2029).
|
||||
Six different conventions for indicating the ends of lines in scanned files are
|
||||
supported. For example:
|
||||
<pre>
|
||||
pcre2grep -N CRLF 'some pattern' <file>
|
||||
</pre>
|
||||
The newline type may be specified in upper, lower, or mixed case. If the
|
||||
newline type is NUL, lines are separated by binary zero characters. The other
|
||||
types are the single-character sequences CR (carriage return) and LF
|
||||
(linefeed), the two-character sequence CRLF, an "anycrlf" type, which
|
||||
recognizes any of the preceding three types, and an "any" type, for which any
|
||||
Unicode line ending sequence is assumed to end a line. The Unicode sequences
|
||||
are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
<br>
|
||||
<br>
|
||||
When the PCRE2 library is built, a default line-ending sequence is specified.
|
||||
This is normally the standard sequence for the operating system. Unless
|
||||
otherwise specified by this option, <b>pcre2grep</b> uses the library's default.
|
||||
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||
makes it possible to use <b>pcre2grep</b> to scan files that have come from
|
||||
other environments without having to modify their line endings. If the data
|
||||
that is being scanned does not agree with the convention set by this option,
|
||||
<b>pcre2grep</b> may behave in strange ways. Note that this option does not
|
||||
apply to files specified by the <b>-f</b>, <b>--exclude-from</b>, or
|
||||
<br>
|
||||
<br>
|
||||
This option makes it possible to use <b>pcre2grep</b> to scan files that have
|
||||
come from other environments without having to modify their line endings. If
|
||||
the data that is being scanned does not agree with the convention set by this
|
||||
option, <b>pcre2grep</b> may behave in strange ways. Note that this option does
|
||||
not apply to files specified by the <b>-f</b>, <b>--exclude-from</b>, or
|
||||
<b>--include-from</b> options, which are expected to use the operating system's
|
||||
standard newline sequence.
|
||||
</P>
|
||||
|
@ -641,29 +671,41 @@ It should never be needed in normal use.
|
|||
</P>
|
||||
<P>
|
||||
<b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
|
||||
When there is a match, instead of outputting the whole line that matched,
|
||||
output just the given text. This option is mutually exclusive with
|
||||
<b>--only-matching</b>, <b>--file-offsets</b>, and <b>--line-offsets</b>. Escape
|
||||
sequences starting with a dollar character may be used to insert the contents
|
||||
of the matched part of the line and/or captured substrings into the text.
|
||||
When there is a match, instead of outputting the line that matched, output just
|
||||
the text specified in this option, followed by an operating-system standard
|
||||
newline. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>,
|
||||
and <b>-C</b> options are ignored. The <b>--newline</b> option has no effect on
|
||||
this option, which is mutually exclusive with <b>--only-matching</b>,
|
||||
<b>--file-offsets</b>, and <b>--line-offsets</b>. However, like
|
||||
<b>--only-matching</b>, if there is more than one match in a line, each of them
|
||||
causes a line of output.
|
||||
<br>
|
||||
<br>
|
||||
$<digits> or ${<digits>} is replaced by the captured
|
||||
substring of the given decimal number; zero substitutes the whole match. If
|
||||
the number is greater than the number of capturing substrings, or if the
|
||||
capture is unset, the replacement is empty.
|
||||
Escape sequences starting with a dollar character may be used to insert the
|
||||
contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
<br>
|
||||
<br>
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
<br>
|
||||
<br>
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
<br>
|
||||
<br>
|
||||
$o<digits> is replaced by the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||
given octal number. In the first form, up to three octal digits are processed.
|
||||
When more digits are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
<br>
|
||||
<br>
|
||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||
processed. When more digits are needed in Unicode mode to specify a wide
|
||||
character, the second form must be used.
|
||||
<br>
|
||||
<br>
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
|
@ -685,20 +727,32 @@ otherwise empty line. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
||||
Show only the part of the line that matched the capturing parentheses of the
|
||||
given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||
equivalent to <b>-o</b> without a number. Because these options can be given
|
||||
without an argument (see above), if an argument is present, it must be given in
|
||||
the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||
for the non-argument case above also apply to this option. If the specified
|
||||
capturing parentheses do not exist in the pattern, or were not set in the
|
||||
match, nothing is output unless the file name or line number are being output.
|
||||
given number. Up to 50 capturing parentheses are supported by default. This
|
||||
limit can be changed via the <b>--om-capture</b> option. A pattern may contain
|
||||
any number of capturing parentheses, but only those whose number is within the
|
||||
limit can be accessed by <b>-o</b>. An error occurs if the number specified by
|
||||
<b>-o</b> is greater than the limit.
|
||||
<br>
|
||||
<br>
|
||||
-o0 is the same as <b>-o</b> without a number. Because these options can be
|
||||
given without an argument (see above), if an argument is present, it must be
|
||||
given in the same shell item, for example, -o3 or --only-matching=2. The
|
||||
comments given for the non-argument case above also apply to this option. If
|
||||
the specified capturing parentheses do not exist in the pattern, or were not
|
||||
set in the match, nothing is output unless the file name or line number are
|
||||
being output.
|
||||
<br>
|
||||
<br>
|
||||
If this option is given multiple times, multiple substrings are output for each
|
||||
match, in the order the options are given, and all on one line. For example,
|
||||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator (but see the next
|
||||
option).
|
||||
but one option).
|
||||
</P>
|
||||
<P>
|
||||
<b>--om-capture</b>=<i>number</i>
|
||||
Set the number of capturing parentheses that can be accessed by <b>-o</b>. The
|
||||
default is 50.
|
||||
</P>
|
||||
<P>
|
||||
<b>--om-separator</b>=<i>text</i>
|
||||
|
@ -720,7 +774,8 @@ option to "recurse".
|
|||
</P>
|
||||
<P>
|
||||
<b>--recursion-limit</b>=<i>number</i>
|
||||
See <b>--match-limit</b> above.
|
||||
This is an obsolete synonym for <b>--depth-limit</b>. See <b>--match-limit</b>
|
||||
above for details.
|
||||
</P>
|
||||
<P>
|
||||
<b>-s</b>, <b>--no-messages</b>
|
||||
|
@ -741,11 +796,23 @@ ignored when used with <b>-L</b> (list files without matches), because the grand
|
|||
total would always be zero.
|
||||
</P>
|
||||
<P>
|
||||
<b>-u</b>, <b>--utf-8</b>
|
||||
<b>-u</b>, <b>--utf</b>
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
||||
<b>--include</b> options) and all subject lines that are scanned must be valid
|
||||
strings of UTF-8 characters.
|
||||
<b>--include</b> options) and all lines that are scanned must be valid strings
|
||||
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||
occurs.
|
||||
</P>
|
||||
<P>
|
||||
<b>-U</b>, <b>--utf-allow-invalid</b>
|
||||
As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code
|
||||
unit sequences. These can never form part of any pattern match. Patterns
|
||||
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||
or other binary files. For more details about matching in non-valid UTF-8
|
||||
strings, see the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
<b>-V</b>, <b>--version</b>
|
||||
|
@ -756,7 +823,9 @@ ignored.
|
|||
<P>
|
||||
<b>-v</b>, <b>--invert-match</b>
|
||||
Invert the sense of the match, so that lines which do <i>not</i> match any of
|
||||
the patterns are the ones that are found.
|
||||
the patterns are the ones that are found. When this option is set, options such
|
||||
as <b>--only-matching</b> and <b>--output</b>, which specify parts of a match
|
||||
that are to be output, are ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
|
||||
|
@ -776,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
|||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-Z</b>, <b>--null</b>
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
|
@ -786,16 +862,27 @@ by the <b>--locale</b> option. If no locale is set, the PCRE2 library's default
|
|||
<br><a name="SEC8" href="#TOC1">NEWLINES</a><br>
|
||||
<P>
|
||||
The <b>-N</b> (<b>--newline</b>) option allows <b>pcre2grep</b> to scan files with
|
||||
different newline conventions from the default. Any parts of the input files
|
||||
that are written to the standard output are copied identically, with whatever
|
||||
newline sequences they have in the input. However, the setting of this option
|
||||
affects only the way scanned files are processed. It does not affect the
|
||||
interpretation of files specified by the <b>-f</b>, <b>--file-list</b>,
|
||||
<b>--exclude-from</b>, or <b>--include-from</b> options, nor does it affect the
|
||||
way in which <b>pcre2grep</b> writes informational messages to the standard
|
||||
error and output streams. For these it uses the string "\n" to indicate
|
||||
newlines, relying on the C I/O library to convert this to an appropriate
|
||||
sequence.
|
||||
newline conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation of files
|
||||
specified by the <b>-f</b>, <b>--file-list</b>, <b>--exclude-from</b>, or
|
||||
<b>--include-from</b> options.
|
||||
</P>
|
||||
<P>
|
||||
Any parts of the scanned input files that are written to the standard output
|
||||
are copied with whatever newline sequences they have in the input. However, if
|
||||
the final line of a file is output, and it does not end with a newline
|
||||
sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF
|
||||
or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a
|
||||
single NL is used.
|
||||
</P>
|
||||
<P>
|
||||
The newline setting does not affect the way in which <b>pcre2grep</b> writes
|
||||
newlines in informational messages to the standard output and error streams.
|
||||
Under Windows, the standard output is set to be binary, so that "\r\n" at the
|
||||
ends of output lines that are copied from the input is not converted to
|
||||
"\r\r\n" by the C I/O library. This means that any messages written to the
|
||||
standard output must end with "\r\n". For all other operating systems, and
|
||||
for all messages to the standard error stream, "\n" is used.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
|
||||
<P>
|
||||
|
@ -806,9 +893,9 @@ as in the GNU <b>grep</b> program. Any long option of the form
|
|||
<b>--file-offsets</b>, <b>--heap-limit</b>, <b>--include-dir</b>,
|
||||
<b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>, <b>-M</b>,
|
||||
<b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--om-separator</b>,
|
||||
<b>--output</b>, <b>-u</b>, and <b>--utf-8</b> options are specific to
|
||||
<b>pcre2grep</b>, as is the use of the <b>--only-matching</b> option with a
|
||||
capturing parentheses number.
|
||||
<b>--output</b>, <b>-u</b>, <b>--utf</b>, <b>-U</b>, and <b>--utf-allow-invalid</b>
|
||||
options are specific to <b>pcre2grep</b>, as is the use of the
|
||||
<b>--only-matching</b> option with a capturing parentheses number.
|
||||
</P>
|
||||
<P>
|
||||
Although most of the common options work the same way, a few are different in
|
||||
|
@ -868,12 +955,36 @@ documentation for details). Numbered callouts are ignored by <b>pcre2grep</b>;
|
|||
only callouts with string arguments are useful.
|
||||
</P>
|
||||
<br><b>
|
||||
Echoing a specific string
|
||||
</b><br>
|
||||
<P>
|
||||
Starting the callout string with a pipe character invokes an echoing facility
|
||||
that avoids calling an external program or script. This facility is always
|
||||
available, provided that callouts were not completely disabled when
|
||||
<b>pcre2grep</b> was built. The rest of the callout string is processed as a
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the <b>--output</b> (<b>-O</b>) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
<pre>
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
</pre>
|
||||
Matching continues normally after the string is output. If you want to see only
|
||||
the callout output but not any output from an actual match, you should end the
|
||||
pattern with (*FAIL).
|
||||
</P>
|
||||
<br><b>
|
||||
Calling external programs or scripts
|
||||
</b><br>
|
||||
<P>
|
||||
This facility can be independently disabled when <b>pcre2grep</b> is built. It
|
||||
is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
|
||||
where <b>lib$spawn()</b> is used, and for any other Unix-like environment where
|
||||
where <b>lib$spawn()</b> is used, and for any Unix-like environment where
|
||||
<b>fork()</b> and <b>execv()</b> are available.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -885,14 +996,11 @@ arguments:
|
|||
executable_name|arg1|arg2|...
|
||||
</pre>
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
||||
captured substring of the given decimal number, which must be greater than
|
||||
zero. If the number is greater than the number of capturing substrings, or if
|
||||
the capture is unset, the replacement is empty.
|
||||
</P>
|
||||
<P>
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||
started by a dollar character. These are the same as for the <b>--output</b>
|
||||
(<b>-O</b>) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
<pre>
|
||||
echo -e "abcde\n12345" | pcre2grep \
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -905,28 +1013,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
</pre>
|
||||
The parameters for the system call that is used to run the
|
||||
program or script are zero-terminated strings. This means that binary zero
|
||||
characters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in the
|
||||
string (for example, a dollar not followed by another character) cause the
|
||||
callout to be ignored. If running the program fails for any reason (including
|
||||
the non-existence of the executable), a local matching failure occurs and the
|
||||
matcher backtracks in the normal way.
|
||||
</P>
|
||||
<br><b>
|
||||
Echoing a specific string
|
||||
</b><br>
|
||||
<P>
|
||||
This facility is always available, provided that callouts were not completely
|
||||
disabled when <b>pcre2grep</b> was built. If the callout string starts with a
|
||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
||||
having been passed through the same escape processing as text from the --output
|
||||
option. This provides a simple echoing facility that avoids calling an external
|
||||
program or script. No terminator is added to the string, so if you want a
|
||||
newline, you must include it explicitly. Matching continues normally after the
|
||||
string is output. If you want to see only the callout output but not any output
|
||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
||||
The parameters for the system call that is used to run the program or script
|
||||
are zero-terminated strings. This means that binary zero characters in the
|
||||
callout argument will cause premature termination of their substrings, and
|
||||
therefore should not be present. Any syntax errors in the string (for example,
|
||||
a dollar not followed by another character) causes the callout to be ignored.
|
||||
If running the program fails for any reason (including the non-existence of the
|
||||
executable), a local matching failure occurs and the matcher backtracks in the
|
||||
normal way.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br>
|
||||
<P>
|
||||
|
@ -958,22 +1052,23 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3).
|
||||
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2unicode</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 24 November 2018
|
||||
Last updated: 30 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -54,6 +54,7 @@ platforms:
|
|||
<pre>
|
||||
ARM 32-bit (v5, v7, and Thumb2)
|
||||
ARM 64-bit
|
||||
IBM s390x 64 bit
|
||||
Intel x86 32-bit and 64-bit
|
||||
MIPS 32-bit and 64-bit
|
||||
Power PC 32-bit and 64-bit
|
||||
|
@ -90,7 +91,7 @@ or a negative error code.
|
|||
There is a limit to the size of pattern that JIT supports, imposed by the size
|
||||
of machine stack that it uses. The exact rules are not documented because they
|
||||
may change at any time, in particular, when new optimizations are introduced.
|
||||
If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
|
||||
If a pattern is too big, a call to <b>pcre2_jit_compile()</b> returns
|
||||
PCRE2_ERROR_NOMEMORY.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -147,25 +148,29 @@ pattern.
|
|||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">MATCHING SUBJECTS CONTAINING INVALID UTF</a><br>
|
||||
<P>
|
||||
When a pattern is compiled with the PCRE2_UTF option, the interpretive matching
|
||||
function expects its subject string to be a valid sequence of UTF code units.
|
||||
If it is not, the result is undefined. This is also true by default of matching
|
||||
via JIT. However, if the option PCRE2_JIT_INVALID_UTF is passed to
|
||||
<b>pcre2_jit_compile()</b>, code that can process a subject containing invalid
|
||||
UTF is compiled.
|
||||
When a pattern is compiled with the PCRE2_UTF option, subject strings are
|
||||
normally expected to be a valid sequence of UTF code units. By default, this is
|
||||
checked at the start of matching and an error is generated if invalid UTF is
|
||||
detected. The PCRE2_NO_UTF_CHECK option can be passed to <b>pcre2_match()</b> to
|
||||
skip the check (for improved performance) if you are sure that a subject string
|
||||
is valid. If this option is used with an invalid string, the result is
|
||||
undefined.
|
||||
</P>
|
||||
<P>
|
||||
In this mode, an invalid code unit sequence never matches any pattern item. It
|
||||
does not match dot, it does not match \p{Any}, it does not even match negative
|
||||
items such as [^X]. A lookbehind assertion fails if it encounters an invalid
|
||||
sequence while moving the current point backwards. In other words, an invalid
|
||||
UTF code unit sequence acts as a barrier which no match can cross. Reaching an
|
||||
invalid sequence causes an immediate backtrack.
|
||||
However, a way of running matches on strings that may contain invalid UTF
|
||||
sequences is available. Calling <b>pcre2_compile()</b> with the
|
||||
PCRE2_MATCH_INVALID_UTF option has two effects: it tells the interpreter in
|
||||
<b>pcre2_match()</b> to support invalid UTF, and, if <b>pcre2_jit_compile()</b>
|
||||
is called, the compiled JIT code also supports invalid UTF. Details of how this
|
||||
support works, in both the JIT and the interpretive cases, is given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
Using this option, an application can run matches in arbitrary data, knowing
|
||||
that any matched strings that are returned will be valid UTF. This can be
|
||||
useful when searching for text in executable or other binary files.
|
||||
There is also an obsolete option for <b>pcre2_jit_compile()</b> called
|
||||
PCRE2_JIT_INVALID_UTF, which currently exists only for backward compatibility.
|
||||
It is superseded by the <b>pcre2_compile()</b> option PCRE2_MATCH_INVALID_UTF
|
||||
and should no longer be used. It may be removed in future.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">UNSUPPORTED OPTIONS AND PATTERN ITEMS</a><br>
|
||||
<P>
|
||||
|
@ -264,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used
|
|||
for currently suspended match(es).
|
||||
</P>
|
||||
<P>
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
</P>
|
||||
<P>
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
|
@ -282,7 +287,7 @@ inefficient solution, and not recommended.
|
|||
This is a suggestion for how a multithreaded program that needs to set up
|
||||
non-default JIT stacks might operate:
|
||||
<pre>
|
||||
During thread initalization
|
||||
During thread initialization
|
||||
thread_local_var = pcre2_jit_stack_create(...)
|
||||
|
||||
During thread exit
|
||||
|
@ -335,12 +340,12 @@ stack through the JIT callback function.
|
|||
You can free a JIT stack at any time, as long as it will not be used by
|
||||
<b>pcre2_match()</b> again. When you assign the stack to a match context, only a
|
||||
pointer is set. There is no reference counting or any other magic. You can free
|
||||
compiled patterns, contexts, and stacks in any order, anytime. Just \fIdo
|
||||
not\fP call <b>pcre2_match()</b> with a match context pointing to an already
|
||||
freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently
|
||||
used by <b>pcre2_match()</b> in another thread). You can also replace the stack
|
||||
in a context at any time when it is not in use. You should free the previous
|
||||
stack before assigning a replacement.
|
||||
compiled patterns, contexts, and stacks in any order, anytime.
|
||||
Just <i>do not</i> call <b>pcre2_match()</b> with a match context pointing to an
|
||||
already freed stack, as that will cause SEGFAULT. (Also, do not free a stack
|
||||
currently used by <b>pcre2_match()</b> in another thread). You can also replace
|
||||
the stack in a context at any time when it is not in use. You should free the
|
||||
previous stack before assigning a replacement.
|
||||
</P>
|
||||
<P>
|
||||
(5) Should I allocate/free a stack every time before/after calling
|
||||
|
@ -377,8 +382,8 @@ out this complicated API.
|
|||
<b>void pcre2_jit_free_unused_memory(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -437,10 +442,10 @@ that was not compiled.
|
|||
<P>
|
||||
When you call <b>pcre2_match()</b>, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
</P>
|
||||
<P>
|
||||
Bypassing the sanity checks and the <b>pcre2_match()</b> wrapping can give
|
||||
|
@ -461,9 +466,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 06 March 2019
|
||||
Last updated: 30 November 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<P>
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 02 February 2019
|
||||
Last updated: 26 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -78,8 +78,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
|
|||
If a leaf node is reached, a matching string has been found, and at that point
|
||||
the algorithm stops. Thus, if there is more than one possible match, this
|
||||
algorithm returns the first one that it finds. Whether this is the shortest,
|
||||
the longest, or some intermediate length depends on the way the greedy and
|
||||
ungreedy repetition quantifiers are specified in the pattern.
|
||||
the longest, or some intermediate length depends on the way the alternations
|
||||
and the greedy or ungreedy repetition quantifiers are specified in the
|
||||
pattern.
|
||||
</P>
|
||||
<P>
|
||||
Because it ends up with a single path through the tree, it is relatively
|
||||
|
@ -109,11 +110,17 @@ no more unterminated paths. At this point, terminated paths represent the
|
|||
different matching possibilities (if there are none, the match has failed).
|
||||
Thus, if there is more than one possible match, this algorithm finds all of
|
||||
them, and in particular, it finds the longest. The matches are returned in
|
||||
decreasing order of length. There is an option to stop the algorithm after the
|
||||
first match (which is necessarily the shortest) is found.
|
||||
the output vector in decreasing order of length. There is an option to stop the
|
||||
algorithm after the first match (which is necessarily the shortest) is found.
|
||||
</P>
|
||||
<P>
|
||||
Note that all the matches that are found start at the same point in the
|
||||
Note that the size of vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using <b>pcre2_match_data_create_from_pattern()</b> to create the match
|
||||
data block is therefore not advisable when doing DFA matching.
|
||||
</P>
|
||||
<P>
|
||||
Note also that all the matches that are found start at the same point in the
|
||||
subject. If the pattern
|
||||
<pre>
|
||||
cat(er(pillar)?)?
|
||||
|
@ -188,23 +195,20 @@ code unit) at a time, for all active paths through the tree.
|
|||
9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
|
||||
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
|
||||
</P>
|
||||
<P>
|
||||
10. The PCRE2_MATCH_INVALID_UTF option for <b>pcre2_compile()</b> is not
|
||||
supported by <b>pcre2_dfa_match()</b>.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">ADVANTAGES OF THE ALTERNATIVE ALGORITHM</a><br>
|
||||
<P>
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
The main advantage of the alternative algorithm is that all possible matches
|
||||
(at a single point in the subject) are automatically found, and in particular,
|
||||
the longest match is found. To find more than one match at the same point using
|
||||
the standard algorithm, you have to do kludgy things with callouts.
|
||||
</P>
|
||||
<P>
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
found, and in particular, the longest match is found. To find more than one
|
||||
match using the standard algorithm, you have to do kludgy things with
|
||||
callouts.
|
||||
</P>
|
||||
<P>
|
||||
2. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack (except for lookbehinds), it is possible to pass very
|
||||
long subject strings to the matching function in several pieces, checking for
|
||||
partial matching each time. Although it is also possible to do multi-segment
|
||||
matching using the standard algorithm, by retaining partially matched
|
||||
substrings, it is more complicated. The
|
||||
Partial matching is possible with this algorithm, though it has some
|
||||
limitations. The
|
||||
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
||||
documentation gives details of partial matching and discusses multi-segment
|
||||
matching.
|
||||
|
@ -219,26 +223,30 @@ because it has to search for all possible matches, but is also because it is
|
|||
less susceptible to optimization.
|
||||
</P>
|
||||
<P>
|
||||
2. Capturing parentheses, backreferences, and script runs are not supported.
|
||||
2. Capturing parentheses, backreferences, script runs, and matching within
|
||||
invalid UTF string are not supported.
|
||||
</P>
|
||||
<P>
|
||||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
</P>
|
||||
<P>
|
||||
4. JIT optimization is not supported.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 10 October 2018
|
||||
Last updated: 28 August 2021
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -14,78 +14,123 @@ please consult the man page, in case the conversion went wrong.
|
|||
<br>
|
||||
<ul>
|
||||
<li><a name="TOC1" href="#SEC1">PARTIAL MATCHING IN PCRE2</a>
|
||||
<li><a name="TOC2" href="#SEC2">PARTIAL MATCHING USING pcre2_match()</a>
|
||||
<li><a name="TOC3" href="#SEC3">PARTIAL MATCHING USING pcre2_dfa_match()</a>
|
||||
<li><a name="TOC4" href="#SEC4">PARTIAL MATCHING AND WORD BOUNDARIES</a>
|
||||
<li><a name="TOC5" href="#SEC5">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a>
|
||||
<li><a name="TOC2" href="#SEC2">REQUIREMENTS FOR A PARTIAL MATCH</a>
|
||||
<li><a name="TOC3" href="#SEC3">PARTIAL MATCHING USING pcre2_match()</a>
|
||||
<li><a name="TOC4" href="#SEC4">MULTI-SEGMENT MATCHING WITH pcre2_match()</a>
|
||||
<li><a name="TOC5" href="#SEC5">PARTIAL MATCHING USING pcre2_dfa_match()</a>
|
||||
<li><a name="TOC6" href="#SEC6">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a>
|
||||
<li><a name="TOC7" href="#SEC7">MULTI-SEGMENT MATCHING WITH pcre2_match()</a>
|
||||
<li><a name="TOC8" href="#SEC8">ISSUES WITH MULTI-SEGMENT MATCHING</a>
|
||||
<li><a name="TOC9" href="#SEC9">AUTHOR</a>
|
||||
<li><a name="TOC10" href="#SEC10">REVISION</a>
|
||||
<li><a name="TOC7" href="#SEC7">AUTHOR</a>
|
||||
<li><a name="TOC8" href="#SEC8">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE2</a><br>
|
||||
<P>
|
||||
In normal use of PCRE2, if the subject string that is passed to a matching
|
||||
function matches as far as it goes, but is too short to match the entire
|
||||
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
|
||||
might be helpful to distinguish this case from other cases in which there is no
|
||||
match.
|
||||
In normal use of PCRE2, if there is a match up to the end of a subject string,
|
||||
but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH
|
||||
is returned, just like any other failing match. There are circumstances where
|
||||
it might be helpful to distinguish this "partial match" case.
|
||||
</P>
|
||||
<P>
|
||||
Consider, for example, an application where a human is required to type in data
|
||||
for a field with specific formatting requirements. An example might be a date
|
||||
in the form <i>ddmmmyy</i>, defined by this pattern:
|
||||
One example is an application where the subject string is very long, and not
|
||||
all available at once. The requirement here is to be able to do the matching
|
||||
segment by segment, but special action is needed when a matched substring spans
|
||||
the boundary between two segments.
|
||||
</P>
|
||||
<P>
|
||||
Another example is checking a user input string as it is typed, to ensure that
|
||||
it conforms to a required format. Invalid characters can be immediately
|
||||
diagnosed and rejected, giving instant feedback.
|
||||
</P>
|
||||
<P>
|
||||
Partial matching is a PCRE2-specific feature; it is not Perl-compatible. It is
|
||||
requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT
|
||||
options when calling a matching function. The difference between the two
|
||||
options is whether or not a partial match is preferred to an alternative
|
||||
complete match, though the details differ between the two types of matching
|
||||
function. If both options are set, PCRE2_PARTIAL_HARD takes precedence.
|
||||
</P>
|
||||
<P>
|
||||
If you want to use partial matching with just-in-time optimized code, as well
|
||||
as setting a partial match option for the matching function, you must also call
|
||||
<b>pcre2_jit_compile()</b> with one or both of these options:
|
||||
<pre>
|
||||
^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
|
||||
</pre>
|
||||
If the application sees the user's keystrokes one by one, and can check that
|
||||
what has been typed so far is potentially valid, it is able to raise an error
|
||||
as soon as a mistake is made, by beeping and not reflecting the character that
|
||||
has been typed, for example. This immediate feedback is likely to be a better
|
||||
user interface than a check that is delayed until the entire string has been
|
||||
entered. Partial matching can also be useful when the subject string is very
|
||||
long and is not all available at once.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
|
||||
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
|
||||
The difference between the two options is whether or not a partial match is
|
||||
preferred to an alternative complete match, though the details differ between
|
||||
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
|
||||
takes precedence.
|
||||
</P>
|
||||
<P>
|
||||
If you want to use partial matching with just-in-time optimized code, you must
|
||||
call <b>pcre2_jit_compile()</b> with one or both of these options:
|
||||
<pre>
|
||||
PCRE2_JIT_PARTIAL_SOFT
|
||||
PCRE2_JIT_PARTIAL_HARD
|
||||
PCRE2_JIT_PARTIAL_SOFT
|
||||
</pre>
|
||||
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
|
||||
matches on the same pattern. If the appropriate JIT mode has not been compiled,
|
||||
interpretive matching code is used.
|
||||
matches on the same pattern. Separate code is compiled for each mode. If the
|
||||
appropriate JIT mode has not been compiled, interpretive matching code is used.
|
||||
</P>
|
||||
<P>
|
||||
Setting a partial matching option disables two of PCRE2's standard
|
||||
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
|
||||
abandons matching immediately if it is not present in the subject string. This
|
||||
optimization cannot be used for a subject string that might match only
|
||||
partially. PCRE2 also knows the minimum length of a matching string, and does
|
||||
optimization hints. PCRE2 remembers the last literal code unit in a pattern,
|
||||
and abandons matching immediately if it is not present in the subject string.
|
||||
This optimization cannot be used for a subject string that might match only
|
||||
partially. PCRE2 also remembers a minimum length of a matching string, and does
|
||||
not bother to run the matching function on shorter strings. This optimization
|
||||
is also disabled for partial matching.
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre2_match()</a><br>
|
||||
<br><a name="SEC2" href="#TOC1">REQUIREMENTS FOR A PARTIAL MATCH</a><br>
|
||||
<P>
|
||||
A partial match occurs during a call to <b>pcre2_match()</b> when the end of the
|
||||
subject string is reached successfully, but matching cannot continue because
|
||||
more characters are needed. However, at least one character in the subject must
|
||||
have been inspected. This character need not form part of the final matched
|
||||
string; lookbehind assertions and the \K escape sequence provide ways of
|
||||
inspecting characters before the start of a matched string. The requirement for
|
||||
inspecting at least one character exists because an empty string can always be
|
||||
matched; without such a restriction there would always be a partial match of an
|
||||
empty string at the end of the subject.
|
||||
A possible partial match occurs during matching when the end of the subject
|
||||
string is reached successfully, but either more characters are needed to
|
||||
complete the match, or the addition of more characters might change what is
|
||||
matched.
|
||||
</P>
|
||||
<P>
|
||||
Example 1: if the pattern is /abc/ and the subject is "ab", more characters are
|
||||
definitely needed to complete a match. In this case both hard and soft matching
|
||||
options yield a partial match.
|
||||
</P>
|
||||
<P>
|
||||
Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match
|
||||
can be found, but the addition of more characters might change what is
|
||||
matched. In this case, only PCRE2_PARTIAL_HARD returns a partial match;
|
||||
PCRE2_PARTIAL_SOFT returns the complete match.
|
||||
</P>
|
||||
<P>
|
||||
On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next
|
||||
pattern item is \z, \Z, \b, \B, or $ there is always a partial match.
|
||||
Otherwise, for both options, the next pattern item must be one that inspects a
|
||||
character, and at least one of the following must be true:
|
||||
</P>
|
||||
<P>
|
||||
(1) At least one character has already been inspected. An inspected character
|
||||
need not form part of the final matched string; lookbehind assertions and the
|
||||
\K escape sequence provide ways of inspecting characters before the start of a
|
||||
matched string.
|
||||
</P>
|
||||
<P>
|
||||
(2) The pattern contains one or more lookbehind assertions. This condition
|
||||
exists in case there is a lookbehind that inspects characters before the start
|
||||
of the match.
|
||||
</P>
|
||||
<P>
|
||||
(3) There is a special case when the whole pattern can match an empty string.
|
||||
When the starting point is at the end of the subject, the empty string match is
|
||||
a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above
|
||||
conditions is true, it is returned. However, because adding more characters
|
||||
might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match,
|
||||
which in this case means "there is going to be a match at this point, but until
|
||||
some more characters are added, we do not know if it will be an empty string or
|
||||
something longer".
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">PARTIAL MATCHING USING pcre2_match()</a><br>
|
||||
<P>
|
||||
When a partial matching option is set, the result of calling
|
||||
<b>pcre2_match()</b> can be one of the following:
|
||||
</P>
|
||||
<P>
|
||||
<b>A successful match</b>
|
||||
A complete match has been found, starting and ending within this subject.
|
||||
</P>
|
||||
<P>
|
||||
<b>PCRE2_ERROR_NOMATCH</b>
|
||||
No match can start anywhere in this subject.
|
||||
</P>
|
||||
<P>
|
||||
<b>PCRE2_ERROR_PARTIAL</b>
|
||||
Adding more characters may result in a complete match that uses one or more
|
||||
characters from the end of this subject.
|
||||
</P>
|
||||
<P>
|
||||
When a partial match is returned, the first two elements in the ovector point
|
||||
|
@ -103,54 +148,42 @@ these characters are needed for a subsequent re-match with additional
|
|||
characters.
|
||||
</P>
|
||||
<P>
|
||||
What happens when a partial match is identified depends on which of the two
|
||||
partial matching options are set.
|
||||
</P>
|
||||
<br><b>
|
||||
PCRE2_PARTIAL_SOFT WITH pcre2_match()
|
||||
</b><br>
|
||||
<P>
|
||||
If PCRE2_PARTIAL_SOFT is set when <b>pcre2_match()</b> identifies a partial
|
||||
match, the partial match is remembered, but matching continues as normal, and
|
||||
other alternatives in the pattern are tried. If no complete match can be found,
|
||||
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
|
||||
</P>
|
||||
<P>
|
||||
This option is "soft" because it prefers a complete match over a partial match.
|
||||
All the various matching items in a pattern behave as if the subject string is
|
||||
potentially complete. For example, \z, \Z, and $ match at the end of the
|
||||
subject, as normal, and for \b and \B the end of the subject is treated as a
|
||||
non-alphanumeric.
|
||||
</P>
|
||||
<P>
|
||||
If there is more than one partial match, the first one that was found provides
|
||||
the data that is returned. Consider this pattern:
|
||||
<pre>
|
||||
/123\w+X|dogY/
|
||||
</pre>
|
||||
If this is matched against the subject string "abc123dog", both
|
||||
alternatives fail to match, but the end of the subject is reached during
|
||||
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
|
||||
identifying "123dog" as the first partial match that was found. (In this
|
||||
example, there are two partial matches, because "dog" on its own partially
|
||||
matches the second alternative.)
|
||||
If this is matched against the subject string "abc123dog", both alternatives
|
||||
fail to match, but the end of the subject is reached during matching, so
|
||||
PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying
|
||||
"123dog" as the first partial match. (In this example, there are two partial
|
||||
matches, because "dog" on its own partially matches the second alternative.)
|
||||
</P>
|
||||
<br><b>
|
||||
PCRE2_PARTIAL_HARD WITH pcre2_match()
|
||||
How a partial match is processed by pcre2_match()
|
||||
</b><br>
|
||||
<P>
|
||||
If PCRE2_PARTIAL_HARD is set for <b>pcre2_match()</b>, PCRE2_ERROR_PARTIAL is
|
||||
returned as soon as a partial match is found, without continuing to search for
|
||||
possible complete matches. This option is "hard" because it prefers an earlier
|
||||
partial match over a later complete match. For this reason, the assumption is
|
||||
made that the end of the supplied subject string may not be the true end of the
|
||||
available data, and so, if \z, \Z, \b, \B, or $ are encountered at the end
|
||||
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
|
||||
character in the subject has been inspected.
|
||||
What happens when a partial match is identified depends on which of the two
|
||||
partial matching options is set.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a
|
||||
partial match is found, without continuing to search for possible complete
|
||||
matches. This option is "hard" because it prefers an earlier partial match over
|
||||
a later complete match. For this reason, the assumption is made that the end of
|
||||
the supplied subject string is not the true end of the available data, which is
|
||||
why \z, \Z, \b, \B, and $ always give a partial match.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching
|
||||
continues as normal, and other alternatives in the pattern are tried. If no
|
||||
complete match can be found, PCRE2_ERROR_PARTIAL is returned instead of
|
||||
PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match
|
||||
over a partial match. All the various matching items in a pattern behave as if
|
||||
the subject string is potentially complete; \z, \Z, and $ match at the end of
|
||||
the subject, as normal, and for \b and \B the end of the subject is treated
|
||||
as a non-alphanumeric.
|
||||
</P>
|
||||
<br><b>
|
||||
Comparing hard and soft partial matching
|
||||
</b><br>
|
||||
<P>
|
||||
The difference between the two partial matching options can be illustrated by a
|
||||
pattern such as:
|
||||
|
@ -175,26 +208,135 @@ to follow this explanation by thinking of the two patterns like this:
|
|||
The second pattern will never match "dogsbody", because it will always find the
|
||||
shorter match first.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">PARTIAL MATCHING USING pcre2_dfa_match()</a><br>
|
||||
<br><b>
|
||||
Example of partial matching using pcre2test
|
||||
</b><br>
|
||||
<P>
|
||||
The DFA functions move along the subject string character by character, without
|
||||
The <b>pcre2test</b> data modifiers <b>partial_hard</b> (or <b>ph</b>) and
|
||||
<b>partial_soft</b> (or <b>ps</b>) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT,
|
||||
respectively, when calling <b>pcre2_match()</b>. Here is a run of
|
||||
<b>pcre2test</b> using a pattern that matches the whole subject in the form of a
|
||||
date:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 25dec3\=ph
|
||||
Partial match: 23dec3
|
||||
data> 3ju\=ph
|
||||
Partial match: 3ju
|
||||
data> 3juj\=ph
|
||||
No match
|
||||
</pre>
|
||||
This example gives the same results for both hard and soft partial matching
|
||||
options. Here is an example where there is a difference:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 25jun04\=ps
|
||||
0: 25jun04
|
||||
1: jun
|
||||
data> 25jun04\=ph
|
||||
Partial match: 25jun04
|
||||
</pre>
|
||||
With PCRE2_PARTIAL_SOFT, the subject is matched completely. For
|
||||
PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so
|
||||
there is only a partial match.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_match()</a><br>
|
||||
<P>
|
||||
PCRE was not originally designed with multi-segment matching in mind. However,
|
||||
over time, features (including partial matching) that make multi-segment
|
||||
matching possible have been added. A very long string can be searched segment
|
||||
by segment by calling <b>pcre2_match()</b> repeatedly, with the aim of achieving
|
||||
the same results that would happen if the entire string was available for
|
||||
searching all the time. Normally, the strings that are being sought are much
|
||||
shorter than each individual segment, and are in the middle of very long
|
||||
strings, so the pattern is normally not anchored.
|
||||
</P>
|
||||
<P>
|
||||
Special logic must be implemented to handle a matched substring that spans a
|
||||
segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a
|
||||
partial match at the end of a segment whenever there is the possibility of
|
||||
changing the match by adding more characters. The PCRE2_NOTBOL option should
|
||||
also be set for all but the first segment.
|
||||
</P>
|
||||
<P>
|
||||
When a partial match occurs, the next segment must be added to the current
|
||||
subject and the match re-run, using the <i>startoffset</i> argument of
|
||||
<b>pcre2_match()</b> to begin at the point where the partial match started.
|
||||
For example:
|
||||
<pre>
|
||||
re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
|
||||
data> ...the date is 23ja\=ph
|
||||
Partial match: 23ja
|
||||
data> ...the date is 23jan19 and on that day...\=offset=15
|
||||
0: 23jan19
|
||||
1: jan
|
||||
</pre>
|
||||
Note the use of the <b>offset</b> modifier to start the new match where the
|
||||
partial match was found. In this example, the next segment was added to the one
|
||||
in which the partial match was found. This is the most straightforward
|
||||
approach, typically using a memory buffer that is twice the size of each
|
||||
segment. After a partial match, the first half of the buffer is discarded, the
|
||||
second half is moved to the start of the buffer, and a new segment is added
|
||||
before repeating the match as in the example above. After a no match, the
|
||||
entire buffer can be discarded.
|
||||
</P>
|
||||
<P>
|
||||
If there are memory constraints, you may want to discard text that precedes a
|
||||
partial match before adding the next segment. Unfortunately, this is not at
|
||||
present straightforward. In cases such as the above, where the pattern does not
|
||||
contain any lookbehinds, it is sufficient to retain only the partially matched
|
||||
substring. However, if the pattern contains a lookbehind assertion, characters
|
||||
that precede the start of the partial match may have been inspected during the
|
||||
matching process. When <b>pcre2test</b> displays a partial match, it indicates
|
||||
these characters with '<' if the <b>allusedtext</b> modifier is set:
|
||||
<pre>
|
||||
re> "(?<=123)abc"
|
||||
data> xx123ab\=ph,allusedtext
|
||||
Partial match: 123ab
|
||||
<<<
|
||||
</pre>
|
||||
However, the <b>allusedtext</b> modifier is not available for JIT matching,
|
||||
because JIT matching does not record the first (or last) consulted characters.
|
||||
For this reason, this information is not available via the API. It is therefore
|
||||
not possible in general to obtain the exact number of characters that must be
|
||||
retained in order to get the right match result. If you cannot retain the
|
||||
entire segment, you must find some heuristic way of choosing.
|
||||
</P>
|
||||
<P>
|
||||
If you know the approximate length of the matching substrings, you can use that
|
||||
to decide how much text to retain. The only lookbehind information that is
|
||||
currently available via the API is the length of the longest individual
|
||||
lookbehind in a pattern, but this can be misleading if there are nested
|
||||
lookbehinds. The value returned by calling <b>pcre2_pattern_info()</b> with the
|
||||
PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code
|
||||
units) that any individual lookbehind moves back when it is processed. A
|
||||
pattern such as "(?<=(?<!b)a)" has a maximum lookbehind value of one, but
|
||||
inspects two characters before its starting point.
|
||||
</P>
|
||||
<P>
|
||||
In a non-UTF or a 32-bit case, moving back is just a subtraction, but in
|
||||
UTF-8 or UTF-16 you have to count characters while moving back through the code
|
||||
units.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">PARTIAL MATCHING USING pcre2_dfa_match()</a><br>
|
||||
<P>
|
||||
The DFA function moves along the subject string character by character, without
|
||||
backtracking, searching for all possible matches simultaneously. If the end of
|
||||
the subject is reached before the end of the pattern, there is the possibility
|
||||
of a partial match, again provided that at least one character has been
|
||||
inspected.
|
||||
of a partial match.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
|
||||
have been no complete matches. Otherwise, the complete matches are returned.
|
||||
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
|
||||
any complete matches. The portion of the string that was matched when the
|
||||
longest partial match was found is set as the first matching string.
|
||||
If PCRE2_PARTIAL_HARD is set, a partial match takes precedence over any
|
||||
complete matches. The portion of the string that was matched when the longest
|
||||
partial match was found is set as the first matching string.
|
||||
</P>
|
||||
<P>
|
||||
Because the DFA functions always search for all possible matches, and there is
|
||||
no difference between greedy and ungreedy repetition, their behaviour is
|
||||
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
|
||||
the string "dog" matched against the ungreedy pattern shown above:
|
||||
Because the DFA function always searches for all possible matches, and there is
|
||||
no difference between greedy and ungreedy repetition, its behaviour is
|
||||
different from the <b>pcre2_match()</b>. Consider the string "dog" matched
|
||||
against this ungreedy pattern:
|
||||
<pre>
|
||||
/dog(sbody)??/
|
||||
</pre>
|
||||
|
@ -202,58 +344,16 @@ Whereas the standard function stops as soon as it finds the complete match for
|
|||
"dog", the DFA function also finds the partial match for "dogsbody", and so
|
||||
returns that when PCRE2_PARTIAL_HARD is set.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
|
||||
<P>
|
||||
If a pattern ends with one of sequences \b or \B, which test for word
|
||||
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
|
||||
results. Consider this pattern:
|
||||
<pre>
|
||||
/\bcat\b/
|
||||
</pre>
|
||||
This matches "cat", provided there is a word boundary at either end. If the
|
||||
subject string is "the cat", the comparison of the final "t" with a following
|
||||
character cannot take place, so a partial match is found. However, normal
|
||||
matching carries on, and \b matches at the end of the subject when the last
|
||||
character is a letter, so a complete match is found. The result, therefore, is
|
||||
<i>not</i> PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
|
||||
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a><br>
|
||||
<P>
|
||||
If the <b>partial_soft</b> (or <b>ps</b>) modifier is present on a
|
||||
<b>pcre2test</b> data line, the PCRE2_PARTIAL_SOFT option is used for the match.
|
||||
Here is a run of <b>pcre2test</b> that uses the date example quoted above:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 25jun04\=ps
|
||||
0: 25jun04
|
||||
1: jun
|
||||
data> 25dec3\=ps
|
||||
Partial match: 23dec3
|
||||
data> 3ju\=ps
|
||||
Partial match: 3ju
|
||||
data> 3juj\=ps
|
||||
No match
|
||||
data> j\=ps
|
||||
No match
|
||||
</pre>
|
||||
The first data string is matched completely, so <b>pcre2test</b> shows the
|
||||
matched substrings. The remaining four strings do not match the complete
|
||||
pattern, but the first two are partial matches. Similar output is obtained
|
||||
if DFA matching is used.
|
||||
</P>
|
||||
<P>
|
||||
If the <b>partial_hard</b> (or <b>ph</b>) modifier is present on a
|
||||
<b>pcre2test</b> data line, the PCRE2_PARTIAL_HARD option is set for the match.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a><br>
|
||||
<P>
|
||||
When a partial match has been found using a DFA matching function, it is
|
||||
When a partial match has been found using the DFA matching function, it is
|
||||
possible to continue the match by providing additional subject data and calling
|
||||
the function again with the same compiled regular expression, this time setting
|
||||
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
|
||||
because this is where details of the previous partial match are stored. Here is
|
||||
an example using <b>pcre2test</b>:
|
||||
because this is where details of the previous partial match are stored. You can
|
||||
set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with PCRE2_DFA_RESTART
|
||||
to continue partial matching over multiple segments. Here is an example using
|
||||
<b>pcre2test</b>:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 23ja\=dfa,ps
|
||||
|
@ -265,155 +365,10 @@ The first call has "23ja" as the subject, and requests partial matching; the
|
|||
second call has "n05" as the subject for the continued (restarted) match.
|
||||
Notice that when the match is complete, only the last part is shown; PCRE2 does
|
||||
not retain the previously partially-matched string. It is up to the calling
|
||||
program to do that if it needs to.
|
||||
</P>
|
||||
<P>
|
||||
That means that, for an unanchored pattern, if a continued match fails, it is
|
||||
not possible to try again at a new starting point. All this facility is capable
|
||||
of doing is continuing with the previous match attempt. In the previous
|
||||
example, if the second set of data is "ug23" the result is no match, even
|
||||
though there would be a match for "aug23" if the entire string were given at
|
||||
once. Depending on the application, this may or may not be what you want.
|
||||
The only way to allow for starting again at the next character is to retain the
|
||||
matched part of the subject and try a new complete match.
|
||||
</P>
|
||||
<P>
|
||||
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
|
||||
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
|
||||
facility can be used to pass very long subject strings to the DFA matching
|
||||
functions.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_match()</a><br>
|
||||
<P>
|
||||
Unlike the DFA function, it is not possible to restart the previous match with
|
||||
a new segment of data when using <b>pcre2_match()</b>. Instead, new data must be
|
||||
added to the previous subject string, and the entire match re-run, starting
|
||||
from the point where the partial match occurred. Earlier data can be discarded.
|
||||
</P>
|
||||
<P>
|
||||
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
|
||||
treat the end of a segment as the end of the subject when matching \z, \Z,
|
||||
\b, \B, and $. Consider an unanchored pattern that matches dates:
|
||||
<pre>
|
||||
re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
|
||||
data> The date is 23ja\=ph
|
||||
Partial match: 23ja
|
||||
</pre>
|
||||
At this stage, an application could discard the text preceding "23ja", add on
|
||||
text from the next segment, and call the matching function again. Unlike the
|
||||
DFA matching function, the entire matching string must always be available,
|
||||
and the complete matching process occurs for each call, so more memory and more
|
||||
processing time is needed.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">ISSUES WITH MULTI-SEGMENT MATCHING</a><br>
|
||||
<P>
|
||||
Certain types of pattern may give problems with multi-segment matching,
|
||||
whichever matching function is used.
|
||||
</P>
|
||||
<P>
|
||||
1. If the pattern contains a test for the beginning of a line, you need to pass
|
||||
the PCRE2_NOTBOL option when the subject string for any call does start at the
|
||||
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
|
||||
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
|
||||
includes the effect of PCRE2_NOTEOL.
|
||||
</P>
|
||||
<P>
|
||||
2. If a pattern contains a lookbehind assertion, characters that precede the
|
||||
start of the partial match may have been inspected during the matching process.
|
||||
When using <b>pcre2_match()</b>, sufficient characters must be retained for the
|
||||
next match attempt. You can ensure that enough characters are retained by doing
|
||||
the following:
|
||||
</P>
|
||||
<P>
|
||||
Before doing any matching, find the length of the longest lookbehind in the
|
||||
pattern by calling <b>pcre2_pattern_info()</b> with the PCRE2_INFO_MAXLOOKBEHIND
|
||||
option. Note that the resulting count is in characters, not code units. After a
|
||||
partial match, moving back from the ovector[0] offset in the subject by the
|
||||
number of characters given for the maximum lookbehind gets you to the earliest
|
||||
character that must be retained. In a non-UTF or a 32-bit situation, moving
|
||||
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
|
||||
while moving back through the code units.
|
||||
</P>
|
||||
<P>
|
||||
Characters before the point you have now reached can be discarded, and after
|
||||
the next segment has been added to what is retained, you should run the next
|
||||
match with the <b>startoffset</b> argument set so that the match begins at the
|
||||
same point as before.
|
||||
</P>
|
||||
<P>
|
||||
For example, if the pattern "(?<=123)abc" is partially matched against the
|
||||
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
|
||||
lookbehind count is 3, so all characters before offset 2 can be discarded. The
|
||||
value of <b>startoffset</b> for the next match should be 3. When <b>pcre2test</b>
|
||||
displays a partial match, it indicates the lookbehind characters with '<'
|
||||
characters:
|
||||
<pre>
|
||||
re> "(?<=123)abc"
|
||||
data> xx123ab\=ph
|
||||
Partial match: 123ab
|
||||
<<<
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
3. Because a partial match must always contain at least one character, what
|
||||
might be considered a partial match of an empty string actually gives a "no
|
||||
match" result. For example:
|
||||
<pre>
|
||||
re> /c(?<=abc)x/
|
||||
data> ab\=ps
|
||||
No match
|
||||
</pre>
|
||||
If the next segment begins "cx", a match should be found, but this will only
|
||||
happen if characters from the previous segment are retained. For this reason, a
|
||||
"no match" result should be interpreted as "partial match of an empty string"
|
||||
when the pattern contains lookbehinds.
|
||||
</P>
|
||||
<P>
|
||||
4. Matching a subject string that is split into multiple segments may not
|
||||
always produce exactly the same result as matching over one single long string,
|
||||
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
|
||||
Word Boundaries" above describes an issue that arises if the pattern ends with
|
||||
\b or \B. Another kind of difference may occur when there are multiple
|
||||
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
|
||||
is given only when there are no completed matches. This means that as soon as
|
||||
the shortest match has been found, continuation to a new subject segment is no
|
||||
longer possible. Consider this <b>pcre2test</b> example:
|
||||
<pre>
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\=ps
|
||||
0: dog
|
||||
data> do\=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\=ps,dfa,dfa_restart
|
||||
0: g
|
||||
data> dogsbody\=dfa
|
||||
0: dogsbody
|
||||
1: dog
|
||||
</pre>
|
||||
The first data line passes the string "dogsb" to a standard matching function,
|
||||
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
|
||||
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
|
||||
string "dog" is a complete match. Similarly, when the subject is presented to
|
||||
a DFA matching function in several parts ("do" and "gsb" being the first two)
|
||||
the match stops when "dog" has been found, and it is not possible to continue.
|
||||
On the other hand, if "dogsbody" is presented as a single string, a DFA
|
||||
matching function finds both matches.
|
||||
</P>
|
||||
<P>
|
||||
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
|
||||
multi-segment data. The example above then behaves differently:
|
||||
<pre>
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\=ph
|
||||
Partial match: dogsb
|
||||
data> do\=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\=ph,dfa,dfa_restart
|
||||
Partial match: gsb
|
||||
</pre>
|
||||
5. Patterns that contain alternatives at the top level which do not all start
|
||||
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
|
||||
used. For example, consider this pattern:
|
||||
program to do that if it needs to. This means that, for an unanchored pattern,
|
||||
if a continued match fails, it is not possible to try again at a new starting
|
||||
point. All this facility is capable of doing is continuing with the previous
|
||||
match attempt. For example, consider this pattern:
|
||||
<pre>
|
||||
1234|3789
|
||||
</pre>
|
||||
|
@ -422,30 +377,18 @@ alternative is found at offset 3. There is no partial match for the second
|
|||
alternative, because such a match does not start at the same point in the
|
||||
subject string. Attempting to continue with the string "7890" does not yield a
|
||||
match because only those alternatives that match at one point in the subject
|
||||
are remembered. The problem arises because the start of the second alternative
|
||||
matches within the first alternative. There is no problem with anchored
|
||||
patterns or patterns such as:
|
||||
<pre>
|
||||
1234|ABCD
|
||||
</pre>
|
||||
where no string can be a partial match for both alternatives. This is not a
|
||||
problem if a standard matching function is used, because the entire match has
|
||||
to be rerun each time:
|
||||
<pre>
|
||||
re> /1234|3789/
|
||||
data> ABC123\=ph
|
||||
Partial match: 123
|
||||
data> 1237890
|
||||
0: 3789
|
||||
</pre>
|
||||
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
|
||||
the entire match can also be used with the DFA matching function. Another
|
||||
possibility is to work with two buffers. If a partial match at offset <i>n</i>
|
||||
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
|
||||
the second buffer, you can then try a new match starting at offset <i>n+1</i> in
|
||||
the first buffer.
|
||||
are remembered. Depending on the application, this may or may not be what you
|
||||
want.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
If you do want to allow for starting again at the next character, one way of
|
||||
doing it is to retain some or all of the segment and try a new complete match,
|
||||
as described for <b>pcre2_match()</b> above. Another possibility is to work with
|
||||
two buffers. If a partial match at offset <i>n</i> in the first buffer is
|
||||
followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you
|
||||
can then try a new match starting at offset <i>n+1</i> in the first buffer.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -454,11 +397,11 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 22 December 2014
|
||||
Last updated: 04 September 2019
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -33,17 +33,18 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC18" href="#SEC18">ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS</a>
|
||||
<li><a name="TOC19" href="#SEC19">BACKREFERENCES</a>
|
||||
<li><a name="TOC20" href="#SEC20">ASSERTIONS</a>
|
||||
<li><a name="TOC21" href="#SEC21">SCRIPT RUNS</a>
|
||||
<li><a name="TOC22" href="#SEC22">CONDITIONAL GROUPS</a>
|
||||
<li><a name="TOC23" href="#SEC23">COMMENTS</a>
|
||||
<li><a name="TOC24" href="#SEC24">RECURSIVE PATTERNS</a>
|
||||
<li><a name="TOC25" href="#SEC25">GROUPS AS SUBROUTINES</a>
|
||||
<li><a name="TOC26" href="#SEC26">ONIGURUMA SUBROUTINE SYNTAX</a>
|
||||
<li><a name="TOC27" href="#SEC27">CALLOUTS</a>
|
||||
<li><a name="TOC28" href="#SEC28">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
|
||||
<li><a name="TOC30" href="#SEC30">AUTHOR</a>
|
||||
<li><a name="TOC31" href="#SEC31">REVISION</a>
|
||||
<li><a name="TOC21" href="#SEC21">NON-ATOMIC ASSERTIONS</a>
|
||||
<li><a name="TOC22" href="#SEC22">SCRIPT RUNS</a>
|
||||
<li><a name="TOC23" href="#SEC23">CONDITIONAL GROUPS</a>
|
||||
<li><a name="TOC24" href="#SEC24">COMMENTS</a>
|
||||
<li><a name="TOC25" href="#SEC25">RECURSIVE PATTERNS</a>
|
||||
<li><a name="TOC26" href="#SEC26">GROUPS AS SUBROUTINES</a>
|
||||
<li><a name="TOC27" href="#SEC27">ONIGURUMA SUBROUTINE SYNTAX</a>
|
||||
<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
|
||||
<li><a name="TOC29" href="#SEC29">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC30" href="#SEC30">SEE ALSO</a>
|
||||
<li><a name="TOC31" href="#SEC31">AUTHOR</a>
|
||||
<li><a name="TOC32" href="#SEC32">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION DETAILS</a><br>
|
||||
<P>
|
||||
|
@ -91,10 +92,11 @@ single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
|||
specified for the 32-bit library, in which case it constrains the character
|
||||
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
||||
built to include Unicode support (which is the default). When using UTF strings
|
||||
you must either call the compiling function with the PCRE2_UTF option, or the
|
||||
pattern must start with the special sequence (*UTF), which is equivalent to
|
||||
setting the relevant option. How setting a UTF mode affects pattern matching is
|
||||
mentioned in several places below. There is also a summary of features in the
|
||||
you must either call the compiling function with one or both of the PCRE2_UTF
|
||||
or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special
|
||||
sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How
|
||||
setting a UTF mode affects pattern matching is mentioned in several places
|
||||
below. There is also a summary of features in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
</P>
|
||||
|
@ -112,7 +114,8 @@ Another special sequence that may appear at the start of a pattern is (*UCP).
|
|||
This has the same effect as setting the PCRE2_UCP option: it causes sequences
|
||||
such as \d and \w to use Unicode properties to determine character types,
|
||||
instead of recognizing only characters with codes less than 256 via a lookup
|
||||
table.
|
||||
table. If also causes upper/lower casing operations to use Unicode properties
|
||||
for characters with code points greater than 127, even when UTF is not set.
|
||||
</P>
|
||||
<P>
|
||||
Some applications that allow their users to supply patterns may wish to
|
||||
|
@ -286,8 +289,11 @@ corresponding characters in the subject. As a trivial example, the pattern
|
|||
The quick brown fox
|
||||
</pre>
|
||||
matches a portion of a subject string that is identical to itself. When
|
||||
caseless matching is specified (the PCRE2_CASELESS option), letters are matched
|
||||
independently of case.
|
||||
caseless matching is specified (the PCRE2_CASELESS option or (?i) within the
|
||||
pattern), letters are matched independently of case. Note that there are two
|
||||
ASCII characters, K and S, that, in addition to their lower case ASCII
|
||||
equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
|
||||
(long S) respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
</P>
|
||||
<P>
|
||||
The power of regular expressions comes from the ability to include wild cards,
|
||||
|
@ -323,6 +329,20 @@ a character class the only metacharacters are:
|
|||
[ POSIX character class (if followed by POSIX syntax)
|
||||
] terminates the character class
|
||||
</pre>
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern, other than in a character class, and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or a # character as
|
||||
part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same
|
||||
applies, but in addition unescaped space and horizontal tab characters are
|
||||
ignored inside a character class. Note: only these two characters are ignored,
|
||||
not the full set of pattern white space characters that are ignored outside a
|
||||
character class. Option settings can be changed within a pattern; see the
|
||||
section entitled
|
||||
<a href="#internaloptions">"Internal Option Setting"</a>
|
||||
below.
|
||||
</P>
|
||||
<P>
|
||||
The following sections describe the use of each of the metacharacters.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">BACKSLASH</a><br>
|
||||
|
@ -340,16 +360,9 @@ precede a non-alphanumeric with backslash to specify that it stands for itself.
|
|||
In particular, if you want to match a backslash, you write \\.
|
||||
</P>
|
||||
<P>
|
||||
In a UTF mode, only ASCII digits and letters have any special meaning after a
|
||||
backslash. All other characters (in particular, those whose code points are
|
||||
greater than 127) are treated as literals.
|
||||
</P>
|
||||
<P>
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern (other than in a character class), and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or # character as part
|
||||
of the pattern.
|
||||
Only ASCII digits and letters have any special meaning after a backslash. All
|
||||
other characters (in particular, those whose code points are greater than 127)
|
||||
are treated as literals.
|
||||
</P>
|
||||
<P>
|
||||
If you want to treat all characters in a sequence as literals, you can do so by
|
||||
|
@ -428,11 +441,11 @@ There may be any number of hexadecimal digits. This syntax is from ECMAScript
|
|||
6.
|
||||
</P>
|
||||
<P>
|
||||
The \N{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
|
||||
is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
|
||||
\N{name} to specify characters by Unicode name; PCRE2 does not support this.
|
||||
Note that when \N is not followed by an opening brace (curly bracket) it has
|
||||
an entirely different meaning, matching any character that is not a newline.
|
||||
The \N{U+hhh..} escape sequence is recognized only when PCRE2 is operating in
|
||||
UTF mode. Perl also uses \N{name} to specify characters by Unicode name; PCRE2
|
||||
does not support this. Note that when \N is not followed by an opening brace
|
||||
(curly bracket) it has an entirely different meaning, matching any character
|
||||
that is not a newline.
|
||||
</P>
|
||||
<P>
|
||||
There are some legacy applications where the escape sequence \r is expected to
|
||||
|
@ -521,7 +534,7 @@ for themselves. For example, outside a character class:
|
|||
\0113 is a tab followed by the character "3"
|
||||
\113 might be a backreference, otherwise the character with octal code 113
|
||||
\377 might be a backreference, otherwise the value 255 (decimal)
|
||||
\81 is always a backreference .sp
|
||||
\81 is always a backreference
|
||||
</pre>
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
must not be introduced by a leading zero, because no more than three octal
|
||||
|
@ -732,7 +745,7 @@ Unicode support is not needed for these characters to be recognized.
|
|||
<P>
|
||||
It is possible to restrict \R to match only CR, LF, or CRLF (instead of the
|
||||
complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
|
||||
at compile time. (BSR is an abbrevation for "backslash R".) This can be made
|
||||
at compile time. (BSR is an abbreviation for "backslash R".) This can be made
|
||||
the default when PCRE2 is built; if this is the case, the other behaviour can
|
||||
be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
|
||||
these settings by starting a pattern string with one of the following
|
||||
|
@ -763,186 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
</P>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
</P>
|
||||
<P>
|
||||
The extra escape sequences that provide property support are:
|
||||
<pre>
|
||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||
\X a Unicode extended grapheme cluster
|
||||
</pre>
|
||||
The property names represented by <i>xx</i> above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
<a href="#extraprops">next section).</a>
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \P{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
The property names represented by <i>xx</i> above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
<a href="#extraprops">below).</a>
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \P{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
</P>
|
||||
<br><b>
|
||||
Script properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\p{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
</P>
|
||||
<P>
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
<pre>
|
||||
\p{Greek}
|
||||
\P{Han}
|
||||
</pre>
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
</P>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Warang_Citi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The general category property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
|
@ -1004,9 +893,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
</pre>
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
</P>
|
||||
<P>
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
|
@ -1033,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For
|
|||
example, \p{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
</P>
|
||||
<br><b>
|
||||
Binary (yes/no) properties for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \d and \w do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
The Bidi_Class property for \p and \P
|
||||
</b><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</pre>
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
</P>
|
||||
<br><b>
|
||||
Extended grapheme clusters
|
||||
|
@ -1069,7 +1000,7 @@ additional characters according to the following rules for ending a cluster:
|
|||
3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
|
||||
are of five types: L, V, T, LV, and LVT. An L character may be followed by an
|
||||
L, V, LV, or LVT character; an LV or V character may be followed by a V or T
|
||||
character; an LVT or T character may be follwed only by a T character.
|
||||
character; an LVT or T character may be followed only by a T character.
|
||||
</P>
|
||||
<P>
|
||||
4. Do not end before extending characters or spacing marks or the "zero-width
|
||||
|
@ -1154,8 +1085,11 @@ For example, when the pattern
|
|||
matches "foobar", the first substring is still set to "foo".
|
||||
</P>
|
||||
<P>
|
||||
Perl documents that the use of \K within assertions is "not well defined". In
|
||||
PCRE2, \K is acted upon when it occurs inside positive assertions, but is
|
||||
From version 5.32.0 Perl forbids the use of \K in lookaround assertions. From
|
||||
release 10.38 PCRE2 also forbids this by default. However, the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
|
||||
<b>pcre2_compile()</b> to re-enable the previous behaviour. When this option is
|
||||
set, \K is acted upon when it occurs inside positive assertions, but is
|
||||
ignored in negative assertions. Note that when a pattern such as (?=ab\K)
|
||||
matches, the reported start of the match can be greater than the end of the
|
||||
match. Using \K in a lookbehind assertion at the start of a pattern can also
|
||||
|
@ -1312,15 +1246,17 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
<P>
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
<a href="#newlines">"Newline conventions"</a>
|
||||
above).
|
||||
</P>
|
||||
<P>
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
</P>
|
||||
<P>
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
|
@ -1360,7 +1296,7 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used).
|
||||
</P>
|
||||
<P>
|
||||
An application can lock out the use of \C by setting the
|
||||
|
@ -1432,7 +1368,10 @@ Characters in a class may be specified by their code points using \o, \x, or
|
|||
\N{U+hh..} in the usual way. When caseless matching is set, any letters in a
|
||||
class represent both their upper case and lower case versions, so for example,
|
||||
a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
|
||||
match "A", whereas a caseful version would.
|
||||
match "A", whereas a caseful version would. Note that there are two ASCII
|
||||
characters, K and S, that, in addition to their lower case ASCII equivalents,
|
||||
are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S)
|
||||
respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
</P>
|
||||
<P>
|
||||
Characters that might indicate line breaks are never treated in any special way
|
||||
|
@ -1644,7 +1583,7 @@ that succeeds is used. If the alternatives are within a group
|
|||
<a href="#group">(defined below),</a>
|
||||
"succeeds" means matching the rest of the main pattern as well as the
|
||||
alternative in the group.
|
||||
</P>
|
||||
<a name="internaloptions"></a></P>
|
||||
<br><a name="SEC13" href="#TOC1">INTERNAL OPTION SETTING</a><br>
|
||||
<P>
|
||||
The settings of the PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL,
|
||||
|
@ -1895,12 +1834,19 @@ are permitted for groups with the same number, for example:
|
|||
(?|(?<AA>aa)|(?<AA>bb))
|
||||
</pre>
|
||||
The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES
|
||||
option at compile time, or by the use of (?J) within the pattern. Duplicate
|
||||
names can be useful for patterns where only one instance of the named capture
|
||||
group can match. Suppose you want to match the name of a weekday, either as a
|
||||
3-letter abbreviation or as the full name, and in both cases you want to
|
||||
extract the abbreviation. This pattern (ignoring the line breaks) does the job:
|
||||
option at compile time, or by the use of (?J) within the pattern, as described
|
||||
in the section entitled
|
||||
<a href="#internaloptions">"Internal Option Setting"</a>
|
||||
above.
|
||||
</P>
|
||||
<P>
|
||||
Duplicate names can be useful for patterns where only one instance of the named
|
||||
capture group can match. Suppose you want to match the name of a weekday,
|
||||
either as a 3-letter abbreviation or as the full name, and in both cases you
|
||||
want to extract the abbreviation. This pattern (ignoring the line breaks) does
|
||||
the job:
|
||||
<pre>
|
||||
(?J)
|
||||
(?<DN>Mon|Fri|Sun)(?:day)?|
|
||||
(?<DN>Tue)(?:sday)?|
|
||||
(?<DN>Wed)(?:nesday)?|
|
||||
|
@ -1921,7 +1867,7 @@ they appear in the overall pattern. The first one that is set is used for the
|
|||
reference. For example, this pattern matches both "foofoo" and "barbar" but not
|
||||
"foobar" or "barfoo":
|
||||
<pre>
|
||||
(?:(?<n>foo)|(?<n>bar))\k<n>
|
||||
(?J)(?:(?<n>foo)|(?<n>bar))\k<n>
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
|
@ -1955,7 +1901,7 @@ items:
|
|||
an escape such as \d or \pL that matches a single character
|
||||
a character class
|
||||
a backreference
|
||||
a parenthesized group (including most assertions)
|
||||
a parenthesized group (including lookaround assertions)
|
||||
a subroutine call (recursive or otherwise)
|
||||
</pre>
|
||||
The general repetition quantifier specifies a minimum and maximum number of
|
||||
|
@ -2013,8 +1959,10 @@ no characters with a quantifier that has no upper limit, for example:
|
|||
</pre>
|
||||
Earlier versions of Perl and PCRE1 used to give an error at compile time for
|
||||
such patterns. However, because there are cases where this can be useful, such
|
||||
patterns are now accepted, but if any repetition of the group does in fact
|
||||
match no characters, the loop is forcibly broken.
|
||||
patterns are now accepted, but whenever an iteration of such a group matches no
|
||||
characters, matching moves on to the next item in the pattern instead of
|
||||
repeatedly matching an empty string. This does not prevent backtracking into
|
||||
any of the iterations if a subsequent item fails to match.
|
||||
</P>
|
||||
<P>
|
||||
By default, quantifiers are "greedy", that is, they match as much as possible
|
||||
|
@ -2139,10 +2087,10 @@ be easier to remember:
|
|||
<pre>
|
||||
(*atomic:\d+)foo
|
||||
</pre>
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
</P>
|
||||
<P>
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
|
@ -2341,11 +2289,11 @@ using alternation, as in the example above, or by a quantifier with a minimum
|
|||
of zero.
|
||||
</P>
|
||||
<P>
|
||||
Backreferences of this type cause the group that they reference to be treated
|
||||
as an
|
||||
For versions of PCRE2 less than 10.25, backreferences of this type used to
|
||||
cause the group that they reference to be treated as an
|
||||
<a href="#atomicgroup">atomic group.</a>
|
||||
Once the whole group has been matched, a subsequent matching failure cannot
|
||||
cause backtracking into the middle of the group.
|
||||
This restriction no longer applies, and backtracking into such groups can occur
|
||||
as normal.
|
||||
<a name="bigassertions"></a></P>
|
||||
<br><a name="SEC20" href="#TOC1">ASSERTIONS</a><br>
|
||||
<P>
|
||||
|
@ -2361,10 +2309,18 @@ those that look behind it, and in each case an assertion may be positive (must
|
|||
match for the assertion to be true) or negative (must not match for the
|
||||
assertion to be true). An assertion group is matched in the normal way,
|
||||
and if it is true, matching continues after it, but with the matching position
|
||||
in the subject string is was it was before the assertion was processed.
|
||||
in the subject string reset to what it was before the assertion was processed.
|
||||
</P>
|
||||
<P>
|
||||
A lookaround assertion may also appear as the condition in a
|
||||
The Perl-compatible lookaround assertions are atomic. If an assertion is true,
|
||||
but there is a subsequent matching failure, there is no backtracking into the
|
||||
assertion. However, there are some cases where non-atomic assertions can be
|
||||
useful. PCRE2 has some support for these, described in the section entitled
|
||||
<a href="#nonatomicassertions">"Non-atomic assertions"</a>
|
||||
below, but they are not Perl-compatible.
|
||||
</P>
|
||||
<P>
|
||||
A lookaround assertion may appear as the condition in a
|
||||
<a href="#conditions">conditional group</a>
|
||||
(see below). In this case, the result of matching the assertion determines
|
||||
which branch of the condition is followed.
|
||||
|
@ -2397,36 +2353,23 @@ control passes to the previous backtracking point, thus discarding any captured
|
|||
strings within the assertion.
|
||||
</P>
|
||||
<P>
|
||||
For compatibility with Perl, most assertion groups may be repeated; though it
|
||||
makes no sense to assert the same thing several times, the side effect of
|
||||
capturing may occasionally be useful. However, an assertion that forms the
|
||||
condition for a conditional group may not be quantified. In practice, for
|
||||
other assertions, there only three cases:
|
||||
<br>
|
||||
<br>
|
||||
(1) If the quantifier is {0}, the assertion is never obeyed during matching.
|
||||
However, it may contain internal capture groups that are called from elsewhere
|
||||
via the
|
||||
<a href="#groupsassubroutines">subroutine mechanism.</a>
|
||||
<br>
|
||||
<br>
|
||||
(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
|
||||
were {0,1}. At run time, the rest of the pattern match is tried with and
|
||||
without the assertion, the order depending on the greediness of the quantifier.
|
||||
<br>
|
||||
<br>
|
||||
(3) If the minimum repetition is greater than zero, the quantifier is ignored.
|
||||
The assertion is obeyed just once when encountered during matching.
|
||||
Most assertion groups may be repeated; though it makes no sense to assert the
|
||||
same thing several times, the side effect of capturing in positive assertions
|
||||
may occasionally be useful. However, an assertion that forms the condition for
|
||||
a conditional group may not be quantified. PCRE2 used to restrict the
|
||||
repetition of assertions, but from release 10.35 the only restriction is that
|
||||
an unlimited maximum repetition is changed to be one more than the minimum. For
|
||||
example, {3,} is treated as {3,4}.
|
||||
</P>
|
||||
<br><b>
|
||||
Alphabetic assertion names
|
||||
</b><br>
|
||||
<P>
|
||||
Traditionally, symbolic sequences such as (?= and (?<= have been used to specify
|
||||
lookaround assertions. Perl 5.28 introduced some experimental alphabetic
|
||||
alternatives which might be easier to remember. They all start with (* instead
|
||||
of (? and must be written using lower case letters. PCRE2 supports the
|
||||
following synonyms:
|
||||
Traditionally, symbolic sequences such as (?= and (?<= have been used to
|
||||
specify lookaround assertions. Perl 5.28 introduced some experimental
|
||||
alphabetic alternatives which might be easier to remember. They all start with
|
||||
(* instead of (? and must be written using lower case letters. PCRE2 supports
|
||||
the following synonyms:
|
||||
<pre>
|
||||
(*positive_lookahead: or (*pla: is the same as (?=
|
||||
(*negative_lookahead: or (*nla: is the same as (?!
|
||||
|
@ -2599,8 +2542,69 @@ preceded by "foo", while
|
|||
</pre>
|
||||
is another pattern that matches "foo" preceded by three digits and any three
|
||||
characters that are not "999".
|
||||
<a name="nonatomicassertions"></a></P>
|
||||
<br><a name="SEC21" href="#TOC1">NON-ATOMIC ASSERTIONS</a><br>
|
||||
<P>
|
||||
The traditional Perl-compatible lookaround assertions are atomic. That is, if
|
||||
an assertion is true, but there is a subsequent matching failure, there is no
|
||||
backtracking into the assertion. However, there are some cases where non-atomic
|
||||
positive assertions can be useful. PCRE2 provides these using the following
|
||||
syntax:
|
||||
<pre>
|
||||
(*non_atomic_positive_lookahead: or (*napla: or (?*
|
||||
(*non_atomic_positive_lookbehind: or (*naplb: or (?<*
|
||||
</pre>
|
||||
Consider the problem of finding the right-most word in a string that also
|
||||
appears earlier in the string, that is, it must appear at least twice in total.
|
||||
This pattern returns the required result as captured substring 1:
|
||||
<pre>
|
||||
^(?x)(*napla: .* \b(\w++)) (?> .*? \b\1\b ){2}
|
||||
</pre>
|
||||
For a subject such as "word1 word2 word3 word2 word3 word4" the result is
|
||||
"word3". How does it work? At the start, ^(?x) anchors the pattern and sets the
|
||||
"x" option, which causes white space (introduced for readability) to be
|
||||
ignored. Inside the assertion, the greedy .* at first consumes the entire
|
||||
string, but then has to backtrack until the rest of the assertion can match a
|
||||
word, which is captured by group 1. In other words, when the assertion first
|
||||
succeeds, it captures the right-most word in the string.
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<P>
|
||||
The current matching point is then reset to the start of the subject, and the
|
||||
rest of the pattern match checks for two occurrences of the captured word,
|
||||
using an ungreedy .*? to scan from the left. If this succeeds, we are done, but
|
||||
if the last word in the string does not occur twice, this part of the pattern
|
||||
fails. If a traditional atomic lookhead (?= or (*pla: had been used, the
|
||||
assertion could not be re-entered, and the whole match would fail. The pattern
|
||||
would succeed only if the very last word in the subject was found twice.
|
||||
</P>
|
||||
<P>
|
||||
Using a non-atomic lookahead, however, means that when the last word does not
|
||||
occur twice in the string, the lookahead can backtrack and find the second-last
|
||||
word, and so on, until either the match succeeds, or all words have been
|
||||
tested.
|
||||
</P>
|
||||
<P>
|
||||
Two conditions must be met for a non-atomic assertion to be useful: the
|
||||
contents of one or more capturing groups must change after a backtrack into the
|
||||
assertion, and there must be a backreference to a changed group later in the
|
||||
pattern. If this is not the case, the rest of the pattern match fails exactly
|
||||
as before because nothing has changed, so using a non-atomic assertion just
|
||||
wastes resources.
|
||||
</P>
|
||||
<P>
|
||||
There is one exception to backtracking into a non-atomic assertion. If an
|
||||
(*ACCEPT) control verb is triggered, the assertion succeeds atomically. That
|
||||
is, a subsequent match failure cannot backtrack into the assertion.
|
||||
</P>
|
||||
<P>
|
||||
Non-atomic assertions are not supported by the alternative matching function
|
||||
<b>pcre2_dfa_match()</b>. They are supported by JIT, but only if they do not
|
||||
contain any control verbs such as (*ACCEPT). (This may change in future). Note
|
||||
that assertions that appear as conditions for
|
||||
<a href="#conditions">conditional groups</a>
|
||||
(see below) must be atomic.
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<P>
|
||||
In concept, a script run is a sequence of characters that are all from the same
|
||||
Unicode script such as Latin or Greek. However, because some scripts are
|
||||
|
@ -2662,7 +2666,7 @@ parentheses.
|
|||
should not be used within a script run group, because it causes an immediate
|
||||
exit from the group, bypassing the script run checking.
|
||||
<a name="conditions"></a></P>
|
||||
<br><a name="SEC22" href="#TOC1">CONDITIONAL GROUPS</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">CONDITIONAL GROUPS</a><br>
|
||||
<P>
|
||||
It is possible to cause the matching process to obey a pattern fragment
|
||||
conditionally or to choose between two alternative fragments, depending on
|
||||
|
@ -2807,7 +2811,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
|
||||
\b (?&byte) (\.(?&byte)){3} \b
|
||||
</pre>
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -2838,8 +2842,13 @@ Assertion conditions
|
|||
<P>
|
||||
If the condition is not in any of the above formats, it must be a parenthesized
|
||||
assertion. This may be a positive or negative lookahead or lookbehind
|
||||
assertion. Consider this pattern, again containing non-significant white space,
|
||||
and with the two alternatives on the second line:
|
||||
assertion. However, it must be a traditional atomic assertion, not one of the
|
||||
PCRE2-specific
|
||||
<a href="#nonatomicassertions">non-atomic assertions.</a>
|
||||
</P>
|
||||
<P>
|
||||
Consider this pattern, again containing non-significant white space, and with
|
||||
the two alternatives on the second line:
|
||||
<pre>
|
||||
(?(?=[^a-z]*[a-z])
|
||||
\d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
|
||||
|
@ -2858,7 +2867,7 @@ positive and negative assertions, because matching always continues after the
|
|||
assertion, whether it succeeds or fails. (Compare non-conditional assertions,
|
||||
for which captures are retained only for positive assertions that succeed.)
|
||||
<a name="comments"></a></P>
|
||||
<br><a name="SEC23" href="#TOC1">COMMENTS</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">COMMENTS</a><br>
|
||||
<P>
|
||||
There are two ways of including comments in patterns that are processed by
|
||||
PCRE2. In both cases, the start of the comment must not be in a character
|
||||
|
@ -2888,7 +2897,7 @@ a newline in the pattern. The sequence \n is still literal at this stage, so
|
|||
it does not terminate the comment. Only an actual character with the code value
|
||||
0x0a (the default newline) does so.
|
||||
<a name="recursion"></a></P>
|
||||
<br><a name="SEC24" href="#TOC1">RECURSIVE PATTERNS</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">RECURSIVE PATTERNS</a><br>
|
||||
<P>
|
||||
Consider the problem of matching a string in parentheses, allowing for
|
||||
unlimited nested parentheses. Without the use of recursion, the best that can
|
||||
|
@ -3076,7 +3085,7 @@ alternative matches "a" and then recurses. In the recursion, \1 does now match
|
|||
"b" and so the whole match succeeds. This match used to fail in Perl, but in
|
||||
later versions (I tried 5.024) it now works.
|
||||
<a name="groupsassubroutines"></a></P>
|
||||
<br><a name="SEC25" href="#TOC1">GROUPS AS SUBROUTINES</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">GROUPS AS SUBROUTINES</a><br>
|
||||
<P>
|
||||
If the syntax for a recursive group call (either by number or by name) is used
|
||||
outside the parentheses to which it refers, it operates a bit like a subroutine
|
||||
|
@ -3124,7 +3133,7 @@ in groups when called as subroutines is described in the section entitled
|
|||
<a href="#btsub">"Backtracking verbs in subroutines"</a>
|
||||
below.
|
||||
<a name="onigurumasubroutines"></a></P>
|
||||
<br><a name="SEC26" href="#TOC1">ONIGURUMA SUBROUTINE SYNTAX</a><br>
|
||||
<br><a name="SEC27" href="#TOC1">ONIGURUMA SUBROUTINE SYNTAX</a><br>
|
||||
<P>
|
||||
For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or
|
||||
a number enclosed either in angle brackets or single quotes, is an alternative
|
||||
|
@ -3142,7 +3151,7 @@ plus or a minus sign it is taken as a relative reference. For example:
|
|||
Note that \g{...} (Perl syntax) and \g<...> (Oniguruma syntax) are <i>not</i>
|
||||
synonymous. The former is a backreference; the latter is a subroutine call.
|
||||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">CALLOUTS</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
Perl has a feature whereby using the sequence (?{...}) causes arbitrary Perl
|
||||
code to be obeyed in the middle of matching a regular expression. This makes it
|
||||
|
@ -3218,13 +3227,13 @@ example:
|
|||
</pre>
|
||||
The doubling is removed before the string is passed to the callout function.
|
||||
<a name="backtrackcontrol"></a></P>
|
||||
<br><a name="SEC28" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
There are a number of special "Backtracking Control Verbs" (to use Perl's
|
||||
terminology) that modify the behaviour of backtracking during matching. They
|
||||
are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form,
|
||||
possibly behaving differently depending on whether or not a name is present.
|
||||
The names are not required to be unique within the pattern.
|
||||
and may behave differently depending on whether or not a name argument is
|
||||
present. The names are not required to be unique within the pattern.
|
||||
</P>
|
||||
<P>
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
|
@ -3252,7 +3261,8 @@ PCRE2_ALT_VERBNAMES is also set.
|
|||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||
not there. Any number of these verbs may occur in a pattern.
|
||||
not there. Any number of these verbs may occur in a pattern. Except for
|
||||
(*ACCEPT), they may not be quantified.
|
||||
</P>
|
||||
<P>
|
||||
Since these verbs are specifically related to backtracking, most of them can be
|
||||
|
@ -3315,6 +3325,19 @@ This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by
|
|||
the outer parentheses.
|
||||
</P>
|
||||
<P>
|
||||
(*ACCEPT) is the only backtracking verb that is allowed to be quantified
|
||||
because an ungreedy quantification with a minimum of zero acts only when a
|
||||
backtrack happens. Consider, for example,
|
||||
<pre>
|
||||
(A(*ACCEPT)??B)C
|
||||
</pre>
|
||||
where A, B, and C may be complex expressions. After matching "A", the matcher
|
||||
processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and
|
||||
the match succeeds. In both cases, all but C is captured. Whereas (*COMMIT)
|
||||
(see below) means "fail on backtrack", a repeated (*ACCEPT) of this type means
|
||||
"succeed on backtrack".
|
||||
</P>
|
||||
<P>
|
||||
<b>Warning:</b> (*ACCEPT) should not be used within a script run group, because
|
||||
it causes an immediate exit from the group, bypassing the script run checking.
|
||||
<pre>
|
||||
|
@ -3332,8 +3355,9 @@ A match with the string "aaaa" always fails, but the callout is taken before
|
|||
each backtrack happens (in this example, 10 times).
|
||||
</P>
|
||||
<P>
|
||||
(*ACCEPT:NAME) and (*FAIL:NAME) are treated as (*MARK:NAME)(*ACCEPT) and
|
||||
(*MARK:NAME)(*FAIL), respectively.
|
||||
(*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*ACCEPT) and
|
||||
(*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before
|
||||
the verb acts.
|
||||
</P>
|
||||
<br><b>
|
||||
Recording which path was taken
|
||||
|
@ -3497,10 +3521,16 @@ successful match if there is a later mismatch. Consider:
|
|||
</pre>
|
||||
If the subject is "aaaac...", after the first match attempt fails (starting at
|
||||
the first character in the string), the starting point skips on to start the
|
||||
next attempt at "c". Note that a possessive quantifer does not have the same
|
||||
next attempt at "c". Note that a possessive quantifier does not have the same
|
||||
effect as this example; although it would suppress backtracking during the
|
||||
first match attempt, the second attempt would start at the second character
|
||||
instead of skipping on to "c".
|
||||
</P>
|
||||
<P>
|
||||
If (*SKIP) is used to specify a new starting position that is the same as the
|
||||
starting position of the current match, or (by being inside a lookbehind)
|
||||
earlier, the position specified by (*SKIP) is ignored, and instead the normal
|
||||
"bumpalong" occurs.
|
||||
<pre>
|
||||
(*SKIP:NAME)
|
||||
</pre>
|
||||
|
@ -3665,11 +3695,20 @@ retained in both cases.
|
|||
</P>
|
||||
<P>
|
||||
The remaining verbs act only when a later failure causes a backtrack to
|
||||
reach them. This means that their effect is confined to the assertion,
|
||||
because lookaround assertions are atomic. A backtrack that occurs after an
|
||||
assertion is complete does not jump back into the assertion. Note in particular
|
||||
that a (*MARK) name that is set in an assertion is not "seen" by an instance of
|
||||
(*SKIP:NAME) latter in the pattern.
|
||||
reach them. This means that, for the Perl-compatible assertions, their effect
|
||||
is confined to the assertion, because Perl lookaround assertions are atomic. A
|
||||
backtrack that occurs after such an assertion is complete does not jump back
|
||||
into the assertion. Note in particular that a (*MARK) name that is set in an
|
||||
assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2 now supports non-atomic positive assertions, as described in the section
|
||||
entitled
|
||||
<a href="#nonatomicassertions">"Non-atomic assertions"</a>
|
||||
above. These assertions must be standalone (not used as conditions). They are
|
||||
not Perl-compatible. For these assertions, a later backtrack does jump back
|
||||
into the assertion, and therefore verbs such as (*COMMIT) can be triggered by
|
||||
backtracks from later in the pattern.
|
||||
</P>
|
||||
<P>
|
||||
The effect of (*THEN) is not allowed to escape beyond an assertion. If there
|
||||
|
@ -3711,25 +3750,25 @@ enclosing group that has alternatives (its normal behaviour). However, if there
|
|||
is no such group within the subroutine's group, the subroutine match fails and
|
||||
there is a backtrack at the outer level.
|
||||
</P>
|
||||
<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC30" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2api</b>(3), <b>pcre2callout</b>(3), <b>pcre2matching</b>(3),
|
||||
<b>pcre2syntax</b>(3), <b>pcre2</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC31" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 12 February 2019
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
</P>
|
||||
<P>
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
</P>
|
||||
<P>
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
</P>
|
||||
<P>
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||
affect the saved block.
|
||||
</P>
|
||||
<P>
|
||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 February 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -68,11 +68,14 @@ application. Because the POSIX functions call the native ones, it is also
|
|||
necessary to add <b>-lpcre2-8</b>.
|
||||
</P>
|
||||
<P>
|
||||
Although they are not defined as protypes in <b>pcre2posix.h</b>, the library
|
||||
does contain functions with the POSIX names <b>regcomp()</b> etc. These simply
|
||||
pass their arguments to the PCRE2 functions. These functions are provided for
|
||||
backwards compatibility with earlier versions of PCRE2, so that existing
|
||||
programs do not have to be recompiled.
|
||||
Although they were not defined as protypes in <b>pcre2posix.h</b>, releases
|
||||
10.33 to 10.36 of the library contained functions with the POSIX names
|
||||
<b>regcomp()</b> etc. These simply passed their arguments to the PCRE2
|
||||
functions. These functions were provided for backwards compatibility with
|
||||
earlier versions of PCRE2, which had only POSIX names. However, this has proved
|
||||
troublesome in situations where a program links with several libraries, some of
|
||||
which use PCRE2's POSIX interface while others use the real POSIX functions.
|
||||
For this reason, the POSIX names have been removed since release 10.37.
|
||||
</P>
|
||||
<P>
|
||||
Calling the header file <b>pcre2posix.h</b> avoids any conflict with other POSIX
|
||||
|
@ -344,9 +347,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 January 2019
|
||||
Last updated: 26 April 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
|
|||
<br><a name="SEC1" href="#TOC1">SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS</a><br>
|
||||
<P>
|
||||
<b>int32_t pcre2_serialize_decode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint32_t *<i>bytes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, const uint8_t *<i>bytes</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int32_t pcre2_serialize_encode(pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint32_t **<i>serialized_bytes</i>,</b>
|
||||
<b>int32_t pcre2_serialize_encode(const pcre2_code **<i>codes</i>,</b>
|
||||
<b> int32_t <i>number_of_codes</i>, uint8_t **<i>serialized_bytes</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>serialized_size</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -94,7 +94,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
<pre>
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
</pre>
|
||||
|
@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
<b>malloc()</b> and <b>free()</b> are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
<pre>
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
|
@ -19,28 +19,31 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC4" href="#SEC4">CHARACTER TYPES</a>
|
||||
<li><a name="TOC5" href="#SEC5">GENERAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC6" href="#SEC6">PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P</a>
|
||||
<li><a name="TOC7" href="#SEC7">SCRIPT NAMES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC9" href="#SEC9">QUANTIFIERS</a>
|
||||
<li><a name="TOC10" href="#SEC10">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC11" href="#SEC11">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC12" href="#SEC12">ALTERNATION</a>
|
||||
<li><a name="TOC13" href="#SEC13">CAPTURING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC15" href="#SEC15">COMMENT</a>
|
||||
<li><a name="TOC16" href="#SEC16">OPTION SETTING</a>
|
||||
<li><a name="TOC17" href="#SEC17">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC18" href="#SEC18">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC19" href="#SEC19">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC20" href="#SEC20">SCRIPT RUNS</a>
|
||||
<li><a name="TOC21" href="#SEC21">BACKREFERENCES</a>
|
||||
<li><a name="TOC22" href="#SEC22">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC23" href="#SEC23">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC24" href="#SEC24">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC25" href="#SEC25">CALLOUTS</a>
|
||||
<li><a name="TOC26" href="#SEC26">SEE ALSO</a>
|
||||
<li><a name="TOC27" href="#SEC27">AUTHOR</a>
|
||||
<li><a name="TOC28" href="#SEC28">REVISION</a>
|
||||
<li><a name="TOC7" href="#SEC7">BINARY PROPERTIES FOR \p AND \P</a>
|
||||
<li><a name="TOC8" href="#SEC8">SCRIPT MATCHING WITH \p AND \P</a>
|
||||
<li><a name="TOC9" href="#SEC9">THE BIDI_CLASS PROPERTY FOR \p AND \P</a>
|
||||
<li><a name="TOC10" href="#SEC10">CHARACTER CLASSES</a>
|
||||
<li><a name="TOC11" href="#SEC11">QUANTIFIERS</a>
|
||||
<li><a name="TOC12" href="#SEC12">ANCHORS AND SIMPLE ASSERTIONS</a>
|
||||
<li><a name="TOC13" href="#SEC13">REPORTED MATCH POINT SETTING</a>
|
||||
<li><a name="TOC14" href="#SEC14">ALTERNATION</a>
|
||||
<li><a name="TOC15" href="#SEC15">CAPTURING</a>
|
||||
<li><a name="TOC16" href="#SEC16">ATOMIC GROUPS</a>
|
||||
<li><a name="TOC17" href="#SEC17">COMMENT</a>
|
||||
<li><a name="TOC18" href="#SEC18">OPTION SETTING</a>
|
||||
<li><a name="TOC19" href="#SEC19">NEWLINE CONVENTION</a>
|
||||
<li><a name="TOC20" href="#SEC20">WHAT \R MATCHES</a>
|
||||
<li><a name="TOC21" href="#SEC21">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a>
|
||||
<li><a name="TOC22" href="#SEC22">NON-ATOMIC LOOKAROUND ASSERTIONS</a>
|
||||
<li><a name="TOC23" href="#SEC23">SCRIPT RUNS</a>
|
||||
<li><a name="TOC24" href="#SEC24">BACKREFERENCES</a>
|
||||
<li><a name="TOC25" href="#SEC25">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a>
|
||||
<li><a name="TOC26" href="#SEC26">CONDITIONAL PATTERNS</a>
|
||||
<li><a name="TOC27" href="#SEC27">BACKTRACKING CONTROL</a>
|
||||
<li><a name="TOC28" href="#SEC28">CALLOUTS</a>
|
||||
<li><a name="TOC29" href="#SEC29">SEE ALSO</a>
|
||||
<li><a name="TOC30" href="#SEC30">AUTHOR</a>
|
||||
<li><a name="TOC31" href="#SEC31">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY</a><br>
|
||||
<P>
|
||||
|
@ -135,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range
|
|||
sequences is changed to use Unicode properties and they match many more
|
||||
characters.
|
||||
</P>
|
||||
<P>
|
||||
Property descriptions in \p and \P are matched caselessly; hyphens,
|
||||
underscores, and white space are ignored, in accordance with Unicode's "loose
|
||||
matching" rules.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">GENERAL CATEGORY PROPERTIES FOR \p and \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
|
@ -151,6 +159,7 @@ characters.
|
|||
Lo Other letter
|
||||
Lt Title case letter
|
||||
Lu Upper case letter
|
||||
Lc Ll, Lu, or Lt
|
||||
L& Ll, Lu, or Lt
|
||||
|
||||
M Mark
|
||||
|
@ -197,158 +206,58 @@ characters.
|
|||
Perl and POSIX space are now the same. Perl added VT to its space character set
|
||||
at release 5.18.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<br><a name="SEC7" href="#TOC1">BINARY PROPERTIES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Vai,
|
||||
Warang_Citi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\p and \P, along with their abbreviations, by running this command:
|
||||
<pre>
|
||||
pcre2test -LP
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">SCRIPT MATCHING WITH \p AND \P</a><br>
|
||||
<P>
|
||||
Many script names and their 4-letter abbreviations are recognized in
|
||||
\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of
|
||||
course). You can obtain a list of these scripts by running this command:
|
||||
<pre>
|
||||
pcre2test -LS
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">THE BIDI_CLASS PROPERTY FOR \p AND \P</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\p{Bidi_Class:<class>} matches a character with the given class
|
||||
\p{BC:<class>} matches a character with the given class
|
||||
</pre>
|
||||
The recognized classes are:
|
||||
<pre>
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">CHARACTER CLASSES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
[...] positive character class
|
||||
|
@ -376,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default,
|
|||
but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
||||
\Q...\E inside a character class.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<br><a name="SEC11" href="#TOC1">QUANTIFIERS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
? 0 or 1, greedy
|
||||
|
@ -397,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
{n,}? n or more, lazy
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<br><a name="SEC12" href="#TOC1">ANCHORS AND SIMPLE ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\b word boundary
|
||||
|
@ -415,20 +324,23 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
\G first matching position in subject
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">REPORTED MATCH POINT SETTING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\K set reported start of match
|
||||
</pre>
|
||||
From release 10.38 \K is not permitted by default in lookaround assertions,
|
||||
for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option is set, the previous behaviour is re-enabled. When this option is set,
|
||||
\K is honoured in positive assertions, but ignored in negative ones.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">ALTERNATION</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">ALTERNATION</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
expr|expr|expr...
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">CAPTURING</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">CAPTURING</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(...) capture group
|
||||
|
@ -443,26 +355,26 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits;
|
|||
in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In
|
||||
both cases, a name must not start with a digit.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">ATOMIC GROUPS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?>...) atomic non-capture group
|
||||
(*atomic:...) atomic non-capture group
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">COMMENT</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">COMMENT</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?#....) comment (not nestable)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">OPTION SETTING</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">OPTION SETTING</a><br>
|
||||
<P>
|
||||
Changes of these options within a group are automatically cancelled at the end
|
||||
of the group.
|
||||
<pre>
|
||||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?J) allow duplicate named groups
|
||||
(?m) multiline
|
||||
(?n) no auto capture
|
||||
(?s) single line (dotall)
|
||||
|
@ -501,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The
|
|||
application can lock out the use of (*UTF) and (*UCP) by setting the
|
||||
PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">NEWLINE CONVENTION</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
settings with a similar syntax.
|
||||
|
@ -514,7 +426,7 @@ settings with a similar syntax.
|
|||
(*NUL) the NUL character (binary zero)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">WHAT \R MATCHES</a><br>
|
||||
<P>
|
||||
These are recognized only at the very start of the pattern or after option
|
||||
setting with a similar syntax.
|
||||
|
@ -523,7 +435,7 @@ setting with a similar syntax.
|
|||
(*BSR_UNICODE) any Unicode newline sequence
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC19" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">LOOKAHEAD AND LOOKBEHIND ASSERTIONS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?=...) )
|
||||
|
@ -544,7 +456,20 @@ setting with a similar syntax.
|
|||
</pre>
|
||||
Each top-level branch of a lookbehind must be of a fixed length.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">NON-ATOMIC LOOKAROUND ASSERTIONS</a><br>
|
||||
<P>
|
||||
These assertions are specific to PCRE2 and are not Perl-compatible.
|
||||
<pre>
|
||||
(?*...) )
|
||||
(*napla:...) ) synonyms
|
||||
(*non_atomic_positive_lookahead:...) )
|
||||
|
||||
(?<*...) )
|
||||
(*naplb:...) ) synonyms
|
||||
(*non_atomic_positive_lookbehind:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">SCRIPT RUNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(*script_run:...) ) script run, can be backtracked into
|
||||
|
@ -554,7 +479,7 @@ Each top-level branch of a lookbehind must be of a fixed length.
|
|||
(*asr:...) )
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">BACKREFERENCES</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
\n reference by number (can be ambiguous)
|
||||
|
@ -571,7 +496,7 @@ Each top-level branch of a lookbehind must be of a fixed length.
|
|||
(?P=name) reference by name (Python)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?R) recurse whole pattern
|
||||
|
@ -590,7 +515,7 @@ Each top-level branch of a lookbehind must be of a fixed length.
|
|||
\g'-n' call subroutine by relative number (PCRE2 extension)
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">CONDITIONAL PATTERNS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?(condition)yes-pattern)
|
||||
|
@ -613,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
|||
conditions or recursion tests. Such a condition is interpreted as a reference
|
||||
condition if the relevant named group exists.
|
||||
</P>
|
||||
<br><a name="SEC24" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the
|
||||
name is mandatory, for the others it is optional. (*SKIP) changes its behaviour
|
||||
|
@ -640,7 +565,7 @@ pattern is not anchored.
|
|||
The effect of one of these verbs in a group called as a subroutine is confined
|
||||
to the subroutine call.
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">CALLOUTS</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?C) callout (assumed number 0)
|
||||
|
@ -651,25 +576,25 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
|
|||
start and the end), and the starting delimiter { matched with the ending
|
||||
delimiter }. To encode the ending delimiter within the string, double it.
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2api</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2matching</b>(3), <b>pcre2</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC30" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC28" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 11 February 2019
|
||||
Last updated: 12 January 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -59,12 +59,7 @@ patterns, and the subject lines specify PCRE2 function options, control how the
|
|||
subject is processed, and what output is produced.
|
||||
</P>
|
||||
<P>
|
||||
As the original fairly simple PCRE library evolved, it acquired many different
|
||||
features, and as a result, the original <b>pcretest</b> program ended up with a
|
||||
lot of options in a messy, arcane syntax for testing all the features. The
|
||||
move to the new PCRE2 API provided an opportunity to re-implement the test
|
||||
program as <b>pcre2test</b>, with a cleaner modifier syntax. Nevertheless, there
|
||||
are still many obscure modifiers, some of which are specifically designed for
|
||||
There are many obscure modifiers, some of which are specifically designed for
|
||||
use in conjunction with the test script and data files that are distributed as
|
||||
part of PCRE2. All the modifiers are documented here, some without much
|
||||
justification, but many of them are unlikely to be of use except when testing
|
||||
|
@ -83,16 +78,16 @@ to 8-bit code units for output.
|
|||
</P>
|
||||
<P>
|
||||
In the rest of this document, the names of library functions and structures
|
||||
are given in generic form, for example, <b>pcre_compile()</b>. The actual
|
||||
are given in generic form, for example, <b>pcre2_compile()</b>. The actual
|
||||
names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||
<a name="inputencoding"></a></P>
|
||||
<br><a name="SEC3" href="#TOC1">INPUT ENCODING</a><br>
|
||||
<P>
|
||||
Input to <b>pcre2test</b> is processed line by line, either by calling the C
|
||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> library. In some
|
||||
Windows environments character 26 (hex 1A) causes an immediate end of file, and
|
||||
no further data is read, so this character should be avoided unless you really
|
||||
want that action.
|
||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> or <b>libedit</b>
|
||||
library. In some Windows environments character 26 (hex 1A) causes an immediate
|
||||
end of file, and no further data is read, so this character should be avoided
|
||||
unless you really want that action.
|
||||
</P>
|
||||
<P>
|
||||
The input is processed using using C's string functions, so must not
|
||||
|
@ -242,19 +237,38 @@ Behave as if each pattern line has the <b>jit</b> modifier; after successful
|
|||
compilation, each pattern is passed to the just-in-time compiler, if available.
|
||||
</P>
|
||||
<P>
|
||||
<b>-jitfast</b>
|
||||
Behave as if each pattern line has the <b>jitfast</b> modifier; after
|
||||
successful compilation, each pattern is passed to the just-in-time compiler, if
|
||||
available, and each subject line is passed directly to the JIT matcher via its
|
||||
"fast path".
|
||||
</P>
|
||||
<P>
|
||||
<b>-jitverify</b>
|
||||
Behave as if each pattern line has the <b>jitverify</b> modifier; after
|
||||
successful compilation, each pattern is passed to the just-in-time compiler, if
|
||||
available, and the use of JIT is verified.
|
||||
available, and the use of JIT for matching is verified.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LM</b>
|
||||
List modifiers: write a list of available pattern and subject modifiers to the
|
||||
standard output, then exit with zero exit code. All other options are ignored.
|
||||
If both -C and -LM are present, whichever is first is recognized.
|
||||
If both -C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
\fB-pattern\fB <i>modifier-list</i>
|
||||
<b>-LP</b>
|
||||
List properties: write a list of recognized Unicode properties to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-LS</b>
|
||||
List scripts: write a list of recogized Unicode script names to the standard
|
||||
output, then exit with zero exit code. All other options are ignored. If both
|
||||
-C and any -Lx options are present, whichever is first is recognized.
|
||||
</P>
|
||||
<P>
|
||||
<b>-pattern</b> <i>modifier-list</i>
|
||||
Behave as if each pattern line contains the given modifiers.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -316,7 +330,7 @@ test data, command lines that begin with # may appear. This file format, with
|
|||
some restrictions, can also be processed by the <b>perltest.sh</b> script that
|
||||
is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
|
||||
and Perl is the same. For a specification of <b>perltest.sh</b>, see the
|
||||
comments near its beginning.
|
||||
comments near its beginning. See also the #perltest command below.
|
||||
</P>
|
||||
<P>
|
||||
When the input is a terminal, <b>pcre2test</b> prompts for each line of input,
|
||||
|
@ -368,6 +382,12 @@ output.
|
|||
This command is used to load a set of precompiled patterns from a file, as
|
||||
described in the section entitled "Saving and restoring compiled patterns"
|
||||
<a href="#saverestore">below.</a>
|
||||
<pre>
|
||||
#loadtables <filename>
|
||||
</pre>
|
||||
This command is used to load a set of binary character tables that can be
|
||||
accessed by the tables=3 qualifier. Such tables can be created by the
|
||||
<b>pcre2_dftables</b> program with the -b option.
|
||||
<pre>
|
||||
#newline_default [<newline-list>]
|
||||
</pre>
|
||||
|
@ -407,14 +427,20 @@ patterns. Modifiers on a pattern can change these settings.
|
|||
<pre>
|
||||
#perltest
|
||||
</pre>
|
||||
The appearance of this line causes all subsequent modifier settings to be
|
||||
checked for compatibility with the <b>perltest.sh</b> script, which is used to
|
||||
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
||||
lines, #pattern commands, and #subject commands that set or unset "mark", no
|
||||
command lines are permitted, because they and many of the modifiers are
|
||||
specific to <b>pcre2test</b>, and should not be used in test files that are also
|
||||
processed by <b>perltest.sh</b>. The <b>#perltest</b> command helps detect tests
|
||||
that are accidentally put in the wrong file.
|
||||
This line is used in test files that can also be processed by <b>perltest.sh</b>
|
||||
to confirm that Perl gives the same results as PCRE2. Subsequent tests are
|
||||
checked for the use of <b>pcre2test</b> features that are incompatible with the
|
||||
<b>perltest.sh</b> script.
|
||||
</P>
|
||||
<P>
|
||||
Patterns must use '/' as their delimiter, and only certain modifiers are
|
||||
supported. Comment lines, #pattern commands, and #subject commands that set or
|
||||
unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and
|
||||
#newline_default commands, which are needed in the relevant pcre2test files,
|
||||
are silently ignored. All other command lines are ignored, but give a warning
|
||||
message. The <b>#perltest</b> command helps detect tests that are accidentally
|
||||
put in the wrong file or use the wrong delimiter. For more details of the
|
||||
<b>perltest.sh</b> script see the comments it contains.
|
||||
<pre>
|
||||
#pop [<modifiers>]
|
||||
#popcopy [<modifiers>]
|
||||
|
@ -467,15 +493,17 @@ excluding pattern meta-characters):
|
|||
</pre>
|
||||
This is interpreted as the pattern's delimiter. A regular expression may be
|
||||
continued over several input lines, in which case the newline characters are
|
||||
included within it. It is possible to include the delimiter within the pattern
|
||||
by escaping it with a backslash, for example
|
||||
included within it. It is possible to include the delimiter as a literal within
|
||||
the pattern by escaping it with a backslash, for example
|
||||
<pre>
|
||||
/abc\/def/
|
||||
</pre>
|
||||
If you do this, the escape and the delimiter form part of the pattern, but
|
||||
since the delimiters are all non-alphanumeric, this does not affect its
|
||||
interpretation. If the terminating delimiter is immediately followed by a
|
||||
backslash, for example,
|
||||
since the delimiters are all non-alphanumeric, the inclusion of the backslash
|
||||
does not affect the pattern's interpretation. Note, however, that this trick
|
||||
does not work within \Q...\E literal bracketing because the backslash will
|
||||
itself be interpreted as a literal. If the terminating delimiter is immediately
|
||||
followed by a backslash, for example,
|
||||
<pre>
|
||||
/abc/\
|
||||
</pre>
|
||||
|
@ -493,11 +521,11 @@ A pattern can be followed by a modifier list (details below).
|
|||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">SUBJECT LINE SYNTAX</a><br>
|
||||
<P>
|
||||
Before each subject line is passed to <b>pcre2_match()</b> or
|
||||
<b>pcre2_dfa_match()</b>, leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes, unless the <b>subject_literal</b>
|
||||
modifier was set for the pattern. The following provide a means of encoding
|
||||
non-printing characters in a visible way:
|
||||
Before each subject line is passed to <b>pcre2_match()</b>,
|
||||
<b>pcre2_dfa_match()</b>, or <b>pcre2_jit_match()</b>, leading and trailing white
|
||||
space is removed, and the line is scanned for backslash escapes, unless the
|
||||
<b>subject_literal</b> modifier was set for the pattern. The following provide a
|
||||
means of encoding non-printing characters in a visible way:
|
||||
<pre>
|
||||
\a alarm (BEL, \x07)
|
||||
\b backspace (\x08)
|
||||
|
@ -594,6 +622,7 @@ way <b>pcre2_compile()</b> behaves. See
|
|||
for a description of the effects of these options.
|
||||
<pre>
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
|
@ -613,6 +642,7 @@ for a description of the effects of these options.
|
|||
firstline set PCRE2_FIRSTLINE
|
||||
literal set PCRE2_LITERAL
|
||||
match_line set PCRE2_EXTRA_MATCH_LINE
|
||||
match_invalid_utf set PCRE2_MATCH_INVALID_UTF
|
||||
match_unset_backref set PCRE2_MATCH_UNSET_BACKREF
|
||||
match_word set PCRE2_EXTRA_MATCH_WORD
|
||||
/m multiline set PCRE2_MULTILINE
|
||||
|
@ -671,7 +701,7 @@ heavily used in the test files.
|
|||
pushcopy push a copy onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
subject_literal treat all subject lines as literal
|
||||
tables=[0|1|2] select internal tables
|
||||
tables=[0|1|2|3] select internal tables
|
||||
use_length do not zero-terminate the pattern
|
||||
utf8_input treat input as UTF-8
|
||||
</pre>
|
||||
|
@ -738,7 +768,9 @@ options, the line is omitted. "First code unit" is where any match must start;
|
|||
if there is more than one they are listed as "starting code units". "Last code
|
||||
unit" is the last literal code unit that must be present in any match. This is
|
||||
not necessarily the last character. These lines are omitted if no starting or
|
||||
ending code units are recorded.
|
||||
ending code units are recorded. The subject length line is omitted when
|
||||
<b>no_start_optimize</b> is set because the minimum length is not calculated
|
||||
when it can never be used.
|
||||
</P>
|
||||
<P>
|
||||
The <b>framesize</b> modifier shows the size, in bytes, of the storage frames
|
||||
|
@ -1017,18 +1049,20 @@ Using alternative character tables
|
|||
</b><br>
|
||||
<P>
|
||||
The value specified for the <b>tables</b> modifier must be one of the digits 0,
|
||||
1, or 2. It causes a specific set of built-in character tables to be passed to
|
||||
<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with
|
||||
different character tables. The digit specifies the tables as follows:
|
||||
1, 2, or 3. It causes a specific set of built-in character tables to be passed
|
||||
to <b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour
|
||||
with different character tables. The digit specifies the tables as follows:
|
||||
<pre>
|
||||
0 do not pass any special character tables
|
||||
1 the default ASCII tables, as distributed in
|
||||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
3 a set of tables loaded by the #loadtables command
|
||||
</pre>
|
||||
In table 2, some characters whose codes are greater than 128 are identified as
|
||||
letters, digits, spaces, etc. Setting alternate character tables and a locale
|
||||
are mutually exclusive.
|
||||
In tables 2, some characters whose codes are greater than 128 are identified as
|
||||
letters, digits, spaces, etc. Tables 3 can be used only after a
|
||||
<b>#loadtables</b> command has loaded them from a binary file. Setting alternate
|
||||
character tables and a locale are mutually exclusive.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting certain match controls
|
||||
|
@ -1040,24 +1074,27 @@ modifier list, in which case they are applied to every subject line that is
|
|||
processed with that pattern. These modifiers do not affect the compilation
|
||||
process.
|
||||
<pre>
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution <n>
|
||||
substitute_stop=<n> skip substitution <n> and following
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
</pre>
|
||||
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
||||
defaults, set them in a <b>#subject</b> command.
|
||||
|
@ -1186,7 +1223,7 @@ Setting match controls
|
|||
The following modifiers affect the matching process or request additional
|
||||
information. Some of them may also be specified on a pattern line (see above),
|
||||
in which case they apply to every subject line that is matched against that
|
||||
pattern.
|
||||
pattern, but can be overridden by modifiers on the subject.
|
||||
<pre>
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
|
@ -1204,7 +1241,8 @@ pattern.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1214,6 +1252,8 @@ pattern.
|
|||
match_limit=<n> set a match limit
|
||||
memory show heap memory usage
|
||||
null_context match with a NULL context
|
||||
null_replacement substitute with NULL replacement
|
||||
null_subject match with NULL subject
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -1223,8 +1263,11 @@ pattern.
|
|||
startoffset=<n> same as offset=<n>
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_literal use PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched use PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only use PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_skip=<n> skip substitution number n
|
||||
substitute_stop=<n> skip substitution number n and greater
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
@ -1249,22 +1292,27 @@ following line with a plus character following the capture number.
|
|||
</P>
|
||||
<P>
|
||||
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
||||
during a successful pattern match by the interpreter should be shown. This
|
||||
feature is not supported for JIT matching, and if requested with JIT it is
|
||||
ignored (with a warning message). Setting this modifier affects the output if
|
||||
there is a lookbehind at the start of a match, or a lookahead at the end, or if
|
||||
\K is used in the pattern. Characters that precede or follow the start and end
|
||||
of the actual match are indicated in the output by '<' or '>' characters
|
||||
underneath them. Here is an example:
|
||||
during a successful pattern match by the interpreter should be shown, for both
|
||||
full and partial matches. This feature is not supported for JIT matching, and
|
||||
if requested with JIT it is ignored (with a warning message). Setting this
|
||||
modifier affects the output if there is a lookbehind at the start of a match,
|
||||
or, for a complete match, a lookahead at the end, or if \K is used in the
|
||||
pattern. Characters that precede or follow the start and end of the actual
|
||||
match are indicated in the output by '<' or '>' characters underneath them.
|
||||
Here is an example:
|
||||
<pre>
|
||||
re> /(?<=pqr)abc(?=xyz)/
|
||||
data> 123pqrabcxyz456\=allusedtext
|
||||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
data> 123pqrabcxy\=ph,allusedtext
|
||||
Partial match: pqrabcxy
|
||||
<<<
|
||||
</pre>
|
||||
This shows that the matched string is "abc", with the preceding and following
|
||||
strings "pqr" and "xyz" having been consulted during the match (when processing
|
||||
the assertions).
|
||||
The first, complete match shows that the matched string is "abc", with the
|
||||
preceding and following strings "pqr" and "xyz" having been consulted during
|
||||
the match (when processing the assertions). The partial match can indicate only
|
||||
the preceding string.
|
||||
</P>
|
||||
<P>
|
||||
The <b>startchar</b> modifier requests that the starting character for the match
|
||||
|
@ -1380,9 +1428,15 @@ Testing the substitution function
|
|||
</b><br>
|
||||
<P>
|
||||
If the <b>replace</b> modifier is set, the <b>pcre2_substitute()</b> function is
|
||||
called instead of one of the matching functions. Note that replacement strings
|
||||
cannot contain commas, because a comma signifies the end of a modifier. This is
|
||||
not thought to be an issue in a test program.
|
||||
called instead of one of the matching functions (or after one call of
|
||||
<b>pcre2_match()</b> in the case of PCRE2_SUBSTITUTE_MATCHED). Note that
|
||||
replacement strings cannot contain commas, because a comma signifies the end of
|
||||
a modifier. This is not thought to be an issue in a test program.
|
||||
</P>
|
||||
<P>
|
||||
Specifying a completely empty replacement string disables this modifier.
|
||||
However, it is possible to specify an empty replacement by providing a buffer
|
||||
length, as described below, for an otherwise empty replacement.
|
||||
</P>
|
||||
<P>
|
||||
Unlike subject strings, <b>pcre2test</b> does not process replacement strings
|
||||
|
@ -1398,11 +1452,16 @@ for <b>pcre2_substitute()</b>:
|
|||
<pre>
|
||||
global PCRE2_SUBSTITUTE_GLOBAL
|
||||
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_literal PCRE2_SUBSTITUTE_LITERAL
|
||||
substitute_matched PCRE2_SUBSTITUTE_MATCHED
|
||||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_replacement_only PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
|
||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
</PRE>
|
||||
</pre>
|
||||
See the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation for details of these options.
|
||||
</P>
|
||||
<P>
|
||||
After a successful substitution, the modified string is output, preceded by the
|
||||
|
@ -1506,7 +1565,7 @@ Setting heap, match, and depth limits
|
|||
<P>
|
||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
<b>find_limits</b> modifier is specified.
|
||||
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding minimum limits
|
||||
|
@ -1516,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
</P>
|
||||
<P>
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
|
@ -1545,9 +1608,7 @@ overall amount of computing resource that is used.
|
|||
</P>
|
||||
<P>
|
||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing MARK names
|
||||
|
@ -1565,12 +1626,10 @@ Showing memory usage
|
|||
<P>
|
||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
</P>
|
||||
|
@ -1624,7 +1683,7 @@ When testing <b>pcre2_substitute()</b>, this modifier also has the effect of
|
|||
passing the replacement string as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Passing a NULL context
|
||||
Passing a NULL context, subject, or replacement
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
|
@ -1632,7 +1691,13 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
|||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
||||
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||
modifiers.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||
<b>null_replacement</b> modifier is set, the subject or replacement string
|
||||
pointers are passed as NULL, respectively, to the relevant functions.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
|
@ -1779,7 +1844,7 @@ restart the match with additional subject data by means of the
|
|||
<b>dfa_restart</b> modifier. For example:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 23ja\=P,dfa
|
||||
data> 23ja\=ps,dfa
|
||||
Partial match: 23ja
|
||||
data> n05\=dfa,dfa_restart
|
||||
0: n05
|
||||
|
@ -2071,16 +2136,16 @@ on the stack.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 11 March 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -16,22 +16,33 @@ please consult the man page, in case the conversion went wrong.
|
|||
UNICODE AND UTF SUPPORT
|
||||
</b><br>
|
||||
<P>
|
||||
When PCRE2 is built with Unicode support (which is the default), it has
|
||||
knowledge of Unicode character properties and can process text strings in
|
||||
UTF-8, UTF-16, or UTF-32 format (depending on the code unit width). However, by
|
||||
default, PCRE2 assumes that one code unit is one character. To process a
|
||||
pattern as a UTF string, where a character may require more than one code unit,
|
||||
you must call
|
||||
<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
|
||||
with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||
(*UTF). When either of these is the case, both the pattern and any subject
|
||||
strings that are matched against it are treated as UTF strings instead of
|
||||
strings of individual one-code-unit characters. There are also some other
|
||||
changes to the way characters are handled, as documented below.
|
||||
PCRE2 is normally built with Unicode support, though if you do not need it, you
|
||||
can build it without, in which case the library will be smaller. With Unicode
|
||||
support, PCRE2 has knowledge of Unicode character properties and can process
|
||||
strings of text in UTF-8, UTF-16, and UTF-32 format (depending on the code unit
|
||||
width), but this is not the default. Unless specifically requested, PCRE2
|
||||
treats each code unit in a string as one character.
|
||||
</P>
|
||||
<P>
|
||||
If you do not need Unicode support you can build PCRE2 without it, in which
|
||||
case the library will be smaller.
|
||||
There are two ways of telling PCRE2 to switch to UTF mode, where characters may
|
||||
consist of more than one code unit and the range of values is constrained. The
|
||||
program can call
|
||||
<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
|
||||
with the PCRE2_UTF option, or the pattern may start with the sequence (*UTF).
|
||||
However, the latter facility can be locked out by the PCRE2_NEVER_UTF option.
|
||||
That is, the programmer can prevent the supplier of the pattern from switching
|
||||
to UTF mode.
|
||||
</P>
|
||||
<P>
|
||||
Note that the PCRE2_MATCH_INVALID_UTF option (see
|
||||
<a href="#matchinvalid">below)</a>
|
||||
forces PCRE2_UTF to be set.
|
||||
</P>
|
||||
<P>
|
||||
In UTF mode, both the pattern and any subject strings that are matched against
|
||||
it are treated as UTF strings instead of strings of individual one-code-unit
|
||||
characters. There are also some other changes to the way characters are
|
||||
handled, as documented below.
|
||||
</P>
|
||||
<br><b>
|
||||
UNICODE PROPERTY SUPPORT
|
||||
|
@ -39,17 +50,18 @@ UNICODE PROPERTY SUPPORT
|
|||
<P>
|
||||
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
||||
\P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting.
|
||||
The Unicode properties that can be tested are limited to the general category
|
||||
properties such as Lu for an upper case letter or Nd for a decimal number, the
|
||||
Unicode script names such as Arabic or Han, and the derived properties Any and
|
||||
L&. Full lists are given in the
|
||||
The Unicode properties that can be tested are a subset of those that Perl
|
||||
supports. Currently they are limited to the general category properties such as
|
||||
Lu for an upper case letter or Nd for a decimal number, the Unicode script
|
||||
names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived
|
||||
properties Any and LC (synonym L&). Full lists are given in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
and
|
||||
<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
|
||||
documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
documentation. In general, only the short names for properties are supported.
|
||||
For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not
|
||||
supported. Furthermore, in Perl, many properties may optionally be prefixed by
|
||||
"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
</P>
|
||||
<br><b>
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -63,22 +75,22 @@ also recognized; larger ones can be coded using \o{...}.
|
|||
<P>
|
||||
The escape sequence \N{U+<hex digits>} is recognized as another way of
|
||||
specifying a Unicode character by code point in a UTF mode. It is not allowed
|
||||
in non-UTF modes.
|
||||
in non-UTF mode.
|
||||
</P>
|
||||
<P>
|
||||
In UTF modes, repeat quantifiers apply to complete UTF characters, not to
|
||||
In UTF mode, repeat quantifiers apply to complete UTF characters, not to
|
||||
individual code units.
|
||||
</P>
|
||||
<P>
|
||||
In UTF modes, the dot metacharacter matches one UTF character instead of a
|
||||
In UTF mode, the dot metacharacter matches one UTF character instead of a
|
||||
single code unit.
|
||||
</P>
|
||||
<P>
|
||||
In UTF modes, capture group names are not restricted to ASCII, and may contain
|
||||
In UTF mode, capture group names are not restricted to ASCII, and may contain
|
||||
any Unicode letters and decimal digits, as well as underscore.
|
||||
</P>
|
||||
<P>
|
||||
The escape sequence \C can be used to match a single code unit in a UTF mode,
|
||||
The escape sequence \C can be used to match a single code unit in UTF mode,
|
||||
but its use can lead to some strange effects because it breaks up multi-unit
|
||||
characters (see the description of \C in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
|
@ -93,7 +105,7 @@ may consist of more than one code unit. The use of \C in these modes provokes
|
|||
a match-time error. Also, the JIT optimization does not support \C in these
|
||||
modes. If JIT optimization is requested for a UTF-8 or UTF-16 pattern that
|
||||
contains \C, it will not succeed, and so when <b>pcre2_match()</b> is called,
|
||||
the matching will be carried out by the normal interpretive function.
|
||||
the matching will be carried out by the interpretive function.
|
||||
</P>
|
||||
<P>
|
||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||
|
@ -123,14 +135,16 @@ However, the special horizontal and vertical white space matching escapes (\h,
|
|||
not PCRE2_UCP is set.
|
||||
</P>
|
||||
<br><b>
|
||||
CASE-EQUIVALENCE IN UTF MODES
|
||||
UNICODE CASE-EQUIVALENCE
|
||||
</b><br>
|
||||
<P>
|
||||
Case-insensitive matching in a UTF mode makes use of Unicode properties except
|
||||
for characters whose code points are less than 128 and that have at most two
|
||||
case-equivalent values. For these, a direct table lookup is used for speed. A
|
||||
few Unicode characters such as Greek sigma have more than two code points that
|
||||
are case-equivalent, and these are treated as such.
|
||||
If either PCRE2_UTF or PCRE2_UCP is set, upper/lower case processing makes use
|
||||
of Unicode properties except for characters whose code points are less than 128
|
||||
and that have at most two case-equivalent values. For these, a direct table
|
||||
lookup is used for speed. A few Unicode characters such as Greek sigma have
|
||||
more than two code points that are case-equivalent, and these are treated
|
||||
specially. Setting PCRE2_UCP without PCRE2_UTF allows Unicode-style case
|
||||
processing for non-UTF character encodings such as UCS-2.
|
||||
<a name="scriptruns"></a></P>
|
||||
<br><b>
|
||||
SCRIPT RUNS
|
||||
|
@ -248,7 +262,7 @@ VALIDITY OF UTF STRINGS
|
|||
<P>
|
||||
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
|
||||
are (by default) checked for validity on entry to the relevant functions. If an
|
||||
invalid UTF string is passed, an negative error code is returned. The code unit
|
||||
invalid UTF string is passed, a negative error code is returned. The code unit
|
||||
offset to the offending character can be extracted from the match data block by
|
||||
calling <b>pcre2_get_startchar()</b>, which is used for this purpose after a UTF
|
||||
error.
|
||||
|
@ -263,17 +277,16 @@ only valid UTF code unit sequences.
|
|||
</P>
|
||||
<P>
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
|
||||
is usually undefined and your program may crash or loop indefinitely. There is,
|
||||
however, one mode of matching that can handle invalid UTF subject strings. This
|
||||
is matching via the JIT optimization using the PCRE2_JIT_INVALID_UTF option
|
||||
when calling <b>pcre2_jit_compile()</b>. For details, see the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
documentation.
|
||||
is undefined and your program may crash or loop indefinitely or give incorrect
|
||||
results. There is, however, one mode of matching that can handle invalid UTF
|
||||
subject strings. This is enabled by passing PCRE2_MATCH_INVALID_UTF to
|
||||
<b>pcre2_compile()</b> and is discussed below in the next section. The rest of
|
||||
this section covers the case when PCRE2_MATCH_INVALID_UTF is not set.
|
||||
</P>
|
||||
<P>
|
||||
Passing PCRE2_NO_UTF_CHECK to <b>pcre2_compile()</b> just disables the check for
|
||||
the pattern; it does not also apply to subject strings. If you want to disable
|
||||
the check for a subject string you must pass this same option to
|
||||
Passing PCRE2_NO_UTF_CHECK to <b>pcre2_compile()</b> just disables the UTF check
|
||||
for the pattern; it does not also apply to subject strings. If you want to
|
||||
disable the check for a subject string you must pass this same option to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -352,7 +365,7 @@ these code points are excluded by RFC 3629.
|
|||
<pre>
|
||||
PCRE2_ERROR_UTF8_ERR13
|
||||
</pre>
|
||||
A 4-byte character has a value greater than 0x10fff; these code points are
|
||||
A 4-byte character has a value greater than 0x10ffff; these code points are
|
||||
excluded by RFC 3629.
|
||||
<pre>
|
||||
PCRE2_ERROR_UTF8_ERR14
|
||||
|
@ -405,7 +418,59 @@ The following negative error codes are given for invalid UTF-32 strings:
|
|||
PCRE2_ERROR_UTF32_ERR1 Surrogate character (0xd800 to 0xdfff)
|
||||
PCRE2_ERROR_UTF32_ERR2 Code point is greater than 0x10ffff
|
||||
|
||||
</PRE>
|
||||
<a name="matchinvalid"></a></PRE>
|
||||
</P>
|
||||
<br><b>
|
||||
MATCHING IN INVALID UTF STRINGS
|
||||
</b><br>
|
||||
<P>
|
||||
You can run pattern matches on subject strings that may contain invalid UTF
|
||||
sequences if you call <b>pcre2_compile()</b> with the PCRE2_MATCH_INVALID_UTF
|
||||
option. This is supported by <b>pcre2_match()</b>, including JIT matching, but
|
||||
not by <b>pcre2_dfa_match()</b>. When PCRE2_MATCH_INVALID_UTF is set, it forces
|
||||
PCRE2_UTF to be set as well. Note, however, that the pattern itself must be a
|
||||
valid UTF string.
|
||||
</P>
|
||||
<P>
|
||||
Setting PCRE2_MATCH_INVALID_UTF does not affect what <b>pcre2_compile()</b>
|
||||
generates, but if <b>pcre2_jit_compile()</b> is subsequently called, it does
|
||||
generate different code. If JIT is not used, the option affects the behaviour
|
||||
of the interpretive code in <b>pcre2_match()</b>. When PCRE2_MATCH_INVALID_UTF
|
||||
is set at compile time, PCRE2_NO_UTF_CHECK is ignored at match time.
|
||||
</P>
|
||||
<P>
|
||||
In this mode, an invalid code unit sequence in the subject never matches any
|
||||
pattern item. It does not match dot, it does not match \p{Any}, it does not
|
||||
even match negative items such as [^X]. A lookbehind assertion fails if it
|
||||
encounters an invalid sequence while moving the current point backwards. In
|
||||
other words, an invalid UTF code unit sequence acts as a barrier which no match
|
||||
can cross.
|
||||
</P>
|
||||
<P>
|
||||
You can also think of this as the subject being split up into fragments of
|
||||
valid UTF, delimited internally by invalid code unit sequences. The pattern is
|
||||
matched fragment by fragment. The result of a successful match, however, is
|
||||
given as code unit offsets in the entire subject string in the usual way. There
|
||||
are a few points to consider:
|
||||
</P>
|
||||
<P>
|
||||
The internal boundaries are not interpreted as the beginnings or ends of lines
|
||||
and so do not match circumflex or dollar characters in the pattern.
|
||||
</P>
|
||||
<P>
|
||||
If <b>pcre2_match()</b> is called with an offset that points to an invalid
|
||||
UTF-sequence, that sequence is skipped, and the match starts at the next valid
|
||||
UTF character, or the end of the subject.
|
||||
</P>
|
||||
<P>
|
||||
At internal fragment boundaries, \b and \B behave in the same way as at the
|
||||
beginning and end of the subject. For example, a sequence such as \bWORD\b
|
||||
would match an instance of WORD that is surrounded by invalid UTF code units.
|
||||
</P>
|
||||
<P>
|
||||
Using PCRE2_MATCH_INVALID_UTF, an application can run matches on arbitrary
|
||||
data, knowing that any matched strings that are returned are valid UTF. This
|
||||
can be useful when searching for UTF text in executable or other binary files.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
|
@ -413,7 +478,7 @@ AUTHOR
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -422,9 +487,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 06 March 2019
|
||||
Last updated: 22 December 2021
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -146,6 +146,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_get_mark.html">pcre2_get_mark</a></td>
|
||||
<td> Get a (*MARK) name</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_get_match_data_size.html">pcre2_get_match_data_size</a></td>
|
||||
<td> Get the size of a match data block</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_get_ovector_count.html">pcre2_get_ovector_count</a></td>
|
||||
<td> Get the ovector count</td></tr>
|
||||
|
||||
|
@ -176,6 +179,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
|
||||
<td> Build character tables in current locale</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_maketables_free.html">pcre2_maketables_free</a></td>
|
||||
<td> Free character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_match.html">pcre2_match</a></td>
|
||||
<td> Match a compiled pattern to a subject string
|
||||
(Perl compatible)</td></tr>
|
||||
|
|
|
@ -16,8 +16,8 @@ DESCRIPTION
|
|||
|
||||
pcre2-config returns the configuration of the installed PCRE2 libraries
|
||||
and the options required to compile a program to use them. Some of the
|
||||
options apply only to the 8-bit, or 16-bit, or 32-bit libraries,
|
||||
respectively, and are not available for libraries that have not been
|
||||
options apply only to the 8-bit, or 16-bit, or 32-bit libraries, re-
|
||||
spectively, and are not available for libraries that have not been
|
||||
built. If an unavailable option is encountered, the "usage" information
|
||||
is output.
|
||||
|
||||
|
@ -36,30 +36,30 @@ OPTIONS
|
|||
--version Writes the version number of the installed PCRE2 libraries to
|
||||
the standard output.
|
||||
|
||||
--libs8 Writes to the standard output the command line options
|
||||
required to link with the 8-bit PCRE2 library (-lpcre2-8 on
|
||||
--libs8 Writes to the standard output the command line options re-
|
||||
quired to link with the 8-bit PCRE2 library (-lpcre2-8 on
|
||||
many systems).
|
||||
|
||||
--libs16 Writes to the standard output the command line options
|
||||
required to link with the 16-bit PCRE2 library (-lpcre2-16 on
|
||||
--libs16 Writes to the standard output the command line options re-
|
||||
quired to link with the 16-bit PCRE2 library (-lpcre2-16 on
|
||||
many systems).
|
||||
|
||||
--libs32 Writes to the standard output the command line options
|
||||
required to link with the 32-bit PCRE2 library (-lpcre2-32 on
|
||||
--libs32 Writes to the standard output the command line options re-
|
||||
quired to link with the 32-bit PCRE2 library (-lpcre2-32 on
|
||||
many systems).
|
||||
|
||||
--libs-posix
|
||||
Writes to the standard output the command line options
|
||||
required to link with PCRE2's POSIX API wrapper library
|
||||
Writes to the standard output the command line options re-
|
||||
quired to link with PCRE2's POSIX API wrapper library
|
||||
(-lpcre2-posix -lpcre2-8 on many systems).
|
||||
|
||||
--cflags Writes to the standard output the command line options
|
||||
required to compile files that use PCRE2 (this may include
|
||||
some -I options, but is blank on many systems).
|
||||
--cflags Writes to the standard output the command line options re-
|
||||
quired to compile files that use PCRE2 (this may include some
|
||||
-I options, but is blank on many systems).
|
||||
|
||||
--cflags-posix
|
||||
Writes to the standard output the command line options
|
||||
required to compile files that use PCRE2's POSIX API wrapper
|
||||
Writes to the standard output the command line options re-
|
||||
quired to compile files that use PCRE2's POSIX API wrapper
|
||||
library (this may include some -I options, but is blank on
|
||||
many systems).
|
||||
|
||||
|
|
22
doc/pcre2.3
22
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2 3 "17 September 2018" "PCRE2 10.33"
|
||||
.TH PCRE2 3 "27 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH INTRODUCTION
|
||||
|
@ -11,7 +11,8 @@ nearly two decades, the limitations of the original API were making development
|
|||
increasingly difficult. The new API is more extensible, and it was simplified
|
||||
by abolishing the separate "study" optimizing function; in PCRE2, patterns are
|
||||
automatically optimized where possible. Since forking from PCRE1, the code has
|
||||
been extensively refactored and new features introduced.
|
||||
been extensively refactored and new features introduced. The old library is now
|
||||
obsolete and is no longer maintained.
|
||||
.P
|
||||
As well as Perl-style regular expression patterns, some features that appeared
|
||||
in Python and the original PCRE before they appeared in Perl are available
|
||||
|
@ -19,8 +20,13 @@ using the Python syntax. There is also some support for one or two .NET and
|
|||
Oniguruma syntax items, and there are options for requesting some minor changes
|
||||
that give better ECMAScript (aka JavaScript) compatibility.
|
||||
.P
|
||||
The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
|
||||
code units, which means that up to three separate libraries may be installed.
|
||||
The source code for PCRE2 can be compiled to support strings of 8-bit, 16-bit,
|
||||
or 32-bit code units, which means that up to three separate libraries may be
|
||||
installed, one for each code unit size. The size of code unit is not related to
|
||||
the bit size of the underlying hardware. In a 64-bit environment that also
|
||||
supports 32-bit applications, versions of PCRE2 that are compiled in both
|
||||
64-bit and 32-bit modes may be needed.
|
||||
.P
|
||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||
|
@ -185,18 +191,18 @@ function, listing its arguments and results.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.P
|
||||
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||
use my two names separated by a dot at gmail.com.
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
Last updated: 27 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
6151
doc/pcre2.txt
6151
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "11 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2_COMPILE 3 "22 April 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -53,6 +53,7 @@ The option bits are:
|
|||
PCRE2_EXTENDED Ignore white space and # comments
|
||||
PCRE2_FIRSTLINE Force matching to be before newline
|
||||
PCRE2_LITERAL Pattern characters are all literal
|
||||
PCRE2_MATCH_INVALID_UTF Enable support for matching invalid UTF
|
||||
PCRE2_MATCH_UNSET_BACKREF Match unset backreferences
|
||||
PCRE2_MULTILINE ^ and $ match newlines within data
|
||||
PCRE2_NEVER_BACKSLASH_C Lock out the use of \eC in patterns
|
||||
|
@ -79,8 +80,17 @@ Additional options may be set in the compile context via the
|
|||
.\"
|
||||
function.
|
||||
.P
|
||||
The yield of this function is a pointer to a private data structure that
|
||||
contains the compiled pattern, or NULL if an error was detected.
|
||||
If either of \fIerrorcode\fP or \fIerroroffset\fP is NULL, the function returns
|
||||
NULL immediately. Otherwise, the yield of this function is a pointer to a
|
||||
private data structure that contains the compiled pattern, or NULL if an error
|
||||
was detected. In the error case, a text error message can be obtained by
|
||||
passing the value returned via the \fIerrorcode\fP argument to the the
|
||||
\fBpcre2_get_error_message()\fP function. The offset (in code units) where the
|
||||
error was encountered is returned via the \fIerroroffset\fP argument.
|
||||
.P
|
||||
If there is no error, the value passed via \fIerrorcode\fP returns the message
|
||||
"no error" if passed to \fBpcre2_get_error_message()\fP, and the value passed
|
||||
via \fIerroroffset\fP is zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API, with more detail on
|
||||
each option, in the
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_DFA_MATCH 3 "16 October 2018" "PCRE2 10.33"
|
||||
.TH PCRE2_DFA_MATCH 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -33,10 +33,15 @@ just once (except when processing lookaround assertions). This function is
|
|||
\fIworkspace\fP Points to a vector of ints used as working space
|
||||
\fIwscount\fP Number of elements in the vector
|
||||
.sp
|
||||
For \fBpcre2_dfa_match()\fP, a match context is needed only if you want to set
|
||||
up a callout function or specify the heap limit or the match or the recursion
|
||||
depth limits. The \fIlength\fP and \fIstartoffset\fP values are code units, not
|
||||
characters. The options are:
|
||||
The size of output vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
|
||||
data block is therefore not advisable when using this function.
|
||||
.P
|
||||
A match context is needed only if you want to set up a callout function or
|
||||
specify the heap limit or the match or the recursion depth limits. The
|
||||
\fIlength\fP and \fIstartoffset\fP values are code units, not characters. The
|
||||
options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_COPY_MATCHED_SUBJECT
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
.TH PCRE2_GET_MATCH_DATA_SIZE 3 "16 July 2019" "PCRE2 10.34"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.nf
|
||||
.B PCRE2_SIZE pcre2_get_match_data_size(pcre2_match_data *\fImatch_data\fP);
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function returns the size, in bytes, of the match data block that is its
|
||||
argument.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
page and a description of the POSIX API in the
|
||||
.\" HREF
|
||||
\fBpcre2posix\fP
|
||||
.\"
|
||||
page.
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_JIT_COMPILE 3 "06 March 2019" "PCRE2 10.33"
|
||||
.TH PCRE2_JIT_COMPILE 3 "29 July 2019" "PCRE2 10.34"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -29,11 +29,16 @@ bits:
|
|||
PCRE2_JIT_COMPLETE compile code for full matching
|
||||
PCRE2_JIT_PARTIAL_SOFT compile code for soft partial matching
|
||||
PCRE2_JIT_PARTIAL_HARD compile code for hard partial matching
|
||||
PCRE2_JIT_INVALID_UTF compile code to handle invalid UTF
|
||||
.sp
|
||||
There is also an obsolete option called PCRE2_JIT_INVALID_UTF, which has been
|
||||
superseded by the \fBpcre2_compile()\fP option PCRE2_MATCH_INVALID_UTF. The old
|
||||
option is deprecated and may be removed in the future.
|
||||
.P
|
||||
The yield of the function is 0 for success, or a negative error code otherwise.
|
||||
In particular, PCRE2_ERROR_JIT_BADOPTION is returned if JIT is not supported or
|
||||
if an unknown bit is set in \fIoptions\fP.
|
||||
if an unknown bit is set in \fIoptions\fP. The function can also return
|
||||
PCRE2_ERROR_NOMEMORY if JIT is unable to allocate executable memory for the
|
||||
compiler, even if it was because of a system security restriction.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
|
|
|
@ -17,7 +17,7 @@ This function frees unused JIT executable memory. The argument is a general
|
|||
context, for custom memory management, or NULL for standard memory management.
|
||||
JIT memory allocation retains some memory in order to improve future JIT
|
||||
compilation speed. In low memory conditions,
|
||||
\fBpcre2_jit_free_unused_memory()\fB can be used to cause this memory to be
|
||||
\fBpcre2_jit_free_unused_memory()\fP can be used to cause this memory to be
|
||||
freed.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_JIT_MATCH 3 "03 November 2014" "PCRE2 10.0"
|
||||
.TH PCRE2_JIT_MATCH 3 "11 February 2020" "PCRE2 10.35"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -22,8 +22,10 @@ algorithm that is similar to Perl's. It is a "fast path" interface to JIT, and
|
|||
it bypasses some of the sanity checks that \fBpcre2_match()\fP applies.
|
||||
Its arguments are exactly the same as for
|
||||
.\" HREF
|
||||
\fBpcre2_match()\fP.
|
||||
\fBpcre2_match()\fP,
|
||||
.\"
|
||||
except that the subject string must be specified with a length;
|
||||
PCRE2_ZERO_TERMINATED is not supported.
|
||||
.P
|
||||
The supported options are PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||
PCRE2_NOTEMPTY_ATSTART, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Unsupported
|
||||
|
|
|
@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling
|
|||
\fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern,
|
||||
which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP.
|
||||
A maximum stack size of 512KiB to 1MiB should be more than enough for any
|
||||
pattern. For more details, see the
|
||||
pattern. If the stack couldn't be allocated or the values passed were not
|
||||
reasonable, NULL will be returned. For more details, see the
|
||||
.\" HREF
|
||||
\fBpcre2jit\fP
|
||||
.\"
|
||||
|
|
|
@ -7,7 +7,7 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.SM
|
||||
.B const unsigned char *pcre2_maketables(pcre2_general_context *\fIgcontext\fP);
|
||||
.B const uint8_t *pcre2_maketables(pcre2_general_context *\fIgcontext\fP);
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
.TH PCRE2_MAKETABLES_FREE 3 "02 September 2019" "PCRE2 10.34"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.nf
|
||||
.B void pcre2_maketables_free(pcre2_general_context *\fIgcontext\fP,
|
||||
.B " const uint8_t *\fItables\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function discards a set of character tables that were created by a call
|
||||
to
|
||||
.\" HREF
|
||||
\fBpcre2_maketables()\fP.
|
||||
.\"
|
||||
.P
|
||||
The \fIgcontext\fP parameter should match what was used in that call to
|
||||
account for any custom allocators that might be in use; if it is NULL
|
||||
the system \fBfree()\fP is used.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
page.
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_MATCH_DATA_CREATE 3 "29 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2_MATCH_DATA_CREATE 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -18,8 +18,9 @@ This function creates a new match data block, which is used for holding the
|
|||
result of a match. The first argument specifies the number of pairs of offsets
|
||||
that are required. These form the "output vector" (ovector) within the match
|
||||
data block, and are used to identify the matched string and any captured
|
||||
substrings. There is always one pair of offsets; if \fBovecsize\fP is zero, it
|
||||
is treated as one.
|
||||
substrings when matching with \fBpcre2_match()\fP, or a number of different
|
||||
matches at the same point when used with \fBpcre2_dfa_match()\fP. There is
|
||||
always one pair of offsets; if \fBovecsize\fP is zero, it is treated as one.
|
||||
.P
|
||||
The second argument points to a general context, for custom memory management,
|
||||
or is NULL for system memory management. The result of the function is NULL if
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "29 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2_MATCH_DATA_CREATE_FROM_PATTERN 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -14,12 +14,15 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function creates a new match data block, which is used for holding the
|
||||
result of a match. The first argument points to a compiled pattern. The number
|
||||
of capturing parentheses within the pattern is used to compute the number of
|
||||
pairs of offsets that are required in the match data block. These form the
|
||||
"output vector" (ovector) within the match data block, and are used to identify
|
||||
the matched string and any captured substrings.
|
||||
This function creates a new match data block for holding the result of a match.
|
||||
The first argument points to a compiled pattern. The number of capturing
|
||||
parentheses within the pattern is used to compute the number of pairs of
|
||||
offsets that are required in the match data block. These form the "output
|
||||
vector" (ovector) within the match data block, and are used to identify the
|
||||
matched string and any captured substrings when matching with
|
||||
\fBpcre2_match()\fP. If you are using \fBpcre2_dfa_match()\fP, which uses the
|
||||
outut vector in a different way, you should use \fBpcre2_match_data_create()\fP
|
||||
instead of this function.
|
||||
.P
|
||||
The second argument points to a general context, for custom memory management,
|
||||
or is NULL to use the same memory allocator as was used for the compiled
|
||||
|
|
|
@ -36,7 +36,7 @@ the following negative error codes:
|
|||
PCRE2_ERROR_BADDATA \fInumber_of_codes\fP is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in \fIbytes\fP
|
||||
PCRE2_ERROR_BADMODE mismatch of variable unit size or PCRE version
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_NULL \fIcodes\fP or \fIbytes\fP is NULL
|
||||
.sp
|
||||
PCRE2_ERROR_BADMAGIC may mean that the data is corrupt, or that it was compiled
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SET_CHARACTER_TABLES 3 "22 October 2014" "PCRE2 10.00"
|
||||
.TH PCRE2_SET_CHARACTER_TABLES 3 "20 March 2020" "PCRE2 10.35"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -8,16 +8,21 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.PP
|
||||
.nf
|
||||
.B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " const unsigned char *\fItables\fP);"
|
||||
.B " const uint8_t *\fItables\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function sets a pointer to custom character tables within a compile
|
||||
context. The second argument must be the result of a call to
|
||||
\fBpcre2_maketables()\fP or NULL to request the default tables. The result is
|
||||
always zero.
|
||||
context. The second argument must point to a set of PCRE2 character tables or
|
||||
be NULL to request the default tables. The result is always zero. Character
|
||||
tables can be created by calling \fBpcre2_maketables()\fP or by running the
|
||||
\fBpcre2_dftables\fP maintenance command in binary mode (see the
|
||||
.\" HREF
|
||||
\fBpcre2build\fP
|
||||
.\"
|
||||
documentation).
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "11 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "31 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -18,12 +18,13 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are
|
|||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff}
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and \ex
|
||||
handling
|
||||
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and
|
||||
\ex handling
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as
|
||||
a literal following character
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SUBSTITUTE 3 "04 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2_SUBSTITUTE 3 "22 January 2020" "PCRE2 10.35"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -36,8 +36,8 @@ Its arguments are:
|
|||
\fIoutlengthptr\fP Points to the length of the output buffer
|
||||
.sp
|
||||
A match data block is needed only if you want to inspect the data from the
|
||||
match that is returned in that block. A match context is needed only if you
|
||||
want to:
|
||||
final match that is returned in that block or if PCRE2_SUBSTITUTE_MATCHED is
|
||||
set. A match context is needed only if you want to:
|
||||
.sp
|
||||
Set up a callout function
|
||||
Set a matching offset limit
|
||||
|
@ -45,33 +45,57 @@ want to:
|
|||
Change the backtracking depth limit
|
||||
Set custom memory management in the match context
|
||||
.sp
|
||||
The \fIlength\fP, \fIstartoffset\fP and \fIrlength\fP values are code
|
||||
units, not characters, as is the contents of the variable pointed at by
|
||||
\fIoutlengthptr\fP, which is updated to the actual length of the new string.
|
||||
The \fIlength\fP, \fIstartoffset\fP and \fIrlength\fP values are code units,
|
||||
not characters, as is the contents of the variable pointed at by
|
||||
\fIoutlengthptr\fP. This variable must contain the length of the output buffer
|
||||
when the function is called. If the function is successful, the value is
|
||||
changed to the length of the new string, excluding the trailing zero that is
|
||||
automatically added.
|
||||
.P
|
||||
The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for
|
||||
zero-terminated strings. The options are:
|
||||
.sp
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Pattern can match only at end of subject
|
||||
PCRE2_NOTBOL Subject is not the beginning of a line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
PCRE2_NOTEMPTY An empty string is not a valid match
|
||||
PCRE2_ANCHORED Match only at the first position
|
||||
PCRE2_ENDANCHORED Match only at end of subject
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of the
|
||||
subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
PCRE2_NOTBOL Subject is not the beginning of a
|
||||
line
|
||||
PCRE2_NOTEOL Subject is not the end of a line
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check the subject or replacement
|
||||
for UTF validity (only relevant if
|
||||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_NOTEMPTY An empty string is not a
|
||||
valid match
|
||||
.\" JOIN
|
||||
PCRE2_NOTEMPTY_ATSTART An empty string at the start of
|
||||
the subject is not a valid match
|
||||
PCRE2_NO_JIT Do not use JIT matching
|
||||
.\" JOIN
|
||||
PCRE2_NO_UTF_CHECK Do not check for UTF validity in
|
||||
the subject or replacement
|
||||
.\" JOIN
|
||||
(only relevant if PCRE2_UTF was
|
||||
set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the
|
||||
subject
|
||||
PCRE2_SUBSTITUTE_LITERAL The replacement string is literal
|
||||
.\" JOIN
|
||||
PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for
|
||||
first match
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s)
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
.sp
|
||||
If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED,
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.
|
||||
.P
|
||||
If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its
|
||||
contents must be the result of a call to \fBpcre2_match()\fP using the same
|
||||
pattern and subject.
|
||||
.P
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
are no matches. The result may be greater than one only when
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a negative error code
|
||||
is returned.
|
||||
.P
|
||||
|
|
656
doc/pcre2api.3
656
doc/pcre2api.3
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2BUILD 3 "03 March 2019" "PCRE2 10.33"
|
||||
.TH PCRE2BUILD 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.
|
||||
|
@ -110,7 +110,7 @@ To build it without Unicode support, add
|
|||
--disable-unicode
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This setting applies to all three libraries. It
|
||||
is not possible to build one library with Unicode support, and another without,
|
||||
is not possible to build one library with Unicode support and another without
|
||||
in the same configuration.
|
||||
.P
|
||||
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16
|
||||
|
@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF.
|
|||
UTF support allows the libraries to process character code points up to
|
||||
0x10ffff in the strings that they handle. Unicode support also gives access to
|
||||
the Unicode properties of characters, using pattern escapes such as \eP, \ep,
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are
|
||||
supported. Details are given in the
|
||||
and \eX. Only the general category properties such as \fILu\fP and \fINd\fP,
|
||||
script names, and some bi-directional properties are supported. Details are
|
||||
given in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
|
@ -175,11 +176,11 @@ SELinux. This has no effect if JIT is not enabled. See the
|
|||
\fBpcre2jit\fP
|
||||
.\"
|
||||
documentation for a discussion of JIT usage. When JIT support is enabled,
|
||||
pcre2grep automatically makes use of it, unless you add
|
||||
\fBpcre2grep\fP automatically makes use of it, unless you add
|
||||
.sp
|
||||
--disable-pcre2grep-jit
|
||||
.sp
|
||||
to the "configure" command.
|
||||
to the \fBconfigure\fP command.
|
||||
.
|
||||
.
|
||||
.SH "NEWLINE RECOGNITION"
|
||||
|
@ -277,12 +278,11 @@ to the \fBconfigure\fP command. This setting also applies to the
|
|||
\fBpcre2_dfa_match()\fP matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
.P
|
||||
The \fBpcre2_match()\fP function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The \fBpcre2_match()\fP function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
|
@ -302,7 +302,7 @@ You can also explicitly limit the depth of nested backtracking in the
|
|||
for --with-match-limit. You can set a lower default limit by adding, for
|
||||
example,
|
||||
.sp
|
||||
--with-match-limit_depth=10000
|
||||
--with-match-limit-depth=10000
|
||||
.sp
|
||||
to the \fBconfigure\fP command. This value can be overridden at run time. This
|
||||
depth limit indirectly limits the amount of heap memory that is used, but
|
||||
|
@ -317,6 +317,7 @@ used for lookaround assertions, atomic groups, and recursion within patterns.
|
|||
The limit does not apply to JIT matching.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="createtables"></a>
|
||||
.SH "CREATING CHARACTER TABLES AT BUILD TIME"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -328,12 +329,33 @@ only. If you add
|
|||
--enable-rebuild-chartables
|
||||
.sp
|
||||
to the \fBconfigure\fP command, the distributed tables are no longer used.
|
||||
Instead, a program called \fBdftables\fP is compiled and run. This outputs the
|
||||
source for new set of tables, created in the default locale of your C run-time
|
||||
system. This method of replacing the tables does not work if you are cross
|
||||
compiling, because \fBdftables\fP is run on the local host. If you need to
|
||||
create alternative tables when cross compiling, you will have to do so "by
|
||||
hand".
|
||||
Instead, a program called \fBpcre2_dftables\fP is compiled and run. This
|
||||
outputs the source for new set of tables, created in the default locale of your
|
||||
C run-time system. This method of replacing the tables does not work if you are
|
||||
cross compiling, because \fBpcre2_dftables\fP needs to be run on the local
|
||||
host and therefore not compiled with the cross compiler.
|
||||
.P
|
||||
If you need to create alternative tables when cross compiling, you will have to
|
||||
do so "by hand". There may also be other reasons for creating tables manually.
|
||||
To cause \fBpcre2_dftables\fP to be built on the local host, run a normal
|
||||
compiling command, and then run the program with the output file as its
|
||||
argument, for example:
|
||||
.sp
|
||||
cc src/pcre2_dftables.c -o pcre2_dftables
|
||||
./pcre2_dftables src/pcre2_chartables.c
|
||||
.sp
|
||||
This builds the tables in the default locale of the local host. If you want to
|
||||
specify a locale, you must use the -L option:
|
||||
.sp
|
||||
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
|
||||
.sp
|
||||
You can also specify -b (with or without -L). This causes the tables to be
|
||||
written in binary instead of as source code. A set of binary tables can be
|
||||
loaded into memory by an application and passed to \fBpcre2_compile()\fP in the
|
||||
same way as tables created by calling \fBpcre2_maketables()\fP. The tables are
|
||||
just a string of bytes, independent of hardware characteristics such as
|
||||
endianness. This means they can be bundled with an application that runs in
|
||||
different environments, to ensure consistent behaviour.
|
||||
.
|
||||
.
|
||||
.SH "USING EBCDIC CODE"
|
||||
|
@ -417,7 +439,7 @@ default parameter values by adding, for example,
|
|||
--with-pcre2grep-bufsize=51200
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
.sp
|
||||
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can override
|
||||
to the \fBconfigure\fP command. The caller of \fBpcre2grep\fP can override
|
||||
these values by using --buffer-size and --max-buffer-size on the command line.
|
||||
.
|
||||
.
|
||||
|
@ -541,15 +563,16 @@ documentation.
|
|||
.sp
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
|
||||
environments other than Microsoft Visual Studio when __STDC_VERSION__ is
|
||||
defined and has a value greater than or equal to 199901L (indicating C99).
|
||||
environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to 199901L
|
||||
(indicating support for C99).
|
||||
However, there is at least one environment that claims to be C99 but does not
|
||||
support these modifiers. If
|
||||
.sp
|
||||
--disable-percent-zt
|
||||
.sp
|
||||
is specified, no use is made of the z or t modifiers. Instead or %td or %zu,
|
||||
%lu is used, with a cast for size_t values.
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or %zu,
|
||||
a suitable format is used depending in the size of long for the platform.
|
||||
.
|
||||
.
|
||||
.SH "SUPPORT FOR FUZZERS"
|
||||
|
@ -601,7 +624,7 @@ give a warning.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -610,6 +633,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 03 March 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,33 +1,43 @@
|
|||
.TH PCRE2COMPAT 3 "12 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
|
||||
.rs
|
||||
.sp
|
||||
This document describes the differences in the ways that PCRE2 and Perl handle
|
||||
regular expressions. The differences described here are with respect to Perl
|
||||
versions 5.26, but as both Perl and PCRE2 are continually changing, the
|
||||
information may sometimes be out of date.
|
||||
This document describes some of the differences in the ways that PCRE2 and Perl
|
||||
handle regular expressions. The differences described here are with respect to
|
||||
Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the
|
||||
information may at times be out of date.
|
||||
.P
|
||||
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the
|
||||
behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the
|
||||
next character unless it is the start of a newline sequence. This means that,
|
||||
if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF
|
||||
(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using
|
||||
EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline
|
||||
indicator.
|
||||
.P
|
||||
2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||
have are given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
page.
|
||||
.P
|
||||
2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but
|
||||
they do not mean what you might think. For example, (?!a){3} does not assert
|
||||
that the next three characters are not "a". It just asserts that the next
|
||||
character is not "a" three times (in principle; PCRE2 optimizes this to run the
|
||||
assertion just once). Perl allows some repeat quantifiers on other assertions,
|
||||
for example, \eb* (but not \eb{3}), but these do not seem to have any use.
|
||||
for example, \eb* , but these do not seem to have any use. PCRE2 does not allow
|
||||
any kind of quantifier on non-lookaround assertions.
|
||||
.P
|
||||
3. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
4. Capture groups that occur inside negative lookaround assertions are counted,
|
||||
but their entries in the offsets vector are set only when a negative assertion
|
||||
is a condition that has a matching branch (that is, the condition is false).
|
||||
Perl may set such capture groups in other circumstances.
|
||||
.P
|
||||
4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu,
|
||||
\eU, and \eN when followed by a character name. \eN on its own, matching a
|
||||
non-newline character, and \eN{U+dd..}, matching a Unicode code point, are
|
||||
supported. The escapes that modify the case of following letters are
|
||||
|
@ -37,23 +47,27 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or
|
|||
PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript
|
||||
interprets them.
|
||||
.P
|
||||
5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||
built with Unicode support (the default). The properties that can be tested
|
||||
with \ep and \eP are limited to the general category properties such as Lu and
|
||||
Nd, script names such as Greek or Han, and the derived properties Any and L&.
|
||||
PCRE2 does support the Cs (surrogate) property, which Perl does not; the Perl
|
||||
documentation says "Because Perl hides the need for the user to understand the
|
||||
internal representation of Unicode characters, there is no need to implement
|
||||
the somewhat messy concept of surrogates."
|
||||
Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the
|
||||
derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs
|
||||
(surrogate) property, but in PCRE2 its use is limited. See the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
documentation for details. The long synonyms for property names that Perl
|
||||
supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted
|
||||
to prefix any of these properties with "Is".
|
||||
.P
|
||||
6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters
|
||||
in between are treated as literals. However, this is slightly different from
|
||||
Perl in that $ and @ are also handled as literals inside the quotes. In Perl,
|
||||
they cause variable interpolation (but of course PCRE2 does not have
|
||||
variables). Also, Perl does "double-quotish backslash interpolation" on any
|
||||
backslashes between \eQ and \eE which, its documentation says, "may lead to
|
||||
confusing results". PCRE2 treats a backslash between \eQ and \eE just like any
|
||||
other character. Note the following examples:
|
||||
they cause variable interpolation (PCRE2 does not have variables). Also, Perl
|
||||
does "double-quotish backslash interpolation" on any backslashes between \eQ
|
||||
and \eE which, its documentation says, "may lead to confusing results". PCRE2
|
||||
treats a backslash between \eQ and \eE just like any other character. Note the
|
||||
following examples:
|
||||
.sp
|
||||
Pattern PCRE2 matches Perl matches
|
||||
.sp
|
||||
|
@ -65,9 +79,10 @@ other character. Note the following examples:
|
|||
\eQA\eB\eE A\eB A\eB
|
||||
\eQ\e\eE \e \e\eE
|
||||
.sp
|
||||
The \eQ...\eE sequence is recognized both inside and outside character classes.
|
||||
The \eQ...\eE sequence is recognized both inside and outside character classes
|
||||
by both PCRE2 and Perl.
|
||||
.P
|
||||
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||
constructions. However, PCRE2 does have a "callout" feature, which allows an
|
||||
external function to be called during pattern matching. See the
|
||||
.\" HREF
|
||||
|
@ -75,27 +90,24 @@ external function to be called during pattern matching. See the
|
|||
.\"
|
||||
documentation for details.
|
||||
.P
|
||||
8. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
9. Subroutine calls (whether recursive or not) were treated as atomic groups up
|
||||
to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking
|
||||
into subroutine calls is now supported, as in Perl.
|
||||
.P
|
||||
9. If any of the backtracking control verbs are used in a group that is called
|
||||
as a subroutine (whether or not recursively), their effect is confined to that
|
||||
group; it does not extend to the surrounding pattern. This is not always the
|
||||
case in Perl. In particular, if (*THEN) is present in a group that is called as
|
||||
a subroutine, its action is limited to that group, even if the group does not
|
||||
contain any | characters. Note that such groups are processed as anchored
|
||||
at the point where they are tested.
|
||||
10. In PCRE2, if any of the backtracking control verbs are used in a group that
|
||||
is called as a subroutine (whether or not recursively), their effect is
|
||||
confined to that group; it does not extend to the surrounding pattern. This is
|
||||
not always the case in Perl. In particular, if (*THEN) is present in a group
|
||||
that is called as a subroutine, its action is limited to that group, even if
|
||||
the group does not contain any | characters. Note that such groups are
|
||||
processed as anchored at the point where they are tested.
|
||||
.P
|
||||
10. If a pattern contains more than one backtracking control verb, the first
|
||||
11. If a pattern contains more than one backtracking control verb, the first
|
||||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
.P
|
||||
11. Most backtracking verbs in assertions have their normal actions. They are
|
||||
not confined to the assertion.
|
||||
.P
|
||||
12. There are some differences that are concerned with the settings of captured
|
||||
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||
|
@ -104,7 +116,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
|||
13. PCRE2's handling of duplicate capture group numbers and names is not as
|
||||
general as Perl's. This is a consequence of the fact the PCRE2 works internally
|
||||
just with numbers, using an external table to translate between numbers and
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B), where the two
|
||||
names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two
|
||||
capture groups have the same number but different names, is not supported, and
|
||||
causes an error at compile time. If it were allowed, it would not be possible
|
||||
to distinguish which group matched, because both names map to capture group
|
||||
|
@ -124,17 +136,24 @@ certainly user mistakes.
|
|||
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||
affected when case-independent matching is specified. For example, \ep{Lu}
|
||||
always matches an upper case letter. I think Perl has changed in this respect;
|
||||
in the release at the time of writing (5.24), \ep{Lu} and \ep{Ll} match all
|
||||
in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all
|
||||
letters, regardless of case, when case independence is specified.
|
||||
.P
|
||||
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 includes new features that are not in earlier versions of Perl, some
|
||||
17. From release 5.32.0, Perl locks out the use of \eK in lookaround
|
||||
assertions. From release 10.38 PCRE2 does the same by default. However, there
|
||||
is an option for re-enabling the previous behaviour. When this option is set,
|
||||
\eK is acted on when it occurs in positive assertions, but is ignored in
|
||||
negative assertions.
|
||||
.P
|
||||
18. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||
Perl 5.10 included new features that were not in earlier versions of Perl, some
|
||||
of which (such as named parentheses) were in PCRE2 for some time before. This
|
||||
list is with respect to Perl 5.26:
|
||||
list is with respect to Perl 5.34:
|
||||
.sp
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||
each alternative branch of a lookbehind assertion can match a different length
|
||||
of string. Perl requires them all to have the same length.
|
||||
each alternative toplevel branch of a lookbehind assertion can match a
|
||||
different length of string. Perl used to require them all to have the same
|
||||
length, but the latest version has some variable length support.
|
||||
.sp
|
||||
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported
|
||||
in lookbehinds, provided that there is no possibility of referencing a
|
||||
|
@ -168,14 +187,18 @@ variable interpolation, but not general hooks on every match.
|
|||
different way and is not Perl-compatible.
|
||||
.sp
|
||||
(l) PCRE2 recognizes some special sequences such as (*CR) or (*NO_JIT) at
|
||||
the start of a pattern that set overall options that cannot be changed within
|
||||
the start of a pattern. These set overall options that cannot be changed within
|
||||
the pattern.
|
||||
.sp
|
||||
(m) PCRE2 supports non-atomic positive lookaround assertions. This is an
|
||||
extension to the lookaround facilities. The default, Perl-compatible
|
||||
lookarounds are atomic.
|
||||
.P
|
||||
18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa
|
||||
modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode
|
||||
rules. This separation cannot be represented with PCRE2_UCP.
|
||||
.P
|
||||
19. Perl has different limits than PCRE2. See the
|
||||
20. Perl has different limits than PCRE2. See the
|
||||
.\" HREF
|
||||
\fBpcre2limit\fP
|
||||
.\"
|
||||
|
@ -190,7 +213,7 @@ fall into any stack-overflow limit. PCRE2 made a similar change at release
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -199,6 +222,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 12 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -116,8 +116,8 @@ permitted to match separator characters, but the double-star (**) feature
|
|||
(which does match separators) is supported.
|
||||
.P
|
||||
PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR matches globs with wildcards allowed to
|
||||
match separator characters. PCRE2_GLOB_NO_STARSTAR matches globs with the
|
||||
double-star feature disabled. These options may be given together.
|
||||
match separator characters. PCRE2_CONVERT_GLOB_NO_STARSTAR matches globs with
|
||||
the double-star feature disabled. These options may be given together.
|
||||
.
|
||||
.
|
||||
.SH "CONVERTING POSIX PATTERNS"
|
||||
|
|
|
@ -104,12 +104,11 @@ uint32_t newline;
|
|||
|
||||
PCRE2_SIZE erroroffset;
|
||||
PCRE2_SIZE *ovector;
|
||||
PCRE2_SIZE subject_length;
|
||||
|
||||
size_t subject_length;
|
||||
pcre2_match_data *match_data;
|
||||
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
|
@ -138,12 +137,14 @@ if (argc - i != 2)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* As pattern and subject are char arguments, they can be straightforwardly
|
||||
cast to PCRE2_SPTR as we are working in 8-bit code units. */
|
||||
/* Pattern and subject are char arguments, so they can be straightforwardly
|
||||
cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
|
||||
length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
|
||||
defined to be size_t. */
|
||||
|
||||
pattern = (PCRE2_SPTR)argv[i];
|
||||
subject = (PCRE2_SPTR)argv[i+1];
|
||||
subject_length = strlen((char *)subject);
|
||||
subject_length = (PCRE2_SIZE)strlen((char *)subject);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -172,17 +173,22 @@ if (re == NULL)
|
|||
|
||||
|
||||
/*************************************************************************
|
||||
* If the compilation succeeded, we call PCRE again, in order to do a *
|
||||
* If the compilation succeeded, we call PCRE2 again, in order to do a *
|
||||
* pattern match against the subject string. This does just ONE match. If *
|
||||
* further matching is needed, it will be done below. Before running the *
|
||||
* match we must set up a match_data block for holding the result. *
|
||||
* match we must set up a match_data block for holding the result. Using *
|
||||
* pcre2_match_data_create_from_pattern() ensures that the block is *
|
||||
* exactly the right size for the number of capturing parentheses in the *
|
||||
* pattern. If you need to know the actual size of a match_data block as *
|
||||
* a number of bytes, you can find it like this: *
|
||||
* *
|
||||
* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); *
|
||||
*************************************************************************/
|
||||
|
||||
/* Using this function ensures that the block is exactly the right size for
|
||||
the number of capturing parentheses in the pattern. */
|
||||
|
||||
match_data = pcre2_match_data_create_from_pattern(re, NULL);
|
||||
|
||||
/* Now run the match. */
|
||||
|
||||
rc = pcre2_match(
|
||||
re, /* the compiled pattern */
|
||||
subject, /* the subject string */
|
||||
|
@ -205,12 +211,12 @@ if (rc < 0)
|
|||
default: printf("Matching error %d\en", rc); break;
|
||||
}
|
||||
pcre2_match_data_free(match_data); /* Release memory used for the match */
|
||||
pcre2_code_free(re); /* data and the compiled pattern. */
|
||||
pcre2_code_free(re); /* data and the compiled pattern. */
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded. Get a pointer to the output vector, where string offsets are
|
||||
stored. */
|
||||
/* Match succeeded. Get a pointer to the output vector, where string offsets
|
||||
are stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("Match succeeded at offset %d\en", (int)ovector[0]);
|
||||
|
@ -228,9 +234,12 @@ pcre2_match_data_create_from_pattern() above. */
|
|||
if (rc == 0)
|
||||
printf("ovector was not big enough for all the captured substrings\en");
|
||||
|
||||
/* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion
|
||||
to set the start of a match later than its end. In this demonstration program,
|
||||
we just detect this case and give up. */
|
||||
/* Since release 10.38 PCRE2 has locked out the use of \eK in lookaround
|
||||
assertions. However, there is an option to re-enable the old behaviour. If that
|
||||
is set, it is possible to run patterns such as /(?=.\eK)/ that use \eK in an
|
||||
assertion to set the start of a match later than its end. In this demonstration
|
||||
program, we show how to detect this case, but it shouldn't arise because the
|
||||
option is never set. */
|
||||
|
||||
if (ovector[0] > ovector[1])
|
||||
{
|
||||
|
@ -249,7 +258,7 @@ application you might want to do things other than print them. */
|
|||
for (i = 0; i < rc; i++)
|
||||
{
|
||||
PCRE2_SPTR substring_start = subject + ovector[2*i];
|
||||
size_t substring_length = ovector[2*i+1] - ovector[2*i];
|
||||
PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
|
||||
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
|
@ -447,7 +456,7 @@ for (;;)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeded */
|
||||
/* Match succeeded */
|
||||
|
||||
printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);
|
||||
|
||||
|
|
420
doc/pcre2grep.1
420
doc/pcre2grep.1
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "24 November 2018" "PCRE2 10.33"
|
||||
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -43,13 +43,15 @@ For example:
|
|||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
\fB-N\fP (\fB--newline\fP) option.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
|
@ -79,8 +81,8 @@ matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
|
|||
(either shown literally, or as an offset), scanning resumes immediately
|
||||
following the match, so that further matches on the same line can be found. If
|
||||
there are multiple patterns, they are all tried on the remainder of the line,
|
||||
but patterns that follow the one that matched are not tried on the earlier part
|
||||
of the line.
|
||||
but patterns that follow the one that matched are not tried on the earlier
|
||||
matched part of the line.
|
||||
.P
|
||||
This behaviour means that the order in which multiple patterns are specified
|
||||
can affect the output when one of the above options is used. This is no longer
|
||||
|
@ -115,11 +117,10 @@ ignored.
|
|||
.rs
|
||||
.sp
|
||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||
is identified as a binary file, and is processed specially. (GNU grep
|
||||
identifies binary files in this manner.) However, if the newline type is
|
||||
specified as "nul", that is, the line terminator is a binary zero, the test for
|
||||
a binary file is not applied. See the \fB--binary-files\fP option for a means
|
||||
of changing the way binary files are handled.
|
||||
is identified as a binary file, and is processed specially. However, if the
|
||||
newline type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the \fB--binary-files\fP
|
||||
option for a means of changing the way binary files are handled.
|
||||
.
|
||||
.
|
||||
.SH "BINARY ZEROS IN PATTERNS"
|
||||
|
@ -150,22 +151,30 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
\fB--binary-files\fP=\fItext\fP.
|
||||
.TP
|
||||
\fB--allow-lookaround-bsk\fP
|
||||
PCRE2 now forbids the use of \eK in lookarounds by default, in line with Perl.
|
||||
This option causes \fBpcre2grep\fP to set the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
|
||||
option, which enables this somewhat dangerous usage.
|
||||
.TP
|
||||
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
|
||||
Output up to \fInumber\fP lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
|
@ -352,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
|
@ -383,8 +394,8 @@ Ignore upper/lower case distinctions during comparisons.
|
|||
.TP
|
||||
\fB--include\fP=\fIpattern\fP
|
||||
If any \fB--include\fP patterns are specified, the only files that are
|
||||
processed are those that match one of the patterns (and do not match an
|
||||
\fB--exclude\fP pattern). This option does not affect directories, but it
|
||||
processed are those whose names match one of the patterns and do not match an
|
||||
\fB--exclude\fP pattern. This option does not affect directories, but it
|
||||
applies to all files, whether listed on the command line, obtained from
|
||||
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
|
||||
expression, and is matched against the final component of the file name, not
|
||||
|
@ -401,8 +412,8 @@ may be given any number of times; all the files are read.
|
|||
.TP
|
||||
\fB--include-dir\fP=\fIpattern\fP
|
||||
If any \fB--include-dir\fP patterns are specified, the only directories that
|
||||
are processed are those that match one of the patterns (and do not match an
|
||||
\fB--exclude-dir\fP pattern). This applies to all directories, whether listed
|
||||
are processed are those whose names match one of the patterns and do not match
|
||||
an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed
|
||||
on the command line, obtained from \fB--file-list\fP, or by scanning a parent
|
||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||
the final component of the directory name, not the entire path. The \fB-F\fP,
|
||||
|
@ -413,18 +424,21 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
|||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-l\fP options.
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches. This
|
||||
opeion overrides any previous \fB-H\fP, \fB-h\fP, or \fB-L\fP options.
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB--label\fP=\fIname\fP
|
||||
This option supplies a name to be used for the standard input when file names
|
||||
|
@ -435,8 +449,8 @@ short form for this option.
|
|||
When this option is given, non-compressed input is read and processed line by
|
||||
line, and the output is flushed after each write. By default, input is read in
|
||||
large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
|
||||
terminal (which is currently possible only in Unix-like environments or
|
||||
Windows). Output to terminal is normally automatically flushed by the operating
|
||||
terminal, which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed by the operating
|
||||
system. This option can be useful when the input or output is attached to a
|
||||
pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
|
||||
However, its use will affect performance, and the \fB-M\fP (multiline) option
|
||||
|
@ -459,40 +473,6 @@ the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
|
|||
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||
used. There is no short form for this option.
|
||||
.TP
|
||||
\fB--match-limit\fP=\fInumber\fP
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
.sp
|
||||
The \fB--match-limit\fP option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than \fB--match-limit\fP.
|
||||
.sp
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
.TP
|
||||
\fB--max-buffer-size=\fInumber\fP
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
.TP
|
||||
\fB-M\fP, \fB--multiline\fP
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
|
@ -520,27 +500,74 @@ well as possibly handling a two-character newline sequence.
|
|||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the \fB-M\fP option
|
||||
does not work when input is read line by line (see \fP--line-buffered\fP.)
|
||||
does not work when input is read line by line (see \fB--line-buffered\fP.)
|
||||
.TP
|
||||
\fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP
|
||||
Stop processing after finding \fInumber\fP matching lines, or non-matching
|
||||
lines if \fB-v\fP is also set. Any trailing context lines are output after the
|
||||
final match. In multiline mode, each multiline match counts as just one line
|
||||
for this purpose. If this limit is reached when reading the standard input from
|
||||
a regular file, the file is left positioned just after the last matching line.
|
||||
If \fB-c\fP is also set, the count that is output is never greater than
|
||||
\fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or
|
||||
\fB-q\fP, or when just checking for a match in a binary file.
|
||||
.TP
|
||||
\fB--match-limit\fP=\fInumber\fP
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
memory. There are three options that set resource limits for matching.
|
||||
.sp
|
||||
The \fB--match-limit\fP option provides a means of limiting computing resource
|
||||
usage when processing patterns that are not going to match, but which have a
|
||||
very large number of possibilities in their search trees. The classic example
|
||||
is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
|
||||
counter that is incremented each time around its main processing loop. If the
|
||||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
needed for each backtracking point depends on the number of capturing
|
||||
parentheses in the pattern, so the amount of memory that is used before this
|
||||
limit acts varies from pattern to pattern. This limit is of use only if it is
|
||||
set smaller than \fB--match-limit\fP.
|
||||
.sp
|
||||
There are no short forms for these options. The default limits can be set
|
||||
when the PCRE2 library is compiled; if they are not specified, the defaults
|
||||
are very large and so effectively unlimited.
|
||||
.TP
|
||||
\fB--max-buffer-size\fP=\fInumber\fP
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
.TP
|
||||
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
|
||||
The PCRE2 library supports five different conventions for indicating
|
||||
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||
which recognizes any of the preceding three types, and an "any" convention, in
|
||||
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||
(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||
PS (paragraph separator, U+2029).
|
||||
Six different conventions for indicating the ends of lines in scanned files are
|
||||
supported. For example:
|
||||
.sp
|
||||
pcre2grep -N CRLF 'some pattern' <file>
|
||||
.sp
|
||||
The newline type may be specified in upper, lower, or mixed case. If the
|
||||
newline type is NUL, lines are separated by binary zero characters. The other
|
||||
types are the single-character sequences CR (carriage return) and LF
|
||||
(linefeed), the two-character sequence CRLF, an "anycrlf" type, which
|
||||
recognizes any of the preceding three types, and an "any" type, for which any
|
||||
Unicode line ending sequence is assumed to end a line. The Unicode sequences
|
||||
are the three just mentioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and PS
|
||||
(paragraph separator, U+2029).
|
||||
.sp
|
||||
When the PCRE2 library is built, a default line-ending sequence is specified.
|
||||
This is normally the standard sequence for the operating system. Unless
|
||||
otherwise specified by this option, \fBpcre2grep\fP uses the library's default.
|
||||
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||
makes it possible to use \fBpcre2grep\fP to scan files that have come from
|
||||
other environments without having to modify their line endings. If the data
|
||||
that is being scanned does not agree with the convention set by this option,
|
||||
\fBpcre2grep\fP may behave in strange ways. Note that this option does not
|
||||
apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
|
||||
.sp
|
||||
This option makes it possible to use \fBpcre2grep\fP to scan files that have
|
||||
come from other environments without having to modify their line endings. If
|
||||
the data that is being scanned does not agree with the convention set by this
|
||||
option, \fBpcre2grep\fP may behave in strange ways. Note that this option does
|
||||
not apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
|
||||
\fB--include-from\fP options, which are expected to use the operating system's
|
||||
standard newline sequence.
|
||||
.TP
|
||||
|
@ -559,25 +586,36 @@ use of JIT at run time. It is provided for testing and working round problems.
|
|||
It should never be needed in normal use.
|
||||
.TP
|
||||
\fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
|
||||
When there is a match, instead of outputting the whole line that matched,
|
||||
output just the given text. This option is mutually exclusive with
|
||||
\fB--only-matching\fP, \fB--file-offsets\fP, and \fB--line-offsets\fP. Escape
|
||||
sequences starting with a dollar character may be used to insert the contents
|
||||
of the matched part of the line and/or captured substrings into the text.
|
||||
When there is a match, instead of outputting the line that matched, output just
|
||||
the text specified in this option, followed by an operating-system standard
|
||||
newline. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP,
|
||||
and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on
|
||||
this option, which is mutually exclusive with \fB--only-matching\fP,
|
||||
\fB--file-offsets\fP, and \fB--line-offsets\fP. However, like
|
||||
\fB--only-matching\fP, if there is more than one match in a line, each of them
|
||||
causes a line of output.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured
|
||||
substring of the given decimal number; zero substitutes the whole match. If
|
||||
the number is greater than the number of capturing substrings, or if the
|
||||
capture is unset, the replacement is empty.
|
||||
Escape sequences starting with a dollar character may be used to insert the
|
||||
contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
.sp
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
.sp
|
||||
$o<digits> is replaced by the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||
given octal number. In the first form, up to three octal digits are processed.
|
||||
When more digits are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
.sp
|
||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||
processed. When more digits are needed in Unicode mode to specify a wide
|
||||
character, the second form must be used.
|
||||
.sp
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar.
|
||||
|
@ -596,19 +634,29 @@ otherwise empty line. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
|
||||
Show only the part of the line that matched the capturing parentheses of the
|
||||
given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||
equivalent to \fB-o\fP without a number. Because these options can be given
|
||||
without an argument (see above), if an argument is present, it must be given in
|
||||
the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||
for the non-argument case above also apply to this option. If the specified
|
||||
capturing parentheses do not exist in the pattern, or were not set in the
|
||||
match, nothing is output unless the file name or line number are being output.
|
||||
given number. Up to 50 capturing parentheses are supported by default. This
|
||||
limit can be changed via the \fB--om-capture\fP option. A pattern may contain
|
||||
any number of capturing parentheses, but only those whose number is within the
|
||||
limit can be accessed by \fB-o\fP. An error occurs if the number specified by
|
||||
\fB-o\fP is greater than the limit.
|
||||
.sp
|
||||
-o0 is the same as \fB-o\fP without a number. Because these options can be
|
||||
given without an argument (see above), if an argument is present, it must be
|
||||
given in the same shell item, for example, -o3 or --only-matching=2. The
|
||||
comments given for the non-argument case above also apply to this option. If
|
||||
the specified capturing parentheses do not exist in the pattern, or were not
|
||||
set in the match, nothing is output unless the file name or line number are
|
||||
being output.
|
||||
.sp
|
||||
If this option is given multiple times, multiple substrings are output for each
|
||||
match, in the order the options are given, and all on one line. For example,
|
||||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator (but see the next
|
||||
option).
|
||||
but one option).
|
||||
.TP
|
||||
\fB--om-capture\fP=\fInumber\fP
|
||||
Set the number of capturing parentheses that can be accessed by \fB-o\fP. The
|
||||
default is 50.
|
||||
.TP
|
||||
\fB--om-separator\fP=\fItext\fP
|
||||
Specify a separating string for multiple occurrences of \fB-o\fP. The default
|
||||
|
@ -626,7 +674,8 @@ immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
|
|||
option to "recurse".
|
||||
.TP
|
||||
\fB--recursion-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP above.
|
||||
This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP
|
||||
above for details.
|
||||
.TP
|
||||
\fB-s\fP, \fB--no-messages\fP
|
||||
Suppress error messages about non-existent or unreadable files. Such files are
|
||||
|
@ -644,11 +693,24 @@ is listed. If file names are being output, the grand total is preceded by
|
|||
ignored when used with \fB-L\fP (list files without matches), because the grand
|
||||
total would always be zero.
|
||||
.TP
|
||||
\fB-u\fP, \fB--utf-8\fP
|
||||
\fB-u\fP, \fB--utf\fP
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
||||
\fB--include\fP options) and all subject lines that are scanned must be valid
|
||||
strings of UTF-8 characters.
|
||||
\fB--include\fP options) and all lines that are scanned must be valid strings
|
||||
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||
occurs.
|
||||
.TP
|
||||
\fB-U\fP, \fB--utf-allow-invalid\fP
|
||||
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
|
||||
unit sequences. These can never form part of any pattern match. Patterns
|
||||
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||
or other binary files. For more details about matching in non-valid UTF-8
|
||||
strings, see the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP(3)
|
||||
.\"
|
||||
documentation.
|
||||
.TP
|
||||
\fB-V\fP, \fB--version\fP
|
||||
Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the
|
||||
|
@ -657,7 +719,9 @@ ignored.
|
|||
.TP
|
||||
\fB-v\fP, \fB--invert-match\fP
|
||||
Invert the sense of the match, so that lines which do \fInot\fP match any of
|
||||
the patterns are the ones that are found.
|
||||
the patterns are the ones that are found. When this option is set, options such
|
||||
as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match
|
||||
that are to be output, are ignored.
|
||||
.TP
|
||||
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
|
||||
Force the patterns only to match "words". That is, there must be a word
|
||||
|
@ -674,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
|||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
|
@ -689,16 +759,25 @@ by the \fB--locale\fP option. If no locale is set, the PCRE2 library's default
|
|||
.rs
|
||||
.sp
|
||||
The \fB-N\fP (\fB--newline\fP) option allows \fBpcre2grep\fP to scan files with
|
||||
different newline conventions from the default. Any parts of the input files
|
||||
that are written to the standard output are copied identically, with whatever
|
||||
newline sequences they have in the input. However, the setting of this option
|
||||
affects only the way scanned files are processed. It does not affect the
|
||||
interpretation of files specified by the \fB-f\fP, \fB--file-list\fP,
|
||||
\fB--exclude-from\fP, or \fB--include-from\fP options, nor does it affect the
|
||||
way in which \fBpcre2grep\fP writes informational messages to the standard
|
||||
error and output streams. For these it uses the string "\en" to indicate
|
||||
newlines, relying on the C I/O library to convert this to an appropriate
|
||||
sequence.
|
||||
newline conventions that differ from the default. This option affects only the
|
||||
way scanned files are processed. It does not affect the interpretation of files
|
||||
specified by the \fB-f\fP, \fB--file-list\fP, \fB--exclude-from\fP, or
|
||||
\fB--include-from\fP options.
|
||||
.P
|
||||
Any parts of the scanned input files that are written to the standard output
|
||||
are copied with whatever newline sequences they have in the input. However, if
|
||||
the final line of a file is output, and it does not end with a newline
|
||||
sequence, a newline sequence is added. If the newline setting is CR, LF, CRLF
|
||||
or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a
|
||||
single NL is used.
|
||||
.P
|
||||
The newline setting does not affect the way in which \fBpcre2grep\fP writes
|
||||
newlines in informational messages to the standard output and error streams.
|
||||
Under Windows, the standard output is set to be binary, so that "\er\en" at the
|
||||
ends of output lines that are copied from the input is not converted to
|
||||
"\er\er\en" by the C I/O library. This means that any messages written to the
|
||||
standard output must end with "\er\en". For all other operating systems, and
|
||||
for all messages to the standard error stream, "\en" is used.
|
||||
.
|
||||
.
|
||||
.SH "OPTIONS COMPATIBILITY"
|
||||
|
@ -711,9 +790,9 @@ as in the GNU \fBgrep\fP program. Any long option of the form
|
|||
\fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
|
||||
\fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
|
||||
\fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
|
||||
\fB--output\fP, \fB-u\fP, and \fB--utf-8\fP options are specific to
|
||||
\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
|
||||
capturing parentheses number.
|
||||
\fB--output\fP, \fB-u\fP, \fB--utf\fP, \fB-U\fP, and \fB--utf-allow-invalid\fP
|
||||
options are specific to \fBpcre2grep\fP, as is the use of the
|
||||
\fB--only-matching\fP option with a capturing parentheses number.
|
||||
.P
|
||||
Although most of the common options work the same way, a few are different in
|
||||
\fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
|
||||
|
@ -775,12 +854,36 @@ documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP;
|
|||
only callouts with string arguments are useful.
|
||||
.
|
||||
.
|
||||
.SS "Echoing a specific string"
|
||||
.rs
|
||||
.sp
|
||||
Starting the callout string with a pipe character invokes an echoing facility
|
||||
that avoids calling an external program or script. This facility is always
|
||||
available, provided that callouts were not completely disabled when
|
||||
\fBpcre2grep\fP was built. The rest of the callout string is processed as a
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
.sp
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
.sp
|
||||
Matching continues normally after the string is output. If you want to see only
|
||||
the callout output but not any output from an actual match, you should end the
|
||||
pattern with (*FAIL).
|
||||
.
|
||||
.
|
||||
.SS "Calling external programs or scripts"
|
||||
.rs
|
||||
.sp
|
||||
This facility can be independently disabled when \fBpcre2grep\fP is built. It
|
||||
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
|
||||
where \fBlib$spawn()\fP is used, and for any other Unix-like environment where
|
||||
where \fBlib$spawn()\fP is used, and for any Unix-like environment where
|
||||
\fBfork()\fP and \fBexecv()\fP are available.
|
||||
.P
|
||||
If the callout string does not start with a pipe (vertical bar) character, it
|
||||
|
@ -791,13 +894,11 @@ arguments:
|
|||
executable_name|arg1|arg2|...
|
||||
.sp
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
||||
captured substring of the given decimal number, which must be greater than
|
||||
zero. If the number is greater than the number of capturing substrings, or if
|
||||
the capture is unset, the replacement is empty.
|
||||
.P
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||
started by a dollar character. These are the same as for the \fB--output\fP
|
||||
(\fB-O\fP) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
.sp
|
||||
echo -e "abcde\en12345" | pcre2grep \e
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -810,28 +911,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
.sp
|
||||
The parameters for the system call that is used to run the
|
||||
program or script are zero-terminated strings. This means that binary zero
|
||||
characters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in the
|
||||
string (for example, a dollar not followed by another character) cause the
|
||||
callout to be ignored. If running the program fails for any reason (including
|
||||
the non-existence of the executable), a local matching failure occurs and the
|
||||
matcher backtracks in the normal way.
|
||||
.
|
||||
.
|
||||
.SS "Echoing a specific string"
|
||||
.rs
|
||||
.sp
|
||||
This facility is always available, provided that callouts were not completely
|
||||
disabled when \fBpcre2grep\fP was built. If the callout string starts with a
|
||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
||||
having been passed through the same escape processing as text from the --output
|
||||
option. This provides a simple echoing facility that avoids calling an external
|
||||
program or script. No terminator is added to the string, so if you want a
|
||||
newline, you must include it explicitly. Matching continues normally after the
|
||||
string is output. If you want to see only the callout output but not any output
|
||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
||||
The parameters for the system call that is used to run the program or script
|
||||
are zero-terminated strings. This means that binary zero characters in the
|
||||
callout argument will cause premature termination of their substrings, and
|
||||
therefore should not be present. Any syntax errors in the string (for example,
|
||||
a dollar not followed by another character) causes the callout to be ignored.
|
||||
If running the program fails for any reason (including the non-existence of the
|
||||
executable), a local matching failure occurs and the matcher backtracks in the
|
||||
normal way.
|
||||
.
|
||||
.
|
||||
.SH "MATCHING ERRORS"
|
||||
|
@ -867,7 +954,8 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3).
|
||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3),
|
||||
\fBpcre2unicode\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -875,7 +963,7 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -884,6 +972,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 24 November 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2JIT 3 "06 March 2019" "PCRE2 10.33"
|
||||
.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT"
|
||||
|
@ -29,6 +29,7 @@ platforms:
|
|||
.sp
|
||||
ARM 32-bit (v5, v7, and Thumb2)
|
||||
ARM 64-bit
|
||||
IBM s390x 64 bit
|
||||
Intel x86 32-bit and 64-bit
|
||||
MIPS 32-bit and 64-bit
|
||||
Power PC 32-bit and 64-bit
|
||||
|
@ -64,7 +65,7 @@ or a negative error code.
|
|||
There is a limit to the size of pattern that JIT supports, imposed by the size
|
||||
of machine stack that it uses. The exact rules are not documented because they
|
||||
may change at any time, in particular, when new optimizations are introduced.
|
||||
If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
|
||||
If a pattern is too big, a call to \fBpcre2_jit_compile()\fP returns
|
||||
PCRE2_ERROR_NOMEMORY.
|
||||
.P
|
||||
PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete
|
||||
|
@ -123,23 +124,29 @@ pattern.
|
|||
.SH "MATCHING SUBJECTS CONTAINING INVALID UTF"
|
||||
.rs
|
||||
.sp
|
||||
When a pattern is compiled with the PCRE2_UTF option, the interpretive matching
|
||||
function expects its subject string to be a valid sequence of UTF code units.
|
||||
If it is not, the result is undefined. This is also true by default of matching
|
||||
via JIT. However, if the option PCRE2_JIT_INVALID_UTF is passed to
|
||||
\fBpcre2_jit_compile()\fP, code that can process a subject containing invalid
|
||||
UTF is compiled.
|
||||
When a pattern is compiled with the PCRE2_UTF option, subject strings are
|
||||
normally expected to be a valid sequence of UTF code units. By default, this is
|
||||
checked at the start of matching and an error is generated if invalid UTF is
|
||||
detected. The PCRE2_NO_UTF_CHECK option can be passed to \fBpcre2_match()\fP to
|
||||
skip the check (for improved performance) if you are sure that a subject string
|
||||
is valid. If this option is used with an invalid string, the result is
|
||||
undefined.
|
||||
.P
|
||||
In this mode, an invalid code unit sequence never matches any pattern item. It
|
||||
does not match dot, it does not match \ep{Any}, it does not even match negative
|
||||
items such as [^X]. A lookbehind assertion fails if it encounters an invalid
|
||||
sequence while moving the current point backwards. In other words, an invalid
|
||||
UTF code unit sequence acts as a barrier which no match can cross. Reaching an
|
||||
invalid sequence causes an immediate backtrack.
|
||||
However, a way of running matches on strings that may contain invalid UTF
|
||||
sequences is available. Calling \fBpcre2_compile()\fP with the
|
||||
PCRE2_MATCH_INVALID_UTF option has two effects: it tells the interpreter in
|
||||
\fBpcre2_match()\fP to support invalid UTF, and, if \fBpcre2_jit_compile()\fP
|
||||
is called, the compiled JIT code also supports invalid UTF. Details of how this
|
||||
support works, in both the JIT and the interpretive cases, is given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
documentation.
|
||||
.P
|
||||
Using this option, an application can run matches in arbitrary data, knowing
|
||||
that any matched strings that are returned will be valid UTF. This can be
|
||||
useful when searching for text in executable or other binary files.
|
||||
There is also an obsolete option for \fBpcre2_jit_compile()\fP called
|
||||
PCRE2_JIT_INVALID_UTF, which currently exists only for backward compatibility.
|
||||
It is superseded by the \fBpcre2_compile()\fP option PCRE2_MATCH_INVALID_UTF
|
||||
and should no longer be used. It may be removed in future.
|
||||
.
|
||||
.
|
||||
.SH "UNSUPPORTED OPTIONS AND PATTERN ITEMS"
|
||||
|
@ -244,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function
|
|||
starts another match, that match must use a different JIT stack to the one used
|
||||
for currently suspended match(es).
|
||||
.P
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
In a multithread application, if you do not specify a JIT stack, or if you
|
||||
assign or pass back NULL from a callback, that is thread-safe, because each
|
||||
thread has its own machine stack. However, if you assign or pass back a
|
||||
non-NULL JIT stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
.P
|
||||
Strictly speaking, even more is allowed. You can assign the same non-NULL stack
|
||||
to a match context that is used by any number of patterns, as long as they are
|
||||
|
@ -260,7 +267,7 @@ inefficient solution, and not recommended.
|
|||
This is a suggestion for how a multithreaded program that needs to set up
|
||||
non-default JIT stacks might operate:
|
||||
.sp
|
||||
During thread initalization
|
||||
During thread initialization
|
||||
thread_local_var = pcre2_jit_stack_create(...)
|
||||
.sp
|
||||
During thread exit
|
||||
|
@ -309,12 +316,12 @@ stack through the JIT callback function.
|
|||
You can free a JIT stack at any time, as long as it will not be used by
|
||||
\fBpcre2_match()\fP again. When you assign the stack to a match context, only a
|
||||
pointer is set. There is no reference counting or any other magic. You can free
|
||||
compiled patterns, contexts, and stacks in any order, anytime. Just \fIdo
|
||||
not\fP call \fBpcre2_match()\fP with a match context pointing to an already
|
||||
freed stack, as that will cause SEGFAULT. (Also, do not free a stack currently
|
||||
used by \fBpcre2_match()\fP in another thread). You can also replace the stack
|
||||
in a context at any time when it is not in use. You should free the previous
|
||||
stack before assigning a replacement.
|
||||
compiled patterns, contexts, and stacks in any order, anytime.
|
||||
Just \fIdo not\fP call \fBpcre2_match()\fP with a match context pointing to an
|
||||
already freed stack, as that will cause SEGFAULT. (Also, do not free a stack
|
||||
currently used by \fBpcre2_match()\fP in another thread). You can also replace
|
||||
the stack in a context at any time when it is not in use. You should free the
|
||||
previous stack before assigning a replacement.
|
||||
.P
|
||||
(5) Should I allocate/free a stack every time before/after calling
|
||||
\fBpcre2_match()\fP?
|
||||
|
@ -348,8 +355,8 @@ out this complicated API.
|
|||
.B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP);
|
||||
.fi
|
||||
.P
|
||||
The JIT executable allocator does not free all memory when it is possible.
|
||||
It expects new allocations, and keeps some free memory around to improve
|
||||
The JIT executable allocator does not free all memory when it is possible. It
|
||||
expects new allocations, and keeps some free memory around to improve
|
||||
allocation speed. However, in low memory conditions, it might be better to free
|
||||
all possible memory. You can cause this to happen by calling
|
||||
pcre2_jit_free_unused_memory(). Its argument is a general context, for custom
|
||||
|
@ -409,10 +416,10 @@ that was not compiled.
|
|||
.P
|
||||
When you call \fBpcre2_match()\fP, as well as testing for invalid options, a
|
||||
number of other sanity checks are performed on the arguments. For example, if
|
||||
the subject pointer is NULL, an immediate error is given. Also, unless
|
||||
PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the
|
||||
interests of speed, these checks do not happen on the JIT fast path, and if
|
||||
invalid data is passed, the result is undefined.
|
||||
the subject pointer is NULL but the length is non-zero, an immediate error is
|
||||
given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested
|
||||
for validity. In the interests of speed, these checks do not happen on the JIT
|
||||
fast path, and if invalid data is passed, the result is undefined.
|
||||
.P
|
||||
Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give
|
||||
speedups of more than 10%.
|
||||
|
@ -438,6 +445,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 06 March 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 30 November 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2LIMITS 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2LIMITS 3 "26 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SIZE AND OTHER LIMITATIONS"
|
||||
|
@ -51,6 +51,10 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
.P
|
||||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
.P
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -58,7 +62,7 @@ The maximum length of a string argument to a callout is the largest number a
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -67,6 +71,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2MATCHING 3 "10 October 2018" "PCRE2 10.33"
|
||||
.TH PCRE2MATCHING 3 "28 August 2021" "PCRE2 10.38"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 MATCHING ALGORITHMS"
|
||||
|
@ -61,8 +61,9 @@ tried is controlled by the greedy or ungreedy nature of the quantifier.
|
|||
If a leaf node is reached, a matching string has been found, and at that point
|
||||
the algorithm stops. Thus, if there is more than one possible match, this
|
||||
algorithm returns the first one that it finds. Whether this is the shortest,
|
||||
the longest, or some intermediate length depends on the way the greedy and
|
||||
ungreedy repetition quantifiers are specified in the pattern.
|
||||
the longest, or some intermediate length depends on the way the alternations
|
||||
and the greedy or ungreedy repetition quantifiers are specified in the
|
||||
pattern.
|
||||
.P
|
||||
Because it ends up with a single path through the tree, it is relatively
|
||||
straightforward for this algorithm to keep track of the substrings that are
|
||||
|
@ -91,10 +92,15 @@ no more unterminated paths. At this point, terminated paths represent the
|
|||
different matching possibilities (if there are none, the match has failed).
|
||||
Thus, if there is more than one possible match, this algorithm finds all of
|
||||
them, and in particular, it finds the longest. The matches are returned in
|
||||
decreasing order of length. There is an option to stop the algorithm after the
|
||||
first match (which is necessarily the shortest) is found.
|
||||
the output vector in decreasing order of length. There is an option to stop the
|
||||
algorithm after the first match (which is necessarily the shortest) is found.
|
||||
.P
|
||||
Note that all the matches that are found start at the same point in the
|
||||
Note that the size of vector needed to contain all the results depends on the
|
||||
number of simultaneous matches, not on the number of parentheses in the
|
||||
pattern. Using \fBpcre2_match_data_create_from_pattern()\fP to create the match
|
||||
data block is therefore not advisable when doing DFA matching.
|
||||
.P
|
||||
Note also that all the matches that are found start at the same point in the
|
||||
subject. If the pattern
|
||||
.sp
|
||||
cat(er(pillar)?)?
|
||||
|
@ -157,24 +163,21 @@ code unit) at a time, for all active paths through the tree.
|
|||
.P
|
||||
9. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
|
||||
supported. (*FAIL) is supported, and behaves like a failing negative assertion.
|
||||
.P
|
||||
10. The PCRE2_MATCH_INVALID_UTF option for \fBpcre2_compile()\fP is not
|
||||
supported by \fBpcre2_dfa_match()\fP.
|
||||
.
|
||||
.
|
||||
.SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
|
||||
.rs
|
||||
.sp
|
||||
Using the alternative matching algorithm provides the following advantages:
|
||||
The main advantage of the alternative algorithm is that all possible matches
|
||||
(at a single point in the subject) are automatically found, and in particular,
|
||||
the longest match is found. To find more than one match at the same point using
|
||||
the standard algorithm, you have to do kludgy things with callouts.
|
||||
.P
|
||||
1. All possible matches (at a single point in the subject) are automatically
|
||||
found, and in particular, the longest match is found. To find more than one
|
||||
match using the standard algorithm, you have to do kludgy things with
|
||||
callouts.
|
||||
.P
|
||||
2. Because the alternative algorithm scans the subject string just once, and
|
||||
never needs to backtrack (except for lookbehinds), it is possible to pass very
|
||||
long subject strings to the matching function in several pieces, checking for
|
||||
partial matching each time. Although it is also possible to do multi-segment
|
||||
matching using the standard algorithm, by retaining partially matched
|
||||
substrings, it is more complicated. The
|
||||
Partial matching is possible with this algorithm, though it has some
|
||||
limitations. The
|
||||
.\" HREF
|
||||
\fBpcre2partial\fP
|
||||
.\"
|
||||
|
@ -191,10 +194,13 @@ The alternative algorithm suffers from a number of disadvantages:
|
|||
because it has to search for all possible matches, but is also because it is
|
||||
less susceptible to optimization.
|
||||
.P
|
||||
2. Capturing parentheses, backreferences, and script runs are not supported.
|
||||
2. Capturing parentheses, backreferences, script runs, and matching within
|
||||
invalid UTF string are not supported.
|
||||
.P
|
||||
3. Although atomic groups are supported, their use does not provide the
|
||||
performance advantage that it does for the standard algorithm.
|
||||
.P
|
||||
4. JIT optimization is not supported.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -202,7 +208,7 @@ performance advantage that it does for the standard algorithm.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -211,6 +217,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 10 October 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
Last updated: 28 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,67 +1,107 @@
|
|||
.TH PCRE2PARTIAL 3 "22 December 2014" "PCRE2 10.00"
|
||||
.TH PCRE2PARTIAL 3 "04 September 2019" "PCRE2 10.34"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions
|
||||
.SH "PARTIAL MATCHING IN PCRE2"
|
||||
.rs
|
||||
.sp
|
||||
In normal use of PCRE2, if the subject string that is passed to a matching
|
||||
function matches as far as it goes, but is too short to match the entire
|
||||
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
|
||||
might be helpful to distinguish this case from other cases in which there is no
|
||||
match.
|
||||
In normal use of PCRE2, if there is a match up to the end of a subject string,
|
||||
but more characters are needed to match the entire pattern, PCRE2_ERROR_NOMATCH
|
||||
is returned, just like any other failing match. There are circumstances where
|
||||
it might be helpful to distinguish this "partial match" case.
|
||||
.P
|
||||
Consider, for example, an application where a human is required to type in data
|
||||
for a field with specific formatting requirements. An example might be a date
|
||||
in the form \fIddmmmyy\fP, defined by this pattern:
|
||||
.sp
|
||||
^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$
|
||||
.sp
|
||||
If the application sees the user's keystrokes one by one, and can check that
|
||||
what has been typed so far is potentially valid, it is able to raise an error
|
||||
as soon as a mistake is made, by beeping and not reflecting the character that
|
||||
has been typed, for example. This immediate feedback is likely to be a better
|
||||
user interface than a check that is delayed until the entire string has been
|
||||
entered. Partial matching can also be useful when the subject string is very
|
||||
long and is not all available at once.
|
||||
One example is an application where the subject string is very long, and not
|
||||
all available at once. The requirement here is to be able to do the matching
|
||||
segment by segment, but special action is needed when a matched substring spans
|
||||
the boundary between two segments.
|
||||
.P
|
||||
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
|
||||
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
|
||||
The difference between the two options is whether or not a partial match is
|
||||
preferred to an alternative complete match, though the details differ between
|
||||
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
|
||||
takes precedence.
|
||||
Another example is checking a user input string as it is typed, to ensure that
|
||||
it conforms to a required format. Invalid characters can be immediately
|
||||
diagnosed and rejected, giving instant feedback.
|
||||
.P
|
||||
If you want to use partial matching with just-in-time optimized code, you must
|
||||
call \fBpcre2_jit_compile()\fP with one or both of these options:
|
||||
Partial matching is a PCRE2-specific feature; it is not Perl-compatible. It is
|
||||
requested by setting one of the PCRE2_PARTIAL_HARD or PCRE2_PARTIAL_SOFT
|
||||
options when calling a matching function. The difference between the two
|
||||
options is whether or not a partial match is preferred to an alternative
|
||||
complete match, though the details differ between the two types of matching
|
||||
function. If both options are set, PCRE2_PARTIAL_HARD takes precedence.
|
||||
.P
|
||||
If you want to use partial matching with just-in-time optimized code, as well
|
||||
as setting a partial match option for the matching function, you must also call
|
||||
\fBpcre2_jit_compile()\fP with one or both of these options:
|
||||
.sp
|
||||
PCRE2_JIT_PARTIAL_SOFT
|
||||
PCRE2_JIT_PARTIAL_HARD
|
||||
PCRE2_JIT_PARTIAL_SOFT
|
||||
.sp
|
||||
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
|
||||
matches on the same pattern. If the appropriate JIT mode has not been compiled,
|
||||
interpretive matching code is used.
|
||||
matches on the same pattern. Separate code is compiled for each mode. If the
|
||||
appropriate JIT mode has not been compiled, interpretive matching code is used.
|
||||
.P
|
||||
Setting a partial matching option disables two of PCRE2's standard
|
||||
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
|
||||
abandons matching immediately if it is not present in the subject string. This
|
||||
optimization cannot be used for a subject string that might match only
|
||||
partially. PCRE2 also knows the minimum length of a matching string, and does
|
||||
optimization hints. PCRE2 remembers the last literal code unit in a pattern,
|
||||
and abandons matching immediately if it is not present in the subject string.
|
||||
This optimization cannot be used for a subject string that might match only
|
||||
partially. PCRE2 also remembers a minimum length of a matching string, and does
|
||||
not bother to run the matching function on shorter strings. This optimization
|
||||
is also disabled for partial matching.
|
||||
.
|
||||
.
|
||||
.SH "REQUIREMENTS FOR A PARTIAL MATCH"
|
||||
.rs
|
||||
.sp
|
||||
A possible partial match occurs during matching when the end of the subject
|
||||
string is reached successfully, but either more characters are needed to
|
||||
complete the match, or the addition of more characters might change what is
|
||||
matched.
|
||||
.P
|
||||
Example 1: if the pattern is /abc/ and the subject is "ab", more characters are
|
||||
definitely needed to complete a match. In this case both hard and soft matching
|
||||
options yield a partial match.
|
||||
.P
|
||||
Example 2: if the pattern is /ab+/ and the subject is "ab", a complete match
|
||||
can be found, but the addition of more characters might change what is
|
||||
matched. In this case, only PCRE2_PARTIAL_HARD returns a partial match;
|
||||
PCRE2_PARTIAL_SOFT returns the complete match.
|
||||
.P
|
||||
On reaching the end of the subject, when PCRE2_PARTIAL_HARD is set, if the next
|
||||
pattern item is \ez, \eZ, \eb, \eB, or $ there is always a partial match.
|
||||
Otherwise, for both options, the next pattern item must be one that inspects a
|
||||
character, and at least one of the following must be true:
|
||||
.P
|
||||
(1) At least one character has already been inspected. An inspected character
|
||||
need not form part of the final matched string; lookbehind assertions and the
|
||||
\eK escape sequence provide ways of inspecting characters before the start of a
|
||||
matched string.
|
||||
.P
|
||||
(2) The pattern contains one or more lookbehind assertions. This condition
|
||||
exists in case there is a lookbehind that inspects characters before the start
|
||||
of the match.
|
||||
.P
|
||||
(3) There is a special case when the whole pattern can match an empty string.
|
||||
When the starting point is at the end of the subject, the empty string match is
|
||||
a possibility, and if PCRE2_PARTIAL_SOFT is set and neither of the above
|
||||
conditions is true, it is returned. However, because adding more characters
|
||||
might result in a non-empty match, PCRE2_PARTIAL_HARD returns a partial match,
|
||||
which in this case means "there is going to be a match at this point, but until
|
||||
some more characters are added, we do not know if it will be an empty string or
|
||||
something longer".
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "PARTIAL MATCHING USING pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
A partial match occurs during a call to \fBpcre2_match()\fP when the end of the
|
||||
subject string is reached successfully, but matching cannot continue because
|
||||
more characters are needed. However, at least one character in the subject must
|
||||
have been inspected. This character need not form part of the final matched
|
||||
string; lookbehind assertions and the \eK escape sequence provide ways of
|
||||
inspecting characters before the start of a matched string. The requirement for
|
||||
inspecting at least one character exists because an empty string can always be
|
||||
matched; without such a restriction there would always be a partial match of an
|
||||
empty string at the end of the subject.
|
||||
When a partial matching option is set, the result of calling
|
||||
\fBpcre2_match()\fP can be one of the following:
|
||||
.TP 2
|
||||
\fBA successful match\fP
|
||||
A complete match has been found, starting and ending within this subject.
|
||||
.TP
|
||||
\fBPCRE2_ERROR_NOMATCH\fP
|
||||
No match can start anywhere in this subject.
|
||||
.TP
|
||||
\fBPCRE2_ERROR_PARTIAL\fP
|
||||
Adding more characters may result in a complete match that uses one or more
|
||||
characters from the end of this subject.
|
||||
.P
|
||||
When a partial match is returned, the first two elements in the ovector point
|
||||
to the portion of the subject that was matched, but the values in the rest of
|
||||
|
@ -77,53 +117,40 @@ is "456abc12", a partial match is found for the string "abc12", because all
|
|||
these characters are needed for a subsequent re-match with additional
|
||||
characters.
|
||||
.P
|
||||
What happens when a partial match is identified depends on which of the two
|
||||
partial matching options are set.
|
||||
.
|
||||
.
|
||||
.SS "PCRE2_PARTIAL_SOFT WITH pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
If PCRE2_PARTIAL_SOFT is set when \fBpcre2_match()\fP identifies a partial
|
||||
match, the partial match is remembered, but matching continues as normal, and
|
||||
other alternatives in the pattern are tried. If no complete match can be found,
|
||||
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
|
||||
.P
|
||||
This option is "soft" because it prefers a complete match over a partial match.
|
||||
All the various matching items in a pattern behave as if the subject string is
|
||||
potentially complete. For example, \ez, \eZ, and $ match at the end of the
|
||||
subject, as normal, and for \eb and \eB the end of the subject is treated as a
|
||||
non-alphanumeric.
|
||||
.P
|
||||
If there is more than one partial match, the first one that was found provides
|
||||
the data that is returned. Consider this pattern:
|
||||
.sp
|
||||
/123\ew+X|dogY/
|
||||
.sp
|
||||
If this is matched against the subject string "abc123dog", both
|
||||
alternatives fail to match, but the end of the subject is reached during
|
||||
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
|
||||
identifying "123dog" as the first partial match that was found. (In this
|
||||
example, there are two partial matches, because "dog" on its own partially
|
||||
matches the second alternative.)
|
||||
If this is matched against the subject string "abc123dog", both alternatives
|
||||
fail to match, but the end of the subject is reached during matching, so
|
||||
PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9, identifying
|
||||
"123dog" as the first partial match. (In this example, there are two partial
|
||||
matches, because "dog" on its own partially matches the second alternative.)
|
||||
.
|
||||
.
|
||||
.SS "PCRE2_PARTIAL_HARD WITH pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
If PCRE2_PARTIAL_HARD is set for \fBpcre2_match()\fP, PCRE2_ERROR_PARTIAL is
|
||||
returned as soon as a partial match is found, without continuing to search for
|
||||
possible complete matches. This option is "hard" because it prefers an earlier
|
||||
partial match over a later complete match. For this reason, the assumption is
|
||||
made that the end of the supplied subject string may not be the true end of the
|
||||
available data, and so, if \ez, \eZ, \eb, \eB, or $ are encountered at the end
|
||||
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
|
||||
character in the subject has been inspected.
|
||||
.
|
||||
.
|
||||
.SS "Comparing hard and soft partial matching"
|
||||
.SS "How a partial match is processed by pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
What happens when a partial match is identified depends on which of the two
|
||||
partial matching options is set.
|
||||
.P
|
||||
If PCRE2_PARTIAL_HARD is set, PCRE2_ERROR_PARTIAL is returned as soon as a
|
||||
partial match is found, without continuing to search for possible complete
|
||||
matches. This option is "hard" because it prefers an earlier partial match over
|
||||
a later complete match. For this reason, the assumption is made that the end of
|
||||
the supplied subject string is not the true end of the available data, which is
|
||||
why \ez, \eZ, \eb, \eB, and $ always give a partial match.
|
||||
.P
|
||||
If PCRE2_PARTIAL_SOFT is set, the partial match is remembered, but matching
|
||||
continues as normal, and other alternatives in the pattern are tried. If no
|
||||
complete match can be found, PCRE2_ERROR_PARTIAL is returned instead of
|
||||
PCRE2_ERROR_NOMATCH. This option is "soft" because it prefers a complete match
|
||||
over a partial match. All the various matching items in a pattern behave as if
|
||||
the subject string is potentially complete; \ez, \eZ, and $ match at the end of
|
||||
the subject, as normal, and for \eb and \eB the end of the subject is treated
|
||||
as a non-alphanumeric.
|
||||
.P
|
||||
The difference between the two partial matching options can be illustrated by a
|
||||
pattern such as:
|
||||
.sp
|
||||
|
@ -148,25 +175,132 @@ The second pattern will never match "dogsbody", because it will always find the
|
|||
shorter match first.
|
||||
.
|
||||
.
|
||||
.SS "Example of partial matching using pcre2test"
|
||||
.rs
|
||||
.sp
|
||||
The \fBpcre2test\fP data modifiers \fBpartial_hard\fP (or \fBph\fP) and
|
||||
\fBpartial_soft\fP (or \fBps\fP) set PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT,
|
||||
respectively, when calling \fBpcre2_match()\fP. Here is a run of
|
||||
\fBpcre2test\fP using a pattern that matches the whole subject in the form of a
|
||||
date:
|
||||
.sp
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 25dec3\e=ph
|
||||
Partial match: 23dec3
|
||||
data> 3ju\e=ph
|
||||
Partial match: 3ju
|
||||
data> 3juj\e=ph
|
||||
No match
|
||||
.sp
|
||||
This example gives the same results for both hard and soft partial matching
|
||||
options. Here is an example where there is a difference:
|
||||
.sp
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 25jun04\e=ps
|
||||
0: 25jun04
|
||||
1: jun
|
||||
data> 25jun04\e=ph
|
||||
Partial match: 25jun04
|
||||
.sp
|
||||
With PCRE2_PARTIAL_SOFT, the subject is matched completely. For
|
||||
PCRE2_PARTIAL_HARD, however, the subject is assumed not to be complete, so
|
||||
there is only a partial match.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SH "MULTI-SEGMENT MATCHING WITH pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
PCRE was not originally designed with multi-segment matching in mind. However,
|
||||
over time, features (including partial matching) that make multi-segment
|
||||
matching possible have been added. A very long string can be searched segment
|
||||
by segment by calling \fBpcre2_match()\fP repeatedly, with the aim of achieving
|
||||
the same results that would happen if the entire string was available for
|
||||
searching all the time. Normally, the strings that are being sought are much
|
||||
shorter than each individual segment, and are in the middle of very long
|
||||
strings, so the pattern is normally not anchored.
|
||||
.P
|
||||
Special logic must be implemented to handle a matched substring that spans a
|
||||
segment boundary. PCRE2_PARTIAL_HARD should be used, because it returns a
|
||||
partial match at the end of a segment whenever there is the possibility of
|
||||
changing the match by adding more characters. The PCRE2_NOTBOL option should
|
||||
also be set for all but the first segment.
|
||||
.P
|
||||
When a partial match occurs, the next segment must be added to the current
|
||||
subject and the match re-run, using the \fIstartoffset\fP argument of
|
||||
\fBpcre2_match()\fP to begin at the point where the partial match started.
|
||||
For example:
|
||||
.sp
|
||||
re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/
|
||||
data> ...the date is 23ja\e=ph
|
||||
Partial match: 23ja
|
||||
data> ...the date is 23jan19 and on that day...\e=offset=15
|
||||
0: 23jan19
|
||||
1: jan
|
||||
.sp
|
||||
Note the use of the \fBoffset\fP modifier to start the new match where the
|
||||
partial match was found. In this example, the next segment was added to the one
|
||||
in which the partial match was found. This is the most straightforward
|
||||
approach, typically using a memory buffer that is twice the size of each
|
||||
segment. After a partial match, the first half of the buffer is discarded, the
|
||||
second half is moved to the start of the buffer, and a new segment is added
|
||||
before repeating the match as in the example above. After a no match, the
|
||||
entire buffer can be discarded.
|
||||
.P
|
||||
If there are memory constraints, you may want to discard text that precedes a
|
||||
partial match before adding the next segment. Unfortunately, this is not at
|
||||
present straightforward. In cases such as the above, where the pattern does not
|
||||
contain any lookbehinds, it is sufficient to retain only the partially matched
|
||||
substring. However, if the pattern contains a lookbehind assertion, characters
|
||||
that precede the start of the partial match may have been inspected during the
|
||||
matching process. When \fBpcre2test\fP displays a partial match, it indicates
|
||||
these characters with '<' if the \fBallusedtext\fP modifier is set:
|
||||
.sp
|
||||
re> "(?<=123)abc"
|
||||
data> xx123ab\e=ph,allusedtext
|
||||
Partial match: 123ab
|
||||
<<<
|
||||
.sp
|
||||
However, the \fBallusedtext\fP modifier is not available for JIT matching,
|
||||
because JIT matching does not record the first (or last) consulted characters.
|
||||
For this reason, this information is not available via the API. It is therefore
|
||||
not possible in general to obtain the exact number of characters that must be
|
||||
retained in order to get the right match result. If you cannot retain the
|
||||
entire segment, you must find some heuristic way of choosing.
|
||||
.P
|
||||
If you know the approximate length of the matching substrings, you can use that
|
||||
to decide how much text to retain. The only lookbehind information that is
|
||||
currently available via the API is the length of the longest individual
|
||||
lookbehind in a pattern, but this can be misleading if there are nested
|
||||
lookbehinds. The value returned by calling \fBpcre2_pattern_info()\fP with the
|
||||
PCRE2_INFO_MAXLOOKBEHIND option is the maximum number of characters (not code
|
||||
units) that any individual lookbehind moves back when it is processed. A
|
||||
pattern such as "(?<=(?<!b)a)" has a maximum lookbehind value of one, but
|
||||
inspects two characters before its starting point.
|
||||
.P
|
||||
In a non-UTF or a 32-bit case, moving back is just a subtraction, but in
|
||||
UTF-8 or UTF-16 you have to count characters while moving back through the code
|
||||
units.
|
||||
.
|
||||
.
|
||||
.SH "PARTIAL MATCHING USING pcre2_dfa_match()"
|
||||
.rs
|
||||
.sp
|
||||
The DFA functions move along the subject string character by character, without
|
||||
The DFA function moves along the subject string character by character, without
|
||||
backtracking, searching for all possible matches simultaneously. If the end of
|
||||
the subject is reached before the end of the pattern, there is the possibility
|
||||
of a partial match, again provided that at least one character has been
|
||||
inspected.
|
||||
of a partial match.
|
||||
.P
|
||||
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
|
||||
have been no complete matches. Otherwise, the complete matches are returned.
|
||||
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
|
||||
any complete matches. The portion of the string that was matched when the
|
||||
longest partial match was found is set as the first matching string.
|
||||
If PCRE2_PARTIAL_HARD is set, a partial match takes precedence over any
|
||||
complete matches. The portion of the string that was matched when the longest
|
||||
partial match was found is set as the first matching string.
|
||||
.P
|
||||
Because the DFA functions always search for all possible matches, and there is
|
||||
no difference between greedy and ungreedy repetition, their behaviour is
|
||||
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
|
||||
the string "dog" matched against the ungreedy pattern shown above:
|
||||
Because the DFA function always searches for all possible matches, and there is
|
||||
no difference between greedy and ungreedy repetition, its behaviour is
|
||||
different from the \fBpcre2_match()\fP. Consider the string "dog" matched
|
||||
against this ungreedy pattern:
|
||||
.sp
|
||||
/dog(sbody)??/
|
||||
.sp
|
||||
|
@ -175,62 +309,17 @@ Whereas the standard function stops as soon as it finds the complete match for
|
|||
returns that when PCRE2_PARTIAL_HARD is set.
|
||||
.
|
||||
.
|
||||
.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
|
||||
.rs
|
||||
.sp
|
||||
If a pattern ends with one of sequences \eb or \eB, which test for word
|
||||
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
|
||||
results. Consider this pattern:
|
||||
.sp
|
||||
/\ebcat\eb/
|
||||
.sp
|
||||
This matches "cat", provided there is a word boundary at either end. If the
|
||||
subject string is "the cat", the comparison of the final "t" with a following
|
||||
character cannot take place, so a partial match is found. However, normal
|
||||
matching carries on, and \eb matches at the end of the subject when the last
|
||||
character is a letter, so a complete match is found. The result, therefore, is
|
||||
\fInot\fP PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
|
||||
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
|
||||
.
|
||||
.
|
||||
.SH "EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST"
|
||||
.rs
|
||||
.sp
|
||||
If the \fBpartial_soft\fP (or \fBps\fP) modifier is present on a
|
||||
\fBpcre2test\fP data line, the PCRE2_PARTIAL_SOFT option is used for the match.
|
||||
Here is a run of \fBpcre2test\fP that uses the date example quoted above:
|
||||
.sp
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 25jun04\e=ps
|
||||
0: 25jun04
|
||||
1: jun
|
||||
data> 25dec3\e=ps
|
||||
Partial match: 23dec3
|
||||
data> 3ju\e=ps
|
||||
Partial match: 3ju
|
||||
data> 3juj\e=ps
|
||||
No match
|
||||
data> j\e=ps
|
||||
No match
|
||||
.sp
|
||||
The first data string is matched completely, so \fBpcre2test\fP shows the
|
||||
matched substrings. The remaining four strings do not match the complete
|
||||
pattern, but the first two are partial matches. Similar output is obtained
|
||||
if DFA matching is used.
|
||||
.P
|
||||
If the \fBpartial_hard\fP (or \fBph\fP) modifier is present on a
|
||||
\fBpcre2test\fP data line, the PCRE2_PARTIAL_HARD option is set for the match.
|
||||
.
|
||||
.
|
||||
.SH "MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()"
|
||||
.rs
|
||||
.sp
|
||||
When a partial match has been found using a DFA matching function, it is
|
||||
When a partial match has been found using the DFA matching function, it is
|
||||
possible to continue the match by providing additional subject data and calling
|
||||
the function again with the same compiled regular expression, this time setting
|
||||
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
|
||||
because this is where details of the previous partial match are stored. Here is
|
||||
an example using \fBpcre2test\fP:
|
||||
because this is where details of the previous partial match are stored. You can
|
||||
set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with PCRE2_DFA_RESTART
|
||||
to continue partial matching over multiple segments. Here is an example using
|
||||
\fBpcre2test\fP:
|
||||
.sp
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 23ja\e=dfa,ps
|
||||
|
@ -242,147 +331,10 @@ The first call has "23ja" as the subject, and requests partial matching; the
|
|||
second call has "n05" as the subject for the continued (restarted) match.
|
||||
Notice that when the match is complete, only the last part is shown; PCRE2 does
|
||||
not retain the previously partially-matched string. It is up to the calling
|
||||
program to do that if it needs to.
|
||||
.P
|
||||
That means that, for an unanchored pattern, if a continued match fails, it is
|
||||
not possible to try again at a new starting point. All this facility is capable
|
||||
of doing is continuing with the previous match attempt. In the previous
|
||||
example, if the second set of data is "ug23" the result is no match, even
|
||||
though there would be a match for "aug23" if the entire string were given at
|
||||
once. Depending on the application, this may or may not be what you want.
|
||||
The only way to allow for starting again at the next character is to retain the
|
||||
matched part of the subject and try a new complete match.
|
||||
.P
|
||||
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
|
||||
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
|
||||
facility can be used to pass very long subject strings to the DFA matching
|
||||
functions.
|
||||
.
|
||||
.
|
||||
.SH "MULTI-SEGMENT MATCHING WITH pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
Unlike the DFA function, it is not possible to restart the previous match with
|
||||
a new segment of data when using \fBpcre2_match()\fP. Instead, new data must be
|
||||
added to the previous subject string, and the entire match re-run, starting
|
||||
from the point where the partial match occurred. Earlier data can be discarded.
|
||||
.P
|
||||
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
|
||||
treat the end of a segment as the end of the subject when matching \ez, \eZ,
|
||||
\eb, \eB, and $. Consider an unanchored pattern that matches dates:
|
||||
.sp
|
||||
re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/
|
||||
data> The date is 23ja\e=ph
|
||||
Partial match: 23ja
|
||||
.sp
|
||||
At this stage, an application could discard the text preceding "23ja", add on
|
||||
text from the next segment, and call the matching function again. Unlike the
|
||||
DFA matching function, the entire matching string must always be available,
|
||||
and the complete matching process occurs for each call, so more memory and more
|
||||
processing time is needed.
|
||||
.
|
||||
.
|
||||
.SH "ISSUES WITH MULTI-SEGMENT MATCHING"
|
||||
.rs
|
||||
.sp
|
||||
Certain types of pattern may give problems with multi-segment matching,
|
||||
whichever matching function is used.
|
||||
.P
|
||||
1. If the pattern contains a test for the beginning of a line, you need to pass
|
||||
the PCRE2_NOTBOL option when the subject string for any call does start at the
|
||||
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
|
||||
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
|
||||
includes the effect of PCRE2_NOTEOL.
|
||||
.P
|
||||
2. If a pattern contains a lookbehind assertion, characters that precede the
|
||||
start of the partial match may have been inspected during the matching process.
|
||||
When using \fBpcre2_match()\fP, sufficient characters must be retained for the
|
||||
next match attempt. You can ensure that enough characters are retained by doing
|
||||
the following:
|
||||
.P
|
||||
Before doing any matching, find the length of the longest lookbehind in the
|
||||
pattern by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_MAXLOOKBEHIND
|
||||
option. Note that the resulting count is in characters, not code units. After a
|
||||
partial match, moving back from the ovector[0] offset in the subject by the
|
||||
number of characters given for the maximum lookbehind gets you to the earliest
|
||||
character that must be retained. In a non-UTF or a 32-bit situation, moving
|
||||
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
|
||||
while moving back through the code units.
|
||||
.P
|
||||
Characters before the point you have now reached can be discarded, and after
|
||||
the next segment has been added to what is retained, you should run the next
|
||||
match with the \fBstartoffset\fP argument set so that the match begins at the
|
||||
same point as before.
|
||||
.P
|
||||
For example, if the pattern "(?<=123)abc" is partially matched against the
|
||||
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
|
||||
lookbehind count is 3, so all characters before offset 2 can be discarded. The
|
||||
value of \fBstartoffset\fP for the next match should be 3. When \fBpcre2test\fP
|
||||
displays a partial match, it indicates the lookbehind characters with '<'
|
||||
characters:
|
||||
.sp
|
||||
re> "(?<=123)abc"
|
||||
data> xx123ab\e=ph
|
||||
Partial match: 123ab
|
||||
<<<
|
||||
.P
|
||||
3. Because a partial match must always contain at least one character, what
|
||||
might be considered a partial match of an empty string actually gives a "no
|
||||
match" result. For example:
|
||||
.sp
|
||||
re> /c(?<=abc)x/
|
||||
data> ab\e=ps
|
||||
No match
|
||||
.sp
|
||||
If the next segment begins "cx", a match should be found, but this will only
|
||||
happen if characters from the previous segment are retained. For this reason, a
|
||||
"no match" result should be interpreted as "partial match of an empty string"
|
||||
when the pattern contains lookbehinds.
|
||||
.P
|
||||
4. Matching a subject string that is split into multiple segments may not
|
||||
always produce exactly the same result as matching over one single long string,
|
||||
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
|
||||
Word Boundaries" above describes an issue that arises if the pattern ends with
|
||||
\eb or \eB. Another kind of difference may occur when there are multiple
|
||||
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
|
||||
is given only when there are no completed matches. This means that as soon as
|
||||
the shortest match has been found, continuation to a new subject segment is no
|
||||
longer possible. Consider this \fBpcre2test\fP example:
|
||||
.sp
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\e=ps
|
||||
0: dog
|
||||
data> do\e=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\e=ps,dfa,dfa_restart
|
||||
0: g
|
||||
data> dogsbody\e=dfa
|
||||
0: dogsbody
|
||||
1: dog
|
||||
.sp
|
||||
The first data line passes the string "dogsb" to a standard matching function,
|
||||
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
|
||||
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
|
||||
string "dog" is a complete match. Similarly, when the subject is presented to
|
||||
a DFA matching function in several parts ("do" and "gsb" being the first two)
|
||||
the match stops when "dog" has been found, and it is not possible to continue.
|
||||
On the other hand, if "dogsbody" is presented as a single string, a DFA
|
||||
matching function finds both matches.
|
||||
.P
|
||||
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
|
||||
multi-segment data. The example above then behaves differently:
|
||||
.sp
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\e=ph
|
||||
Partial match: dogsb
|
||||
data> do\e=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\e=ph,dfa,dfa_restart
|
||||
Partial match: gsb
|
||||
.sp
|
||||
5. Patterns that contain alternatives at the top level which do not all start
|
||||
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
|
||||
used. For example, consider this pattern:
|
||||
program to do that if it needs to. This means that, for an unanchored pattern,
|
||||
if a continued match fails, it is not possible to try again at a new starting
|
||||
point. All this facility is capable of doing is continuing with the previous
|
||||
match attempt. For example, consider this pattern:
|
||||
.sp
|
||||
1234|3789
|
||||
.sp
|
||||
|
@ -391,28 +343,15 @@ alternative is found at offset 3. There is no partial match for the second
|
|||
alternative, because such a match does not start at the same point in the
|
||||
subject string. Attempting to continue with the string "7890" does not yield a
|
||||
match because only those alternatives that match at one point in the subject
|
||||
are remembered. The problem arises because the start of the second alternative
|
||||
matches within the first alternative. There is no problem with anchored
|
||||
patterns or patterns such as:
|
||||
.sp
|
||||
1234|ABCD
|
||||
.sp
|
||||
where no string can be a partial match for both alternatives. This is not a
|
||||
problem if a standard matching function is used, because the entire match has
|
||||
to be rerun each time:
|
||||
.sp
|
||||
re> /1234|3789/
|
||||
data> ABC123\e=ph
|
||||
Partial match: 123
|
||||
data> 1237890
|
||||
0: 3789
|
||||
.sp
|
||||
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
|
||||
the entire match can also be used with the DFA matching function. Another
|
||||
possibility is to work with two buffers. If a partial match at offset \fIn\fP
|
||||
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
|
||||
the second buffer, you can then try a new match starting at offset \fIn+1\fP in
|
||||
the first buffer.
|
||||
are remembered. Depending on the application, this may or may not be what you
|
||||
want.
|
||||
.P
|
||||
If you do want to allow for starting again at the next character, one way of
|
||||
doing it is to retain some or all of the segment and try a new complete match,
|
||||
as described for \fBpcre2_match()\fP above. Another possibility is to work with
|
||||
two buffers. If a partial match at offset \fIn\fP in the first buffer is
|
||||
followed by "no match" when PCRE2_DFA_RESTART is used on the second buffer, you
|
||||
can then try a new match starting at offset \fIn+1\fP in the first buffer.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -429,6 +368,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 22 December 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 04 September 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "12 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -52,10 +52,11 @@ single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
|||
specified for the 32-bit library, in which case it constrains the character
|
||||
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
||||
built to include Unicode support (which is the default). When using UTF strings
|
||||
you must either call the compiling function with the PCRE2_UTF option, or the
|
||||
pattern must start with the special sequence (*UTF), which is equivalent to
|
||||
setting the relevant option. How setting a UTF mode affects pattern matching is
|
||||
mentioned in several places below. There is also a summary of features in the
|
||||
you must either call the compiling function with one or both of the PCRE2_UTF
|
||||
or PCRE2_MATCH_INVALID_UTF options, or the pattern must start with the special
|
||||
sequence (*UTF), which is equivalent to setting the relevant PCRE2_UTF. How
|
||||
setting a UTF mode affects pattern matching is mentioned in several places
|
||||
below. There is also a summary of features in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
|
@ -74,7 +75,8 @@ Another special sequence that may appear at the start of a pattern is (*UCP).
|
|||
This has the same effect as setting the PCRE2_UCP option: it causes sequences
|
||||
such as \ed and \ew to use Unicode properties to determine character types,
|
||||
instead of recognizing only characters with codes less than 256 via a lookup
|
||||
table.
|
||||
table. If also causes upper/lower casing operations to use Unicode properties
|
||||
for characters with code points greater than 127, even when UTF is not set.
|
||||
.P
|
||||
Some applications that allow their users to supply patterns may wish to
|
||||
restrict them for security reasons. If the PCRE2_NEVER_UCP option is passed to
|
||||
|
@ -261,8 +263,11 @@ corresponding characters in the subject. As a trivial example, the pattern
|
|||
The quick brown fox
|
||||
.sp
|
||||
matches a portion of a subject string that is identical to itself. When
|
||||
caseless matching is specified (the PCRE2_CASELESS option), letters are matched
|
||||
independently of case.
|
||||
caseless matching is specified (the PCRE2_CASELESS option or (?i) within the
|
||||
pattern), letters are matched independently of case. Note that there are two
|
||||
ASCII characters, K and S, that, in addition to their lower case ASCII
|
||||
equivalents, are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F
|
||||
(long S) respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
.P
|
||||
The power of regular expressions comes from the ability to include wild cards,
|
||||
character classes, alternatives, and repetitions in the pattern. These are
|
||||
|
@ -296,6 +301,22 @@ a character class the only metacharacters are:
|
|||
[ POSIX character class (if followed by POSIX syntax)
|
||||
] terminates the character class
|
||||
.sp
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern, other than in a character class, and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or a # character as
|
||||
part of the pattern. If the PCRE2_EXTENDED_MORE option is set, the same
|
||||
applies, but in addition unescaped space and horizontal tab characters are
|
||||
ignored inside a character class. Note: only these two characters are ignored,
|
||||
not the full set of pattern white space characters that are ignored outside a
|
||||
character class. Option settings can be changed within a pattern; see the
|
||||
section entitled
|
||||
.\" HTML <a href="#internaloptions">
|
||||
.\" </a>
|
||||
"Internal Option Setting"
|
||||
.\"
|
||||
below.
|
||||
.P
|
||||
The following sections describe the use of each of the metacharacters.
|
||||
.
|
||||
.
|
||||
|
@ -313,15 +334,9 @@ would otherwise be interpreted as a metacharacter, so it is always safe to
|
|||
precede a non-alphanumeric with backslash to specify that it stands for itself.
|
||||
In particular, if you want to match a backslash, you write \e\e.
|
||||
.P
|
||||
In a UTF mode, only ASCII digits and letters have any special meaning after a
|
||||
backslash. All other characters (in particular, those whose code points are
|
||||
greater than 127) are treated as literals.
|
||||
.P
|
||||
If a pattern is compiled with the PCRE2_EXTENDED option, most white space in
|
||||
the pattern (other than in a character class), and characters between a #
|
||||
outside a character class and the next newline, inclusive, are ignored. An
|
||||
escaping backslash can be used to include a white space or # character as part
|
||||
of the pattern.
|
||||
Only ASCII digits and letters have any special meaning after a backslash. All
|
||||
other characters (in particular, those whose code points are greater than 127)
|
||||
are treated as literals.
|
||||
.P
|
||||
If you want to treat all characters in a sequence as literals, you can do so by
|
||||
putting them between \eQ and \eE. This is different from Perl in that $ and @
|
||||
|
@ -398,11 +413,11 @@ PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition,
|
|||
There may be any number of hexadecimal digits. This syntax is from ECMAScript
|
||||
6.
|
||||
.P
|
||||
The \eN{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
|
||||
is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
|
||||
\eN{name} to specify characters by Unicode name; PCRE2 does not support this.
|
||||
Note that when \eN is not followed by an opening brace (curly bracket) it has
|
||||
an entirely different meaning, matching any character that is not a newline.
|
||||
The \eN{U+hhh..} escape sequence is recognized only when PCRE2 is operating in
|
||||
UTF mode. Perl also uses \eN{name} to specify characters by Unicode name; PCRE2
|
||||
does not support this. Note that when \eN is not followed by an opening brace
|
||||
(curly bracket) it has an entirely different meaning, matching any character
|
||||
that is not a newline.
|
||||
.P
|
||||
There are some legacy applications where the escape sequence \er is expected to
|
||||
match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \er in a
|
||||
|
@ -494,7 +509,6 @@ for themselves. For example, outside a character class:
|
|||
.\" JOIN
|
||||
\e377 might be a backreference, otherwise
|
||||
the value 255 (decimal)
|
||||
.\" JOIN
|
||||
\e81 is always a backreference
|
||||
.sp
|
||||
Note that octal values of 100 or greater that are specified using this syntax
|
||||
|
@ -726,7 +740,7 @@ Unicode support is not needed for these characters to be recognized.
|
|||
.P
|
||||
It is possible to restrict \eR to match only CR, LF, or CRLF (instead of the
|
||||
complete set of Unicode line endings) by setting the option PCRE2_BSR_ANYCRLF
|
||||
at compile time. (BSR is an abbrevation for "backslash R".) This can be made
|
||||
at compile time. (BSR is an abbreviation for "backslash R".) This can be made
|
||||
the default when PCRE2 is built; if this is the case, the other behaviour can
|
||||
be requested via the PCRE2_BSR_UNICODE option. It is also possible to specify
|
||||
these settings by starting a pattern string with one of the following
|
||||
|
@ -758,187 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these
|
|||
sequences are of course limited to testing characters whose code points are
|
||||
less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points
|
||||
greater than 0x10ffff (the Unicode limit) may be encountered. These are all
|
||||
treated as being in the Unknown script and with an unassigned type. The extra
|
||||
escape sequences are:
|
||||
treated as being in the Unknown script and with an unassigned type.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.P
|
||||
The extra escape sequences that provide property support are:
|
||||
.sp
|
||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||
\eX a Unicode extended grapheme cluster
|
||||
.sp
|
||||
The property names represented by \fIxx\fP above are case-sensitive. There is
|
||||
support for Unicode script names, Unicode general category properties, "Any",
|
||||
which matches any character (including newline), and some special PCRE2
|
||||
properties (described in the
|
||||
The property names represented by \fIxx\fP above are not case-sensitive, and in
|
||||
accordance with Unicode's "loose matching" rules, spaces, hyphens, and
|
||||
underscores are ignored. There is support for Unicode script names, Unicode
|
||||
general category properties, "Any", which matches any character (including
|
||||
newline), Bidi_Class, a number of binary (yes/no) properties, and some special
|
||||
PCRE2 properties (described
|
||||
.\" HTML <a href="#extraprops">
|
||||
.\" </a>
|
||||
next section).
|
||||
below).
|
||||
.\"
|
||||
Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2.
|
||||
Note that \eP{Any} does not match any characters, so always causes a match
|
||||
failure.
|
||||
Certain other Perl properties such as "InMusicalSymbols" are not supported by
|
||||
PCRE2. Note that \eP{Any} does not match any characters, so always causes a
|
||||
match failure.
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "Script properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
There are three different syntax forms for matching a script. Each Unicode
|
||||
character has a basic script and, optionally, a list of other scripts ("Script
|
||||
Extensions") with which it is commonly used. Using the Adlam script as an
|
||||
example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas
|
||||
\ep{scx:Adlam} matches, in addition, characters that have Adlam in their
|
||||
extensions list. The full names "script" and "script extensions" for the
|
||||
property types are recognized, and a equals sign is an alternative to the
|
||||
colon. If a script name is given without a property type, for example,
|
||||
\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this
|
||||
interpretation at release 5.26 and PCRE2 changed at release 10.40.
|
||||
.P
|
||||
Sets of Unicode characters are defined as belonging to certain scripts. A
|
||||
character from one of these sets can be matched using a script name. For
|
||||
example:
|
||||
.sp
|
||||
\ep{Greek}
|
||||
\eP{Han}
|
||||
.sp
|
||||
Unassigned characters (and in non-UTF 32-bit mode, characters with code points
|
||||
greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not
|
||||
part of an identified script are lumped together as "Common". The current list
|
||||
of scripts is:
|
||||
.P
|
||||
Adlam,
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
Balinese,
|
||||
Bamum,
|
||||
Bassa_Vah,
|
||||
Batak,
|
||||
Bengali,
|
||||
Bhaiksuki,
|
||||
Bopomofo,
|
||||
Brahmi,
|
||||
Braille,
|
||||
Buginese,
|
||||
Buhid,
|
||||
Canadian_Aboriginal,
|
||||
Carian,
|
||||
Caucasian_Albanian,
|
||||
Chakma,
|
||||
Cham,
|
||||
Cherokee,
|
||||
Common,
|
||||
Coptic,
|
||||
Cuneiform,
|
||||
Cypriot,
|
||||
Cyrillic,
|
||||
Deseret,
|
||||
Devanagari,
|
||||
Dogra,
|
||||
Duployan,
|
||||
Egyptian_Hieroglyphs,
|
||||
Elbasan,
|
||||
Ethiopic,
|
||||
Georgian,
|
||||
Glagolitic,
|
||||
Gothic,
|
||||
Grantha,
|
||||
Greek,
|
||||
Gujarati,
|
||||
Gunjala_Gondi,
|
||||
Gurmukhi,
|
||||
Han,
|
||||
Hangul,
|
||||
Hanifi_Rohingya,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
Inherited,
|
||||
Inscriptional_Pahlavi,
|
||||
Inscriptional_Parthian,
|
||||
Javanese,
|
||||
Kaithi,
|
||||
Kannada,
|
||||
Katakana,
|
||||
Kayah_Li,
|
||||
Kharoshthi,
|
||||
Khmer,
|
||||
Khojki,
|
||||
Khudawadi,
|
||||
Lao,
|
||||
Latin,
|
||||
Lepcha,
|
||||
Limbu,
|
||||
Linear_A,
|
||||
Linear_B,
|
||||
Lisu,
|
||||
Lycian,
|
||||
Lydian,
|
||||
Mahajani,
|
||||
Makasar,
|
||||
Malayalam,
|
||||
Mandaic,
|
||||
Manichaean,
|
||||
Marchen,
|
||||
Masaram_Gondi,
|
||||
Medefaidrin,
|
||||
Meetei_Mayek,
|
||||
Mende_Kikakui,
|
||||
Meroitic_Cursive,
|
||||
Meroitic_Hieroglyphs,
|
||||
Miao,
|
||||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
New_Tai_Lue,
|
||||
Newa,
|
||||
Nko,
|
||||
Nushu,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
Old_Persian,
|
||||
Old_Sogdian,
|
||||
Old_South_Arabian,
|
||||
Old_Turkic,
|
||||
Oriya,
|
||||
Osage,
|
||||
Osmanya,
|
||||
Pahawh_Hmong,
|
||||
Palmyrene,
|
||||
Pau_Cin_Hau,
|
||||
Phags_Pa,
|
||||
Phoenician,
|
||||
Psalter_Pahlavi,
|
||||
Rejang,
|
||||
Runic,
|
||||
Samaritan,
|
||||
Saurashtra,
|
||||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sogdian,
|
||||
Sora_Sompeng,
|
||||
Soyombo,
|
||||
Sundanese,
|
||||
Syloti_Nagri,
|
||||
Syriac,
|
||||
Tagalog,
|
||||
Tagbanwa,
|
||||
Tai_Le,
|
||||
Tai_Tham,
|
||||
Tai_Viet,
|
||||
Takri,
|
||||
Tamil,
|
||||
Tangut,
|
||||
Telugu,
|
||||
Thaana,
|
||||
Thai,
|
||||
Tibetan,
|
||||
Tifinagh,
|
||||
Tirhuta,
|
||||
Ugaritic,
|
||||
Unknown,
|
||||
Vai,
|
||||
Warang_Citi,
|
||||
Yi,
|
||||
Zanabazar_Square.
|
||||
.P
|
||||
of recognized script names and their 4-character abbreviations can be obtained
|
||||
by running this command:
|
||||
.sp
|
||||
pcre2test -LS
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.
|
||||
.SS "The general category property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Each character has exactly one Unicode general category property, specified by
|
||||
a two-letter abbreviation. For compatibility with Perl, negation can be
|
||||
specified by including a circumflex between the opening brace and the property
|
||||
|
@ -998,9 +889,9 @@ The following general category property codes are supported:
|
|||
Zp Paragraph separator
|
||||
Zs Space separator
|
||||
.sp
|
||||
The special property L& is also supported: it matches a character that has
|
||||
the Lu, Ll, or Lt property, in other words, a letter that is not classified as
|
||||
a modifier or "other".
|
||||
The special property LC, which has the synonym L&, is also supported: it
|
||||
matches a character that has the Lu, Ll, or Lt property, in other words, a
|
||||
letter that is not classified as a modifier or "other".
|
||||
.P
|
||||
The Cs (Surrogate) property applies only to characters whose code points are in
|
||||
the range U+D800 to U+DFFF. These characters are no different to any other
|
||||
|
@ -1024,12 +915,53 @@ Unicode table.
|
|||
Specifying caseless matching does not affect these escape sequences. For
|
||||
example, \ep{Lu} always matches only upper case letters. This is different from
|
||||
the behaviour of current versions of Perl.
|
||||
.P
|
||||
Matching characters by Unicode property is not fast, because PCRE2 has to do a
|
||||
multistage table lookup in order to find a character's property. That is why
|
||||
the traditional escape sequences such as \ed and \ew do not use Unicode
|
||||
properties in PCRE2 by default, though you can make them do so by setting the
|
||||
PCRE2_UCP option or by starting the pattern with (*UCP).
|
||||
.
|
||||
.
|
||||
.SS "Binary (yes/no) properties for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
Unicode defines a number of binary properties, that is, properties whose only
|
||||
values are true or false. You can obtain a list of those that are recognized by
|
||||
\ep and \eP, along with their abbreviations, by running this command:
|
||||
.sp
|
||||
pcre2test -LP
|
||||
.sp
|
||||
.
|
||||
.
|
||||
.SS "The Bidi_Class property for \ep and \eP"
|
||||
.rs
|
||||
.sp
|
||||
\ep{Bidi_Class:<class>} matches a character with the given class
|
||||
\ep{BC:<class>} matches a character with the given class
|
||||
.sp
|
||||
The recognized classes are:
|
||||
.sp
|
||||
AL Arabic letter
|
||||
AN Arabic number
|
||||
B paragraph separator
|
||||
BN boundary neutral
|
||||
CS common separator
|
||||
EN European number
|
||||
ES European separator
|
||||
ET European terminator
|
||||
FSI first strong isolate
|
||||
L left-to-right
|
||||
LRE left-to-right embedding
|
||||
LRI left-to-right isolate
|
||||
LRO left-to-right override
|
||||
NSM non-spacing mark
|
||||
ON other neutral
|
||||
PDF pop directional format
|
||||
PDI pop directional isolate
|
||||
R right-to-left
|
||||
RLE right-to-left embedding
|
||||
RLI right-to-left isolate
|
||||
RLO right-to-left override
|
||||
S segment separator
|
||||
WS which space
|
||||
.sp
|
||||
An equals sign may be used instead of a colon. The class names are
|
||||
case-insensitive; only the short names listed above are recognized.
|
||||
.
|
||||
.
|
||||
.SS Extended grapheme clusters
|
||||
|
@ -1059,7 +991,7 @@ additional characters according to the following rules for ending a cluster:
|
|||
3. Do not break Hangul (a Korean script) syllable sequences. Hangul characters
|
||||
are of five types: L, V, T, LV, and LVT. An L character may be followed by an
|
||||
L, V, LV, or LVT character; an LV or V character may be followed by a V or T
|
||||
character; an LVT or T character may be follwed only by a T character.
|
||||
character; an LVT or T character may be followed only by a T character.
|
||||
.P
|
||||
4. Do not end before extending characters or spacing marks or the "zero-width
|
||||
joiner" character. Characters with the "mark" property always have the
|
||||
|
@ -1145,8 +1077,11 @@ For example, when the pattern
|
|||
.sp
|
||||
matches "foobar", the first substring is still set to "foo".
|
||||
.P
|
||||
Perl documents that the use of \eK within assertions is "not well defined". In
|
||||
PCRE2, \eK is acted upon when it occurs inside positive assertions, but is
|
||||
From version 5.32.0 Perl forbids the use of \eK in lookaround assertions. From
|
||||
release 10.38 PCRE2 also forbids this by default. However, the
|
||||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option can be used when calling
|
||||
\fBpcre2_compile()\fP to re-enable the previous behaviour. When this option is
|
||||
set, \eK is acted upon when it occurs inside positive assertions, but is
|
||||
ignored in negative assertions. Note that when a pattern such as (?=ab\eK)
|
||||
matches, the reported start of the match can be greater than the end of the
|
||||
match. Using \eK in a lookbehind assertion at the start of a pattern can also
|
||||
|
@ -1305,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with
|
|||
.sp
|
||||
Outside a character class, a dot in the pattern matches any one character in
|
||||
the subject string except (by default) a character that signifies the end of a
|
||||
line.
|
||||
line. One or more characters may be specified as line terminators (see
|
||||
.\" HTML <a href="#newlines">
|
||||
.\" </a>
|
||||
"Newline conventions"
|
||||
.\"
|
||||
above).
|
||||
.P
|
||||
When a line ending is defined as a single character, dot never matches that
|
||||
character; when the two-character sequence CRLF is used, dot does not match CR
|
||||
if it is immediately followed by LF, but otherwise it matches all characters
|
||||
(including isolated CRs and LFs). When any Unicode line endings are being
|
||||
recognized, dot does not match CR or LF or any of the other line ending
|
||||
characters.
|
||||
Dot never matches a single line-ending character. When the two-character
|
||||
sequence CRLF is the only line ending, dot does not match CR if it is
|
||||
immediately followed by LF, but otherwise it matches all characters (including
|
||||
isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences
|
||||
of CR of LF match dot. When all Unicode line endings are being recognized, dot
|
||||
does not match CR or LF or any of the other line ending characters.
|
||||
.P
|
||||
The behaviour of dot with regard to newlines can be changed. If the
|
||||
PCRE2_DOTALL option is set, a dot matches any one character, without exception.
|
||||
|
@ -1352,7 +1292,7 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
unless the PCRE2_NO_UTF_CHECK or PCRE2_MATCH_INVALID_UTF option is used).
|
||||
.P
|
||||
An application can lock out the use of \eC by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||
|
@ -1426,7 +1366,10 @@ Characters in a class may be specified by their code points using \eo, \ex, or
|
|||
\eN{U+hh..} in the usual way. When caseless matching is set, any letters in a
|
||||
class represent both their upper case and lower case versions, so for example,
|
||||
a caseless [aeiou] matches "A" as well as "a", and a caseless [^aeiou] does not
|
||||
match "A", whereas a caseful version would.
|
||||
match "A", whereas a caseful version would. Note that there are two ASCII
|
||||
characters, K and S, that, in addition to their lower case ASCII equivalents,
|
||||
are case-equivalent with Unicode U+212A (Kelvin sign) and U+017F (long S)
|
||||
respectively when either PCRE2_UTF or PCRE2_UCP is set.
|
||||
.P
|
||||
Characters that might indicate line breaks are never treated in any special way
|
||||
when matching character classes, whatever line-ending sequence is in use, and
|
||||
|
@ -1638,6 +1581,7 @@ that succeeds is used. If the alternatives are within a group
|
|||
alternative in the group.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="internaloptions"></a>
|
||||
.SH "INTERNAL OPTION SETTING"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -1896,12 +1840,21 @@ are permitted for groups with the same number, for example:
|
|||
(?|(?<AA>aa)|(?<AA>bb))
|
||||
.sp
|
||||
The duplicate name constraint can be disabled by setting the PCRE2_DUPNAMES
|
||||
option at compile time, or by the use of (?J) within the pattern. Duplicate
|
||||
names can be useful for patterns where only one instance of the named capture
|
||||
group can match. Suppose you want to match the name of a weekday, either as a
|
||||
3-letter abbreviation or as the full name, and in both cases you want to
|
||||
extract the abbreviation. This pattern (ignoring the line breaks) does the job:
|
||||
option at compile time, or by the use of (?J) within the pattern, as described
|
||||
in the section entitled
|
||||
.\" HTML <a href="#internaloptions">
|
||||
.\" </a>
|
||||
"Internal Option Setting"
|
||||
.\"
|
||||
above.
|
||||
.P
|
||||
Duplicate names can be useful for patterns where only one instance of the named
|
||||
capture group can match. Suppose you want to match the name of a weekday,
|
||||
either as a 3-letter abbreviation or as the full name, and in both cases you
|
||||
want to extract the abbreviation. This pattern (ignoring the line breaks) does
|
||||
the job:
|
||||
.sp
|
||||
(?J)
|
||||
(?<DN>Mon|Fri|Sun)(?:day)?|
|
||||
(?<DN>Tue)(?:sday)?|
|
||||
(?<DN>Wed)(?:nesday)?|
|
||||
|
@ -1921,7 +1874,7 @@ they appear in the overall pattern. The first one that is set is used for the
|
|||
reference. For example, this pattern matches both "foofoo" and "barbar" but not
|
||||
"foobar" or "barfoo":
|
||||
.sp
|
||||
(?:(?<n>foo)|(?<n>bar))\ek<n>
|
||||
(?J)(?:(?<n>foo)|(?<n>bar))\ek<n>
|
||||
.sp
|
||||
.P
|
||||
If you make a subroutine call to a non-unique named group, the one that
|
||||
|
@ -1960,7 +1913,7 @@ items:
|
|||
an escape such as \ed or \epL that matches a single character
|
||||
a character class
|
||||
a backreference
|
||||
a parenthesized group (including most assertions)
|
||||
a parenthesized group (including lookaround assertions)
|
||||
a subroutine call (recursive or otherwise)
|
||||
.sp
|
||||
The general repetition quantifier specifies a minimum and maximum number of
|
||||
|
@ -2021,8 +1974,10 @@ no characters with a quantifier that has no upper limit, for example:
|
|||
.sp
|
||||
Earlier versions of Perl and PCRE1 used to give an error at compile time for
|
||||
such patterns. However, because there are cases where this can be useful, such
|
||||
patterns are now accepted, but if any repetition of the group does in fact
|
||||
match no characters, the loop is forcibly broken.
|
||||
patterns are now accepted, but whenever an iteration of such a group matches no
|
||||
characters, matching moves on to the next item in the pattern instead of
|
||||
repeatedly matching an empty string. This does not prevent backtracking into
|
||||
any of the iterations if a subsequent item fails to match.
|
||||
.P
|
||||
By default, quantifiers are "greedy", that is, they match as much as possible
|
||||
(up to the maximum number of permitted times), without causing the rest of the
|
||||
|
@ -2140,10 +2095,10 @@ be easier to remember:
|
|||
.sp
|
||||
(*atomic:\ed+)foo
|
||||
.sp
|
||||
This kind of parenthesized group "locks up" the part of the pattern it
|
||||
contains once it has matched, and a failure further into the pattern is
|
||||
prevented from backtracking into it. Backtracking past it to previous items,
|
||||
however, works as normal.
|
||||
This kind of parenthesized group "locks up" the part of the pattern it contains
|
||||
once it has matched, and a failure further into the pattern is prevented from
|
||||
backtracking into it. Backtracking past it to previous items, however, works as
|
||||
normal.
|
||||
.P
|
||||
An alternative description is that a group of this type matches exactly the
|
||||
string of characters that an identical standalone pattern would match, if
|
||||
|
@ -2339,14 +2294,14 @@ the first iteration does not need to match the backreference. This can be done
|
|||
using alternation, as in the example above, or by a quantifier with a minimum
|
||||
of zero.
|
||||
.P
|
||||
Backreferences of this type cause the group that they reference to be treated
|
||||
as an
|
||||
For versions of PCRE2 less than 10.25, backreferences of this type used to
|
||||
cause the group that they reference to be treated as an
|
||||
.\" HTML <a href="#atomicgroup">
|
||||
.\" </a>
|
||||
atomic group.
|
||||
.\"
|
||||
Once the whole group has been matched, a subsequent matching failure cannot
|
||||
cause backtracking into the middle of the group.
|
||||
This restriction no longer applies, and backtracking into such groups can occur
|
||||
as normal.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="bigassertions"></a>
|
||||
|
@ -2367,9 +2322,19 @@ those that look behind it, and in each case an assertion may be positive (must
|
|||
match for the assertion to be true) or negative (must not match for the
|
||||
assertion to be true). An assertion group is matched in the normal way,
|
||||
and if it is true, matching continues after it, but with the matching position
|
||||
in the subject string is was it was before the assertion was processed.
|
||||
in the subject string reset to what it was before the assertion was processed.
|
||||
.P
|
||||
A lookaround assertion may also appear as the condition in a
|
||||
The Perl-compatible lookaround assertions are atomic. If an assertion is true,
|
||||
but there is a subsequent matching failure, there is no backtracking into the
|
||||
assertion. However, there are some cases where non-atomic assertions can be
|
||||
useful. PCRE2 has some support for these, described in the section entitled
|
||||
.\" HTML <a href="#nonatomicassertions">
|
||||
.\" </a>
|
||||
"Non-atomic assertions"
|
||||
.\"
|
||||
below, but they are not Perl-compatible.
|
||||
.P
|
||||
A lookaround assertion may appear as the condition in a
|
||||
.\" HTML <a href="#conditions">
|
||||
.\" </a>
|
||||
conditional group
|
||||
|
@ -2404,36 +2369,23 @@ the "no" branch of the condition. For other failing negative assertions,
|
|||
control passes to the previous backtracking point, thus discarding any captured
|
||||
strings within the assertion.
|
||||
.P
|
||||
For compatibility with Perl, most assertion groups may be repeated; though it
|
||||
makes no sense to assert the same thing several times, the side effect of
|
||||
capturing may occasionally be useful. However, an assertion that forms the
|
||||
condition for a conditional group may not be quantified. In practice, for
|
||||
other assertions, there only three cases:
|
||||
.sp
|
||||
(1) If the quantifier is {0}, the assertion is never obeyed during matching.
|
||||
However, it may contain internal capture groups that are called from elsewhere
|
||||
via the
|
||||
.\" HTML <a href="#groupsassubroutines">
|
||||
.\" </a>
|
||||
subroutine mechanism.
|
||||
.\"
|
||||
.sp
|
||||
(2) If quantifier is {0,n} where n is greater than zero, it is treated as if it
|
||||
were {0,1}. At run time, the rest of the pattern match is tried with and
|
||||
without the assertion, the order depending on the greediness of the quantifier.
|
||||
.sp
|
||||
(3) If the minimum repetition is greater than zero, the quantifier is ignored.
|
||||
The assertion is obeyed just once when encountered during matching.
|
||||
Most assertion groups may be repeated; though it makes no sense to assert the
|
||||
same thing several times, the side effect of capturing in positive assertions
|
||||
may occasionally be useful. However, an assertion that forms the condition for
|
||||
a conditional group may not be quantified. PCRE2 used to restrict the
|
||||
repetition of assertions, but from release 10.35 the only restriction is that
|
||||
an unlimited maximum repetition is changed to be one more than the minimum. For
|
||||
example, {3,} is treated as {3,4}.
|
||||
.
|
||||
.
|
||||
.SS "Alphabetic assertion names"
|
||||
.rs
|
||||
.sp
|
||||
Traditionally, symbolic sequences such as (?= and (?<= have been used to specify
|
||||
lookaround assertions. Perl 5.28 introduced some experimental alphabetic
|
||||
alternatives which might be easier to remember. They all start with (* instead
|
||||
of (? and must be written using lower case letters. PCRE2 supports the
|
||||
following synonyms:
|
||||
Traditionally, symbolic sequences such as (?= and (?<= have been used to
|
||||
specify lookaround assertions. Perl 5.28 introduced some experimental
|
||||
alphabetic alternatives which might be easier to remember. They all start with
|
||||
(* instead of (? and must be written using lower case letters. PCRE2 supports
|
||||
the following synonyms:
|
||||
.sp
|
||||
(*positive_lookahead: or (*pla: is the same as (?=
|
||||
(*negative_lookahead: or (*nla: is the same as (?!
|
||||
|
@ -2610,6 +2562,68 @@ is another pattern that matches "foo" preceded by three digits and any three
|
|||
characters that are not "999".
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="nonatomicassertions"></a>
|
||||
.SH "NON-ATOMIC ASSERTIONS"
|
||||
.rs
|
||||
.sp
|
||||
The traditional Perl-compatible lookaround assertions are atomic. That is, if
|
||||
an assertion is true, but there is a subsequent matching failure, there is no
|
||||
backtracking into the assertion. However, there are some cases where non-atomic
|
||||
positive assertions can be useful. PCRE2 provides these using the following
|
||||
syntax:
|
||||
.sp
|
||||
(*non_atomic_positive_lookahead: or (*napla: or (?*
|
||||
(*non_atomic_positive_lookbehind: or (*naplb: or (?<*
|
||||
.sp
|
||||
Consider the problem of finding the right-most word in a string that also
|
||||
appears earlier in the string, that is, it must appear at least twice in total.
|
||||
This pattern returns the required result as captured substring 1:
|
||||
.sp
|
||||
^(?x)(*napla: .* \eb(\ew++)) (?> .*? \eb\e1\eb ){2}
|
||||
.sp
|
||||
For a subject such as "word1 word2 word3 word2 word3 word4" the result is
|
||||
"word3". How does it work? At the start, ^(?x) anchors the pattern and sets the
|
||||
"x" option, which causes white space (introduced for readability) to be
|
||||
ignored. Inside the assertion, the greedy .* at first consumes the entire
|
||||
string, but then has to backtrack until the rest of the assertion can match a
|
||||
word, which is captured by group 1. In other words, when the assertion first
|
||||
succeeds, it captures the right-most word in the string.
|
||||
.P
|
||||
The current matching point is then reset to the start of the subject, and the
|
||||
rest of the pattern match checks for two occurrences of the captured word,
|
||||
using an ungreedy .*? to scan from the left. If this succeeds, we are done, but
|
||||
if the last word in the string does not occur twice, this part of the pattern
|
||||
fails. If a traditional atomic lookhead (?= or (*pla: had been used, the
|
||||
assertion could not be re-entered, and the whole match would fail. The pattern
|
||||
would succeed only if the very last word in the subject was found twice.
|
||||
.P
|
||||
Using a non-atomic lookahead, however, means that when the last word does not
|
||||
occur twice in the string, the lookahead can backtrack and find the second-last
|
||||
word, and so on, until either the match succeeds, or all words have been
|
||||
tested.
|
||||
.P
|
||||
Two conditions must be met for a non-atomic assertion to be useful: the
|
||||
contents of one or more capturing groups must change after a backtrack into the
|
||||
assertion, and there must be a backreference to a changed group later in the
|
||||
pattern. If this is not the case, the rest of the pattern match fails exactly
|
||||
as before because nothing has changed, so using a non-atomic assertion just
|
||||
wastes resources.
|
||||
.P
|
||||
There is one exception to backtracking into a non-atomic assertion. If an
|
||||
(*ACCEPT) control verb is triggered, the assertion succeeds atomically. That
|
||||
is, a subsequent match failure cannot backtrack into the assertion.
|
||||
.P
|
||||
Non-atomic assertions are not supported by the alternative matching function
|
||||
\fBpcre2_dfa_match()\fP. They are supported by JIT, but only if they do not
|
||||
contain any control verbs such as (*ACCEPT). (This may change in future). Note
|
||||
that assertions that appear as conditions for
|
||||
.\" HTML <a href="#conditions">
|
||||
.\" </a>
|
||||
conditional groups
|
||||
.\"
|
||||
(see below) must be atomic.
|
||||
.
|
||||
.
|
||||
.SH "SCRIPT RUNS"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -2830,7 +2844,7 @@ breaks):
|
|||
(?(DEFINE) (?<byte> 2[0-4]\ed | 25[0-5] | 1\ed\ed | [1-9]?\ed) )
|
||||
\eb (?&byte) (\e.(?&byte)){3} \eb
|
||||
.sp
|
||||
The first part of the pattern is a DEFINE group inside which a another group
|
||||
The first part of the pattern is a DEFINE group inside which another group
|
||||
named "byte" is defined. This matches an individual component of an IPv4
|
||||
address (a number less than 256). When matching takes place, this part of the
|
||||
pattern is skipped because DEFINE acts like a false condition. The rest of the
|
||||
|
@ -2861,8 +2875,15 @@ than two digits.
|
|||
.sp
|
||||
If the condition is not in any of the above formats, it must be a parenthesized
|
||||
assertion. This may be a positive or negative lookahead or lookbehind
|
||||
assertion. Consider this pattern, again containing non-significant white space,
|
||||
and with the two alternatives on the second line:
|
||||
assertion. However, it must be a traditional atomic assertion, not one of the
|
||||
PCRE2-specific
|
||||
.\" HTML <a href="#nonatomicassertions">
|
||||
.\" </a>
|
||||
non-atomic assertions.
|
||||
.\"
|
||||
.P
|
||||
Consider this pattern, again containing non-significant white space, and with
|
||||
the two alternatives on the second line:
|
||||
.sp
|
||||
(?(?=[^a-z]*[a-z])
|
||||
\ed{2}-[a-z]{3}-\ed{2} | \ed{2}-\ed{2}-\ed{2} )
|
||||
|
@ -3261,8 +3282,8 @@ The doubling is removed before the string is passed to the callout function.
|
|||
There are a number of special "Backtracking Control Verbs" (to use Perl's
|
||||
terminology) that modify the behaviour of backtracking during matching. They
|
||||
are generally of the form (*VERB) or (*VERB:NAME). Some verbs take either form,
|
||||
possibly behaving differently depending on whether or not a name is present.
|
||||
The names are not required to be unique within the pattern.
|
||||
and may behave differently depending on whether or not a name argument is
|
||||
present. The names are not required to be unique within the pattern.
|
||||
.P
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
|
@ -3286,7 +3307,8 @@ PCRE2_ALT_VERBNAMES is also set.
|
|||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||
not there. Any number of these verbs may occur in a pattern.
|
||||
not there. Any number of these verbs may occur in a pattern. Except for
|
||||
(*ACCEPT), they may not be quantified.
|
||||
.P
|
||||
Since these verbs are specifically related to backtracking, most of them can be
|
||||
used only when the pattern is to be matched using the traditional matching
|
||||
|
@ -3360,6 +3382,18 @@ example:
|
|||
This matches "AB", "AAD", or "ACD"; when it matches "AB", "B" is captured by
|
||||
the outer parentheses.
|
||||
.P
|
||||
(*ACCEPT) is the only backtracking verb that is allowed to be quantified
|
||||
because an ungreedy quantification with a minimum of zero acts only when a
|
||||
backtrack happens. Consider, for example,
|
||||
.sp
|
||||
(A(*ACCEPT)??B)C
|
||||
.sp
|
||||
where A, B, and C may be complex expressions. After matching "A", the matcher
|
||||
processes "BC"; if that fails, causing a backtrack, (*ACCEPT) is triggered and
|
||||
the match succeeds. In both cases, all but C is captured. Whereas (*COMMIT)
|
||||
(see below) means "fail on backtrack", a repeated (*ACCEPT) of this type means
|
||||
"succeed on backtrack".
|
||||
.P
|
||||
\fBWarning:\fP (*ACCEPT) should not be used within a script run group, because
|
||||
it causes an immediate exit from the group, bypassing the script run checking.
|
||||
.sp
|
||||
|
@ -3376,8 +3410,9 @@ nearest equivalent is the callout feature, as for example in this pattern:
|
|||
A match with the string "aaaa" always fails, but the callout is taken before
|
||||
each backtrack happens (in this example, 10 times).
|
||||
.P
|
||||
(*ACCEPT:NAME) and (*FAIL:NAME) are treated as (*MARK:NAME)(*ACCEPT) and
|
||||
(*MARK:NAME)(*FAIL), respectively.
|
||||
(*ACCEPT:NAME) and (*FAIL:NAME) behave the same as (*MARK:NAME)(*ACCEPT) and
|
||||
(*MARK:NAME)(*FAIL), respectively, that is, a (*MARK) is recorded just before
|
||||
the verb acts.
|
||||
.
|
||||
.
|
||||
.SS "Recording which path was taken"
|
||||
|
@ -3539,10 +3574,15 @@ successful match if there is a later mismatch. Consider:
|
|||
.sp
|
||||
If the subject is "aaaac...", after the first match attempt fails (starting at
|
||||
the first character in the string), the starting point skips on to start the
|
||||
next attempt at "c". Note that a possessive quantifer does not have the same
|
||||
next attempt at "c". Note that a possessive quantifier does not have the same
|
||||
effect as this example; although it would suppress backtracking during the
|
||||
first match attempt, the second attempt would start at the second character
|
||||
instead of skipping on to "c".
|
||||
.P
|
||||
If (*SKIP) is used to specify a new starting position that is the same as the
|
||||
starting position of the current match, or (by being inside a lookbehind)
|
||||
earlier, the position specified by (*SKIP) is ignored, and instead the normal
|
||||
"bumpalong" occurs.
|
||||
.sp
|
||||
(*SKIP:NAME)
|
||||
.sp
|
||||
|
@ -3700,11 +3740,22 @@ a positive assertion and false for a negative one; captured substrings are
|
|||
retained in both cases.
|
||||
.P
|
||||
The remaining verbs act only when a later failure causes a backtrack to
|
||||
reach them. This means that their effect is confined to the assertion,
|
||||
because lookaround assertions are atomic. A backtrack that occurs after an
|
||||
assertion is complete does not jump back into the assertion. Note in particular
|
||||
that a (*MARK) name that is set in an assertion is not "seen" by an instance of
|
||||
(*SKIP:NAME) latter in the pattern.
|
||||
reach them. This means that, for the Perl-compatible assertions, their effect
|
||||
is confined to the assertion, because Perl lookaround assertions are atomic. A
|
||||
backtrack that occurs after such an assertion is complete does not jump back
|
||||
into the assertion. Note in particular that a (*MARK) name that is set in an
|
||||
assertion is not "seen" by an instance of (*SKIP:NAME) later in the pattern.
|
||||
.P
|
||||
PCRE2 now supports non-atomic positive assertions, as described in the section
|
||||
entitled
|
||||
.\" HTML <a href="#nonatomicassertions">
|
||||
.\" </a>
|
||||
"Non-atomic assertions"
|
||||
.\"
|
||||
above. These assertions must be standalone (not used as conditions). They are
|
||||
not Perl-compatible. For these assertions, a later backtrack does jump back
|
||||
into the assertion, and therefore verbs such as (*COMMIT) can be triggered by
|
||||
backtracks from later in the pattern.
|
||||
.P
|
||||
The effect of (*THEN) is not allowed to escape beyond an assertion. If there
|
||||
are no more branches to try, (*THEN) causes a positive assertion to be false,
|
||||
|
@ -3754,7 +3805,7 @@ there is a backtrack at the outer level.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -3763,6 +3814,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 12 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 12 January 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PERFORM 3 "03 February 2019" "PCRE2 10.33"
|
||||
.TH PCRE2PERFORM 3 "27 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 PERFORMANCE"
|
||||
|
@ -69,12 +69,28 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
.P
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
.P
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to \fBpcre2_match()\fP. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
.P
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to \fBpcre2_match()\fP with the same match data block does not
|
||||
affect the saved block.
|
||||
.P
|
||||
In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround assertions,
|
||||
|
@ -230,7 +246,7 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
.fi
|
||||
.
|
||||
|
@ -239,6 +255,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2POSIX 3 "30 January 2019" "PCRE2 10.33"
|
||||
.TH PCRE2POSIX 3 "26 April 2021" "PCRE2 10.37"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "SYNOPSIS"
|
||||
|
@ -44,11 +44,14 @@ can be accessed by adding \fB-lpcre2-posix\fP to the command for linking an
|
|||
application. Because the POSIX functions call the native ones, it is also
|
||||
necessary to add \fB-lpcre2-8\fP.
|
||||
.P
|
||||
Although they are not defined as protypes in \fBpcre2posix.h\fP, the library
|
||||
does contain functions with the POSIX names \fBregcomp()\fP etc. These simply
|
||||
pass their arguments to the PCRE2 functions. These functions are provided for
|
||||
backwards compatibility with earlier versions of PCRE2, so that existing
|
||||
programs do not have to be recompiled.
|
||||
Although they were not defined as protypes in \fBpcre2posix.h\fP, releases
|
||||
10.33 to 10.36 of the library contained functions with the POSIX names
|
||||
\fBregcomp()\fP etc. These simply passed their arguments to the PCRE2
|
||||
functions. These functions were provided for backwards compatibility with
|
||||
earlier versions of PCRE2, which had only POSIX names. However, this has proved
|
||||
troublesome in situations where a program links with several libraries, some of
|
||||
which use PCRE2's POSIX interface while others use the real POSIX functions.
|
||||
For this reason, the POSIX names have been removed since release 10.37.
|
||||
.P
|
||||
Calling the header file \fBpcre2posix.h\fP avoids any conflict with other POSIX
|
||||
libraries. It can, of course, be renamed or aliased as \fBregex.h\fP, which is
|
||||
|
@ -321,6 +324,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 January 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 April 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
.nf
|
||||
.B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP,"
|
||||
.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP,"
|
||||
.B " pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP,"
|
||||
.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP,
|
||||
.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP,"
|
||||
.B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);"
|
||||
.sp
|
||||
.B void pcre2_serialize_free(uint8_t *\fIbytes\fP);
|
||||
|
@ -81,7 +81,7 @@ of serialized patterns, or one of the following negative error codes:
|
|||
.sp
|
||||
PCRE2_ERROR_BADDATA the number of patterns is zero or less
|
||||
PCRE2_ERROR_BADMAGIC mismatch of id bytes in one of the patterns
|
||||
PCRE2_ERROR_MEMORY memory allocation failed
|
||||
PCRE2_ERROR_NOMEMORY memory allocation failed
|
||||
PCRE2_ERROR_MIXEDTABLES the patterns do not all use the same tables
|
||||
PCRE2_ERROR_NULL the 1st, 3rd, or 4th argument is NULL
|
||||
.sp
|
||||
|
@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
|
|||
\fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte
|
||||
stream is no longer needed and can be discarded.
|
||||
.sp
|
||||
int32_t number_of_codes;
|
||||
pcre2_code *list_of_codes[2];
|
||||
uint8_t *bytes = <serialized data>;
|
||||
int32_t number_of_codes =
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue