Implement -Z in pcre2grep and update documentation
This commit is contained in:
parent
cc5e121c8e
commit
8b133fa0ba
|
@ -49,6 +49,8 @@ tests.
|
|||
tests run by 'make check', but can be run manually. The current output is from
|
||||
a 64-bit system.
|
||||
|
||||
13. Implemented -Z aka --null in pcre2grep.
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
|
42
RunGrepTest
42
RunGrepTest
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
|||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||
|
||||
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||
# in many operating systems. An earlier version of this script used sed to
|
||||
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character. However, on (some versions
|
||||
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||
# it to the current or parent directory, whichever one contains the test data.
|
||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||
|
@ -685,6 +701,16 @@ echo "---------------------------- Test 134 -----------------------------" >>tes
|
|||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||
|
@ -759,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
|||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||
|
||||
# This next test involves NUL characters. It seems impossible to handle them
|
||||
# easily in many operating systems. An earlier version of this script used sed
|
||||
# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character (@). However, on (some
|
||||
# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
|
@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
|
|||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/pcre2-dev
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -375,7 +375,8 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
@ -400,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -695,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
|
|
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
</P>
|
||||
<P>
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
|
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
|||
limit is set, less than the default.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
<b>pcre2_match()</b> uses the heap are given in the
|
||||
<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
|
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|||
<br>
|
||||
<br>
|
||||
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -3148,11 +3146,11 @@ The backtracking match limit was reached.
|
|||
<pre>
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
</pre>
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
<pre>
|
||||
PCRE2_ERROR_NULL
|
||||
</pre>
|
||||
|
@ -4020,9 +4018,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 14 December 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -284,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
|||
counting is done differently).
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||
change this by a setting such as
|
||||
|
@ -609,16 +608,16 @@ give a warning.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 08 December 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
|||
<pre>
|
||||
pcre2grep some-pattern file1 - file3
|
||||
</pre>
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
<b>-N</b> (<b>--newline</b>) option.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||
terminator to a zero byte.
|
||||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
|
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
context lines (the <b>-Z</b> option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||
<b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -199,9 +203,10 @@ Output up to <i>number</i> lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--heap-limit</b>=<i>number</i>
|
||||
|
@ -481,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
|||
<b>-L</b>, <b>--files-without-match</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-l</b> options.
|
||||
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-l</b>, <b>--files-with-matches</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
|
@ -592,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
|
|||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
|
@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
|||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-Z</b>, <b>--null</b>
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
|
@ -1053,9 +1066,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 31 August 2021
|
||||
Last updated: 30 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<P>
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 02 February 2019
|
||||
Last updated: 26 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
</P>
|
||||
<P>
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
</P>
|
||||
<P>
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
</P>
|
||||
<P>
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||
affect the saved block.
|
||||
</P>
|
||||
<P>
|
||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 February 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -1241,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1564,7 +1565,7 @@ Setting heap, match, and depth limits
|
|||
<P>
|
||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
<b>find_limits</b> modifier is specified.
|
||||
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding minimum limits
|
||||
|
@ -1574,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
</P>
|
||||
<P>
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
|
@ -1603,9 +1608,7 @@ overall amount of computing resource that is used.
|
|||
</P>
|
||||
<P>
|
||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing MARK names
|
||||
|
@ -1623,12 +1626,10 @@ Showing memory usage
|
|||
<P>
|
||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
</P>
|
||||
|
@ -1690,7 +1691,8 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
|||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
||||
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||
modifiers.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||
|
@ -2141,7 +2143,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 12 January 2022
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
|
|
430
doc/pcre2.txt
430
doc/pcre2.txt
|
@ -1028,7 +1028,7 @@ PCRE2 CONTEXTS
|
|||
pcre2jit documentation for more details). If the limit is reached, the
|
||||
negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
|
||||
limit can be set when PCRE2 is built; if it is not, the default is set
|
||||
very large and is essentially "unlimited".
|
||||
very large and is essentially unlimited.
|
||||
|
||||
A value for the heap limit may also be supplied by an item at the start
|
||||
of a pattern of the form
|
||||
|
@ -1039,19 +1039,15 @@ PCRE2 CONTEXTS
|
|||
less ddd is less than the limit set by the caller of pcre2_match() or,
|
||||
if no such limit is set, less than the default.
|
||||
|
||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
||||
tem stack for recording backtracking points. The more nested backtrack-
|
||||
ing points there are (that is, the deeper the search tree), the more
|
||||
memory is needed. Heap memory is used only if the initial vector is
|
||||
too small. If the heap limit is set to a value less than 21 (in partic-
|
||||
ular, zero) no heap memory will be used. In this case, only patterns
|
||||
that do not have a lot of nested backtracking can be successfully pro-
|
||||
cessed.
|
||||
The pcre2_match() function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
pcre2_match() uses the heap are given in the pcre2perform documenta-
|
||||
tion.
|
||||
|
||||
Similarly, for pcre2_dfa_match(), a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and
|
||||
only if this is not big enough is heap memory used. In this case, too,
|
||||
setting a value of zero disables the use of the heap.
|
||||
For pcre2_dfa_match(), a vector on the system stack is used when pro-
|
||||
cessing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, setting a
|
||||
value of zero disables the use of the heap.
|
||||
|
||||
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
||||
uint32_t value);
|
||||
|
@ -1093,12 +1089,12 @@ PCRE2 CONTEXTS
|
|||
|
||||
This parameter limits the depth of nested backtracking in
|
||||
pcre2_match(). Each time a nested backtracking point is passed, a new
|
||||
memory "frame" is used to remember the state of matching at that point.
|
||||
memory frame is used to remember the state of matching at that point.
|
||||
Thus, this parameter indirectly limits the amount of memory that is
|
||||
used in a match. However, because the size of each memory "frame" de-
|
||||
pends on the number of capturing parentheses, the actual memory limit
|
||||
varies from pattern to pattern. This limit was more useful in versions
|
||||
before 10.30, where function recursion was used for backtracking.
|
||||
used in a match. However, because the size of each memory frame depends
|
||||
on the number of capturing parentheses, the actual memory limit varies
|
||||
from pattern to pattern. This limit was more useful in versions before
|
||||
10.30, where function recursion was used for backtracking.
|
||||
|
||||
The depth limit is not relevant, and is ignored, when matching is done
|
||||
using JIT compiled code. However, it is supported by pcre2_dfa_match(),
|
||||
|
@ -3051,12 +3047,12 @@ ERROR RETURNS FROM pcre2_match()
|
|||
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
|
||||
If a pattern contains many nested backtracking points, heap memory is
|
||||
used to remember them. This error is given when the memory allocation
|
||||
function (default or custom) fails. Note that a different error,
|
||||
PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is
|
||||
given when the memory allocation function (default or custom) fails.
|
||||
Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the
|
||||
amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca-
|
||||
tion fails.
|
||||
|
||||
PCRE2_ERROR_NULL
|
||||
|
||||
|
@ -3860,8 +3856,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 14 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -4118,41 +4114,40 @@ LIMITING PCRE2 RESOURCE USAGE
|
|||
pcre2_dfa_match() matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
|
||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
||||
tem stack to record backtracking points. The more nested backtracking
|
||||
points there are (that is, the deeper the search tree), the more memory
|
||||
is needed. If the initial vector is not large enough, heap memory is
|
||||
used, up to a certain limit, which is specified in kibibytes (units of
|
||||
1024 bytes). The limit can be changed at run time, as described in the
|
||||
pcre2api documentation. The default limit (in effect unlimited) is 20
|
||||
million. You can change this by a setting such as
|
||||
The pcre2_match() function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the
|
||||
deeper the search tree), the more memory is needed. There is an upper
|
||||
limit, specified in kibibytes (units of 1024 bytes). This limit can be
|
||||
changed at run time, as described in the pcre2api documentation. The
|
||||
default limit (in effect unlimited) is 20 million. You can change this
|
||||
by a setting such as
|
||||
|
||||
--with-heap-limit=500
|
||||
|
||||
which limits the amount of heap to 500 KiB. This limit applies only to
|
||||
which limits the amount of heap to 500 KiB. This limit applies only to
|
||||
interpretive matching in pcre2_match() and pcre2_dfa_match(), which may
|
||||
also use the heap for internal workspace when processing complicated
|
||||
patterns. This limit does not apply when JIT (which has its own memory
|
||||
also use the heap for internal workspace when processing complicated
|
||||
patterns. This limit does not apply when JIT (which has its own memory
|
||||
arrangements) is used.
|
||||
|
||||
You can also explicitly limit the depth of nested backtracking in the
|
||||
You can also explicitly limit the depth of nested backtracking in the
|
||||
pcre2_match() interpreter. This limit defaults to the value that is set
|
||||
for --with-match-limit. You can set a lower default limit by adding,
|
||||
for --with-match-limit. You can set a lower default limit by adding,
|
||||
for example,
|
||||
|
||||
--with-match-limit-depth=10000
|
||||
|
||||
to the configure command. This value can be overridden at run time.
|
||||
This depth limit indirectly limits the amount of heap memory that is
|
||||
used, but because the size of each backtracking "frame" depends on the
|
||||
number of capturing parentheses in a pattern, the amount of heap that
|
||||
is used before the limit is reached varies from pattern to pattern.
|
||||
to the configure command. This value can be overridden at run time.
|
||||
This depth limit indirectly limits the amount of heap memory that is
|
||||
used, but because the size of each backtracking "frame" depends on the
|
||||
number of capturing parentheses in a pattern, the amount of heap that
|
||||
is used before the limit is reached varies from pattern to pattern.
|
||||
This limit was more useful in versions before 10.30, where function re-
|
||||
cursion was used for backtracking.
|
||||
|
||||
As well as applying to pcre2_match(), the depth limit also controls the
|
||||
depth of recursive function calls in pcre2_dfa_match(). These are used
|
||||
for lookaround assertions, atomic groups, and recursion within pat-
|
||||
depth of recursive function calls in pcre2_dfa_match(). These are used
|
||||
for lookaround assertions, atomic groups, and recursion within pat-
|
||||
terns. The limit does not apply to JIT matching.
|
||||
|
||||
|
||||
|
@ -4160,67 +4155,67 @@ CREATING CHARACTER TABLES AT BUILD TIME
|
|||
|
||||
PCRE2 uses fixed tables for processing characters whose code points are
|
||||
less than 256. By default, PCRE2 is built with a set of tables that are
|
||||
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
||||
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
||||
for ASCII codes only. If you add
|
||||
|
||||
--enable-rebuild-chartables
|
||||
|
||||
to the configure command, the distributed tables are no longer used.
|
||||
to the configure command, the distributed tables are no longer used.
|
||||
Instead, a program called pcre2_dftables is compiled and run. This out-
|
||||
puts the source for new set of tables, created in the default locale of
|
||||
your C run-time system. This method of replacing the tables does not
|
||||
your C run-time system. This method of replacing the tables does not
|
||||
work if you are cross compiling, because pcre2_dftables needs to be run
|
||||
on the local host and therefore not compiled with the cross compiler.
|
||||
|
||||
If you need to create alternative tables when cross compiling, you will
|
||||
have to do so "by hand". There may also be other reasons for creating
|
||||
tables manually. To cause pcre2_dftables to be built on the local
|
||||
have to do so "by hand". There may also be other reasons for creating
|
||||
tables manually. To cause pcre2_dftables to be built on the local
|
||||
host, run a normal compiling command, and then run the program with the
|
||||
output file as its argument, for example:
|
||||
|
||||
cc src/pcre2_dftables.c -o pcre2_dftables
|
||||
./pcre2_dftables src/pcre2_chartables.c
|
||||
|
||||
This builds the tables in the default locale of the local host. If you
|
||||
This builds the tables in the default locale of the local host. If you
|
||||
want to specify a locale, you must use the -L option:
|
||||
|
||||
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
|
||||
|
||||
You can also specify -b (with or without -L). This causes the tables to
|
||||
be written in binary instead of as source code. A set of binary tables
|
||||
can be loaded into memory by an application and passed to pcre2_com-
|
||||
be written in binary instead of as source code. A set of binary tables
|
||||
can be loaded into memory by an application and passed to pcre2_com-
|
||||
pile() in the same way as tables created by calling pcre2_maketables().
|
||||
The tables are just a string of bytes, independent of hardware charac-
|
||||
teristics such as endianness. This means they can be bundled with an
|
||||
application that runs in different environments, to ensure consistent
|
||||
The tables are just a string of bytes, independent of hardware charac-
|
||||
teristics such as endianness. This means they can be bundled with an
|
||||
application that runs in different environments, to ensure consistent
|
||||
behaviour.
|
||||
|
||||
|
||||
USING EBCDIC CODE
|
||||
|
||||
PCRE2 assumes by default that it will run in an environment where the
|
||||
character code is ASCII or Unicode, which is a superset of ASCII. This
|
||||
PCRE2 assumes by default that it will run in an environment where the
|
||||
character code is ASCII or Unicode, which is a superset of ASCII. This
|
||||
is the case for most computer operating systems. PCRE2 can, however, be
|
||||
compiled to run in an 8-bit EBCDIC environment by adding
|
||||
|
||||
--enable-ebcdic --disable-unicode
|
||||
|
||||
to the configure command. This setting implies --enable-rebuild-charta-
|
||||
bles. You should only use it if you know that you are in an EBCDIC en-
|
||||
bles. You should only use it if you know that you are in an EBCDIC en-
|
||||
vironment (for example, an IBM mainframe operating system).
|
||||
|
||||
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||
version of the library. Consequently, --enable-unicode and --enable-
|
||||
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||
version of the library. Consequently, --enable-unicode and --enable-
|
||||
ebcdic are mutually exclusive.
|
||||
|
||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||
is used. In such an environment you should use
|
||||
|
||||
--enable-ebcdic-nl25
|
||||
|
||||
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
|
||||
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
||||
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
||||
0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
|
||||
acter (which, in Unicode, is 0x85).
|
||||
|
||||
|
@ -4232,47 +4227,47 @@ USING EBCDIC CODE
|
|||
PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS
|
||||
|
||||
By default pcre2grep supports the use of callouts with string arguments
|
||||
within the patterns it is matching. There are two kinds: one that gen-
|
||||
within the patterns it is matching. There are two kinds: one that gen-
|
||||
erates output using local code, and another that calls an external pro-
|
||||
gram or script. If --disable-pcre2grep-callout-fork is added to the
|
||||
configure command, only the first kind of callout is supported; if
|
||||
--disable-pcre2grep-callout is used, all callouts are completely ig-
|
||||
nored. For more details of pcre2grep callouts, see the pcre2grep docu-
|
||||
gram or script. If --disable-pcre2grep-callout-fork is added to the
|
||||
configure command, only the first kind of callout is supported; if
|
||||
--disable-pcre2grep-callout is used, all callouts are completely ig-
|
||||
nored. For more details of pcre2grep callouts, see the pcre2grep docu-
|
||||
mentation.
|
||||
|
||||
|
||||
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
|
||||
|
||||
By default, pcre2grep reads all files as plain text. You can build it
|
||||
so that it recognizes files whose names end in .gz or .bz2, and reads
|
||||
By default, pcre2grep reads all files as plain text. You can build it
|
||||
so that it recognizes files whose names end in .gz or .bz2, and reads
|
||||
them with libz or libbz2, respectively, by adding one or both of
|
||||
|
||||
--enable-pcre2grep-libz
|
||||
--enable-pcre2grep-libbz2
|
||||
|
||||
to the configure command. These options naturally require that the rel-
|
||||
evant libraries are installed on your system. Configuration will fail
|
||||
evant libraries are installed on your system. Configuration will fail
|
||||
if they are not.
|
||||
|
||||
|
||||
PCRE2GREP BUFFER SIZE
|
||||
|
||||
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
||||
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
||||
scanning, in order to be able to output "before" and "after" lines when
|
||||
it finds a match. The default starting size of the buffer is 20KiB. The
|
||||
buffer itself is three times this size, but because of the way it is
|
||||
buffer itself is three times this size, but because of the way it is
|
||||
used for holding "before" lines, the longest line that is guaranteed to
|
||||
be processable is the notional buffer size. If a longer line is encoun-
|
||||
tered, pcre2grep automatically expands the buffer, up to a specified
|
||||
maximum size, whose default is 1MiB or the starting size, whichever is
|
||||
the larger. You can change the default parameter values by adding, for
|
||||
tered, pcre2grep automatically expands the buffer, up to a specified
|
||||
maximum size, whose default is 1MiB or the starting size, whichever is
|
||||
the larger. You can change the default parameter values by adding, for
|
||||
example,
|
||||
|
||||
--with-pcre2grep-bufsize=51200
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
|
||||
to the configure command. The caller of pcre2grep can override these
|
||||
values by using --buffer-size and --max-buffer-size on the command
|
||||
to the configure command. The caller of pcre2grep can override these
|
||||
values by using --buffer-size and --max-buffer-size on the command
|
||||
line.
|
||||
|
||||
|
||||
|
@ -4283,26 +4278,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
|
|||
--enable-pcre2test-libreadline
|
||||
--enable-pcre2test-libedit
|
||||
|
||||
to the configure command, pcre2test is linked with the libreadline or-
|
||||
libedit library, respectively, and when its input is from a terminal,
|
||||
it reads it using the readline() function. This provides line-editing
|
||||
and history facilities. Note that libreadline is GPL-licensed, so if
|
||||
you distribute a binary of pcre2test linked in this way, there may be
|
||||
to the configure command, pcre2test is linked with the libreadline or-
|
||||
libedit library, respectively, and when its input is from a terminal,
|
||||
it reads it using the readline() function. This provides line-editing
|
||||
and history facilities. Note that libreadline is GPL-licensed, so if
|
||||
you distribute a binary of pcre2test linked in this way, there may be
|
||||
licensing issues. These can be avoided by linking instead with libedit,
|
||||
which has a BSD licence.
|
||||
|
||||
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
||||
be added to the pcre2test build. In many operating environments with a
|
||||
sytem-installed readline library this is sufficient. However, in some
|
||||
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
||||
be added to the pcre2test build. In many operating environments with a
|
||||
sytem-installed readline library this is sufficient. However, in some
|
||||
environments (e.g. if an unmodified distribution version of readline is
|
||||
in use), some extra configuration may be necessary. The INSTALL file
|
||||
in use), some extra configuration may be necessary. The INSTALL file
|
||||
for libreadline says this:
|
||||
|
||||
"Readline uses the termcap functions, but does not link with
|
||||
the termcap or curses library itself, allowing applications
|
||||
which link with readline the to choose an appropriate library."
|
||||
|
||||
If your environment has not been set up so that an appropriate library
|
||||
If your environment has not been set up so that an appropriate library
|
||||
is automatically included, you may need to add something like
|
||||
|
||||
LIBS="-ncurses"
|
||||
|
@ -4316,7 +4311,7 @@ INCLUDING DEBUGGING CODE
|
|||
|
||||
--enable-debug
|
||||
|
||||
to the configure command, additional debugging code is included in the
|
||||
to the configure command, additional debugging code is included in the
|
||||
build. This feature is intended for use by the PCRE2 maintainers.
|
||||
|
||||
|
||||
|
@ -4326,14 +4321,14 @@ DEBUGGING WITH VALGRIND SUPPORT
|
|||
|
||||
--enable-valgrind
|
||||
|
||||
to the configure command, PCRE2 will use valgrind annotations to mark
|
||||
certain memory regions as unaddressable. This allows it to detect in-
|
||||
to the configure command, PCRE2 will use valgrind annotations to mark
|
||||
certain memory regions as unaddressable. This allows it to detect in-
|
||||
valid memory accesses, and is mostly useful for debugging PCRE2 itself.
|
||||
|
||||
|
||||
CODE COVERAGE REPORTING
|
||||
|
||||
If your C compiler is gcc, you can build a version of PCRE2 that can
|
||||
If your C compiler is gcc, you can build a version of PCRE2 that can
|
||||
generate a code coverage report for its test suite. To enable this, you
|
||||
must install lcov version 1.6 or above. Then specify
|
||||
|
||||
|
@ -4342,20 +4337,20 @@ CODE COVERAGE REPORTING
|
|||
to the configure command and build PCRE2 in the usual way.
|
||||
|
||||
Note that using ccache (a caching C compiler) is incompatible with code
|
||||
coverage reporting. If you have configured ccache to run automatically
|
||||
coverage reporting. If you have configured ccache to run automatically
|
||||
on your system, you must set the environment variable
|
||||
|
||||
CCACHE_DISABLE=1
|
||||
|
||||
before running make to build PCRE2, so that ccache is not used.
|
||||
|
||||
When --enable-coverage is used, the following addition targets are
|
||||
When --enable-coverage is used, the following addition targets are
|
||||
added to the Makefile:
|
||||
|
||||
make coverage
|
||||
|
||||
This creates a fresh coverage report for the PCRE2 test suite. It is
|
||||
equivalent to running "make coverage-reset", "make coverage-baseline",
|
||||
This creates a fresh coverage report for the PCRE2 test suite. It is
|
||||
equivalent to running "make coverage-reset", "make coverage-baseline",
|
||||
"make check", and then "make coverage-report".
|
||||
|
||||
make coverage-reset
|
||||
|
@ -4372,73 +4367,73 @@ CODE COVERAGE REPORTING
|
|||
|
||||
make coverage-clean-report
|
||||
|
||||
This removes the generated coverage report without cleaning the cover-
|
||||
This removes the generated coverage report without cleaning the cover-
|
||||
age data itself.
|
||||
|
||||
make coverage-clean-data
|
||||
|
||||
This removes the captured coverage data without removing the coverage
|
||||
This removes the captured coverage data without removing the coverage
|
||||
files created at compile time (*.gcno).
|
||||
|
||||
make coverage-clean
|
||||
|
||||
This cleans all coverage data including the generated coverage report.
|
||||
For more information about code coverage, see the gcov and lcov docu-
|
||||
This cleans all coverage data including the generated coverage report.
|
||||
For more information about code coverage, see the gcov and lcov docu-
|
||||
mentation.
|
||||
|
||||
|
||||
DISABLING THE Z AND T FORMATTING MODIFIERS
|
||||
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers
|
||||
The C99 standard defines formatting modifiers z and t for size_t and
|
||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers
|
||||
in environments other than old versions of Microsoft Visual Studio when
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating support for C99). However, there is at least one
|
||||
__STDC_VERSION__ is defined and has a value greater than or equal to
|
||||
199901L (indicating support for C99). However, there is at least one
|
||||
environment that claims to be C99 but does not support these modifiers.
|
||||
If
|
||||
|
||||
--disable-percent-zt
|
||||
|
||||
is specified, no use is made of the z or t modifiers. Instead of %td or
|
||||
%zu, a suitable format is used depending in the size of long for the
|
||||
%zu, a suitable format is used depending in the size of long for the
|
||||
platform.
|
||||
|
||||
|
||||
SUPPORT FOR FUZZERS
|
||||
|
||||
There is a special option for use by people who want to run fuzzing
|
||||
There is a special option for use by people who want to run fuzzing
|
||||
tests on PCRE2:
|
||||
|
||||
--enable-fuzz-support
|
||||
|
||||
At present this applies only to the 8-bit library. If set, it causes an
|
||||
extra library called libpcre2-fuzzsupport.a to be built, but not in-
|
||||
stalled. This contains a single function called LLVMFuzzerTestOneIn-
|
||||
put() whose arguments are a pointer to a string and the length of the
|
||||
string. When called, this function tries to compile the string as a
|
||||
pattern, and if that succeeds, to match it. This is done both with no
|
||||
options and with some random options bits that are generated from the
|
||||
extra library called libpcre2-fuzzsupport.a to be built, but not in-
|
||||
stalled. This contains a single function called LLVMFuzzerTestOneIn-
|
||||
put() whose arguments are a pointer to a string and the length of the
|
||||
string. When called, this function tries to compile the string as a
|
||||
pattern, and if that succeeds, to match it. This is done both with no
|
||||
options and with some random options bits that are generated from the
|
||||
string.
|
||||
|
||||
Setting --enable-fuzz-support also causes a binary called pcre2fuz-
|
||||
zcheck to be created. This is normally run under valgrind or used when
|
||||
Setting --enable-fuzz-support also causes a binary called pcre2fuz-
|
||||
zcheck to be created. This is normally run under valgrind or used when
|
||||
PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing
|
||||
function and outputs information about what it is doing. The input
|
||||
strings are specified by arguments: if an argument starts with "=" the
|
||||
rest of it is a literal input string. Otherwise, it is assumed to be a
|
||||
function and outputs information about what it is doing. The input
|
||||
strings are specified by arguments: if an argument starts with "=" the
|
||||
rest of it is a literal input string. Otherwise, it is assumed to be a
|
||||
file name, and the contents of the file are the test string.
|
||||
|
||||
|
||||
OBSOLETE OPTION
|
||||
|
||||
In versions of PCRE2 prior to 10.30, there were two ways of handling
|
||||
backtracking in the pcre2_match() function. The default was to use the
|
||||
In versions of PCRE2 prior to 10.30, there were two ways of handling
|
||||
backtracking in the pcre2_match() function. The default was to use the
|
||||
system stack, but if
|
||||
|
||||
--disable-stack-for-recursion
|
||||
|
||||
was set, memory on the heap was used. From release 10.30 onwards this
|
||||
has changed (the stack is no longer used) and this option now does
|
||||
was set, memory on the heap was used. From release 10.30 onwards this
|
||||
has changed (the stack is no longer used) and this option now does
|
||||
nothing except give a warning.
|
||||
|
||||
|
||||
|
@ -4450,14 +4445,14 @@ SEE ALSO
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -5596,18 +5591,22 @@ SIZE AND OTHER LIMITATIONS
|
|||
The maximum length of a string argument to a callout is the largest
|
||||
number a 32-bit unsigned integer can hold.
|
||||
|
||||
The maximum amount of heap memory used for matching is controlled by
|
||||
the heap limit, which can be set in a pattern or in a match context.
|
||||
The default is a very large number, effectively unlimited.
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -9773,152 +9772,169 @@ STACK AND HEAP USAGE AT RUN TIME
|
|||
sive function calls could use a great deal of stack, and this could
|
||||
cause problems, but this usage has been eliminated. Backtracking posi-
|
||||
tions are now explicitly remembered in memory frames controlled by the
|
||||
code. An initial 20KiB vector of frames is allocated on the system
|
||||
stack (enough for about 100 frames for small patterns), but if this is
|
||||
insufficient, heap memory is used. The amount of heap memory can be
|
||||
limited; if the limit is set to zero, only the initial stack vector is
|
||||
used. Rewriting patterns to be time-efficient, as described below, may
|
||||
also reduce the memory requirements.
|
||||
code.
|
||||
|
||||
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround as-
|
||||
The size of each frame depends on the size of pointer variables and the
|
||||
number of capturing parenthesized groups in the pattern being matched.
|
||||
On a 64-bit system the frame size for a pattern with no captures is 128
|
||||
bytes. For each capturing group the size increases by 16 bytes.
|
||||
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on
|
||||
the system stack, but this still caused some issues for multi-thread
|
||||
applications where each thread has a very small stack. From release
|
||||
10.41 backtracking memory frames are always held in heap memory. An
|
||||
initial heap allocation is obtained the first time any match data block
|
||||
is passed to pcre2_match(). This is remembered with the match data
|
||||
block and re-used if that block is used for another match. It is freed
|
||||
when the match data block itself is freed.
|
||||
|
||||
The size of the initial block is the larger of 20KiB or ten times the
|
||||
pattern's frame size, unless the heap limit is less than this, in which
|
||||
case the heap limit is used. If the initial block proves to be too
|
||||
small during matching, it is replaced by a larger block, subject to the
|
||||
heap limit. The heap limit is checked only when a new block is to be
|
||||
allocated. Reducing the heap limit between calls to pcre2_match() with
|
||||
the same match data block does not affect the saved block.
|
||||
|
||||
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround as-
|
||||
sertions, and recursion within the pattern. The original version of the
|
||||
code used to allocate quite large internal workspace vectors on the
|
||||
stack, which caused some problems for some patterns in environments
|
||||
with small stacks. From release 10.32 the code for pcre2_dfa_match()
|
||||
has been re-factored to use heap memory when necessary for internal
|
||||
workspace when recursing, though recursive function calls are still
|
||||
code used to allocate quite large internal workspace vectors on the
|
||||
stack, which caused some problems for some patterns in environments
|
||||
with small stacks. From release 10.32 the code for pcre2_dfa_match()
|
||||
has been re-factored to use heap memory when necessary for internal
|
||||
workspace when recursing, though recursive function calls are still
|
||||
used.
|
||||
|
||||
The "match depth" parameter can be used to limit the depth of function
|
||||
recursion, and the "match heap" parameter to limit heap memory in
|
||||
The "match depth" parameter can be used to limit the depth of function
|
||||
recursion, and the "match heap" parameter to limit heap memory in
|
||||
pcre2_dfa_match().
|
||||
|
||||
|
||||
PROCESSING TIME
|
||||
|
||||
Certain items in regular expression patterns are processed more effi-
|
||||
Certain items in regular expression patterns are processed more effi-
|
||||
ciently than others. It is more efficient to use a character class like
|
||||
[aeiou] than a set of single-character alternatives such as
|
||||
(a|e|i|o|u). In general, the simplest construction that provides the
|
||||
[aeiou] than a set of single-character alternatives such as
|
||||
(a|e|i|o|u). In general, the simplest construction that provides the
|
||||
required behaviour is usually the most efficient. Jeffrey Friedl's book
|
||||
contains a lot of useful general discussion about optimizing regular
|
||||
contains a lot of useful general discussion about optimizing regular
|
||||
expressions for efficient performance. This document contains a few ob-
|
||||
servations about PCRE2.
|
||||
|
||||
Using Unicode character properties (the \p, \P, and \X escapes) is
|
||||
slow, because PCRE2 has to use a multi-stage table lookup whenever it
|
||||
needs a character's property. If you can find an alternative pattern
|
||||
Using Unicode character properties (the \p, \P, and \X escapes) is
|
||||
slow, because PCRE2 has to use a multi-stage table lookup whenever it
|
||||
needs a character's property. If you can find an alternative pattern
|
||||
that does not use character properties, it will probably be faster.
|
||||
|
||||
By default, the escape sequences \b, \d, \s, and \w, and the POSIX
|
||||
character classes such as [:alpha:] do not use Unicode properties,
|
||||
By default, the escape sequences \b, \d, \s, and \w, and the POSIX
|
||||
character classes such as [:alpha:] do not use Unicode properties,
|
||||
partly for backwards compatibility, and partly for performance reasons.
|
||||
However, you can set the PCRE2_UCP option or start the pattern with
|
||||
(*UCP) if you want Unicode character properties to be used. This can
|
||||
double the matching time for items such as \d, when matched with
|
||||
pcre2_match(); the performance loss is less with a DFA matching func-
|
||||
However, you can set the PCRE2_UCP option or start the pattern with
|
||||
(*UCP) if you want Unicode character properties to be used. This can
|
||||
double the matching time for items such as \d, when matched with
|
||||
pcre2_match(); the performance loss is less with a DFA matching func-
|
||||
tion, and in both cases there is not much difference for \b.
|
||||
|
||||
When a pattern begins with .* not in atomic parentheses, nor in paren-
|
||||
theses that are the subject of a backreference, and the PCRE2_DOTALL
|
||||
option is set, the pattern is implicitly anchored by PCRE2, since it
|
||||
can match only at the start of a subject string. If the pattern has
|
||||
When a pattern begins with .* not in atomic parentheses, nor in paren-
|
||||
theses that are the subject of a backreference, and the PCRE2_DOTALL
|
||||
option is set, the pattern is implicitly anchored by PCRE2, since it
|
||||
can match only at the start of a subject string. If the pattern has
|
||||
multiple top-level branches, they must all be anchorable. The optimiza-
|
||||
tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au-
|
||||
tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au-
|
||||
tomatically disabled if the pattern contains (*PRUNE) or (*SKIP).
|
||||
|
||||
If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be-
|
||||
cause the dot metacharacter does not then match a newline, and if the
|
||||
subject string contains newlines, the pattern may match from the char-
|
||||
If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be-
|
||||
cause the dot metacharacter does not then match a newline, and if the
|
||||
subject string contains newlines, the pattern may match from the char-
|
||||
acter immediately following one of them instead of from the very start.
|
||||
For example, the pattern
|
||||
|
||||
.*second
|
||||
|
||||
matches the subject "first\nand second" (where \n stands for a newline
|
||||
character), with the match starting at the seventh character. In order
|
||||
to do this, PCRE2 has to retry the match starting after every newline
|
||||
matches the subject "first\nand second" (where \n stands for a newline
|
||||
character), with the match starting at the seventh character. In order
|
||||
to do this, PCRE2 has to retry the match starting after every newline
|
||||
in the subject.
|
||||
|
||||
If you are using such a pattern with subject strings that do not con-
|
||||
tain newlines, the best performance is obtained by setting
|
||||
PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex-
|
||||
plicit anchoring. That saves PCRE2 from having to scan along the sub-
|
||||
If you are using such a pattern with subject strings that do not con-
|
||||
tain newlines, the best performance is obtained by setting
|
||||
PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex-
|
||||
plicit anchoring. That saves PCRE2 from having to scan along the sub-
|
||||
ject looking for a newline to restart at.
|
||||
|
||||
Beware of patterns that contain nested indefinite repeats. These can
|
||||
take a long time to run when applied to a string that does not match.
|
||||
Beware of patterns that contain nested indefinite repeats. These can
|
||||
take a long time to run when applied to a string that does not match.
|
||||
Consider the pattern fragment
|
||||
|
||||
^(a+)*
|
||||
|
||||
This can match "aaaa" in 16 different ways, and this number increases
|
||||
very rapidly as the string gets longer. (The * repeat can match 0, 1,
|
||||
2, 3, or 4 times, and for each of those cases other than 0 or 4, the +
|
||||
repeats can match different numbers of times.) When the remainder of
|
||||
the pattern is such that the entire match is going to fail, PCRE2 has
|
||||
in principle to try every possible variation, and this can take an ex-
|
||||
This can match "aaaa" in 16 different ways, and this number increases
|
||||
very rapidly as the string gets longer. (The * repeat can match 0, 1,
|
||||
2, 3, or 4 times, and for each of those cases other than 0 or 4, the +
|
||||
repeats can match different numbers of times.) When the remainder of
|
||||
the pattern is such that the entire match is going to fail, PCRE2 has
|
||||
in principle to try every possible variation, and this can take an ex-
|
||||
tremely long time, even for relatively short strings.
|
||||
|
||||
An optimization catches some of the more simple cases such as
|
||||
|
||||
(a+)*b
|
||||
|
||||
where a literal character follows. Before embarking on the standard
|
||||
matching procedure, PCRE2 checks that there is a "b" later in the sub-
|
||||
ject string, and if there is not, it fails the match immediately. How-
|
||||
ever, when there is no following literal this optimization cannot be
|
||||
where a literal character follows. Before embarking on the standard
|
||||
matching procedure, PCRE2 checks that there is a "b" later in the sub-
|
||||
ject string, and if there is not, it fails the match immediately. How-
|
||||
ever, when there is no following literal this optimization cannot be
|
||||
used. You can see the difference by comparing the behaviour of
|
||||
|
||||
(a+)*\d
|
||||
|
||||
with the pattern above. The former gives a failure almost instantly
|
||||
when applied to a whole line of "a" characters, whereas the latter
|
||||
with the pattern above. The former gives a failure almost instantly
|
||||
when applied to a whole line of "a" characters, whereas the latter
|
||||
takes an appreciable time with strings longer than about 20 characters.
|
||||
|
||||
In many cases, the solution to this kind of performance issue is to use
|
||||
an atomic group or a possessive quantifier. This can often reduce mem-
|
||||
an atomic group or a possessive quantifier. This can often reduce mem-
|
||||
ory requirements as well. As another example, consider this pattern:
|
||||
|
||||
([^<]|<(?!inet))+
|
||||
|
||||
It matches from wherever it starts until it encounters "<inet" or the
|
||||
end of the data, and is the kind of pattern that might be used when
|
||||
It matches from wherever it starts until it encounters "<inet" or the
|
||||
end of the data, and is the kind of pattern that might be used when
|
||||
processing an XML file. Each iteration of the outer parentheses matches
|
||||
either one character that is not "<" or a "<" that is not followed by
|
||||
"inet". However, each time a parenthesis is processed, a backtracking
|
||||
position is passed, so this formulation uses a memory frame for each
|
||||
either one character that is not "<" or a "<" that is not followed by
|
||||
"inet". However, each time a parenthesis is processed, a backtracking
|
||||
position is passed, so this formulation uses a memory frame for each
|
||||
matched character. For a long string, a lot of memory is required. Con-
|
||||
sider now this rewritten pattern, which matches exactly the same
|
||||
sider now this rewritten pattern, which matches exactly the same
|
||||
strings:
|
||||
|
||||
([^<]++|<(?!inet))+
|
||||
|
||||
This runs much faster, because sequences of characters that do not con-
|
||||
tain "<" are "swallowed" in one item inside the parentheses, and a pos-
|
||||
sessive quantifier is used to stop any backtracking into the runs of
|
||||
non-"<" characters. This version also uses a lot less memory because
|
||||
entry to a new set of parentheses happens only when a "<" character
|
||||
that is not followed by "inet" is encountered (and we assume this is
|
||||
sessive quantifier is used to stop any backtracking into the runs of
|
||||
non-"<" characters. This version also uses a lot less memory because
|
||||
entry to a new set of parentheses happens only when a "<" character
|
||||
that is not followed by "inet" is encountered (and we assume this is
|
||||
relatively rare).
|
||||
|
||||
This example shows that one way of optimizing performance when matching
|
||||
long subject strings is to write repeated parenthesized subpatterns to
|
||||
long subject strings is to write repeated parenthesized subpatterns to
|
||||
match more than one character whenever possible.
|
||||
|
||||
SETTING RESOURCE LIMITS
|
||||
|
||||
You can set limits on the amount of processing that takes place when
|
||||
matching, and on the amount of heap memory that is used. The default
|
||||
You can set limits on the amount of processing that takes place when
|
||||
matching, and on the amount of heap memory that is used. The default
|
||||
values of the limits are very large, and unlikely ever to operate. They
|
||||
can be changed when PCRE2 is built, and they can also be set when
|
||||
pcre2_match() or pcre2_dfa_match() is called. For details of these in-
|
||||
terfaces, see the pcre2build documentation and the section entitled
|
||||
can be changed when PCRE2 is built, and they can also be set when
|
||||
pcre2_match() or pcre2_dfa_match() is called. For details of these in-
|
||||
terfaces, see the pcre2build documentation and the section entitled
|
||||
"The match context" in the pcre2api documentation.
|
||||
|
||||
The pcre2test test program has a modifier called "find_limits" which,
|
||||
if applied to a subject line, causes it to find the smallest limits
|
||||
The pcre2test test program has a modifier called "find_limits" which,
|
||||
if applied to a subject line, causes it to find the smallest limits
|
||||
that allow a pattern to match. This is done by repeatedly matching with
|
||||
different limits.
|
||||
|
||||
|
@ -9926,14 +9942,14 @@ PROCESSING TIME
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "27 July 2022" "PCRE2 10.41"
|
||||
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -43,13 +43,15 @@ For example:
|
|||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
\fB-N\fP (\fB--newline\fP) option.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
|
@ -149,9 +151,11 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
|
@ -167,9 +171,10 @@ Output up to \fInumber\fP lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
|
@ -356,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
|
@ -417,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
|||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-l\fP options.
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
|
@ -516,7 +525,7 @@ counter that is incremented each time around its main processing loop. If the
|
|||
value set by \fB--match-limit\fP is reached, an error occurs.
|
||||
.sp
|
||||
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
.sp
|
||||
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
|
||||
which indirectly limits the amount of memory that is used. The amount of memory
|
||||
|
@ -729,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
|||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
|
@ -957,6 +972,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 27 July 2022
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1111,7 +1111,8 @@ SUBJECT MODIFIERS
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use pcre2_dfa_match()
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1411,7 +1412,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
||||
priate limits in the match context. These values are ignored when the
|
||||
find_limits modifier is specified.
|
||||
find_limits or find_limits_noheap modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
|
@ -1419,8 +1420,12 @@ SUBJECT MODIFIERS
|
|||
calls the relevant matching function several times, setting different
|
||||
values in the match context via pcre2_set_heap_limit(),
|
||||
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
||||
minimum values for each parameter that allows the match to complete
|
||||
without error. If JIT is being used, only the match limit is relevant.
|
||||
smallest value for each parameter that allows the match to complete
|
||||
without a "limit exceeded" error. The match itself may succeed or fail.
|
||||
An alternative modifier, find_limits_noheap, omits the heap limit. This
|
||||
is used in the standard tests, because the minimum heap limit varies
|
||||
between systems. If JIT is being used, only the match limit is rele-
|
||||
vant, and the other two are automatically omitted.
|
||||
|
||||
When using this modifier, the pattern should not contain any limit set-
|
||||
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
||||
|
@ -1446,9 +1451,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
For both kinds of matching, the heap_limit number, which is in
|
||||
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
||||
for matching. A value of zero disables the use of any heap memory; many
|
||||
simple pattern matches can be done without using the heap, so zero is
|
||||
not an unreasonable setting.
|
||||
for matching.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
@ -1463,13 +1466,11 @@ SUBJECT MODIFIERS
|
|||
|
||||
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
||||
ory allocation and freeing calls that occur during a call to
|
||||
pcre2_match() or pcre2_dfa_match(). These occur only when a match re-
|
||||
quires a bigger vector than the default for remembering backtracking
|
||||
points (pcre2_match()) or for internal workspace (pcre2_dfa_match()).
|
||||
In many cases there will be no heap memory used and therefore no addi-
|
||||
tional output. No heap memory is allocated during matching with JIT, so
|
||||
in that case the memory modifier never has any effect. For this modi-
|
||||
fier to work, the null_context modifier must not be set on both the
|
||||
pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is
|
||||
used only when a match requires more internal workspace that the de-
|
||||
fault allocation on the stack, so in many cases there will be no out-
|
||||
put. No heap memory is allocated during matching with JIT. For this
|
||||
modifier to work, the null_context modifier must not be set on both the
|
||||
pattern and the subject, though it can be set on one or the other.
|
||||
|
||||
Setting a starting offset
|
||||
|
@ -1518,45 +1519,46 @@ SUBJECT MODIFIERS
|
|||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly
|
||||
in this case (they use default values). This modifier cannot be used
|
||||
with the find_limits or substitute_callout modifiers.
|
||||
with the find_limits, find_limits_noheap, or substitute_callout modi-
|
||||
fiers.
|
||||
|
||||
Similarly, for testing purposes, if the null_subject or null_replace-
|
||||
ment modifier is set, the subject or replacement string pointers are
|
||||
Similarly, for testing purposes, if the null_subject or null_replace-
|
||||
ment modifier is set, the subject or replacement string pointers are
|
||||
passed as NULL, respectively, to the relevant functions.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
By default, pcre2test uses the standard PCRE2 matching function,
|
||||
By default, pcre2test uses the standard PCRE2 matching function,
|
||||
pcre2_match() to match each subject line. PCRE2 also supports an alter-
|
||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||
ferent way, and has some restrictions. The differences between the two
|
||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||
ferent way, and has some restrictions. The differences between the two
|
||||
functions are described in the pcre2matching documentation.
|
||||
|
||||
If the dfa modifier is set, the alternative matching function is used.
|
||||
This function finds all possible matches at a given point in the sub-
|
||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||
after the first match is found. This is always the shortest possible
|
||||
If the dfa modifier is set, the alternative matching function is used.
|
||||
This function finds all possible matches at a given point in the sub-
|
||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||
after the first match is found. This is always the shortest possible
|
||||
match.
|
||||
|
||||
|
||||
DEFAULT OUTPUT FROM pcre2test
|
||||
|
||||
This section describes the output when the normal matching function,
|
||||
This section describes the output when the normal matching function,
|
||||
pcre2_match(), is being used.
|
||||
|
||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||
strings, starting with number 0 for the string that matched the whole
|
||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||
strings, starting with number 0 for the string that matched the whole
|
||||
pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER-
|
||||
ROR_NOMATCH, or "Partial match:" followed by the partially matching
|
||||
substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is
|
||||
the entire substring that was inspected during the partial match; it
|
||||
may include characters before the actual match start if a lookbehind
|
||||
ROR_NOMATCH, or "Partial match:" followed by the partially matching
|
||||
substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is
|
||||
the entire substring that was inspected during the partial match; it
|
||||
may include characters before the actual match start if a lookbehind
|
||||
assertion, \K, \b, or \B was involved.)
|
||||
|
||||
For any other return, pcre2test outputs the PCRE2 negative error number
|
||||
and a short descriptive phrase. If the error is a failed UTF string
|
||||
check, the code unit offset of the start of the failing character is
|
||||
and a short descriptive phrase. If the error is a failed UTF string
|
||||
check, the code unit offset of the start of the failing character is
|
||||
also output. Here is an example of an interactive pcre2test run.
|
||||
|
||||
$ pcre2test
|
||||
|
@ -1572,8 +1574,8 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
Unset capturing substrings that are not followed by one that is set are
|
||||
not shown by pcre2test unless the allcaptures modifier is specified. In
|
||||
the following example, there are two capturing substrings, but when the
|
||||
first data line is matched, the second, unset substring is not shown.
|
||||
An "internal" unset substring is shown as "<unset>", as for the second
|
||||
first data line is matched, the second, unset substring is not shown.
|
||||
An "internal" unset substring is shown as "<unset>", as for the second
|
||||
data line.
|
||||
|
||||
re> /(a)|(b)/
|
||||
|
@ -1585,11 +1587,11 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
1: <unset>
|
||||
2: b
|
||||
|
||||
If the strings contain any non-printing characters, they are output as
|
||||
\xhh escapes if the value is less than 256 and UTF mode is not set.
|
||||
If the strings contain any non-printing characters, they are output as
|
||||
\xhh escapes if the value is less than 256 and UTF mode is not set.
|
||||
Otherwise they are output as \x{hh...} escapes. See below for the defi-
|
||||
nition of non-printing characters. If the aftertext modifier is set,
|
||||
the output for substring 0 is followed by the the rest of the subject
|
||||
nition of non-printing characters. If the aftertext modifier is set,
|
||||
the output for substring 0 is followed by the the rest of the subject
|
||||
string, identified by "0+" like this:
|
||||
|
||||
re> /cat/aftertext
|
||||
|
@ -1609,8 +1611,8 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
0: ipp
|
||||
1: pp
|
||||
|
||||
"No match" is output only if the first match attempt fails. Here is an
|
||||
example of a failure message (the offset 4 that is specified by the
|
||||
"No match" is output only if the first match attempt fails. Here is an
|
||||
example of a failure message (the offset 4 that is specified by the
|
||||
offset modifier is past the end of the subject string):
|
||||
|
||||
re> /xyz/
|
||||
|
@ -1618,7 +1620,7 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
Error -24 (bad offset value)
|
||||
|
||||
Note that whereas patterns can be continued over several lines (a plain
|
||||
">" prompt is used for continuations), subject lines may not. However
|
||||
">" prompt is used for continuations), subject lines may not. However
|
||||
newlines can be included in a subject by means of the \n escape (or \r,
|
||||
\r\n, etc., depending on the newline sequence setting).
|
||||
|
||||
|
@ -1626,7 +1628,7 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
When the alternative matching function, pcre2_dfa_match(), is used, the
|
||||
output consists of a list of all the matches that start at the first
|
||||
output consists of a list of all the matches that start at the first
|
||||
point in the subject where there is at least one match. For example:
|
||||
|
||||
re> /(tang|tangerine|tan)/
|
||||
|
@ -1635,11 +1637,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
|||
1: tang
|
||||
2: tan
|
||||
|
||||
Using the normal matching function on this data finds only "tang". The
|
||||
longest matching string is always given first (and numbered zero). Af-
|
||||
ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
|
||||
Using the normal matching function on this data finds only "tang". The
|
||||
longest matching string is always given first (and numbered zero). Af-
|
||||
ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
|
||||
lowed by the partially matching substring. Note that this is the entire
|
||||
substring that was inspected during the partial match; it may include
|
||||
substring that was inspected during the partial match; it may include
|
||||
characters before the actual match start if a lookbehind assertion, \b,
|
||||
or \B was involved. (\K is not supported for DFA matching.)
|
||||
|
||||
|
@ -1655,16 +1657,16 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
|||
1: tan
|
||||
0: tan
|
||||
|
||||
The alternative matching function does not support substring capture,
|
||||
so the modifiers that are concerned with captured substrings are not
|
||||
The alternative matching function does not support substring capture,
|
||||
so the modifiers that are concerned with captured substrings are not
|
||||
relevant.
|
||||
|
||||
|
||||
RESTARTING AFTER A PARTIAL MATCH
|
||||
|
||||
When the alternative matching function has given the PCRE2_ERROR_PAR-
|
||||
When the alternative matching function has given the PCRE2_ERROR_PAR-
|
||||
TIAL return, indicating that the subject partially matched the pattern,
|
||||
you can restart the match with additional subject data by means of the
|
||||
you can restart the match with additional subject data by means of the
|
||||
dfa_restart modifier. For example:
|
||||
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
|
@ -1673,37 +1675,37 @@ RESTARTING AFTER A PARTIAL MATCH
|
|||
data> n05\=dfa,dfa_restart
|
||||
0: n05
|
||||
|
||||
For further information about partial matching, see the pcre2partial
|
||||
For further information about partial matching, see the pcre2partial
|
||||
documentation.
|
||||
|
||||
|
||||
CALLOUTS
|
||||
|
||||
If the pattern contains any callout requests, pcre2test's callout func-
|
||||
tion is called during matching unless callout_none is specified. This
|
||||
tion is called during matching unless callout_none is specified. This
|
||||
works with both matching functions, and with JIT, though there are some
|
||||
differences in behaviour. The output for callouts with numerical argu-
|
||||
differences in behaviour. The output for callouts with numerical argu-
|
||||
ments and those with string arguments is slightly different.
|
||||
|
||||
Callouts with numerical arguments
|
||||
|
||||
By default, the callout function displays the callout number, the start
|
||||
and current positions in the subject text at the callout time, and the
|
||||
and current positions in the subject text at the callout time, and the
|
||||
next pattern item to be tested. For example:
|
||||
|
||||
--->pqrabcdef
|
||||
0 ^ ^ \d
|
||||
|
||||
This output indicates that callout number 0 occurred for a match at-
|
||||
tempt starting at the fourth character of the subject string, when the
|
||||
pointer was at the seventh character, and when the next pattern item
|
||||
was \d. Just one circumflex is output if the start and current posi-
|
||||
This output indicates that callout number 0 occurred for a match at-
|
||||
tempt starting at the fourth character of the subject string, when the
|
||||
pointer was at the seventh character, and when the next pattern item
|
||||
was \d. Just one circumflex is output if the start and current posi-
|
||||
tions are the same, or if the current position precedes the start posi-
|
||||
tion, which can happen if the callout is in a lookbehind assertion.
|
||||
|
||||
Callouts numbered 255 are assumed to be automatic callouts, inserted as
|
||||
a result of the auto_callout pattern modifier. In this case, instead of
|
||||
showing the callout number, the offset in the pattern, preceded by a
|
||||
showing the callout number, the offset in the pattern, preceded by a
|
||||
plus, is output. For example:
|
||||
|
||||
re> /\d?[A-E]\*/auto_callout
|
||||
|
@ -1730,17 +1732,17 @@ CALLOUTS
|
|||
+12 ^ ^
|
||||
0: abc
|
||||
|
||||
The mark changes between matching "a" and "b", but stays the same for
|
||||
the rest of the match, so nothing more is output. If, as a result of
|
||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||
The mark changes between matching "a" and "b", but stays the same for
|
||||
the rest of the match, so nothing more is output. If, as a result of
|
||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||
output.
|
||||
|
||||
Callouts with string arguments
|
||||
|
||||
The output for a callout with a string argument is similar, except that
|
||||
instead of outputting a callout number before the position indicators,
|
||||
the callout string and its offset in the pattern string are output be-
|
||||
fore the reflection of the subject string, and the subject string is
|
||||
instead of outputting a callout number before the position indicators,
|
||||
the callout string and its offset in the pattern string are output be-
|
||||
fore the reflection of the subject string, and the subject string is
|
||||
reflected for each callout. For example:
|
||||
|
||||
re> /^ab(?C'first')cd(?C"second")ef/
|
||||
|
@ -1756,26 +1758,26 @@ CALLOUTS
|
|||
|
||||
Callout modifiers
|
||||
|
||||
The callout function in pcre2test returns zero (carry on matching) by
|
||||
default, but you can use a callout_fail modifier in a subject line to
|
||||
The callout function in pcre2test returns zero (carry on matching) by
|
||||
default, but you can use a callout_fail modifier in a subject line to
|
||||
change this and other parameters of the callout (see below).
|
||||
|
||||
If the callout_capture modifier is set, the current captured groups are
|
||||
output when a callout occurs. This is useful only for non-DFA matching,
|
||||
as pcre2_dfa_match() does not support capturing, so no captures are
|
||||
as pcre2_dfa_match() does not support capturing, so no captures are
|
||||
ever shown.
|
||||
|
||||
The normal callout output, showing the callout number or pattern offset
|
||||
(as described above) is suppressed if the callout_no_where modifier is
|
||||
(as described above) is suppressed if the callout_no_where modifier is
|
||||
set.
|
||||
|
||||
When using the interpretive matching function pcre2_match() without
|
||||
JIT, setting the callout_extra modifier causes additional output from
|
||||
pcre2test's callout function to be generated. For the first callout in
|
||||
a match attempt at a new starting position in the subject, "New match
|
||||
attempt" is output. If there has been a backtrack since the last call-
|
||||
When using the interpretive matching function pcre2_match() without
|
||||
JIT, setting the callout_extra modifier causes additional output from
|
||||
pcre2test's callout function to be generated. For the first callout in
|
||||
a match attempt at a new starting position in the subject, "New match
|
||||
attempt" is output. If there has been a backtrack since the last call-
|
||||
out (or start of matching if this is the first callout), "Backtrack" is
|
||||
output, followed by "No other matching paths" if the backtrack ended
|
||||
output, followed by "No other matching paths" if the backtrack ended
|
||||
the previous match attempt. For example:
|
||||
|
||||
re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
|
||||
|
@ -1812,86 +1814,86 @@ CALLOUTS
|
|||
+1 ^ a+
|
||||
No match
|
||||
|
||||
Notice that various optimizations must be turned off if you want all
|
||||
possible matching paths to be scanned. If no_start_optimize is not
|
||||
used, there is an immediate "no match", without any callouts, because
|
||||
the starting optimization fails to find "b" in the subject, which it
|
||||
knows must be present for any match. If no_auto_possess is not used,
|
||||
the "a+" item is turned into "a++", which reduces the number of back-
|
||||
Notice that various optimizations must be turned off if you want all
|
||||
possible matching paths to be scanned. If no_start_optimize is not
|
||||
used, there is an immediate "no match", without any callouts, because
|
||||
the starting optimization fails to find "b" in the subject, which it
|
||||
knows must be present for any match. If no_auto_possess is not used,
|
||||
the "a+" item is turned into "a++", which reduces the number of back-
|
||||
tracks.
|
||||
|
||||
The callout_extra modifier has no effect if used with the DFA matching
|
||||
The callout_extra modifier has no effect if used with the DFA matching
|
||||
function, or with JIT.
|
||||
|
||||
Return values from callouts
|
||||
|
||||
The default return from the callout function is zero, which allows
|
||||
The default return from the callout function is zero, which allows
|
||||
matching to continue. The callout_fail modifier can be given one or two
|
||||
numbers. If there is only one number, 1 is returned instead of 0 (caus-
|
||||
ing matching to backtrack) when a callout of that number is reached. If
|
||||
two numbers (<n>:<m>) are given, 1 is returned when callout <n> is
|
||||
reached and there have been at least <m> callouts. The callout_error
|
||||
two numbers (<n>:<m>) are given, 1 is returned when callout <n> is
|
||||
reached and there have been at least <m> callouts. The callout_error
|
||||
modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus-
|
||||
ing the entire matching process to be aborted. If both these modifiers
|
||||
are set for the same callout number, callout_error takes precedence.
|
||||
Note that callouts with string arguments are always given the number
|
||||
ing the entire matching process to be aborted. If both these modifiers
|
||||
are set for the same callout number, callout_error takes precedence.
|
||||
Note that callouts with string arguments are always given the number
|
||||
zero.
|
||||
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. This is set as the "user data" that is passed to the matching
|
||||
function, and passed back when the callout function is invoked. Any
|
||||
value other than zero is used as a return from pcre2test's callout
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. This is set as the "user data" that is passed to the matching
|
||||
function, and passed back when the callout function is invoked. Any
|
||||
value other than zero is used as a return from pcre2test's callout
|
||||
function.
|
||||
|
||||
Inserting callouts can be helpful when using pcre2test to check compli-
|
||||
cated regular expressions. For further information about callouts, see
|
||||
cated regular expressions. For further information about callouts, see
|
||||
the pcre2callout documentation.
|
||||
|
||||
|
||||
NON-PRINTING CHARACTERS
|
||||
|
||||
When pcre2test is outputting text in the compiled version of a pattern,
|
||||
bytes other than 32-126 are always treated as non-printing characters
|
||||
bytes other than 32-126 are always treated as non-printing characters
|
||||
and are therefore shown as hex escapes.
|
||||
|
||||
When pcre2test is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been
|
||||
set for the pattern (using the locale modifier). In this case, the is-
|
||||
When pcre2test is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been
|
||||
set for the pattern (using the locale modifier). In this case, the is-
|
||||
print() function is used to distinguish printing and non-printing char-
|
||||
acters.
|
||||
|
||||
|
||||
SAVING AND RESTORING COMPILED PATTERNS
|
||||
|
||||
It is possible to save compiled patterns on disc or elsewhere, and
|
||||
It is possible to save compiled patterns on disc or elsewhere, and
|
||||
reload them later, subject to a number of restrictions. JIT data cannot
|
||||
be saved. The host on which the patterns are reloaded must be running
|
||||
be saved. The host on which the patterns are reloaded must be running
|
||||
the same version of PCRE2, with the same code unit width, and must also
|
||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||
compiled patterns can be saved they must be serialized, that is, con-
|
||||
verted to a stream of bytes. A single byte stream may contain any num-
|
||||
ber of compiled patterns, but they must all use the same character ta-
|
||||
bles. A single copy of the tables is included in the byte stream (its
|
||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||
compiled patterns can be saved they must be serialized, that is, con-
|
||||
verted to a stream of bytes. A single byte stream may contain any num-
|
||||
ber of compiled patterns, but they must all use the same character ta-
|
||||
bles. A single copy of the tables is included in the byte stream (its
|
||||
size is 1088 bytes).
|
||||
|
||||
The functions whose names begin with pcre2_serialize_ are used for se-
|
||||
rializing and de-serializing. They are described in the pcre2serialize
|
||||
documentation. In this section we describe the features of pcre2test
|
||||
The functions whose names begin with pcre2_serialize_ are used for se-
|
||||
rializing and de-serializing. They are described in the pcre2serialize
|
||||
documentation. In this section we describe the features of pcre2test
|
||||
that can be used to test these functions.
|
||||
|
||||
Note that "serialization" in PCRE2 does not convert compiled patterns
|
||||
to an abstract format like Java or .NET. It just makes a reloadable
|
||||
Note that "serialization" in PCRE2 does not convert compiled patterns
|
||||
to an abstract format like Java or .NET. It just makes a reloadable
|
||||
byte code stream. Hence the restrictions on reloading mentioned above.
|
||||
|
||||
In pcre2test, when a pattern with push modifier is successfully com-
|
||||
piled, it is pushed onto a stack of compiled patterns, and pcre2test
|
||||
expects the next line to contain a new pattern (or command) instead of
|
||||
In pcre2test, when a pattern with push modifier is successfully com-
|
||||
piled, it is pushed onto a stack of compiled patterns, and pcre2test
|
||||
expects the next line to contain a new pattern (or command) instead of
|
||||
a subject line. By contrast, the pushcopy modifier causes a copy of the
|
||||
compiled pattern to be stacked, leaving the original available for im-
|
||||
mediate matching. By using push and/or pushcopy, a number of patterns
|
||||
can be compiled and retained. These modifiers are incompatible with
|
||||
compiled pattern to be stacked, leaving the original available for im-
|
||||
mediate matching. By using push and/or pushcopy, a number of patterns
|
||||
can be compiled and retained. These modifiers are incompatible with
|
||||
posix, and control modifiers that act at match time are ignored (with a
|
||||
message) for the stacked patterns. The jitverify modifier applies only
|
||||
message) for the stacked patterns. The jitverify modifier applies only
|
||||
at compile time.
|
||||
|
||||
The command
|
||||
|
@ -1899,21 +1901,21 @@ SAVING AND RESTORING COMPILED PATTERNS
|
|||
#save <filename>
|
||||
|
||||
causes all the stacked patterns to be serialized and the result written
|
||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||
command
|
||||
|
||||
#load <filename>
|
||||
|
||||
reads the data in the file, and then arranges for it to be de-serial-
|
||||
ized, with the resulting compiled patterns added to the pattern stack.
|
||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||
mand, which must be followed by lines of subjects that are to be
|
||||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, posix_nosub, push, and pushcopy are not al-
|
||||
lowed, nor are any option-setting modifiers. The JIT modifiers are,
|
||||
however permitted. Here is an example that saves and reloads two pat-
|
||||
reads the data in the file, and then arranges for it to be de-serial-
|
||||
ized, with the resulting compiled patterns added to the pattern stack.
|
||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||
mand, which must be followed by lines of subjects that are to be
|
||||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, posix_nosub, push, and pushcopy are not al-
|
||||
lowed, nor are any option-setting modifiers. The JIT modifiers are,
|
||||
however permitted. Here is an example that saves and reloads two pat-
|
||||
terns.
|
||||
|
||||
/abc/push
|
||||
|
@ -1926,10 +1928,10 @@ SAVING AND RESTORING COMPILED PATTERNS
|
|||
#pop jit,bincode
|
||||
abc
|
||||
|
||||
If jitverify is used with #pop, it does not automatically imply jit,
|
||||
If jitverify is used with #pop, it does not automatically imply jit,
|
||||
which is different behaviour from when it is used on a pattern.
|
||||
|
||||
The #popcopy command is analagous to the pushcopy modifier in that it
|
||||
The #popcopy command is analagous to the pushcopy modifier in that it
|
||||
makes current a copy of the topmost stack pattern, leaving the original
|
||||
still on the stack.
|
||||
|
||||
|
@ -1949,5 +1951,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 12 January 2022
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -205,9 +205,6 @@ point. */
|
|||
* Global variables *
|
||||
*************************************************/
|
||||
|
||||
/* Jeffrey Friedl has some debugging requirements that are not part of the
|
||||
regular code. */
|
||||
|
||||
static const char *colour_string = "1;31";
|
||||
static const char *colour_option = NULL;
|
||||
static const char *dee_option = NULL;
|
||||
|
@ -220,6 +217,10 @@ static const char *output_text = NULL;
|
|||
|
||||
static char *main_buffer = NULL;
|
||||
|
||||
static const char *printname_nl = STDOUT_NL; /* Changed to NULL for -Z */
|
||||
static int printname_colon = ':'; /* Changed to 0 for -Z */
|
||||
static int printname_hyphen = '-'; /* Changed to 0 for -Z */
|
||||
|
||||
static int after_context = 0;
|
||||
static int before_context = 0;
|
||||
static int binary_files = BIN_BINARY;
|
||||
|
@ -483,6 +484,7 @@ static option_item optionlist[] = {
|
|||
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
|
||||
{ OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" },
|
||||
{ OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" },
|
||||
{ OP_NODATA, 'Z', NULL, "null", "output 0 byte after file names" },
|
||||
{ OP_NODATA, 0, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
|
@ -1773,7 +1775,7 @@ if (after_context > 0 && lastmatchnumber > 0)
|
|||
{
|
||||
char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
|
||||
if (ellength == 0 && pp == main_buffer + bufsize) break;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname, printname_hyphen);
|
||||
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
||||
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||
lastmatchrestart = pp;
|
||||
|
@ -2439,10 +2441,10 @@ if (pid == 0)
|
|||
}
|
||||
else if (pid > 0)
|
||||
{
|
||||
(void)fflush(stdout);
|
||||
(void)fflush(stdout);
|
||||
(void)waitpid(pid, &result, 0);
|
||||
(void)fflush(stdout);
|
||||
}
|
||||
(void)fflush(stdout);
|
||||
}
|
||||
#endif /* End Windows/VMS/other handling */
|
||||
|
||||
free(args);
|
||||
|
@ -2730,7 +2732,9 @@ while (ptr < endptr)
|
|||
|
||||
else if (filenames == FN_MATCH_ONLY)
|
||||
{
|
||||
fprintf(stdout, "%s" STDOUT_NL, printname);
|
||||
fprintf(stdout, "%s", printname);
|
||||
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
|
||||
else fprintf(stdout, "%s", printname_nl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2749,7 +2753,8 @@ while (ptr < endptr)
|
|||
{
|
||||
PCRE2_SIZE oldstartoffset;
|
||||
|
||||
if (printname != NULL) fprintf(stdout, "%s:", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_colon);
|
||||
if (number) fprintf(stdout, "%lu:", linenumber);
|
||||
|
||||
/* Handle --line-offsets */
|
||||
|
@ -2871,7 +2876,8 @@ while (ptr < endptr)
|
|||
while (lastmatchrestart < p)
|
||||
{
|
||||
char *pp = lastmatchrestart;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_hyphen);
|
||||
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||
|
@ -2912,7 +2918,8 @@ while (ptr < endptr)
|
|||
{
|
||||
int ellength;
|
||||
char *pp = p;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_hyphen);
|
||||
if (number) fprintf(stdout, "%lu-", linenumber - linecount--);
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
FWRITE_IGNORE(p, 1, pp - p, stdout);
|
||||
|
@ -2926,7 +2933,8 @@ while (ptr < endptr)
|
|||
if (after_context > 0 || before_context > 0)
|
||||
endhyphenpending = TRUE;
|
||||
|
||||
if (printname != NULL) fprintf(stdout, "%s:", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_colon);
|
||||
if (number) fprintf(stdout, "%lu:", linenumber);
|
||||
|
||||
/* In multiline mode, or if colouring, we have to split the line(s) up
|
||||
|
@ -3131,7 +3139,9 @@ were none. If we found a match, we won't have got this far. */
|
|||
|
||||
if (filenames == FN_NOMATCH_ONLY)
|
||||
{
|
||||
fprintf(stdout, "%s" STDOUT_NL, printname);
|
||||
fprintf(stdout, "%s", printname);
|
||||
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
|
||||
else fprintf(stdout, "%s", printname_nl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -3142,7 +3152,7 @@ if (count_only && !quiet)
|
|||
if (count > 0 || !omit_zero_count)
|
||||
{
|
||||
if (printname != NULL && filenames != FN_NONE)
|
||||
fprintf(stdout, "%s:", printname);
|
||||
fprintf(stdout, "%s%c", printname, printname_colon);
|
||||
fprintf(stdout, "%lu" STDOUT_NL, count);
|
||||
counts_printed++;
|
||||
}
|
||||
|
@ -3528,8 +3538,6 @@ switch(letter)
|
|||
case 'u': options |= PCRE2_UTF; utf = TRUE; break;
|
||||
case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
|
||||
case 'v': invert = TRUE; break;
|
||||
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
||||
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
||||
|
||||
case 'V':
|
||||
{
|
||||
|
@ -3540,6 +3548,10 @@ switch(letter)
|
|||
pcre2grep_exit(0);
|
||||
break;
|
||||
|
||||
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
||||
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
||||
case 'Z': printname_colon = printname_hyphen = 0; printname_nl = NULL; break;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter);
|
||||
pcre2grep_exit(usage(2));
|
||||
|
@ -4259,8 +4271,6 @@ if (DEE_option != NULL)
|
|||
|
||||
(void)pcre2_set_compile_extra_options(compile_context, extra_options);
|
||||
|
||||
/* Check the values for Jeffrey Friedl's debugging options. */
|
||||
|
||||
/* If use_jit is set, check whether JIT is available. If not, do not try
|
||||
to use JIT. */
|
||||
|
||||
|
|
|
@ -991,3 +991,22 @@ RC=0
|
|||
---------------------------- Test 134 -----------------------------
|
||||
=AB3CD5=
|
||||
RC=0
|
||||
---------------------------- Test 135 -----------------------------
|
||||
./testdata/grepinputv@The word is cat in this line
|
||||
RC=0
|
||||
./testdata/grepinputv@./testdata/grepinputv@RC=0
|
||||
./testdata/grepinputv@This line contains \E and (regex) *meta* [characters].
|
||||
./testdata/grepinputv@The word is cat in this line
|
||||
./testdata/grepinputv@The caterpillar sat on the mat
|
||||
RC=0
|
||||
testdata/grepinputM |