Implement -Z in pcre2grep and update documentation

This commit is contained in:
Philip Hazel 2022-07-30 17:41:49 +01:00
parent cc5e121c8e
commit 8b133fa0ba
16 changed files with 994 additions and 868 deletions

View File

@ -49,6 +49,8 @@ tests.
tests run by 'make check', but can be run manually. The current output is from tests run by 'make check', but can be run manually. The current output is from
a 64-bit system. a 64-bit system.
13. Implemented -Z aka --null in pcre2grep.
Version 10.40 15-April-2022 Version 10.40 15-April-2022
--------------------------- ---------------------------

View File

@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u" diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub" diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
# Some tests involve NUL characters. It seems impossible to handle them easily
# in many operating systems. An earlier version of this script used sed to
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
# even when using GNU sed. A user suggested using tr instead, which
# necessitates translating to a single character. However, on (some versions
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
if [ -x /usr/xpg4/bin/tr ] ; then
tr=/usr/xpg4/bin/tr
else
tr=tr
fi
# If this test is being run from "make check", $srcdir will be set. If not, set # If this test is being run from "make check", $srcdir will be set. If not, set
# it to the current or parent directory, whichever one contains the test data. # it to the current or parent directory, whichever one contains the test data.
# Subsequently, we run most of the pcre2grep tests in the source directory so # Subsequently, we run most of the pcre2grep tests in the source directory so
@ -685,6 +701,16 @@ echo "---------------------------- Test 134 -----------------------------" >>tes
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1 (cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
echo "RC=$?" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
echo "RC=$?" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
echo "RC=$?" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
echo "RC=$?" >>testtrygrep
# Now compare the results. # Now compare the results.
$cf $srcdir/testdata/grepoutput testtrygrep $cf $srcdir/testdata/grepoutput testtrygrep
@ -759,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
# This next test involves NUL characters. It seems impossible to handle them
# easily in many operating systems. An earlier version of this script used sed
# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
# even when using GNU sed. A user suggested using tr instead, which
# necessitates translating to a single character (@). However, on (some
# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
if [ -x /usr/xpg4/bin/tr ] ; then
tr=/usr/xpg4/bin/tr
else
tr=tr
fi
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
printf 'abc\0def' >testNinputgrep printf 'abc\0def' >testNinputgrep
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep $valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep

View File

@ -121,6 +121,7 @@ environment, for example.
pcre2_substring.c pcre2_substring.c
pcre2_tables.c pcre2_tables.c
pcre2_ucd.c pcre2_ucd.c
pcre2_ucptables.c
pcre2_valid_utf.c pcre2_valid_utf.c
pcre2_xclass.c pcre2_xclass.c
@ -373,7 +374,7 @@ Otherwise:
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
have been created. have been created.
2. Edit RunTest.bat to indentify the full or relative location of 2. Edit RunTest.bat to identify the full or relative location of
the pcre2 source (wherein which the testdata folder resides), e.g.: the pcre2 source (wherein which the testdata folder resides), e.g.:
set srcdir=C:\pcre2\pcre2-10.00 set srcdir=C:\pcre2\pcre2-10.00

View File

@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
You can access the archives and also subscribe or manage your subscription You can access the archives and also subscribe or manage your subscription
here: here:
https://groups.google.com/pcre2-dev https://groups.google.com/g/pcre2-dev
Please read the NEWS file if you are upgrading from a previous release. The Please read the NEWS file if you are upgrading from a previous release. The
contents of this README file are: contents of this README file are:
@ -375,7 +375,8 @@ library. They are also documented in the pcre2build man page.
necessary to specify something like LIBS="-lncurses" as well. This is necessary to specify something like LIBS="-lncurses" as well. This is
because, to quote the readline INSTALL, "Readline uses the termcap functions, because, to quote the readline INSTALL, "Readline uses the termcap functions,
but does not link with the termcap or curses library itself, allowing but does not link with the termcap or curses library itself, allowing
applications which link with readline the to choose an appropriate library." applications which link with readline the option to choose an appropriate
library."
If you get error messages about missing functions tgetstr, tgetent, tputs, If you get error messages about missing functions tgetstr, tgetent, tputs,
tgetflag, or tgoto, this is the problem, and linking with the ncurses library tgetflag, or tgoto, this is the problem, and linking with the ncurses library
should fix it. should fix it.
@ -400,10 +401,10 @@ library. They are also documented in the pcre2build man page.
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
be created. This is normally run under valgrind or used when PCRE2 is be created. This is normally run under valgrind or used when PCRE2 is
compiled with address sanitizing enabled. It calls the fuzzing function and compiled with address sanitizing enabled. It calls the fuzzing function and
outputs information about it is doing. The input strings are specified by outputs information about what it is doing. The input strings are specified
arguments: if an argument starts with "=" the rest of it is a literal input by arguments: if an argument starts with "=" the rest of it is a literal
string. Otherwise, it is assumed to be a file name, and the contents of the input string. Otherwise, it is assumed to be a file name, and the contents
file are the test string. of the file are the test string.
. Releases before 10.30 could be compiled with --disable-stack-for-recursion, . Releases before 10.30 could be compiled with --disable-stack-for-recursion,
which caused pcre2_match() to use individual blocks on the heap for which caused pcre2_match() to use individual blocks on the heap for
@ -695,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
different code unit widths. different code unit widths.
Test 15 contains a number of tests that must not be run with JIT. They check, Test 15 contains a number of tests that must not be run with JIT. They check,
among other non-JIT things, the match-limiting features of the intepretive among other non-JIT things, the match-limiting features of the interpretive
matcher. matcher.
Test 16 is run only when JIT support is not available. It checks that an Test 16 is run only when JIT support is not available. It checks that an

View File

@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
documentation for more details). If the limit is reached, the negative error documentation for more details). If the limit is reached, the negative error
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2 code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
is built; if it is not, the default is set very large and is essentially is built; if it is not, the default is set very large and is essentially
"unlimited". unlimited.
</P> </P>
<P> <P>
A value for the heap limit may also be supplied by an item at the start of a A value for the heap limit may also be supplied by an item at the start of a
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
limit is set, less than the default. limit is set, less than the default.
</P> </P>
<P> <P>
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system The <b>pcre2_match()</b> function always needs some heap memory, so setting a
stack for recording backtracking points. The more nested backtracking points value of zero guarantees a "heap limit exceeded" error. Details of how
there are (that is, the deeper the search tree), the more memory is needed. <b>pcre2_match()</b> uses the heap are given in the
Heap memory is used only if the initial vector is too small. If the heap limit <a href="pcre2perform.html"><b>pcre2perform</b></a>
is set to a value less than 21 (in particular, zero) no heap memory will be documentation.
used. In this case, only patterns that do not have a lot of nested backtracking
can be successfully processed.
</P> </P>
<P> <P>
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
when processing pattern recursions, lookarounds, or atomic groups, and only if processing pattern recursions, lookarounds, or atomic groups, and only if this
this is not big enough is heap memory used. In this case, too, setting a value is not big enough is heap memory used. In this case, setting a value of zero
of zero disables the use of the heap. disables the use of the heap.
<br> <br>
<br> <br>
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b> <b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
<br> <br>
<br> <br>
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>. This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
Each time a nested backtracking point is passed, a new memory "frame" is used Each time a nested backtracking point is passed, a new memory frame is used
to remember the state of matching at that point. Thus, this parameter to remember the state of matching at that point. Thus, this parameter
indirectly limits the amount of memory that is used in a match. However, indirectly limits the amount of memory that is used in a match. However,
because the size of each memory "frame" depends on the number of capturing because the size of each memory frame depends on the number of capturing
parentheses, the actual memory limit varies from pattern to pattern. This limit parentheses, the actual memory limit varies from pattern to pattern. This limit
was more useful in versions before 10.30, where function recursion was used for was more useful in versions before 10.30, where function recursion was used for
backtracking. backtracking.
@ -3148,11 +3146,11 @@ The backtracking match limit was reached.
<pre> <pre>
PCRE2_ERROR_NOMEMORY PCRE2_ERROR_NOMEMORY
</pre> </pre>
If a pattern contains many nested backtracking points, heap memory is used to Heap memory is used to remember backgracking points. This error is given when
remember them. This error is given when the memory allocation function (default the memory allocation function (default or custom) fails. Note that a different
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
<pre> <pre>
PCRE2_ERROR_NULL PCRE2_ERROR_NULL
</pre> </pre>
@ -4020,9 +4018,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC42" href="#TOC1">REVISION</a><br> <br><a name="SEC42" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 14 December 2021 Last updated: 27 July 2022
<br> <br>
Copyright &copy; 1997-2021 University of Cambridge. Copyright &copy; 1997-2022 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -284,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
counting is done differently). counting is done differently).
</P> </P>
<P> <P>
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system The <b>pcre2_match()</b> function uses heap memory to record backtracking
stack to record backtracking points. The more nested backtracking points there points. The more nested backtracking points there are (that is, the deeper the
are (that is, the deeper the search tree), the more memory is needed. If the search tree), the more memory is needed. There is an upper limit, specified in
initial vector is not large enough, heap memory is used, up to a certain limit, kibibytes (units of 1024 bytes). This limit can be changed at run time, as
which is specified in kibibytes (units of 1024 bytes). The limit can be changed described in the
at run time, as described in the
<a href="pcre2api.html"><b>pcre2api</b></a> <a href="pcre2api.html"><b>pcre2api</b></a>
documentation. The default limit (in effect unlimited) is 20 million. You can documentation. The default limit (in effect unlimited) is 20 million. You can
change this by a setting such as change this by a setting such as
@ -609,16 +608,16 @@ give a warning.
<P> <P>
Philip Hazel Philip Hazel
<br> <br>
University Computing Service Retired from University Computing Service
<br> <br>
Cambridge, England. Cambridge, England.
<br> <br>
</P> </P>
<br><a name="SEC26" href="#TOC1">REVISION</a><br> <br><a name="SEC26" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 08 December 2021 Last updated: 27 July 2022
<br> <br>
Copyright &copy; 1997-2021 University of Cambridge. Copyright &copy; 1997-2022 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -71,13 +71,15 @@ For example:
<pre> <pre>
pcre2grep some-pattern file1 - file3 pcre2grep some-pattern file1 - file3
</pre> </pre>
Input files are searched line by line. By default, each line that matches a By default, input files are searched line by line. Each line that matches a
pattern is copied to the standard output, and if there is more than one file, pattern is copied to the standard output, and if there is more than one file,
the file name is output at the start of each line, followed by a colon. the file name is output at the start of each line, followed by a colon.
However, there are options that can change how <b>pcre2grep</b> behaves. In However, there are options that can change how <b>pcre2grep</b> behaves. For
particular, the <b>-M</b> option makes it possible to search for strings that example, the <b>-M</b> option makes it possible to search for strings that span
span line boundaries. What defines a line boundary is controlled by the line boundaries. What defines a line boundary is controlled by the <b>-N</b>
<b>-N</b> (<b>--newline</b>) option. (<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
not file names are shown, and the <b>-Z</b> option changes the file name
terminator to a zero byte.
</P> </P>
<P> <P>
The amount of memory used for buffering files that are being scanned is The amount of memory used for buffering files that are being scanned is
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
lines are output if the next match or the end of the file is reached, or if the lines are output if the next match or the end of the file is reached, or if the
processing buffer size has been set too small. If file names and/or line processing buffer size has been set too small. If file names and/or line
numbers are being output, a hyphen separator is used instead of a colon for the numbers are being output, a hyphen separator is used instead of a colon for the
context lines. A line containing "--" is output between each group of lines, context lines (the <b>-Z</b> option can be used to change the file name
unless they are in fact contiguous in the input file. The value of <i>number</i> terminator to a zero byte). A line containing "--" is output between each group
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored. of lines, unless they are in fact contiguous in the input file. The value of
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
<b>-A</b> is ignored.
</P> </P>
<P> <P>
<b>-a</b>, <b>--text</b> <b>-a</b>, <b>--text</b>
@ -199,9 +203,10 @@ Output up to <i>number</i> lines of context before each matching line. Fewer
lines are output if the previous match or the start of the file is within lines are output if the previous match or the start of the file is within
<i>number</i> lines, or if the processing buffer size has been set too small. If <i>number</i> lines, or if the processing buffer size has been set too small. If
file names and/or line numbers are being output, a hyphen separator is used file names and/or line numbers are being output, a hyphen separator is used
instead of a colon for the context lines. A line containing "--" is output instead of a colon for the context lines (the <b>-Z</b> option can be used to
between each group of lines, unless they are in fact contiguous in the input change the file name terminator to a zero byte). A line containing "--" is
file. The value of <i>number</i> is expected to be relatively small. When output between each group of lines, unless they are in fact contiguous in the
input file. The value of <i>number</i> is expected to be relatively small. When
<b>-c</b> is used, <b>-B</b> is ignored. <b>-c</b> is used, <b>-B</b> is ignored.
</P> </P>
<P> <P>
@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
<P> <P>
<b>-H</b>, <b>--with-filename</b> <b>-H</b>, <b>--with-filename</b>
Force the inclusion of the file name at the start of output lines when Force the inclusion of the file name at the start of output lines when
searching a single file. By default, the file name is not shown in this case. searching a single file. The file name is not normally shown in this case.
For matching lines, the file name is followed by a colon; for context lines, a By default, for matching lines, the file name is followed by a colon; for
hyphen separator is used. If a line number is also being output, it follows the context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
file name. When the <b>-M</b> option causes a pattern to match more than one change the terminator to a zero byte. If a line number is also being output,
line, only the first is preceded by the file name. This option overrides any it follows the file name. When the <b>-M</b> option causes a pattern to match
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options. more than one line, only the first is preceded by the file name. This option
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
</P> </P>
<P> <P>
<b>-h</b>, <b>--no-filename</b> <b>-h</b>, <b>--no-filename</b>
Suppress the output file names when searching multiple files. By default, Suppress the output file names when searching multiple files. File names are
file names are shown when multiple files are searched. For matching lines, the normally shown when multiple files are searched. By default, for matching
file name is followed by a colon; for context lines, a hyphen separator is used. lines, the file name is followed by a colon; for context lines, a hyphen
If a line number is also being output, it follows the file name. This option separator is used. The <b>-Z</b> option can be used to change the terminator to
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options. a zero byte. If a line number is also being output, it follows the file name.
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
</P> </P>
<P> <P>
<b>--heap-limit</b>=<i>number</i> <b>--heap-limit</b>=<i>number</i>
@ -481,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
<b>-L</b>, <b>--files-without-match</b> <b>-L</b>, <b>--files-without-match</b>
Instead of outputting lines from the files, just output the names of the files Instead of outputting lines from the files, just output the names of the files
that do not contain any lines that would have been output. Each file name is that do not contain any lines that would have been output. Each file name is
output once, on a separate line. This option overrides any previous <b>-H</b>, output once, on a separate line by default, but if the <b>-Z</b> option is set,
<b>-h</b>, or <b>-l</b> options. they are separated by zero bytes instead of newlines. This option overrides any
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
</P> </P>
<P> <P>
<b>-l</b>, <b>--files-with-matches</b> <b>-l</b>, <b>--files-with-matches</b>
Instead of outputting lines from the files, just output the names of the files Instead of outputting lines from the files, just output the names of the files
containing lines that would have been output. Each file name is output once, on containing lines that would have been output. Each file name is output once, on
a separate line. Searching normally stops as soon as a matching line is found a separate line, but if the <b>-Z</b> option is set, they are separated by zero
in a file. However, if the <b>-c</b> (count) option is also used, matching bytes instead of newlines. Searching normally stops as soon as a matching line
continues in order to obtain the correct count, and those files that have at is found in a file. However, if the <b>-c</b> (count) option is also used,
least one match are listed along with their counts. Using this option with matching continues in order to obtain the correct count, and those files that
<b>-c</b> is a way of suppressing the listing of files with no matches that have at least one match are listed along with their counts. Using this option
with <b>-c</b> is a way of suppressing the listing of files with no matches that
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>, occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
<b>-h</b>, or <b>-L</b> options. <b>-h</b>, or <b>-L</b> options.
</P> </P>
@ -592,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
<br> <br>
<br> <br>
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
1024 bytes), the amount of heap memory that may be used for matching. Heap 1024 bytes), the maximum amount of heap memory that may be used for matching.
memory is needed only if matching the pattern requires a significant number of
nested backtracking points to be remembered. This parameter can be set to zero
to forbid the use of heap memory altogether.
<br> <br>
<br> <br>
The <b>--depth-limit</b> option limits the depth of nested backtracking points, The <b>--depth-limit</b> option limits the depth of nested backtracking points,
@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
matched against the contents of files; it does not apply to patterns specified matched against the contents of files; it does not apply to patterns specified
by any of the <b>--include</b> or <b>--exclude</b> options. by any of the <b>--include</b> or <b>--exclude</b> options.
</P> </P>
<P>
<b>-Z</b>, <b>--null</b>
Terminate files names in the regular output with a zero byte (the NUL
character) instead of what would normally appear. This is useful when file
names contain unusual characters such as colons, hyphens, or even newlines. The
option does not apply to file names in error messages.
</P>
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br> <br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
<P> <P>
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
@ -1053,9 +1066,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC16" href="#TOC1">REVISION</a><br> <br><a name="SEC16" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 31 August 2021 Last updated: 30 July 2022
<br> <br>
Copyright &copy; 1997-2021 University of Cambridge. Copyright &copy; 1997-2022 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
The maximum length of a string argument to a callout is the largest number a The maximum length of a string argument to a callout is the largest number a
32-bit unsigned integer can hold. 32-bit unsigned integer can hold.
</P> </P>
<P>
The maximum amount of heap memory used for matching is controlled by the heap
limit, which can be set in a pattern or in a match context. The default is a
very large number, effectively unlimited.
</P>
<br><b> <br><b>
AUTHOR AUTHOR
</b><br> </b><br>
<P> <P>
Philip Hazel Philip Hazel
<br> <br>
University Computing Service Retired from University Computing Service
<br> <br>
Cambridge, England. Cambridge, England.
<br> <br>
@ -86,9 +91,9 @@ Cambridge, England.
REVISION REVISION
</b><br> </b><br>
<P> <P>
Last updated: 02 February 2019 Last updated: 26 July 2022
<br> <br>
Copyright &copy; 1997-2019 University of Cambridge. Copyright &copy; 1997-2022 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
uses very little system stack at run time. In earlier releases recursive uses very little system stack at run time. In earlier releases recursive
function calls could use a great deal of stack, and this could cause problems, function calls could use a great deal of stack, and this could cause problems,
but this usage has been eliminated. Backtracking positions are now explicitly but this usage has been eliminated. Backtracking positions are now explicitly
remembered in memory frames controlled by the code. An initial 20KiB vector of remembered in memory frames controlled by the code.
frames is allocated on the system stack (enough for about 100 frames for small </P>
patterns), but if this is insufficient, heap memory is used. The amount of heap <P>
memory can be limited; if the limit is set to zero, only the initial stack The size of each frame depends on the size of pointer variables and the number
vector is used. Rewriting patterns to be time-efficient, as described below, of capturing parenthesized groups in the pattern being matched. On a 64-bit
may also reduce the memory requirements. system the frame size for a pattern with no captures is 128 bytes. For each
capturing group the size increases by 16 bytes.
</P>
<P>
Until release 10.41, an initial 20KiB frames vector was allocated on the system
stack, but this still caused some issues for multi-thread applications where
each thread has a very small stack. From release 10.41 backtracking memory
frames are always held in heap memory. An initial heap allocation is obtained
the first time any match data block is passed to <b>pcre2_match()</b>. This is
remembered with the match data block and re-used if that block is used for
another match. It is freed when the match data block itself is freed.
</P>
<P>
The size of the initial block is the larger of 20KiB or ten times the pattern's
frame size, unless the heap limit is less than this, in which case the heap
limit is used. If the initial block proves to be too small during matching, it
is replaced by a larger block, subject to the heap limit. The heap limit is
checked only when a new block is to be allocated. Reducing the heap limit
between calls to <b>pcre2_match()</b> with the same match data block does not
affect the saved block.
</P> </P>
<P> <P>
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
<P> <P>
Philip Hazel Philip Hazel
<br> <br>
University Computing Service Retired from University Computing Service
<br> <br>
Cambridge, England. Cambridge, England.
<br> <br>
</P> </P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br> <br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 03 February 2019 Last updated: 27 July 2022
<br> <br>
Copyright &copy; 1997-2019 University of Cambridge. Copyright &copy; 1997-2022 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -1241,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
copy=&#60;number or name&#62; copy captured substring copy=&#60;number or name&#62; copy captured substring
depth_limit=&#60;n&#62; set a depth limit depth_limit=&#60;n&#62; set a depth limit
dfa use <b>pcre2_dfa_match()</b> dfa use <b>pcre2_dfa_match()</b>
find_limits find match and depth limits find_limits find heap, match and depth limits
find_limits_noheap find match and depth limits
get=&#60;number or name&#62; extract captured substring get=&#60;number or name&#62; extract captured substring
getall extract all captured substrings getall extract all captured substrings
/g global global matching /g global global matching
@ -1564,7 +1565,7 @@ Setting heap, match, and depth limits
<P> <P>
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
the appropriate limits in the match context. These values are ignored when the the appropriate limits in the match context. These values are ignored when the
<b>find_limits</b> modifier is specified. <b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
</P> </P>
<br><b> <br><b>
Finding minimum limits Finding minimum limits
@ -1574,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
calls the relevant matching function several times, setting different values in calls the relevant matching function several times, setting different values in
the match context via <b>pcre2_set_heap_limit()</b>, the match context via <b>pcre2_set_heap_limit()</b>,
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds <b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
the minimum values for each parameter that allows the match to complete without the smallest value for each parameter that allows the match to complete without
error. If JIT is being used, only the match limit is relevant. a "limit exceeded" error. The match itself may succeed or fail. An alternative
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
standard tests, because the minimum heap limit varies between systems. If JIT
is being used, only the match limit is relevant, and the other two are
automatically omitted.
</P> </P>
<P> <P>
When using this modifier, the pattern should not contain any limit settings When using this modifier, the pattern should not contain any limit settings
@ -1603,9 +1608,7 @@ overall amount of computing resource that is used.
</P> </P>
<P> <P>
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
(units of 1024 bytes), limits the amount of heap memory used for matching. A (units of 1024 bytes), limits the amount of heap memory used for matching.
value of zero disables the use of any heap memory; many simple pattern matches
can be done without using the heap, so zero is not an unreasonable setting.
</P> </P>
<br><b> <br><b>
Showing MARK names Showing MARK names
@ -1623,12 +1626,10 @@ Showing memory usage
<P> <P>
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
memory allocation and freeing calls that occur during a call to memory allocation and freeing calls that occur during a call to
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
requires a bigger vector than the default for remembering backtracking points is used only when a match requires more internal workspace that the default
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In allocation on the stack, so in many cases there will be no output. No heap
many cases there will be no heap memory used and therefore no additional memory is allocated during matching with JIT. For this modifier to work, the
output. No heap memory is allocated during matching with JIT, so in that case
the <b>memory</b> modifier never has any effect. For this modifier to work, the
<b>null_context</b> modifier must not be set on both the pattern and the <b>null_context</b> modifier must not be set on both the pattern and the
subject, though it can be set on one or the other. subject, though it can be set on one or the other.
</P> </P>
@ -1690,7 +1691,8 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
testing that the matching and substitution functions behave correctly in this testing that the matching and substitution functions behave correctly in this
case (they use default values). This modifier cannot be used with the case (they use default values). This modifier cannot be used with the
<b>find_limits</b> or <b>substitute_callout</b> modifiers. <b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
modifiers.
</P> </P>
<P> <P>
Similarly, for testing purposes, if the <b>null_subject</b> or Similarly, for testing purposes, if the <b>null_subject</b> or
@ -2141,7 +2143,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br> <br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 12 January 2022 Last updated: 27 July 2022
<br> <br>
Copyright &copy; 1997-2022 University of Cambridge. Copyright &copy; 1997-2022 University of Cambridge.
<br> <br>

View File

@ -1028,7 +1028,7 @@ PCRE2 CONTEXTS
pcre2jit documentation for more details). If the limit is reached, the pcre2jit documentation for more details). If the limit is reached, the
negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
limit can be set when PCRE2 is built; if it is not, the default is set limit can be set when PCRE2 is built; if it is not, the default is set
very large and is essentially "unlimited". very large and is essentially unlimited.
A value for the heap limit may also be supplied by an item at the start A value for the heap limit may also be supplied by an item at the start
of a pattern of the form of a pattern of the form
@ -1039,19 +1039,15 @@ PCRE2 CONTEXTS
less ddd is less than the limit set by the caller of pcre2_match() or, less ddd is less than the limit set by the caller of pcre2_match() or,
if no such limit is set, less than the default. if no such limit is set, less than the default.
The pcre2_match() function starts out using a 20KiB vector on the sys- The pcre2_match() function always needs some heap memory, so setting a
tem stack for recording backtracking points. The more nested backtrack- value of zero guarantees a "heap limit exceeded" error. Details of how
ing points there are (that is, the deeper the search tree), the more pcre2_match() uses the heap are given in the pcre2perform documenta-
memory is needed. Heap memory is used only if the initial vector is tion.
too small. If the heap limit is set to a value less than 21 (in partic-
ular, zero) no heap memory will be used. In this case, only patterns
that do not have a lot of nested backtracking can be successfully pro-
cessed.
Similarly, for pcre2_dfa_match(), a vector on the system stack is used For pcre2_dfa_match(), a vector on the system stack is used when pro-
when processing pattern recursions, lookarounds, or atomic groups, and cessing pattern recursions, lookarounds, or atomic groups, and only if
only if this is not big enough is heap memory used. In this case, too, this is not big enough is heap memory used. In this case, setting a
setting a value of zero disables the use of the heap. value of zero disables the use of the heap.
int pcre2_set_match_limit(pcre2_match_context *mcontext, int pcre2_set_match_limit(pcre2_match_context *mcontext,
uint32_t value); uint32_t value);
@ -1093,12 +1089,12 @@ PCRE2 CONTEXTS
This parameter limits the depth of nested backtracking in This parameter limits the depth of nested backtracking in
pcre2_match(). Each time a nested backtracking point is passed, a new pcre2_match(). Each time a nested backtracking point is passed, a new
memory "frame" is used to remember the state of matching at that point. memory frame is used to remember the state of matching at that point.
Thus, this parameter indirectly limits the amount of memory that is Thus, this parameter indirectly limits the amount of memory that is
used in a match. However, because the size of each memory "frame" de- used in a match. However, because the size of each memory frame depends
pends on the number of capturing parentheses, the actual memory limit on the number of capturing parentheses, the actual memory limit varies
varies from pattern to pattern. This limit was more useful in versions from pattern to pattern. This limit was more useful in versions before
before 10.30, where function recursion was used for backtracking. 10.30, where function recursion was used for backtracking.
The depth limit is not relevant, and is ignored, when matching is done The depth limit is not relevant, and is ignored, when matching is done
using JIT compiled code. However, it is supported by pcre2_dfa_match(), using JIT compiled code. However, it is supported by pcre2_dfa_match(),
@ -3051,12 +3047,12 @@ ERROR RETURNS FROM pcre2_match()
PCRE2_ERROR_NOMEMORY PCRE2_ERROR_NOMEMORY
If a pattern contains many nested backtracking points, heap memory is Heap memory is used to remember backgracking points. This error is
used to remember them. This error is given when the memory allocation given when the memory allocation function (default or custom) fails.
function (default or custom) fails. Note that a different error, Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the
PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca-
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. tion fails.
PCRE2_ERROR_NULL PCRE2_ERROR_NULL
@ -3860,8 +3856,8 @@ AUTHOR
REVISION REVISION
Last updated: 14 December 2021 Last updated: 27 July 2022
Copyright (c) 1997-2021 University of Cambridge. Copyright (c) 1997-2022 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -4118,41 +4114,40 @@ LIMITING PCRE2 RESOURCE USAGE
pcre2_dfa_match() matching function, and to JIT matching (though the pcre2_dfa_match() matching function, and to JIT matching (though the
counting is done differently). counting is done differently).
The pcre2_match() function starts out using a 20KiB vector on the sys- The pcre2_match() function uses heap memory to record backtracking
tem stack to record backtracking points. The more nested backtracking points. The more nested backtracking points there are (that is, the
points there are (that is, the deeper the search tree), the more memory deeper the search tree), the more memory is needed. There is an upper
is needed. If the initial vector is not large enough, heap memory is limit, specified in kibibytes (units of 1024 bytes). This limit can be
used, up to a certain limit, which is specified in kibibytes (units of changed at run time, as described in the pcre2api documentation. The
1024 bytes). The limit can be changed at run time, as described in the default limit (in effect unlimited) is 20 million. You can change this
pcre2api documentation. The default limit (in effect unlimited) is 20 by a setting such as
million. You can change this by a setting such as
--with-heap-limit=500 --with-heap-limit=500
which limits the amount of heap to 500 KiB. This limit applies only to which limits the amount of heap to 500 KiB. This limit applies only to
interpretive matching in pcre2_match() and pcre2_dfa_match(), which may interpretive matching in pcre2_match() and pcre2_dfa_match(), which may
also use the heap for internal workspace when processing complicated also use the heap for internal workspace when processing complicated
patterns. This limit does not apply when JIT (which has its own memory patterns. This limit does not apply when JIT (which has its own memory
arrangements) is used. arrangements) is used.
You can also explicitly limit the depth of nested backtracking in the You can also explicitly limit the depth of nested backtracking in the
pcre2_match() interpreter. This limit defaults to the value that is set pcre2_match() interpreter. This limit defaults to the value that is set
for --with-match-limit. You can set a lower default limit by adding, for --with-match-limit. You can set a lower default limit by adding,
for example, for example,
--with-match-limit-depth=10000 --with-match-limit-depth=10000
to the configure command. This value can be overridden at run time. to the configure command. This value can be overridden at run time.
This depth limit indirectly limits the amount of heap memory that is This depth limit indirectly limits the amount of heap memory that is
used, but because the size of each backtracking "frame" depends on the used, but because the size of each backtracking "frame" depends on the
number of capturing parentheses in a pattern, the amount of heap that number of capturing parentheses in a pattern, the amount of heap that
is used before the limit is reached varies from pattern to pattern. is used before the limit is reached varies from pattern to pattern.
This limit was more useful in versions before 10.30, where function re- This limit was more useful in versions before 10.30, where function re-
cursion was used for backtracking. cursion was used for backtracking.
As well as applying to pcre2_match(), the depth limit also controls the As well as applying to pcre2_match(), the depth limit also controls the
depth of recursive function calls in pcre2_dfa_match(). These are used depth of recursive function calls in pcre2_dfa_match(). These are used
for lookaround assertions, atomic groups, and recursion within pat- for lookaround assertions, atomic groups, and recursion within pat-
terns. The limit does not apply to JIT matching. terns. The limit does not apply to JIT matching.
@ -4160,67 +4155,67 @@ CREATING CHARACTER TABLES AT BUILD TIME
PCRE2 uses fixed tables for processing characters whose code points are PCRE2 uses fixed tables for processing characters whose code points are
less than 256. By default, PCRE2 is built with a set of tables that are less than 256. By default, PCRE2 is built with a set of tables that are
distributed in the file src/pcre2_chartables.c.dist. These tables are distributed in the file src/pcre2_chartables.c.dist. These tables are
for ASCII codes only. If you add for ASCII codes only. If you add
--enable-rebuild-chartables --enable-rebuild-chartables
to the configure command, the distributed tables are no longer used. to the configure command, the distributed tables are no longer used.
Instead, a program called pcre2_dftables is compiled and run. This out- Instead, a program called pcre2_dftables is compiled and run. This out-
puts the source for new set of tables, created in the default locale of puts the source for new set of tables, created in the default locale of
your C run-time system. This method of replacing the tables does not your C run-time system. This method of replacing the tables does not
work if you are cross compiling, because pcre2_dftables needs to be run work if you are cross compiling, because pcre2_dftables needs to be run
on the local host and therefore not compiled with the cross compiler. on the local host and therefore not compiled with the cross compiler.
If you need to create alternative tables when cross compiling, you will If you need to create alternative tables when cross compiling, you will
have to do so "by hand". There may also be other reasons for creating have to do so "by hand". There may also be other reasons for creating
tables manually. To cause pcre2_dftables to be built on the local tables manually. To cause pcre2_dftables to be built on the local
host, run a normal compiling command, and then run the program with the host, run a normal compiling command, and then run the program with the
output file as its argument, for example: output file as its argument, for example:
cc src/pcre2_dftables.c -o pcre2_dftables cc src/pcre2_dftables.c -o pcre2_dftables
./pcre2_dftables src/pcre2_chartables.c ./pcre2_dftables src/pcre2_chartables.c
This builds the tables in the default locale of the local host. If you This builds the tables in the default locale of the local host. If you
want to specify a locale, you must use the -L option: want to specify a locale, you must use the -L option:
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
You can also specify -b (with or without -L). This causes the tables to You can also specify -b (with or without -L). This causes the tables to
be written in binary instead of as source code. A set of binary tables be written in binary instead of as source code. A set of binary tables
can be loaded into memory by an application and passed to pcre2_com- can be loaded into memory by an application and passed to pcre2_com-
pile() in the same way as tables created by calling pcre2_maketables(). pile() in the same way as tables created by calling pcre2_maketables().
The tables are just a string of bytes, independent of hardware charac- The tables are just a string of bytes, independent of hardware charac-
teristics such as endianness. This means they can be bundled with an teristics such as endianness. This means they can be bundled with an
application that runs in different environments, to ensure consistent application that runs in different environments, to ensure consistent
behaviour. behaviour.
USING EBCDIC CODE USING EBCDIC CODE
PCRE2 assumes by default that it will run in an environment where the PCRE2 assumes by default that it will run in an environment where the
character code is ASCII or Unicode, which is a superset of ASCII. This character code is ASCII or Unicode, which is a superset of ASCII. This
is the case for most computer operating systems. PCRE2 can, however, be is the case for most computer operating systems. PCRE2 can, however, be
compiled to run in an 8-bit EBCDIC environment by adding compiled to run in an 8-bit EBCDIC environment by adding
--enable-ebcdic --disable-unicode --enable-ebcdic --disable-unicode
to the configure command. This setting implies --enable-rebuild-charta- to the configure command. This setting implies --enable-rebuild-charta-
bles. You should only use it if you know that you are in an EBCDIC en- bles. You should only use it if you know that you are in an EBCDIC en-
vironment (for example, an IBM mainframe operating system). vironment (for example, an IBM mainframe operating system).
It is not possible to support both EBCDIC and UTF-8 codes in the same It is not possible to support both EBCDIC and UTF-8 codes in the same
version of the library. Consequently, --enable-unicode and --enable- version of the library. Consequently, --enable-unicode and --enable-
ebcdic are mutually exclusive. ebcdic are mutually exclusive.
The EBCDIC character that corresponds to an ASCII LF is assumed to have The EBCDIC character that corresponds to an ASCII LF is assumed to have
the value 0x15 by default. However, in some EBCDIC environments, 0x25 the value 0x15 by default. However, in some EBCDIC environments, 0x25
is used. In such an environment you should use is used. In such an environment you should use
--enable-ebcdic-nl25 --enable-ebcdic-nl25
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
0x25 is not chosen as LF is made to correspond to the Unicode NEL char- 0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
acter (which, in Unicode, is 0x85). acter (which, in Unicode, is 0x85).
@ -4232,47 +4227,47 @@ USING EBCDIC CODE
PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS
By default pcre2grep supports the use of callouts with string arguments By default pcre2grep supports the use of callouts with string arguments
within the patterns it is matching. There are two kinds: one that gen- within the patterns it is matching. There are two kinds: one that gen-
erates output using local code, and another that calls an external pro- erates output using local code, and another that calls an external pro-
gram or script. If --disable-pcre2grep-callout-fork is added to the gram or script. If --disable-pcre2grep-callout-fork is added to the
configure command, only the first kind of callout is supported; if configure command, only the first kind of callout is supported; if
--disable-pcre2grep-callout is used, all callouts are completely ig- --disable-pcre2grep-callout is used, all callouts are completely ig-
nored. For more details of pcre2grep callouts, see the pcre2grep docu- nored. For more details of pcre2grep callouts, see the pcre2grep docu-
mentation. mentation.
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
By default, pcre2grep reads all files as plain text. You can build it By default, pcre2grep reads all files as plain text. You can build it
so that it recognizes files whose names end in .gz or .bz2, and reads so that it recognizes files whose names end in .gz or .bz2, and reads
them with libz or libbz2, respectively, by adding one or both of them with libz or libbz2, respectively, by adding one or both of
--enable-pcre2grep-libz --enable-pcre2grep-libz
--enable-pcre2grep-libbz2 --enable-pcre2grep-libbz2
to the configure command. These options naturally require that the rel- to the configure command. These options naturally require that the rel-
evant libraries are installed on your system. Configuration will fail evant libraries are installed on your system. Configuration will fail
if they are not. if they are not.
PCRE2GREP BUFFER SIZE PCRE2GREP BUFFER SIZE
pcre2grep uses an internal buffer to hold a "window" on the file it is pcre2grep uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when scanning, in order to be able to output "before" and "after" lines when
it finds a match. The default starting size of the buffer is 20KiB. The it finds a match. The default starting size of the buffer is 20KiB. The
buffer itself is three times this size, but because of the way it is buffer itself is three times this size, but because of the way it is
used for holding "before" lines, the longest line that is guaranteed to used for holding "before" lines, the longest line that is guaranteed to
be processable is the notional buffer size. If a longer line is encoun- be processable is the notional buffer size. If a longer line is encoun-
tered, pcre2grep automatically expands the buffer, up to a specified tered, pcre2grep automatically expands the buffer, up to a specified
maximum size, whose default is 1MiB or the starting size, whichever is maximum size, whose default is 1MiB or the starting size, whichever is
the larger. You can change the default parameter values by adding, for the larger. You can change the default parameter values by adding, for
example, example,
--with-pcre2grep-bufsize=51200 --with-pcre2grep-bufsize=51200
--with-pcre2grep-max-bufsize=2097152 --with-pcre2grep-max-bufsize=2097152
to the configure command. The caller of pcre2grep can override these to the configure command. The caller of pcre2grep can override these
values by using --buffer-size and --max-buffer-size on the command values by using --buffer-size and --max-buffer-size on the command
line. line.
@ -4283,26 +4278,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
--enable-pcre2test-libreadline --enable-pcre2test-libreadline
--enable-pcre2test-libedit --enable-pcre2test-libedit
to the configure command, pcre2test is linked with the libreadline or- to the configure command, pcre2test is linked with the libreadline or-
libedit library, respectively, and when its input is from a terminal, libedit library, respectively, and when its input is from a terminal,
it reads it using the readline() function. This provides line-editing it reads it using the readline() function. This provides line-editing
and history facilities. Note that libreadline is GPL-licensed, so if and history facilities. Note that libreadline is GPL-licensed, so if
you distribute a binary of pcre2test linked in this way, there may be you distribute a binary of pcre2test linked in this way, there may be
licensing issues. These can be avoided by linking instead with libedit, licensing issues. These can be avoided by linking instead with libedit,
which has a BSD licence. which has a BSD licence.
Setting --enable-pcre2test-libreadline causes the -lreadline option to Setting --enable-pcre2test-libreadline causes the -lreadline option to
be added to the pcre2test build. In many operating environments with a be added to the pcre2test build. In many operating environments with a
sytem-installed readline library this is sufficient. However, in some sytem-installed readline library this is sufficient. However, in some
environments (e.g. if an unmodified distribution version of readline is environments (e.g. if an unmodified distribution version of readline is
in use), some extra configuration may be necessary. The INSTALL file in use), some extra configuration may be necessary. The INSTALL file
for libreadline says this: for libreadline says this:
"Readline uses the termcap functions, but does not link with "Readline uses the termcap functions, but does not link with
the termcap or curses library itself, allowing applications the termcap or curses library itself, allowing applications
which link with readline the to choose an appropriate library." which link with readline the to choose an appropriate library."
If your environment has not been set up so that an appropriate library If your environment has not been set up so that an appropriate library
is automatically included, you may need to add something like is automatically included, you may need to add something like
LIBS="-ncurses" LIBS="-ncurses"
@ -4316,7 +4311,7 @@ INCLUDING DEBUGGING CODE
--enable-debug --enable-debug
to the configure command, additional debugging code is included in the to the configure command, additional debugging code is included in the
build. This feature is intended for use by the PCRE2 maintainers. build. This feature is intended for use by the PCRE2 maintainers.
@ -4326,14 +4321,14 @@ DEBUGGING WITH VALGRIND SUPPORT
--enable-valgrind --enable-valgrind
to the configure command, PCRE2 will use valgrind annotations to mark to the configure command, PCRE2 will use valgrind annotations to mark
certain memory regions as unaddressable. This allows it to detect in- certain memory regions as unaddressable. This allows it to detect in-
valid memory accesses, and is mostly useful for debugging PCRE2 itself. valid memory accesses, and is mostly useful for debugging PCRE2 itself.
CODE COVERAGE REPORTING CODE COVERAGE REPORTING
If your C compiler is gcc, you can build a version of PCRE2 that can If your C compiler is gcc, you can build a version of PCRE2 that can
generate a code coverage report for its test suite. To enable this, you generate a code coverage report for its test suite. To enable this, you
must install lcov version 1.6 or above. Then specify must install lcov version 1.6 or above. Then specify
@ -4342,20 +4337,20 @@ CODE COVERAGE REPORTING
to the configure command and build PCRE2 in the usual way. to the configure command and build PCRE2 in the usual way.
Note that using ccache (a caching C compiler) is incompatible with code Note that using ccache (a caching C compiler) is incompatible with code
coverage reporting. If you have configured ccache to run automatically coverage reporting. If you have configured ccache to run automatically
on your system, you must set the environment variable on your system, you must set the environment variable
CCACHE_DISABLE=1 CCACHE_DISABLE=1
before running make to build PCRE2, so that ccache is not used. before running make to build PCRE2, so that ccache is not used.
When --enable-coverage is used, the following addition targets are When --enable-coverage is used, the following addition targets are
added to the Makefile: added to the Makefile:
make coverage make coverage
This creates a fresh coverage report for the PCRE2 test suite. It is This creates a fresh coverage report for the PCRE2 test suite. It is
equivalent to running "make coverage-reset", "make coverage-baseline", equivalent to running "make coverage-reset", "make coverage-baseline",
"make check", and then "make coverage-report". "make check", and then "make coverage-report".
make coverage-reset make coverage-reset
@ -4372,73 +4367,73 @@ CODE COVERAGE REPORTING
make coverage-clean-report make coverage-clean-report
This removes the generated coverage report without cleaning the cover- This removes the generated coverage report without cleaning the cover-
age data itself. age data itself.
make coverage-clean-data make coverage-clean-data
This removes the captured coverage data without removing the coverage This removes the captured coverage data without removing the coverage
files created at compile time (*.gcno). files created at compile time (*.gcno).
make coverage-clean make coverage-clean
This cleans all coverage data including the generated coverage report. This cleans all coverage data including the generated coverage report.
For more information about code coverage, see the gcov and lcov docu- For more information about code coverage, see the gcov and lcov docu-
mentation. mentation.
DISABLING THE Z AND T FORMATTING MODIFIERS DISABLING THE Z AND T FORMATTING MODIFIERS
The C99 standard defines formatting modifiers z and t for size_t and The C99 standard defines formatting modifiers z and t for size_t and
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers
in environments other than old versions of Microsoft Visual Studio when in environments other than old versions of Microsoft Visual Studio when
__STDC_VERSION__ is defined and has a value greater than or equal to __STDC_VERSION__ is defined and has a value greater than or equal to
199901L (indicating support for C99). However, there is at least one 199901L (indicating support for C99). However, there is at least one
environment that claims to be C99 but does not support these modifiers. environment that claims to be C99 but does not support these modifiers.
If If
--disable-percent-zt --disable-percent-zt
is specified, no use is made of the z or t modifiers. Instead of %td or is specified, no use is made of the z or t modifiers. Instead of %td or
%zu, a suitable format is used depending in the size of long for the %zu, a suitable format is used depending in the size of long for the
platform. platform.
SUPPORT FOR FUZZERS SUPPORT FOR FUZZERS
There is a special option for use by people who want to run fuzzing There is a special option for use by people who want to run fuzzing
tests on PCRE2: tests on PCRE2:
--enable-fuzz-support --enable-fuzz-support
At present this applies only to the 8-bit library. If set, it causes an At present this applies only to the 8-bit library. If set, it causes an
extra library called libpcre2-fuzzsupport.a to be built, but not in- extra library called libpcre2-fuzzsupport.a to be built, but not in-
stalled. This contains a single function called LLVMFuzzerTestOneIn- stalled. This contains a single function called LLVMFuzzerTestOneIn-
put() whose arguments are a pointer to a string and the length of the put() whose arguments are a pointer to a string and the length of the
string. When called, this function tries to compile the string as a string. When called, this function tries to compile the string as a
pattern, and if that succeeds, to match it. This is done both with no pattern, and if that succeeds, to match it. This is done both with no
options and with some random options bits that are generated from the options and with some random options bits that are generated from the
string. string.
Setting --enable-fuzz-support also causes a binary called pcre2fuz- Setting --enable-fuzz-support also causes a binary called pcre2fuz-
zcheck to be created. This is normally run under valgrind or used when zcheck to be created. This is normally run under valgrind or used when
PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing
function and outputs information about what it is doing. The input function and outputs information about what it is doing. The input
strings are specified by arguments: if an argument starts with "=" the strings are specified by arguments: if an argument starts with "=" the
rest of it is a literal input string. Otherwise, it is assumed to be a rest of it is a literal input string. Otherwise, it is assumed to be a
file name, and the contents of the file are the test string. file name, and the contents of the file are the test string.
OBSOLETE OPTION OBSOLETE OPTION
In versions of PCRE2 prior to 10.30, there were two ways of handling In versions of PCRE2 prior to 10.30, there were two ways of handling
backtracking in the pcre2_match() function. The default was to use the backtracking in the pcre2_match() function. The default was to use the
system stack, but if system stack, but if
--disable-stack-for-recursion --disable-stack-for-recursion
was set, memory on the heap was used. From release 10.30 onwards this was set, memory on the heap was used. From release 10.30 onwards this
has changed (the stack is no longer used) and this option now does has changed (the stack is no longer used) and this option now does
nothing except give a warning. nothing except give a warning.
@ -4450,14 +4445,14 @@ SEE ALSO
AUTHOR AUTHOR
Philip Hazel Philip Hazel
University Computing Service Retired from University Computing Service
Cambridge, England. Cambridge, England.
REVISION REVISION
Last updated: 08 December 2021 Last updated: 27 July 2022
Copyright (c) 1997-2021 University of Cambridge. Copyright (c) 1997-2022 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -5596,18 +5591,22 @@ SIZE AND OTHER LIMITATIONS
The maximum length of a string argument to a callout is the largest The maximum length of a string argument to a callout is the largest
number a 32-bit unsigned integer can hold. number a 32-bit unsigned integer can hold.
The maximum amount of heap memory used for matching is controlled by
the heap limit, which can be set in a pattern or in a match context.
The default is a very large number, effectively unlimited.
AUTHOR AUTHOR
Philip Hazel Philip Hazel
University Computing Service Retired from University Computing Service
Cambridge, England. Cambridge, England.
REVISION REVISION
Last updated: 02 February 2019 Last updated: 26 July 2022
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2022 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -9773,152 +9772,169 @@ STACK AND HEAP USAGE AT RUN TIME
sive function calls could use a great deal of stack, and this could sive function calls could use a great deal of stack, and this could
cause problems, but this usage has been eliminated. Backtracking posi- cause problems, but this usage has been eliminated. Backtracking posi-
tions are now explicitly remembered in memory frames controlled by the tions are now explicitly remembered in memory frames controlled by the
code. An initial 20KiB vector of frames is allocated on the system code.
stack (enough for about 100 frames for small patterns), but if this is
insufficient, heap memory is used. The amount of heap memory can be
limited; if the limit is set to zero, only the initial stack vector is
used. Rewriting patterns to be time-efficient, as described below, may
also reduce the memory requirements.
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive The size of each frame depends on the size of pointer variables and the
function calls, but only for processing atomic groups, lookaround as- number of capturing parenthesized groups in the pattern being matched.
On a 64-bit system the frame size for a pattern with no captures is 128
bytes. For each capturing group the size increases by 16 bytes.
Until release 10.41, an initial 20KiB frames vector was allocated on
the system stack, but this still caused some issues for multi-thread
applications where each thread has a very small stack. From release
10.41 backtracking memory frames are always held in heap memory. An
initial heap allocation is obtained the first time any match data block
is passed to pcre2_match(). This is remembered with the match data
block and re-used if that block is used for another match. It is freed
when the match data block itself is freed.
The size of the initial block is the larger of 20KiB or ten times the
pattern's frame size, unless the heap limit is less than this, in which
case the heap limit is used. If the initial block proves to be too
small during matching, it is replaced by a larger block, subject to the
heap limit. The heap limit is checked only when a new block is to be
allocated. Reducing the heap limit between calls to pcre2_match() with
the same match data block does not affect the saved block.
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive
function calls, but only for processing atomic groups, lookaround as-
sertions, and recursion within the pattern. The original version of the sertions, and recursion within the pattern. The original version of the
code used to allocate quite large internal workspace vectors on the code used to allocate quite large internal workspace vectors on the
stack, which caused some problems for some patterns in environments stack, which caused some problems for some patterns in environments
with small stacks. From release 10.32 the code for pcre2_dfa_match() with small stacks. From release 10.32 the code for pcre2_dfa_match()
has been re-factored to use heap memory when necessary for internal has been re-factored to use heap memory when necessary for internal
workspace when recursing, though recursive function calls are still workspace when recursing, though recursive function calls are still
used. used.
The "match depth" parameter can be used to limit the depth of function The "match depth" parameter can be used to limit the depth of function
recursion, and the "match heap" parameter to limit heap memory in recursion, and the "match heap" parameter to limit heap memory in
pcre2_dfa_match(). pcre2_dfa_match().
PROCESSING TIME PROCESSING TIME
Certain items in regular expression patterns are processed more effi- Certain items in regular expression patterns are processed more effi-
ciently than others. It is more efficient to use a character class like ciently than others. It is more efficient to use a character class like
[aeiou] than a set of single-character alternatives such as [aeiou] than a set of single-character alternatives such as
(a|e|i|o|u). In general, the simplest construction that provides the (a|e|i|o|u). In general, the simplest construction that provides the
required behaviour is usually the most efficient. Jeffrey Friedl's book required behaviour is usually the most efficient. Jeffrey Friedl's book
contains a lot of useful general discussion about optimizing regular contains a lot of useful general discussion about optimizing regular
expressions for efficient performance. This document contains a few ob- expressions for efficient performance. This document contains a few ob-
servations about PCRE2. servations about PCRE2.
Using Unicode character properties (the \p, \P, and \X escapes) is Using Unicode character properties (the \p, \P, and \X escapes) is
slow, because PCRE2 has to use a multi-stage table lookup whenever it slow, because PCRE2 has to use a multi-stage table lookup whenever it
needs a character's property. If you can find an alternative pattern needs a character's property. If you can find an alternative pattern
that does not use character properties, it will probably be faster. that does not use character properties, it will probably be faster.
By default, the escape sequences \b, \d, \s, and \w, and the POSIX By default, the escape sequences \b, \d, \s, and \w, and the POSIX
character classes such as [:alpha:] do not use Unicode properties, character classes such as [:alpha:] do not use Unicode properties,
partly for backwards compatibility, and partly for performance reasons. partly for backwards compatibility, and partly for performance reasons.
However, you can set the PCRE2_UCP option or start the pattern with However, you can set the PCRE2_UCP option or start the pattern with
(*UCP) if you want Unicode character properties to be used. This can (*UCP) if you want Unicode character properties to be used. This can
double the matching time for items such as \d, when matched with double the matching time for items such as \d, when matched with
pcre2_match(); the performance loss is less with a DFA matching func- pcre2_match(); the performance loss is less with a DFA matching func-
tion, and in both cases there is not much difference for \b. tion, and in both cases there is not much difference for \b.
When a pattern begins with .* not in atomic parentheses, nor in paren- When a pattern begins with .* not in atomic parentheses, nor in paren-
theses that are the subject of a backreference, and the PCRE2_DOTALL theses that are the subject of a backreference, and the PCRE2_DOTALL
option is set, the pattern is implicitly anchored by PCRE2, since it option is set, the pattern is implicitly anchored by PCRE2, since it
can match only at the start of a subject string. If the pattern has can match only at the start of a subject string. If the pattern has
multiple top-level branches, they must all be anchorable. The optimiza- multiple top-level branches, they must all be anchorable. The optimiza-
tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au- tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au-
tomatically disabled if the pattern contains (*PRUNE) or (*SKIP). tomatically disabled if the pattern contains (*PRUNE) or (*SKIP).
If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be- If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be-
cause the dot metacharacter does not then match a newline, and if the cause the dot metacharacter does not then match a newline, and if the
subject string contains newlines, the pattern may match from the char- subject string contains newlines, the pattern may match from the char-
acter immediately following one of them instead of from the very start. acter immediately following one of them instead of from the very start.
For example, the pattern For example, the pattern
.*second .*second
matches the subject "first\nand second" (where \n stands for a newline matches the subject "first\nand second" (where \n stands for a newline
character), with the match starting at the seventh character. In order character), with the match starting at the seventh character. In order
to do this, PCRE2 has to retry the match starting after every newline to do this, PCRE2 has to retry the match starting after every newline
in the subject. in the subject.
If you are using such a pattern with subject strings that do not con- If you are using such a pattern with subject strings that do not con-
tain newlines, the best performance is obtained by setting tain newlines, the best performance is obtained by setting
PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex- PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex-
plicit anchoring. That saves PCRE2 from having to scan along the sub- plicit anchoring. That saves PCRE2 from having to scan along the sub-
ject looking for a newline to restart at. ject looking for a newline to restart at.
Beware of patterns that contain nested indefinite repeats. These can Beware of patterns that contain nested indefinite repeats. These can
take a long time to run when applied to a string that does not match. take a long time to run when applied to a string that does not match.
Consider the pattern fragment Consider the pattern fragment
^(a+)* ^(a+)*
This can match "aaaa" in 16 different ways, and this number increases This can match "aaaa" in 16 different ways, and this number increases
very rapidly as the string gets longer. (The * repeat can match 0, 1, very rapidly as the string gets longer. (The * repeat can match 0, 1,
2, 3, or 4 times, and for each of those cases other than 0 or 4, the + 2, 3, or 4 times, and for each of those cases other than 0 or 4, the +
repeats can match different numbers of times.) When the remainder of repeats can match different numbers of times.) When the remainder of
the pattern is such that the entire match is going to fail, PCRE2 has the pattern is such that the entire match is going to fail, PCRE2 has
in principle to try every possible variation, and this can take an ex- in principle to try every possible variation, and this can take an ex-
tremely long time, even for relatively short strings. tremely long time, even for relatively short strings.
An optimization catches some of the more simple cases such as An optimization catches some of the more simple cases such as
(a+)*b (a+)*b
where a literal character follows. Before embarking on the standard where a literal character follows. Before embarking on the standard
matching procedure, PCRE2 checks that there is a "b" later in the sub- matching procedure, PCRE2 checks that there is a "b" later in the sub-
ject string, and if there is not, it fails the match immediately. How- ject string, and if there is not, it fails the match immediately. How-
ever, when there is no following literal this optimization cannot be ever, when there is no following literal this optimization cannot be
used. You can see the difference by comparing the behaviour of used. You can see the difference by comparing the behaviour of
(a+)*\d (a+)*\d
with the pattern above. The former gives a failure almost instantly with the pattern above. The former gives a failure almost instantly
when applied to a whole line of "a" characters, whereas the latter when applied to a whole line of "a" characters, whereas the latter
takes an appreciable time with strings longer than about 20 characters. takes an appreciable time with strings longer than about 20 characters.
In many cases, the solution to this kind of performance issue is to use In many cases, the solution to this kind of performance issue is to use
an atomic group or a possessive quantifier. This can often reduce mem- an atomic group or a possessive quantifier. This can often reduce mem-
ory requirements as well. As another example, consider this pattern: ory requirements as well. As another example, consider this pattern:
([^<]|<(?!inet))+ ([^<]|<(?!inet))+
It matches from wherever it starts until it encounters "<inet" or the It matches from wherever it starts until it encounters "<inet" or the
end of the data, and is the kind of pattern that might be used when end of the data, and is the kind of pattern that might be used when
processing an XML file. Each iteration of the outer parentheses matches processing an XML file. Each iteration of the outer parentheses matches
either one character that is not "<" or a "<" that is not followed by either one character that is not "<" or a "<" that is not followed by
"inet". However, each time a parenthesis is processed, a backtracking "inet". However, each time a parenthesis is processed, a backtracking
position is passed, so this formulation uses a memory frame for each position is passed, so this formulation uses a memory frame for each
matched character. For a long string, a lot of memory is required. Con- matched character. For a long string, a lot of memory is required. Con-
sider now this rewritten pattern, which matches exactly the same sider now this rewritten pattern, which matches exactly the same
strings: strings:
([^<]++|<(?!inet))+ ([^<]++|<(?!inet))+
This runs much faster, because sequences of characters that do not con- This runs much faster, because sequences of characters that do not con-
tain "<" are "swallowed" in one item inside the parentheses, and a pos- tain "<" are "swallowed" in one item inside the parentheses, and a pos-
sessive quantifier is used to stop any backtracking into the runs of sessive quantifier is used to stop any backtracking into the runs of
non-"<" characters. This version also uses a lot less memory because non-"<" characters. This version also uses a lot less memory because
entry to a new set of parentheses happens only when a "<" character entry to a new set of parentheses happens only when a "<" character
that is not followed by "inet" is encountered (and we assume this is that is not followed by "inet" is encountered (and we assume this is
relatively rare). relatively rare).
This example shows that one way of optimizing performance when matching This example shows that one way of optimizing performance when matching
long subject strings is to write repeated parenthesized subpatterns to long subject strings is to write repeated parenthesized subpatterns to
match more than one character whenever possible. match more than one character whenever possible.
SETTING RESOURCE LIMITS SETTING RESOURCE LIMITS
You can set limits on the amount of processing that takes place when You can set limits on the amount of processing that takes place when
matching, and on the amount of heap memory that is used. The default matching, and on the amount of heap memory that is used. The default
values of the limits are very large, and unlikely ever to operate. They values of the limits are very large, and unlikely ever to operate. They
can be changed when PCRE2 is built, and they can also be set when can be changed when PCRE2 is built, and they can also be set when
pcre2_match() or pcre2_dfa_match() is called. For details of these in- pcre2_match() or pcre2_dfa_match() is called. For details of these in-
terfaces, see the pcre2build documentation and the section entitled terfaces, see the pcre2build documentation and the section entitled
"The match context" in the pcre2api documentation. "The match context" in the pcre2api documentation.
The pcre2test test program has a modifier called "find_limits" which, The pcre2test test program has a modifier called "find_limits" which,
if applied to a subject line, causes it to find the smallest limits if applied to a subject line, causes it to find the smallest limits
that allow a pattern to match. This is done by repeatedly matching with that allow a pattern to match. This is done by repeatedly matching with
different limits. different limits.
@ -9926,14 +9942,14 @@ PROCESSING TIME
AUTHOR AUTHOR
Philip Hazel Philip Hazel
University Computing Service Retired from University Computing Service
Cambridge, England. Cambridge, England.
REVISION REVISION
Last updated: 03 February 2019 Last updated: 27 July 2022
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2022 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2GREP 1 "27 July 2022" "PCRE2 10.41" .TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
.SH NAME .SH NAME
pcre2grep - a grep with Perl-compatible regular expressions. pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS .SH SYNOPSIS
@ -43,13 +43,15 @@ For example:
.sp .sp
pcre2grep some-pattern file1 - file3 pcre2grep some-pattern file1 - file3
.sp .sp
Input files are searched line by line. By default, each line that matches a By default, input files are searched line by line. Each line that matches a
pattern is copied to the standard output, and if there is more than one file, pattern is copied to the standard output, and if there is more than one file,
the file name is output at the start of each line, followed by a colon. the file name is output at the start of each line, followed by a colon.
However, there are options that can change how \fBpcre2grep\fP behaves. In However, there are options that can change how \fBpcre2grep\fP behaves. For
particular, the \fB-M\fP option makes it possible to search for strings that example, the \fB-M\fP option makes it possible to search for strings that span
span line boundaries. What defines a line boundary is controlled by the line boundaries. What defines a line boundary is controlled by the \fB-N\fP
\fB-N\fP (\fB--newline\fP) option. (\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
not file names are shown, and the \fB-Z\fP option changes the file name
terminator to a zero byte.
.P .P
The amount of memory used for buffering files that are being scanned is The amount of memory used for buffering files that are being scanned is
controlled by parameters that can be set by the \fB--buffer-size\fP and controlled by parameters that can be set by the \fB--buffer-size\fP and
@ -149,9 +151,11 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
lines are output if the next match or the end of the file is reached, or if the lines are output if the next match or the end of the file is reached, or if the
processing buffer size has been set too small. If file names and/or line processing buffer size has been set too small. If file names and/or line
numbers are being output, a hyphen separator is used instead of a colon for the numbers are being output, a hyphen separator is used instead of a colon for the
context lines. A line containing "--" is output between each group of lines, context lines (the \fB-Z\fP option can be used to change the file name
unless they are in fact contiguous in the input file. The value of \fInumber\fP terminator to a zero byte). A line containing "--" is output between each group
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored. of lines, unless they are in fact contiguous in the input file. The value of
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
\fB-A\fP is ignored.
.TP .TP
\fB-a\fP, \fB--text\fP \fB-a\fP, \fB--text\fP
Treat binary files as text. This is equivalent to Treat binary files as text. This is equivalent to
@ -167,9 +171,10 @@ Output up to \fInumber\fP lines of context before each matching line. Fewer
lines are output if the previous match or the start of the file is within lines are output if the previous match or the start of the file is within
\fInumber\fP lines, or if the processing buffer size has been set too small. If \fInumber\fP lines, or if the processing buffer size has been set too small. If
file names and/or line numbers are being output, a hyphen separator is used file names and/or line numbers are being output, a hyphen separator is used
instead of a colon for the context lines. A line containing "--" is output instead of a colon for the context lines (the \fB-Z\fP option can be used to
between each group of lines, unless they are in fact contiguous in the input change the file name terminator to a zero byte). A line containing "--" is
file. The value of \fInumber\fP is expected to be relatively small. When output between each group of lines, unless they are in fact contiguous in the
input file. The value of \fInumber\fP is expected to be relatively small. When
\fB-c\fP is used, \fB-B\fP is ignored. \fB-c\fP is used, \fB-B\fP is ignored.
.TP .TP
\fB--binary-files=\fP\fIword\fP \fB--binary-files=\fP\fIword\fP
@ -356,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
.TP .TP
\fB-H\fP, \fB--with-filename\fP \fB-H\fP, \fB--with-filename\fP
Force the inclusion of the file name at the start of output lines when Force the inclusion of the file name at the start of output lines when
searching a single file. By default, the file name is not shown in this case. searching a single file. The file name is not normally shown in this case.
For matching lines, the file name is followed by a colon; for context lines, a By default, for matching lines, the file name is followed by a colon; for
hyphen separator is used. If a line number is also being output, it follows the context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
file name. When the \fB-M\fP option causes a pattern to match more than one change the terminator to a zero byte. If a line number is also being output,
line, only the first is preceded by the file name. This option overrides any it follows the file name. When the \fB-M\fP option causes a pattern to match
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options. more than one line, only the first is preceded by the file name. This option
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
.TP .TP
\fB-h\fP, \fB--no-filename\fP \fB-h\fP, \fB--no-filename\fP
Suppress the output file names when searching multiple files. By default, Suppress the output file names when searching multiple files. File names are
file names are shown when multiple files are searched. For matching lines, the normally shown when multiple files are searched. By default, for matching
file name is followed by a colon; for context lines, a hyphen separator is used. lines, the file name is followed by a colon; for context lines, a hyphen
If a line number is also being output, it follows the file name. This option separator is used. The \fB-Z\fP option can be used to change the terminator to
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options. a zero byte. If a line number is also being output, it follows the file name.
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
.TP .TP
\fB--heap-limit\fP=\fInumber\fP \fB--heap-limit\fP=\fInumber\fP
See \fB--match-limit\fP below. See \fB--match-limit\fP below.
@ -417,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
\fB-L\fP, \fB--files-without-match\fP \fB-L\fP, \fB--files-without-match\fP
Instead of outputting lines from the files, just output the names of the files Instead of outputting lines from the files, just output the names of the files
that do not contain any lines that would have been output. Each file name is that do not contain any lines that would have been output. Each file name is
output once, on a separate line. This option overrides any previous \fB-H\fP, output once, on a separate line by default, but if the \fB-Z\fP option is set,
\fB-h\fP, or \fB-l\fP options. they are separated by zero bytes instead of newlines. This option overrides any
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
.TP .TP
\fB-l\fP, \fB--files-with-matches\fP \fB-l\fP, \fB--files-with-matches\fP
Instead of outputting lines from the files, just output the names of the files Instead of outputting lines from the files, just output the names of the files
containing lines that would have been output. Each file name is output once, on containing lines that would have been output. Each file name is output once, on
a separate line. Searching normally stops as soon as a matching line is found a separate line, but if the \fB-Z\fP option is set, they are separated by zero
in a file. However, if the \fB-c\fP (count) option is also used, matching bytes instead of newlines. Searching normally stops as soon as a matching line
continues in order to obtain the correct count, and those files that have at is found in a file. However, if the \fB-c\fP (count) option is also used,
least one match are listed along with their counts. Using this option with matching continues in order to obtain the correct count, and those files that
\fB-c\fP is a way of suppressing the listing of files with no matches that have at least one match are listed along with their counts. Using this option
with \fB-c\fP is a way of suppressing the listing of files with no matches that
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP, occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
\fB-h\fP, or \fB-L\fP options. \fB-h\fP, or \fB-L\fP options.
.TP .TP
@ -516,7 +525,7 @@ counter that is incremented each time around its main processing loop. If the
value set by \fB--match-limit\fP is reached, an error occurs. value set by \fB--match-limit\fP is reached, an error occurs.
.sp .sp
The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of The \fB--heap-limit\fP option specifies, as a number of kibibytes (units of
1024 bytes), the maximum amount of heap memory that may be used for matching. 1024 bytes), the maximum amount of heap memory that may be used for matching.
.sp .sp
The \fB--depth-limit\fP option limits the depth of nested backtracking points, The \fB--depth-limit\fP option limits the depth of nested backtracking points,
which indirectly limits the amount of memory that is used. The amount of memory which indirectly limits the amount of memory that is used. The amount of memory
@ -729,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
pattern and ")$" at the end. This option applies only to the patterns that are pattern and ")$" at the end. This option applies only to the patterns that are
matched against the contents of files; it does not apply to patterns specified matched against the contents of files; it does not apply to patterns specified
by any of the \fB--include\fP or \fB--exclude\fP options. by any of the \fB--include\fP or \fB--exclude\fP options.
.TP
\fB-Z\fP, \fB--null\fP
Terminate files names in the regular output with a zero byte (the NUL
character) instead of what would normally appear. This is useful when file
names contain unusual characters such as colons, hyphens, or even newlines. The
option does not apply to file names in error messages.
. .
. .
.SH "ENVIRONMENT VARIABLES" .SH "ENVIRONMENT VARIABLES"
@ -957,6 +972,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 27 July 2022 Last updated: 30 July 2022
Copyright (c) 1997-2022 University of Cambridge. Copyright (c) 1997-2022 University of Cambridge.
.fi .fi

File diff suppressed because it is too large Load Diff

View File

@ -1111,7 +1111,8 @@ SUBJECT MODIFIERS
copy=<number or name> copy captured substring copy=<number or name> copy captured substring
depth_limit=<n> set a depth limit depth_limit=<n> set a depth limit
dfa use pcre2_dfa_match() dfa use pcre2_dfa_match()
find_limits find match and depth limits find_limits find heap, match and depth limits
find_limits_noheap find match and depth limits
get=<number or name> extract captured substring get=<number or name> extract captured substring
getall extract all captured substrings getall extract all captured substrings
/g global global matching /g global global matching
@ -1411,7 +1412,7 @@ SUBJECT MODIFIERS
The heap_limit, match_limit, and depth_limit modifiers set the appro- The heap_limit, match_limit, and depth_limit modifiers set the appro-
priate limits in the match context. These values are ignored when the priate limits in the match context. These values are ignored when the
find_limits modifier is specified. find_limits or find_limits_noheap modifier is specified.
Finding minimum limits Finding minimum limits
@ -1419,8 +1420,12 @@ SUBJECT MODIFIERS
calls the relevant matching function several times, setting different calls the relevant matching function several times, setting different
values in the match context via pcre2_set_heap_limit(), values in the match context via pcre2_set_heap_limit(),
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
minimum values for each parameter that allows the match to complete smallest value for each parameter that allows the match to complete
without error. If JIT is being used, only the match limit is relevant. without a "limit exceeded" error. The match itself may succeed or fail.
An alternative modifier, find_limits_noheap, omits the heap limit. This
is used in the standard tests, because the minimum heap limit varies
between systems. If JIT is being used, only the match limit is rele-
vant, and the other two are automatically omitted.
When using this modifier, the pattern should not contain any limit set- When using this modifier, the pattern should not contain any limit set-
tings such as (*LIMIT_MATCH=...) within it. If such a setting is tings such as (*LIMIT_MATCH=...) within it. If such a setting is
@ -1446,9 +1451,7 @@ SUBJECT MODIFIERS
For both kinds of matching, the heap_limit number, which is in For both kinds of matching, the heap_limit number, which is in
kibibytes (units of 1024 bytes), limits the amount of heap memory used kibibytes (units of 1024 bytes), limits the amount of heap memory used
for matching. A value of zero disables the use of any heap memory; many for matching.
simple pattern matches can be done without using the heap, so zero is
not an unreasonable setting.
Showing MARK names Showing MARK names
@ -1463,13 +1466,11 @@ SUBJECT MODIFIERS
The memory modifier causes pcre2test to log the sizes of all heap mem- The memory modifier causes pcre2test to log the sizes of all heap mem-
ory allocation and freeing calls that occur during a call to ory allocation and freeing calls that occur during a call to
pcre2_match() or pcre2_dfa_match(). These occur only when a match re- pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is
quires a bigger vector than the default for remembering backtracking used only when a match requires more internal workspace that the de-
points (pcre2_match()) or for internal workspace (pcre2_dfa_match()). fault allocation on the stack, so in many cases there will be no out-
In many cases there will be no heap memory used and therefore no addi- put. No heap memory is allocated during matching with JIT. For this
tional output. No heap memory is allocated during matching with JIT, so modifier to work, the null_context modifier must not be set on both the
in that case the memory modifier never has any effect. For this modi-
fier to work, the null_context modifier must not be set on both the
pattern and the subject, though it can be set on one or the other. pattern and the subject, though it can be set on one or the other.
Setting a starting offset Setting a starting offset
@ -1518,45 +1519,46 @@ SUBJECT MODIFIERS
null_context modifier is set, however, NULL is passed. This is for null_context modifier is set, however, NULL is passed. This is for
testing that the matching and substitution functions behave correctly testing that the matching and substitution functions behave correctly
in this case (they use default values). This modifier cannot be used in this case (they use default values). This modifier cannot be used
with the find_limits or substitute_callout modifiers. with the find_limits, find_limits_noheap, or substitute_callout modi-
fiers.
Similarly, for testing purposes, if the null_subject or null_replace- Similarly, for testing purposes, if the null_subject or null_replace-
ment modifier is set, the subject or replacement string pointers are ment modifier is set, the subject or replacement string pointers are
passed as NULL, respectively, to the relevant functions. passed as NULL, respectively, to the relevant functions.
THE ALTERNATIVE MATCHING FUNCTION THE ALTERNATIVE MATCHING FUNCTION
By default, pcre2test uses the standard PCRE2 matching function, By default, pcre2test uses the standard PCRE2 matching function,
pcre2_match() to match each subject line. PCRE2 also supports an alter- pcre2_match() to match each subject line. PCRE2 also supports an alter-
native matching function, pcre2_dfa_match(), which operates in a dif- native matching function, pcre2_dfa_match(), which operates in a dif-
ferent way, and has some restrictions. The differences between the two ferent way, and has some restrictions. The differences between the two
functions are described in the pcre2matching documentation. functions are described in the pcre2matching documentation.
If the dfa modifier is set, the alternative matching function is used. If the dfa modifier is set, the alternative matching function is used.
This function finds all possible matches at a given point in the sub- This function finds all possible matches at a given point in the sub-
ject. If, however, the dfa_shortest modifier is set, processing stops ject. If, however, the dfa_shortest modifier is set, processing stops
after the first match is found. This is always the shortest possible after the first match is found. This is always the shortest possible
match. match.
DEFAULT OUTPUT FROM pcre2test DEFAULT OUTPUT FROM pcre2test
This section describes the output when the normal matching function, This section describes the output when the normal matching function,
pcre2_match(), is being used. pcre2_match(), is being used.
When a match succeeds, pcre2test outputs the list of captured sub- When a match succeeds, pcre2test outputs the list of captured sub-
strings, starting with number 0 for the string that matched the whole strings, starting with number 0 for the string that matched the whole
pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER- pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER-
ROR_NOMATCH, or "Partial match:" followed by the partially matching ROR_NOMATCH, or "Partial match:" followed by the partially matching
substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is
the entire substring that was inspected during the partial match; it the entire substring that was inspected during the partial match; it
may include characters before the actual match start if a lookbehind may include characters before the actual match start if a lookbehind
assertion, \K, \b, or \B was involved.) assertion, \K, \b, or \B was involved.)
For any other return, pcre2test outputs the PCRE2 negative error number For any other return, pcre2test outputs the PCRE2 negative error number
and a short descriptive phrase. If the error is a failed UTF string and a short descriptive phrase. If the error is a failed UTF string
check, the code unit offset of the start of the failing character is check, the code unit offset of the start of the failing character is
also output. Here is an example of an interactive pcre2test run. also output. Here is an example of an interactive pcre2test run.
$ pcre2test $ pcre2test
@ -1572,8 +1574,8 @@ DEFAULT OUTPUT FROM pcre2test
Unset capturing substrings that are not followed by one that is set are Unset capturing substrings that are not followed by one that is set are
not shown by pcre2test unless the allcaptures modifier is specified. In not shown by pcre2test unless the allcaptures modifier is specified. In
the following example, there are two capturing substrings, but when the the following example, there are two capturing substrings, but when the
first data line is matched, the second, unset substring is not shown. first data line is matched, the second, unset substring is not shown.
An "internal" unset substring is shown as "<unset>", as for the second An "internal" unset substring is shown as "<unset>", as for the second
data line. data line.
re> /(a)|(b)/ re> /(a)|(b)/
@ -1585,11 +1587,11 @@ DEFAULT OUTPUT FROM pcre2test
1: <unset> 1: <unset>
2: b 2: b
If the strings contain any non-printing characters, they are output as If the strings contain any non-printing characters, they are output as
\xhh escapes if the value is less than 256 and UTF mode is not set. \xhh escapes if the value is less than 256 and UTF mode is not set.
Otherwise they are output as \x{hh...} escapes. See below for the defi- Otherwise they are output as \x{hh...} escapes. See below for the defi-
nition of non-printing characters. If the aftertext modifier is set, nition of non-printing characters. If the aftertext modifier is set,
the output for substring 0 is followed by the the rest of the subject the output for substring 0 is followed by the the rest of the subject
string, identified by "0+" like this: string, identified by "0+" like this:
re> /cat/aftertext re> /cat/aftertext
@ -1609,8 +1611,8 @@ DEFAULT OUTPUT FROM pcre2test
0: ipp 0: ipp
1: pp 1: pp
"No match" is output only if the first match attempt fails. Here is an "No match" is output only if the first match attempt fails. Here is an
example of a failure message (the offset 4 that is specified by the example of a failure message (the offset 4 that is specified by the
offset modifier is past the end of the subject string): offset modifier is past the end of the subject string):
re> /xyz/ re> /xyz/
@ -1618,7 +1620,7 @@ DEFAULT OUTPUT FROM pcre2test
Error -24 (bad offset value) Error -24 (bad offset value)
Note that whereas patterns can be continued over several lines (a plain Note that whereas patterns can be continued over several lines (a plain
">" prompt is used for continuations), subject lines may not. However ">" prompt is used for continuations), subject lines may not. However
newlines can be included in a subject by means of the \n escape (or \r, newlines can be included in a subject by means of the \n escape (or \r,
\r\n, etc., depending on the newline sequence setting). \r\n, etc., depending on the newline sequence setting).
@ -1626,7 +1628,7 @@ DEFAULT OUTPUT FROM pcre2test
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
When the alternative matching function, pcre2_dfa_match(), is used, the When the alternative matching function, pcre2_dfa_match(), is used, the
output consists of a list of all the matches that start at the first output consists of a list of all the matches that start at the first
point in the subject where there is at least one match. For example: point in the subject where there is at least one match. For example:
re> /(tang|tangerine|tan)/ re> /(tang|tangerine|tan)/
@ -1635,11 +1637,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
1: tang 1: tang
2: tan 2: tan
Using the normal matching function on this data finds only "tang". The Using the normal matching function on this data finds only "tang". The
longest matching string is always given first (and numbered zero). Af- longest matching string is always given first (and numbered zero). Af-
ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol- ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
lowed by the partially matching substring. Note that this is the entire lowed by the partially matching substring. Note that this is the entire
substring that was inspected during the partial match; it may include substring that was inspected during the partial match; it may include
characters before the actual match start if a lookbehind assertion, \b, characters before the actual match start if a lookbehind assertion, \b,
or \B was involved. (\K is not supported for DFA matching.) or \B was involved. (\K is not supported for DFA matching.)
@ -1655,16 +1657,16 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
1: tan 1: tan
0: tan 0: tan
The alternative matching function does not support substring capture, The alternative matching function does not support substring capture,
so the modifiers that are concerned with captured substrings are not so the modifiers that are concerned with captured substrings are not
relevant. relevant.
RESTARTING AFTER A PARTIAL MATCH RESTARTING AFTER A PARTIAL MATCH
When the alternative matching function has given the PCRE2_ERROR_PAR- When the alternative matching function has given the PCRE2_ERROR_PAR-
TIAL return, indicating that the subject partially matched the pattern, TIAL return, indicating that the subject partially matched the pattern,
you can restart the match with additional subject data by means of the you can restart the match with additional subject data by means of the
dfa_restart modifier. For example: dfa_restart modifier. For example:
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@ -1673,37 +1675,37 @@ RESTARTING AFTER A PARTIAL MATCH
data> n05\=dfa,dfa_restart data> n05\=dfa,dfa_restart
0: n05 0: n05
For further information about partial matching, see the pcre2partial For further information about partial matching, see the pcre2partial
documentation. documentation.
CALLOUTS CALLOUTS
If the pattern contains any callout requests, pcre2test's callout func- If the pattern contains any callout requests, pcre2test's callout func-
tion is called during matching unless callout_none is specified. This tion is called during matching unless callout_none is specified. This
works with both matching functions, and with JIT, though there are some works with both matching functions, and with JIT, though there are some
differences in behaviour. The output for callouts with numerical argu- differences in behaviour. The output for callouts with numerical argu-
ments and those with string arguments is slightly different. ments and those with string arguments is slightly different.
Callouts with numerical arguments Callouts with numerical arguments
By default, the callout function displays the callout number, the start By default, the callout function displays the callout number, the start
and current positions in the subject text at the callout time, and the and current positions in the subject text at the callout time, and the
next pattern item to be tested. For example: next pattern item to be tested. For example:
--->pqrabcdef --->pqrabcdef
0 ^ ^ \d 0 ^ ^ \d
This output indicates that callout number 0 occurred for a match at- This output indicates that callout number 0 occurred for a match at-
tempt starting at the fourth character of the subject string, when the tempt starting at the fourth character of the subject string, when the
pointer was at the seventh character, and when the next pattern item pointer was at the seventh character, and when the next pattern item
was \d. Just one circumflex is output if the start and current posi- was \d. Just one circumflex is output if the start and current posi-
tions are the same, or if the current position precedes the start posi- tions are the same, or if the current position precedes the start posi-
tion, which can happen if the callout is in a lookbehind assertion. tion, which can happen if the callout is in a lookbehind assertion.
Callouts numbered 255 are assumed to be automatic callouts, inserted as Callouts numbered 255 are assumed to be automatic callouts, inserted as
a result of the auto_callout pattern modifier. In this case, instead of a result of the auto_callout pattern modifier. In this case, instead of
showing the callout number, the offset in the pattern, preceded by a showing the callout number, the offset in the pattern, preceded by a
plus, is output. For example: plus, is output. For example:
re> /\d?[A-E]\*/auto_callout re> /\d?[A-E]\*/auto_callout
@ -1730,17 +1732,17 @@ CALLOUTS
+12 ^ ^ +12 ^ ^
0: abc 0: abc
The mark changes between matching "a" and "b", but stays the same for The mark changes between matching "a" and "b", but stays the same for
the rest of the match, so nothing more is output. If, as a result of the rest of the match, so nothing more is output. If, as a result of
backtracking, the mark reverts to being unset, the text "<unset>" is backtracking, the mark reverts to being unset, the text "<unset>" is
output. output.
Callouts with string arguments Callouts with string arguments
The output for a callout with a string argument is similar, except that The output for a callout with a string argument is similar, except that
instead of outputting a callout number before the position indicators, instead of outputting a callout number before the position indicators,
the callout string and its offset in the pattern string are output be- the callout string and its offset in the pattern string are output be-
fore the reflection of the subject string, and the subject string is fore the reflection of the subject string, and the subject string is
reflected for each callout. For example: reflected for each callout. For example:
re> /^ab(?C'first')cd(?C"second")ef/ re> /^ab(?C'first')cd(?C"second")ef/
@ -1756,26 +1758,26 @@ CALLOUTS
Callout modifiers Callout modifiers
The callout function in pcre2test returns zero (carry on matching) by The callout function in pcre2test returns zero (carry on matching) by
default, but you can use a callout_fail modifier in a subject line to default, but you can use a callout_fail modifier in a subject line to
change this and other parameters of the callout (see below). change this and other parameters of the callout (see below).
If the callout_capture modifier is set, the current captured groups are If the callout_capture modifier is set, the current captured groups are
output when a callout occurs. This is useful only for non-DFA matching, output when a callout occurs. This is useful only for non-DFA matching,
as pcre2_dfa_match() does not support capturing, so no captures are as pcre2_dfa_match() does not support capturing, so no captures are
ever shown. ever shown.
The normal callout output, showing the callout number or pattern offset The normal callout output, showing the callout number or pattern offset
(as described above) is suppressed if the callout_no_where modifier is (as described above) is suppressed if the callout_no_where modifier is
set. set.
When using the interpretive matching function pcre2_match() without When using the interpretive matching function pcre2_match() without
JIT, setting the callout_extra modifier causes additional output from JIT, setting the callout_extra modifier causes additional output from
pcre2test's callout function to be generated. For the first callout in pcre2test's callout function to be generated. For the first callout in
a match attempt at a new starting position in the subject, "New match a match attempt at a new starting position in the subject, "New match
attempt" is output. If there has been a backtrack since the last call- attempt" is output. If there has been a backtrack since the last call-
out (or start of matching if this is the first callout), "Backtrack" is out (or start of matching if this is the first callout), "Backtrack" is
output, followed by "No other matching paths" if the backtrack ended output, followed by "No other matching paths" if the backtrack ended
the previous match attempt. For example: the previous match attempt. For example:
re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
@ -1812,86 +1814,86 @@ CALLOUTS
+1 ^ a+ +1 ^ a+
No match No match
Notice that various optimizations must be turned off if you want all Notice that various optimizations must be turned off if you want all
possible matching paths to be scanned. If no_start_optimize is not possible matching paths to be scanned. If no_start_optimize is not
used, there is an immediate "no match", without any callouts, because used, there is an immediate "no match", without any callouts, because
the starting optimization fails to find "b" in the subject, which it the starting optimization fails to find "b" in the subject, which it
knows must be present for any match. If no_auto_possess is not used, knows must be present for any match. If no_auto_possess is not used,
the "a+" item is turned into "a++", which reduces the number of back- the "a+" item is turned into "a++", which reduces the number of back-
tracks. tracks.
The callout_extra modifier has no effect if used with the DFA matching The callout_extra modifier has no effect if used with the DFA matching
function, or with JIT. function, or with JIT.
Return values from callouts Return values from callouts
The default return from the callout function is zero, which allows The default return from the callout function is zero, which allows
matching to continue. The callout_fail modifier can be given one or two matching to continue. The callout_fail modifier can be given one or two
numbers. If there is only one number, 1 is returned instead of 0 (caus- numbers. If there is only one number, 1 is returned instead of 0 (caus-
ing matching to backtrack) when a callout of that number is reached. If ing matching to backtrack) when a callout of that number is reached. If
two numbers (<n>:<m>) are given, 1 is returned when callout <n> is two numbers (<n>:<m>) are given, 1 is returned when callout <n> is
reached and there have been at least <m> callouts. The callout_error reached and there have been at least <m> callouts. The callout_error
modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus- modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus-
ing the entire matching process to be aborted. If both these modifiers ing the entire matching process to be aborted. If both these modifiers
are set for the same callout number, callout_error takes precedence. are set for the same callout number, callout_error takes precedence.
Note that callouts with string arguments are always given the number Note that callouts with string arguments are always given the number
zero. zero.
The callout_data modifier can be given an unsigned or a negative num- The callout_data modifier can be given an unsigned or a negative num-
ber. This is set as the "user data" that is passed to the matching ber. This is set as the "user data" that is passed to the matching
function, and passed back when the callout function is invoked. Any function, and passed back when the callout function is invoked. Any
value other than zero is used as a return from pcre2test's callout value other than zero is used as a return from pcre2test's callout
function. function.
Inserting callouts can be helpful when using pcre2test to check compli- Inserting callouts can be helpful when using pcre2test to check compli-
cated regular expressions. For further information about callouts, see cated regular expressions. For further information about callouts, see
the pcre2callout documentation. the pcre2callout documentation.
NON-PRINTING CHARACTERS NON-PRINTING CHARACTERS
When pcre2test is outputting text in the compiled version of a pattern, When pcre2test is outputting text in the compiled version of a pattern,
bytes other than 32-126 are always treated as non-printing characters bytes other than 32-126 are always treated as non-printing characters
and are therefore shown as hex escapes. and are therefore shown as hex escapes.
When pcre2test is outputting text that is a matched part of a subject When pcre2test is outputting text that is a matched part of a subject
string, it behaves in the same way, unless a different locale has been string, it behaves in the same way, unless a different locale has been
set for the pattern (using the locale modifier). In this case, the is- set for the pattern (using the locale modifier). In this case, the is-
print() function is used to distinguish printing and non-printing char- print() function is used to distinguish printing and non-printing char-
acters. acters.
SAVING AND RESTORING COMPILED PATTERNS SAVING AND RESTORING COMPILED PATTERNS
It is possible to save compiled patterns on disc or elsewhere, and It is possible to save compiled patterns on disc or elsewhere, and
reload them later, subject to a number of restrictions. JIT data cannot reload them later, subject to a number of restrictions. JIT data cannot
be saved. The host on which the patterns are reloaded must be running be saved. The host on which the patterns are reloaded must be running
the same version of PCRE2, with the same code unit width, and must also the same version of PCRE2, with the same code unit width, and must also
have the same endianness, pointer width and PCRE2_SIZE type. Before have the same endianness, pointer width and PCRE2_SIZE type. Before
compiled patterns can be saved they must be serialized, that is, con- compiled patterns can be saved they must be serialized, that is, con-
verted to a stream of bytes. A single byte stream may contain any num- verted to a stream of bytes. A single byte stream may contain any num-
ber of compiled patterns, but they must all use the same character ta- ber of compiled patterns, but they must all use the same character ta-
bles. A single copy of the tables is included in the byte stream (its bles. A single copy of the tables is included in the byte stream (its
size is 1088 bytes). size is 1088 bytes).
The functions whose names begin with pcre2_serialize_ are used for se- The functions whose names begin with pcre2_serialize_ are used for se-
rializing and de-serializing. They are described in the pcre2serialize rializing and de-serializing. They are described in the pcre2serialize
documentation. In this section we describe the features of pcre2test documentation. In this section we describe the features of pcre2test
that can be used to test these functions. that can be used to test these functions.
Note that "serialization" in PCRE2 does not convert compiled patterns Note that "serialization" in PCRE2 does not convert compiled patterns
to an abstract format like Java or .NET. It just makes a reloadable to an abstract format like Java or .NET. It just makes a reloadable
byte code stream. Hence the restrictions on reloading mentioned above. byte code stream. Hence the restrictions on reloading mentioned above.
In pcre2test, when a pattern with push modifier is successfully com- In pcre2test, when a pattern with push modifier is successfully com-
piled, it is pushed onto a stack of compiled patterns, and pcre2test piled, it is pushed onto a stack of compiled patterns, and pcre2test
expects the next line to contain a new pattern (or command) instead of expects the next line to contain a new pattern (or command) instead of
a subject line. By contrast, the pushcopy modifier causes a copy of the a subject line. By contrast, the pushcopy modifier causes a copy of the
compiled pattern to be stacked, leaving the original available for im- compiled pattern to be stacked, leaving the original available for im-
mediate matching. By using push and/or pushcopy, a number of patterns mediate matching. By using push and/or pushcopy, a number of patterns
can be compiled and retained. These modifiers are incompatible with can be compiled and retained. These modifiers are incompatible with
posix, and control modifiers that act at match time are ignored (with a posix, and control modifiers that act at match time are ignored (with a
message) for the stacked patterns. The jitverify modifier applies only message) for the stacked patterns. The jitverify modifier applies only
at compile time. at compile time.
The command The command
@ -1899,21 +1901,21 @@ SAVING AND RESTORING COMPILED PATTERNS
#save <filename> #save <filename>
causes all the stacked patterns to be serialized and the result written causes all the stacked patterns to be serialized and the result written
to the named file. Afterwards, all the stacked patterns are freed. The to the named file. Afterwards, all the stacked patterns are freed. The
command command
#load <filename> #load <filename>
reads the data in the file, and then arranges for it to be de-serial- reads the data in the file, and then arranges for it to be de-serial-
ized, with the resulting compiled patterns added to the pattern stack. ized, with the resulting compiled patterns added to the pattern stack.
The pattern on the top of the stack can be retrieved by the #pop com- The pattern on the top of the stack can be retrieved by the #pop com-
mand, which must be followed by lines of subjects that are to be mand, which must be followed by lines of subjects that are to be
matched with the pattern, terminated as usual by an empty line or end matched with the pattern, terminated as usual by an empty line or end
of file. This command may be followed by a modifier list containing of file. This command may be followed by a modifier list containing
only control modifiers that act after a pattern has been compiled. In only control modifiers that act after a pattern has been compiled. In
particular, hex, posix, posix_nosub, push, and pushcopy are not al- particular, hex, posix, posix_nosub, push, and pushcopy are not al-
lowed, nor are any option-setting modifiers. The JIT modifiers are, lowed, nor are any option-setting modifiers. The JIT modifiers are,
however permitted. Here is an example that saves and reloads two pat- however permitted. Here is an example that saves and reloads two pat-
terns. terns.
/abc/push /abc/push
@ -1926,10 +1928,10 @@ SAVING AND RESTORING COMPILED PATTERNS
#pop jit,bincode #pop jit,bincode
abc abc
If jitverify is used with #pop, it does not automatically imply jit, If jitverify is used with #pop, it does not automatically imply jit,
which is different behaviour from when it is used on a pattern. which is different behaviour from when it is used on a pattern.
The #popcopy command is analagous to the pushcopy modifier in that it The #popcopy command is analagous to the pushcopy modifier in that it
makes current a copy of the topmost stack pattern, leaving the original makes current a copy of the topmost stack pattern, leaving the original
still on the stack. still on the stack.
@ -1949,5 +1951,5 @@ AUTHOR
REVISION REVISION
Last updated: 12 January 2022 Last updated: 27 July 2022
Copyright (c) 1997-2022 University of Cambridge. Copyright (c) 1997-2022 University of Cambridge.

View File

@ -205,9 +205,6 @@ point. */
* Global variables * * Global variables *
*************************************************/ *************************************************/
/* Jeffrey Friedl has some debugging requirements that are not part of the
regular code. */
static const char *colour_string = "1;31"; static const char *colour_string = "1;31";
static const char *colour_option = NULL; static const char *colour_option = NULL;
static const char *dee_option = NULL; static const char *dee_option = NULL;
@ -220,6 +217,10 @@ static const char *output_text = NULL;
static char *main_buffer = NULL; static char *main_buffer = NULL;
static const char *printname_nl = STDOUT_NL; /* Changed to NULL for -Z */
static int printname_colon = ':'; /* Changed to 0 for -Z */
static int printname_hyphen = '-'; /* Changed to 0 for -Z */
static int after_context = 0; static int after_context = 0;
static int before_context = 0; static int before_context = 0;
static int binary_files = BIN_BINARY; static int binary_files = BIN_BINARY;
@ -483,6 +484,7 @@ static option_item optionlist[] = {
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" }, { OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
{ OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" }, { OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" },
{ OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" }, { OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" },
{ OP_NODATA, 'Z', NULL, "null", "output 0 byte after file names" },
{ OP_NODATA, 0, NULL, NULL, NULL } { OP_NODATA, 0, NULL, NULL, NULL }
}; };
@ -1773,7 +1775,7 @@ if (after_context > 0 && lastmatchnumber > 0)
{ {
char *pp = end_of_line(lastmatchrestart, endptr, &ellength); char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
if (ellength == 0 && pp == main_buffer + bufsize) break; if (ellength == 0 && pp == main_buffer + bufsize) break;
if (printname != NULL) fprintf(stdout, "%s-", printname); if (printname != NULL) fprintf(stdout, "%s%c", printname, printname_hyphen);
if (number) fprintf(stdout, "%lu-", lastmatchnumber++); if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout); FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
lastmatchrestart = pp; lastmatchrestart = pp;
@ -2439,10 +2441,10 @@ if (pid == 0)
} }
else if (pid > 0) else if (pid > 0)
{ {
(void)fflush(stdout); (void)fflush(stdout);
(void)waitpid(pid, &result, 0); (void)waitpid(pid, &result, 0);
(void)fflush(stdout); (void)fflush(stdout);
} }
#endif /* End Windows/VMS/other handling */ #endif /* End Windows/VMS/other handling */
free(args); free(args);
@ -2730,7 +2732,9 @@ while (ptr < endptr)
else if (filenames == FN_MATCH_ONLY) else if (filenames == FN_MATCH_ONLY)
{ {
fprintf(stdout, "%s" STDOUT_NL, printname); fprintf(stdout, "%s", printname);
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
else fprintf(stdout, "%s", printname_nl);
return 0; return 0;
} }
@ -2749,7 +2753,8 @@ while (ptr < endptr)
{ {
PCRE2_SIZE oldstartoffset; PCRE2_SIZE oldstartoffset;
if (printname != NULL) fprintf(stdout, "%s:", printname); if (printname != NULL) fprintf(stdout, "%s%c", printname,
printname_colon);
if (number) fprintf(stdout, "%lu:", linenumber); if (number) fprintf(stdout, "%lu:", linenumber);
/* Handle --line-offsets */ /* Handle --line-offsets */
@ -2871,7 +2876,8 @@ while (ptr < endptr)
while (lastmatchrestart < p) while (lastmatchrestart < p)
{ {
char *pp = lastmatchrestart; char *pp = lastmatchrestart;
if (printname != NULL) fprintf(stdout, "%s-", printname); if (printname != NULL) fprintf(stdout, "%s%c", printname,
printname_hyphen);
if (number) fprintf(stdout, "%lu-", lastmatchnumber++); if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
pp = end_of_line(pp, endptr, &ellength); pp = end_of_line(pp, endptr, &ellength);
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout); FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
@ -2912,7 +2918,8 @@ while (ptr < endptr)
{ {
int ellength; int ellength;
char *pp = p; char *pp = p;
if (printname != NULL) fprintf(stdout, "%s-", printname); if (printname != NULL) fprintf(stdout, "%s%c", printname,
printname_hyphen);
if (number) fprintf(stdout, "%lu-", linenumber - linecount--); if (number) fprintf(stdout, "%lu-", linenumber - linecount--);
pp = end_of_line(pp, endptr, &ellength); pp = end_of_line(pp, endptr, &ellength);
FWRITE_IGNORE(p, 1, pp - p, stdout); FWRITE_IGNORE(p, 1, pp - p, stdout);
@ -2926,7 +2933,8 @@ while (ptr < endptr)
if (after_context > 0 || before_context > 0) if (after_context > 0 || before_context > 0)
endhyphenpending = TRUE; endhyphenpending = TRUE;
if (printname != NULL) fprintf(stdout, "%s:", printname); if (printname != NULL) fprintf(stdout, "%s%c", printname,
printname_colon);
if (number) fprintf(stdout, "%lu:", linenumber); if (number) fprintf(stdout, "%lu:", linenumber);
/* In multiline mode, or if colouring, we have to split the line(s) up /* In multiline mode, or if colouring, we have to split the line(s) up
@ -3131,7 +3139,9 @@ were none. If we found a match, we won't have got this far. */
if (filenames == FN_NOMATCH_ONLY) if (filenames == FN_NOMATCH_ONLY)
{ {
fprintf(stdout, "%s" STDOUT_NL, printname); fprintf(stdout, "%s", printname);
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
else fprintf(stdout, "%s", printname_nl);
return 0; return 0;
} }
@ -3142,7 +3152,7 @@ if (count_only && !quiet)
if (count > 0 || !omit_zero_count) if (count > 0 || !omit_zero_count)
{ {
if (printname != NULL && filenames != FN_NONE) if (printname != NULL && filenames != FN_NONE)
fprintf(stdout, "%s:", printname); fprintf(stdout, "%s%c", printname, printname_colon);
fprintf(stdout, "%lu" STDOUT_NL, count); fprintf(stdout, "%lu" STDOUT_NL, count);
counts_printed++; counts_printed++;
} }
@ -3528,8 +3538,6 @@ switch(letter)
case 'u': options |= PCRE2_UTF; utf = TRUE; break; case 'u': options |= PCRE2_UTF; utf = TRUE; break;
case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break; case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
case 'v': invert = TRUE; break; case 'v': invert = TRUE; break;
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
case 'V': case 'V':
{ {
@ -3540,6 +3548,10 @@ switch(letter)
pcre2grep_exit(0); pcre2grep_exit(0);
break; break;
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
case 'Z': printname_colon = printname_hyphen = 0; printname_nl = NULL; break;
default: default:
fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter); fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter);
pcre2grep_exit(usage(2)); pcre2grep_exit(usage(2));
@ -4259,8 +4271,6 @@ if (DEE_option != NULL)
(void)pcre2_set_compile_extra_options(compile_context, extra_options); (void)pcre2_set_compile_extra_options(compile_context, extra_options);
/* Check the values for Jeffrey Friedl's debugging options. */
/* If use_jit is set, check whether JIT is available. If not, do not try /* If use_jit is set, check whether JIT is available. If not, do not try
to use JIT. */ to use JIT. */

19
testdata/grepoutput vendored
View File

@ -991,3 +991,22 @@ RC=0
---------------------------- Test 134 ----------------------------- ---------------------------- Test 134 -----------------------------
=AB3CD5= =AB3CD5=
RC=0 RC=0
---------------------------- Test 135 -----------------------------
./testdata/grepinputv@The word is cat in this line
RC=0
./testdata/grepinputv@./testdata/grepinputv@RC=0
./testdata/grepinputv@This line contains \E and (regex) *meta* [characters].
./testdata/grepinputv@The word is cat in this line
./testdata/grepinputv@The caterpillar sat on the mat
RC=0
testdata/grepinputM3:start end in between start
end and following
testdata/grepinputM7:start end in between start
end and following start
end other stuff
testdata/grepinputM11:start end in between start
end
testdata/grepinputM16:start end in between start
end
RC=0