Implement -Z in pcre2grep and update documentation
This commit is contained in:
parent
cc5e121c8e
commit
8b133fa0ba
|
@ -49,6 +49,8 @@ tests.
|
||||||
tests run by 'make check', but can be run manually. The current output is from
|
tests run by 'make check', but can be run manually. The current output is from
|
||||||
a 64-bit system.
|
a 64-bit system.
|
||||||
|
|
||||||
|
13. Implemented -Z aka --null in pcre2grep.
|
||||||
|
|
||||||
|
|
||||||
Version 10.40 15-April-2022
|
Version 10.40 15-April-2022
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
42
RunGrepTest
42
RunGrepTest
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
||||||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||||
|
|
||||||
|
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||||
|
# in many operating systems. An earlier version of this script used sed to
|
||||||
|
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||||
|
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||||
|
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||||
|
# even when using GNU sed. A user suggested using tr instead, which
|
||||||
|
# necessitates translating to a single character. However, on (some versions
|
||||||
|
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||||
|
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||||
|
|
||||||
|
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||||
|
tr=/usr/xpg4/bin/tr
|
||||||
|
else
|
||||||
|
tr=tr
|
||||||
|
fi
|
||||||
|
|
||||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||||
# it to the current or parent directory, whichever one contains the test data.
|
# it to the current or parent directory, whichever one contains the test data.
|
||||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||||
|
@ -685,6 +701,16 @@ echo "---------------------------- Test 134 -----------------------------" >>tes
|
||||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||||
echo "RC=$?" >>testtrygrep
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
# Now compare the results.
|
# Now compare the results.
|
||||||
|
|
||||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||||
|
@ -759,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
||||||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||||
|
|
||||||
# This next test involves NUL characters. It seems impossible to handle them
|
|
||||||
# easily in many operating systems. An earlier version of this script used sed
|
|
||||||
# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
|
||||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
|
||||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
|
||||||
# even when using GNU sed. A user suggested using tr instead, which
|
|
||||||
# necessitates translating to a single character (@). However, on (some
|
|
||||||
# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
|
||||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
|
||||||
|
|
||||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
|
||||||
tr=/usr/xpg4/bin/tr
|
|
||||||
else
|
|
||||||
tr=tr
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||||
printf 'abc\0def' >testNinputgrep
|
printf 'abc\0def' >testNinputgrep
|
||||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
||||||
pcre2_substring.c
|
pcre2_substring.c
|
||||||
pcre2_tables.c
|
pcre2_tables.c
|
||||||
pcre2_ucd.c
|
pcre2_ucd.c
|
||||||
|
pcre2_ucptables.c
|
||||||
pcre2_valid_utf.c
|
pcre2_valid_utf.c
|
||||||
pcre2_xclass.c
|
pcre2_xclass.c
|
||||||
|
|
||||||
|
@ -373,7 +374,7 @@ Otherwise:
|
||||||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||||
have been created.
|
have been created.
|
||||||
|
|
||||||
2. Edit RunTest.bat to indentify the full or relative location of
|
2. Edit RunTest.bat to identify the full or relative location of
|
||||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||||
|
|
||||||
set srcdir=C:\pcre2\pcre2-10.00
|
set srcdir=C:\pcre2\pcre2-10.00
|
||||||
|
|
|
@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
|
||||||
You can access the archives and also subscribe or manage your subscription
|
You can access the archives and also subscribe or manage your subscription
|
||||||
here:
|
here:
|
||||||
|
|
||||||
https://groups.google.com/pcre2-dev
|
https://groups.google.com/g/pcre2-dev
|
||||||
|
|
||||||
Please read the NEWS file if you are upgrading from a previous release. The
|
Please read the NEWS file if you are upgrading from a previous release. The
|
||||||
contents of this README file are:
|
contents of this README file are:
|
||||||
|
@ -375,7 +375,8 @@ library. They are also documented in the pcre2build man page.
|
||||||
necessary to specify something like LIBS="-lncurses" as well. This is
|
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||||
but does not link with the termcap or curses library itself, allowing
|
but does not link with the termcap or curses library itself, allowing
|
||||||
applications which link with readline the to choose an appropriate library."
|
applications which link with readline the option to choose an appropriate
|
||||||
|
library."
|
||||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||||
should fix it.
|
should fix it.
|
||||||
|
@ -400,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
||||||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||||
be created. This is normally run under valgrind or used when PCRE2 is
|
be created. This is normally run under valgrind or used when PCRE2 is
|
||||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||||
outputs information about it is doing. The input strings are specified by
|
outputs information about what it is doing. The input strings are specified
|
||||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||||
file are the test string.
|
of the file are the test string.
|
||||||
|
|
||||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||||
which caused pcre2_match() to use individual blocks on the heap for
|
which caused pcre2_match() to use individual blocks on the heap for
|
||||||
|
@ -695,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
||||||
different code unit widths.
|
different code unit widths.
|
||||||
|
|
||||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||||
among other non-JIT things, the match-limiting features of the intepretive
|
among other non-JIT things, the match-limiting features of the interpretive
|
||||||
matcher.
|
matcher.
|
||||||
|
|
||||||
Test 16 is run only when JIT support is not available. It checks that an
|
Test 16 is run only when JIT support is not available. It checks that an
|
||||||
|
|
|
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
|
||||||
documentation for more details). If the limit is reached, the negative error
|
documentation for more details). If the limit is reached, the negative error
|
||||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||||
is built; if it is not, the default is set very large and is essentially
|
is built; if it is not, the default is set very large and is essentially
|
||||||
"unlimited".
|
unlimited.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A value for the heap limit may also be supplied by an item at the start of a
|
A value for the heap limit may also be supplied by an item at the start of a
|
||||||
|
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
||||||
limit is set, less than the default.
|
limit is set, less than the default.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
||||||
stack for recording backtracking points. The more nested backtracking points
|
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||||
there are (that is, the deeper the search tree), the more memory is needed.
|
<b>pcre2_match()</b> uses the heap are given in the
|
||||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
||||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
documentation.
|
||||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
|
||||||
can be successfully processed.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
||||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||||
this is not big enough is heap memory used. In this case, too, setting a value
|
is not big enough is heap memory used. In this case, setting a value of zero
|
||||||
of zero disables the use of the heap.
|
disables the use of the heap.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
|
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
||||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
Each time a nested backtracking point is passed, a new memory frame is used
|
||||||
to remember the state of matching at that point. Thus, this parameter
|
to remember the state of matching at that point. Thus, this parameter
|
||||||
indirectly limits the amount of memory that is used in a match. However,
|
indirectly limits the amount of memory that is used in a match. However,
|
||||||
because the size of each memory "frame" depends on the number of capturing
|
because the size of each memory frame depends on the number of capturing
|
||||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||||
was more useful in versions before 10.30, where function recursion was used for
|
was more useful in versions before 10.30, where function recursion was used for
|
||||||
backtracking.
|
backtracking.
|
||||||
|
@ -3148,11 +3146,11 @@ The backtracking match limit was reached.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_ERROR_NOMEMORY
|
PCRE2_ERROR_NOMEMORY
|
||||||
</pre>
|
</pre>
|
||||||
If a pattern contains many nested backtracking points, heap memory is used to
|
Heap memory is used to remember backgracking points. This error is given when
|
||||||
remember them. This error is given when the memory allocation function (default
|
the memory allocation function (default or custom) fails. Note that a different
|
||||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_ERROR_NULL
|
PCRE2_ERROR_NULL
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -4020,9 +4018,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 14 December 2021
|
Last updated: 27 July 2022
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2021 University of Cambridge.
|
Copyright © 1997-2022 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -284,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
||||||
counting is done differently).
|
counting is done differently).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||||
stack to record backtracking points. The more nested backtracking points there
|
points. The more nested backtracking points there are (that is, the deeper the
|
||||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
search tree), the more memory is needed. There is an upper limit, specified in
|
||||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
described in the
|
||||||
at run time, as described in the
|
|
||||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||||
change this by a setting such as
|
change this by a setting such as
|
||||||
|
@ -609,16 +608,16 @@ give a warning.
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
University Computing Service
|
Retired from University Computing Service
|
||||||
<br>
|
<br>
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 08 December 2021
|
Last updated: 27 July 2022
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2021 University of Cambridge.
|
Copyright © 1997-2022 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
||||||
<pre>
|
<pre>
|
||||||
pcre2grep some-pattern file1 - file3
|
pcre2grep some-pattern file1 - file3
|
||||||
</pre>
|
</pre>
|
||||||
Input files are searched line by line. By default, each line that matches a
|
By default, input files are searched line by line. Each line that matches a
|
||||||
pattern is copied to the standard output, and if there is more than one file,
|
pattern is copied to the standard output, and if there is more than one file,
|
||||||
the file name is output at the start of each line, followed by a colon.
|
the file name is output at the start of each line, followed by a colon.
|
||||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||||
span line boundaries. What defines a line boundary is controlled by the
|
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||||
<b>-N</b> (<b>--newline</b>) option.
|
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||||
|
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||||
|
terminator to a zero byte.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The amount of memory used for buffering files that are being scanned is
|
The amount of memory used for buffering files that are being scanned is
|
||||||
|
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
||||||
lines are output if the next match or the end of the file is reached, or if the
|
lines are output if the next match or the end of the file is reached, or if the
|
||||||
processing buffer size has been set too small. If file names and/or line
|
processing buffer size has been set too small. If file names and/or line
|
||||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||||
context lines. A line containing "--" is output between each group of lines,
|
context lines (the <b>-Z</b> option can be used to change the file name
|
||||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
terminator to a zero byte). A line containing "--" is output between each group
|
||||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
of lines, unless they are in fact contiguous in the input file. The value of
|
||||||
|
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||||
|
<b>-A</b> is ignored.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-a</b>, <b>--text</b>
|
<b>-a</b>, <b>--text</b>
|
||||||
|
@ -199,9 +203,10 @@ Output up to <i>number</i> lines of context before each matching line. Fewer
|
||||||
lines are output if the previous match or the start of the file is within
|
lines are output if the previous match or the start of the file is within
|
||||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||||
file names and/or line numbers are being output, a hyphen separator is used
|
file names and/or line numbers are being output, a hyphen separator is used
|
||||||
instead of a colon for the context lines. A line containing "--" is output
|
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||||
between each group of lines, unless they are in fact contiguous in the input
|
change the file name terminator to a zero byte). A line containing "--" is
|
||||||
file. The value of <i>number</i> is expected to be relatively small. When
|
output between each group of lines, unless they are in fact contiguous in the
|
||||||
|
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
||||||
<P>
|
<P>
|
||||||
<b>-H</b>, <b>--with-filename</b>
|
<b>-H</b>, <b>--with-filename</b>
|
||||||
Force the inclusion of the file name at the start of output lines when
|
Force the inclusion of the file name at the start of output lines when
|
||||||
searching a single file. By default, the file name is not shown in this case.
|
searching a single file. The file name is not normally shown in this case.
|
||||||
For matching lines, the file name is followed by a colon; for context lines, a
|
By default, for matching lines, the file name is followed by a colon; for
|
||||||
hyphen separator is used. If a line number is also being output, it follows the
|
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
change the terminator to a zero byte. If a line number is also being output,
|
||||||
line, only the first is preceded by the file name. This option overrides any
|
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
more than one line, only the first is preceded by the file name. This option
|
||||||
|
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-h</b>, <b>--no-filename</b>
|
<b>-h</b>, <b>--no-filename</b>
|
||||||
Suppress the output file names when searching multiple files. By default,
|
Suppress the output file names when searching multiple files. File names are
|
||||||
file names are shown when multiple files are searched. For matching lines, the
|
normally shown when multiple files are searched. By default, for matching
|
||||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||||
If a line number is also being output, it follows the file name. This option
|
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
a zero byte. If a line number is also being output, it follows the file name.
|
||||||
|
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>--heap-limit</b>=<i>number</i>
|
<b>--heap-limit</b>=<i>number</i>
|
||||||
|
@ -481,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
||||||
<b>-L</b>, <b>--files-without-match</b>
|
<b>-L</b>, <b>--files-without-match</b>
|
||||||
Instead of outputting lines from the files, just output the names of the files
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
that do not contain any lines that would have been output. Each file name is
|
that do not contain any lines that would have been output. Each file name is
|
||||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||||
<b>-h</b>, or <b>-l</b> options.
|
they are separated by zero bytes instead of newlines. This option overrides any
|
||||||
|
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-l</b>, <b>--files-with-matches</b>
|
<b>-l</b>, <b>--files-with-matches</b>
|
||||||
Instead of outputting lines from the files, just output the names of the files
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
containing lines that would have been output. Each file name is output once, on
|
containing lines that would have been output. Each file name is output once, on
|
||||||
a separate line. Searching normally stops as soon as a matching line is found
|
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||||
continues in order to obtain the correct count, and those files that have at
|
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||||
least one match are listed along with their counts. Using this option with
|
matching continues in order to obtain the correct count, and those files that
|
||||||
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
have at least one match are listed along with their counts. Using this option
|
||||||
|
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||||
<b>-h</b>, or <b>-L</b> options.
|
<b>-h</b>, or <b>-L</b> options.
|
||||||
</P>
|
</P>
|
||||||
|
@ -592,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||||
memory is needed only if matching the pattern requires a significant number of
|
|
||||||
nested backtracking points to be remembered. This parameter can be set to zero
|
|
||||||
to forbid the use of heap memory altogether.
|
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||||
|
@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
||||||
matched against the contents of files; it does not apply to patterns specified
|
matched against the contents of files; it does not apply to patterns specified
|
||||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||||
</P>
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-Z</b>, <b>--null</b>
|
||||||
|
Terminate files names in the regular output with a zero byte (the NUL
|
||||||
|
character) instead of what would normally appear. This is useful when file
|
||||||
|
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||||
|
option does not apply to file names in error messages.
|
||||||
|
</P>
|
||||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||||
<P>
|
<P>
|
||||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||||
|
@ -1053,9 +1066,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 31 August 2021
|
Last updated: 30 July 2022
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2021 University of Cambridge.
|
Copyright © 1997-2022 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
||||||
The maximum length of a string argument to a callout is the largest number a
|
The maximum length of a string argument to a callout is the largest number a
|
||||||
32-bit unsigned integer can hold.
|
32-bit unsigned integer can hold.
|
||||||
</P>
|
</P>
|
||||||
|
<P>
|
||||||
|
The maximum amount of heap memory used for matching is controlled by the heap
|
||||||
|
limit, which can be set in a pattern or in a match context. The default is a
|
||||||
|
very large number, effectively unlimited.
|
||||||
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
AUTHOR
|
AUTHOR
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
University Computing Service
|
Retired from University Computing Service
|
||||||
<br>
|
<br>
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 02 February 2019
|
Last updated: 26 July 2022
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2019 University of Cambridge.
|
Copyright © 1997-2022 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
||||||
uses very little system stack at run time. In earlier releases recursive
|
uses very little system stack at run time. In earlier releases recursive
|
||||||
function calls could use a great deal of stack, and this could cause problems,
|
function calls could use a great deal of stack, and this could cause problems,
|
||||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
remembered in memory frames controlled by the code.
|
||||||
frames is allocated on the system stack (enough for about 100 frames for small
|
</P>
|
||||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
<P>
|
||||||
memory can be limited; if the limit is set to zero, only the initial stack
|
The size of each frame depends on the size of pointer variables and the number
|
||||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||||
may also reduce the memory requirements.
|
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||||
|
capturing group the size increases by 16 bytes.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||||
|
stack, but this still caused some issues for multi-thread applications where
|
||||||
|
each thread has a very small stack. From release 10.41 backtracking memory
|
||||||
|
frames are always held in heap memory. An initial heap allocation is obtained
|
||||||
|
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||||
|
remembered with the match data block and re-used if that block is used for
|
||||||
|
another match. It is freed when the match data block itself is freed.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||||
|
frame size, unless the heap limit is less than this, in which case the heap
|
||||||
|
limit is used. If the initial block proves to be too small during matching, it
|
||||||
|
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||||
|
checked only when a new block is to be allocated. Reducing the heap limit
|
||||||
|
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||||
|
affect the saved block.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
University Computing Service
|
Retired from University Computing Service
|
||||||
<br>
|
<br>
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 03 February 2019
|
Last updated: 27 July 2022
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2019 University of Cambridge.
|
Copyright © 1997-2022 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -1241,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
|
||||||
copy=<number or name> copy captured substring
|
copy=<number or name> copy captured substring
|
||||||
depth_limit=<n> set a depth limit
|
depth_limit=<n> set a depth limit
|
||||||
dfa use <b>pcre2_dfa_match()</b>
|
dfa use <b>pcre2_dfa_match()</b>
|
||||||
find_limits find match and depth limits
|
find_limits find heap, match and depth limits
|
||||||
|
find_limits_noheap find match and depth limits
|
||||||
get=<number or name> extract captured substring
|
get=<number or name> extract captured substring
|
||||||
getall extract all captured substrings
|
getall extract all captured substrings
|
||||||
/g global global matching
|
/g global global matching
|
||||||
|
@ -1564,7 +1565,7 @@ Setting heap, match, and depth limits
|
||||||
<P>
|
<P>
|
||||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||||
the appropriate limits in the match context. These values are ignored when the
|
the appropriate limits in the match context. These values are ignored when the
|
||||||
<b>find_limits</b> modifier is specified.
|
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Finding minimum limits
|
Finding minimum limits
|
||||||
|
@ -1574,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
||||||
calls the relevant matching function several times, setting different values in
|
calls the relevant matching function several times, setting different values in
|
||||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||||
the minimum values for each parameter that allows the match to complete without
|
the smallest value for each parameter that allows the match to complete without
|
||||||
error. If JIT is being used, only the match limit is relevant.
|
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||||
|
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||||
|
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||||
|
is being used, only the match limit is relevant, and the other two are
|
||||||
|
automatically omitted.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
When using this modifier, the pattern should not contain any limit settings
|
When using this modifier, the pattern should not contain any limit settings
|
||||||
|
@ -1603,9 +1608,7 @@ overall amount of computing resource that is used.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||||
value of zero disables the use of any heap memory; many simple pattern matches
|
|
||||||
can be done without using the heap, so zero is not an unreasonable setting.
|
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Showing MARK names
|
Showing MARK names
|
||||||
|
@ -1623,12 +1626,10 @@ Showing memory usage
|
||||||
<P>
|
<P>
|
||||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||||
memory allocation and freeing calls that occur during a call to
|
memory allocation and freeing calls that occur during a call to
|
||||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||||
requires a bigger vector than the default for remembering backtracking points
|
is used only when a match requires more internal workspace that the default
|
||||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
allocation on the stack, so in many cases there will be no output. No heap
|
||||||
many cases there will be no heap memory used and therefore no additional
|
memory is allocated during matching with JIT. For this modifier to work, the
|
||||||
output. No heap memory is allocated during matching with JIT, so in that case
|
|
||||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
|
||||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||||
subject, though it can be set on one or the other.
|
subject, though it can be set on one or the other.
|
||||||
</P>
|
</P>
|
||||||
|
@ -1690,7 +1691,8 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||||
testing that the matching and substitution functions behave correctly in this
|
testing that the matching and substitution functions behave correctly in this
|
||||||
case (they use default values). This modifier cannot be used with the
|
case (they use default values). This modifier cannot be used with the
|
||||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||||
|
modifiers.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||||
|
@ -2141,7 +2143,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 12 January 2022
|
Last updated: 27 July 2022
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2022 University of Cambridge.
|
Copyright © 1997-2022 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
430
doc/pcre2.txt
430
doc/pcre2.txt
|
@ -1028,7 +1028,7 @@ PCRE2 CONTEXTS
|
||||||
pcre2jit documentation for more details). If the limit is reached, the
|
pcre2jit documentation for more details). If the limit is reached, the
|
||||||
negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
|
negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
|
||||||
limit can be set when PCRE2 is built; if it is not, the default is set
|
limit can be set when PCRE2 is built; if it is not, the default is set
|
||||||
very large and is essentially "unlimited".
|
very large and is essentially unlimited.
|
||||||
|
|
||||||
A value for the heap limit may also be supplied by an item at the start
|
A value for the heap limit may also be supplied by an item at the start
|
||||||
of a pattern of the form
|
of a pattern of the form
|
||||||
|
@ -1039,19 +1039,15 @@ PCRE2 CONTEXTS
|
||||||
less ddd is less than the limit set by the caller of pcre2_match() or,
|
less ddd is less than the limit set by the caller of pcre2_match() or,
|
||||||
if no such limit is set, less than the default.
|
if no such limit is set, less than the default.
|
||||||
|
|
||||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
The pcre2_match() function always needs some heap memory, so setting a
|
||||||
tem stack for recording backtracking points. The more nested backtrack-
|
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||||
ing points there are (that is, the deeper the search tree), the more
|
pcre2_match() uses the heap are given in the pcre2perform documenta-
|
||||||
memory is needed. Heap memory is used only if the initial vector is
|
tion.
|
||||||
too small. If the heap limit is set to a value less than 21 (in partic-
|
|
||||||
ular, zero) no heap memory will be used. In this case, only patterns
|
|
||||||
that do not have a lot of nested backtracking can be successfully pro-
|
|
||||||
cessed.
|
|
||||||
|
|
||||||
Similarly, for pcre2_dfa_match(), a vector on the system stack is used
|
For pcre2_dfa_match(), a vector on the system stack is used when pro-
|
||||||
when processing pattern recursions, lookarounds, or atomic groups, and
|
cessing pattern recursions, lookarounds, or atomic groups, and only if
|
||||||
only if this is not big enough is heap memory used. In this case, too,
|
this is not big enough is heap memory used. In this case, setting a
|
||||||
setting a value of zero disables the use of the heap.
|
value of zero disables the use of the heap.
|
||||||
|
|
||||||
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
@ -1093,12 +1089,12 @@ PCRE2 CONTEXTS
|
||||||
|
|
||||||
This parameter limits the depth of nested backtracking in
|
This parameter limits the depth of nested backtracking in
|
||||||
pcre2_match(). Each time a nested backtracking point is passed, a new
|
pcre2_match(). Each time a nested backtracking point is passed, a new
|
||||||
memory "frame" is used to remember the state of matching at that point.
|
memory frame is used to remember the state of matching at that point.
|
||||||
Thus, this parameter indirectly limits the amount of memory that is
|
Thus, this parameter indirectly limits the amount of memory that is
|
||||||
used in a match. However, because the size of each memory "frame" de-
|
used in a match. However, because the size of each memory frame depends
|
||||||
pends on the number of capturing parentheses, the actual memory limit
|
on the number of capturing parentheses, the actual memory limit varies
|
||||||
varies from pattern to pattern. This limit was more useful in versions
|
from pattern to pattern. This limit was more useful in versions before
|
||||||
before 10.30, where function recursion was used for backtracking.
|
10.30, where function recursion was used for backtracking.
|
||||||
|
|
||||||
The depth limit is not relevant, and is ignored, when matching is done
|
The depth limit is not relevant, and is ignored, when matching is done
|
||||||
using JIT compiled code. However, it is supported by pcre2_dfa_match(),
|
using JIT compiled code. However, it is supported by pcre2_dfa_match(),
|
||||||
|
@ -3051,12 +3047,12 @@ ERROR RETURNS FROM pcre2_match()
|
||||||
|
|
||||||
PCRE2_ERROR_NOMEMORY
|
PCRE2_ERROR_NOMEMORY
|
||||||
|
|
||||||
If a pattern contains many nested backtracking points, heap memory is
|
Heap memory is used to remember backgracking points. This error is
|
||||||
used to remember them. This error is given when the memory allocation
|
given when the memory allocation function (default or custom) fails.
|
||||||
function (default or custom) fails. Note that a different error,
|
Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the
|
||||||
PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca-
|
||||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
tion fails.
|
||||||
|
|
||||||
PCRE2_ERROR_NULL
|
PCRE2_ERROR_NULL
|
||||||
|
|
||||||
|
@ -3860,8 +3856,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 14 December 2021
|
Last updated: 27 July 2022
|
||||||
Copyright (c) 1997-2021 University of Cambridge.
|
Copyright (c) 1997-2022 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -4118,41 +4114,40 @@ LIMITING PCRE2 RESOURCE USAGE
|
||||||
pcre2_dfa_match() matching function, and to JIT matching (though the
|
pcre2_dfa_match() matching function, and to JIT matching (though the
|
||||||
counting is done differently).
|
counting is done differently).
|
||||||
|
|
||||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
The pcre2_match() function uses heap memory to record backtracking
|
||||||
tem stack to record backtracking points. The more nested backtracking
|
points. The more nested backtracking points there are (that is, the
|
||||||
points there are (that is, the deeper the search tree), the more memory
|
deeper the search tree), the more memory is needed. There is an upper
|
||||||
is needed. If the initial vector is not large enough, heap memory is
|
limit, specified in kibibytes (units of 1024 bytes). This limit can be
|
||||||
used, up to a certain limit, which is specified in kibibytes (units of
|
changed at run time, as described in the pcre2api documentation. The
|
||||||
1024 bytes). The limit can be changed at run time, as described in the
|
default limit (in effect unlimited) is 20 million. You can change this
|
||||||
pcre2api documentation. The default limit (in effect unlimited) is 20
|
by a setting such as
|
||||||
million. You can change this by a setting such as
|
|
||||||
|
|
||||||
--with-heap-limit=500
|
--with-heap-limit=500
|
||||||
|
|
||||||
which limits the amount of heap to 500 KiB. This limit applies only to
|
which limits the amount of heap to 500 KiB. This limit applies only to
|
||||||
interpretive matching in pcre2_match() and pcre2_dfa_match(), which may
|
interpretive matching in pcre2_match() and pcre2_dfa_match(), which may
|
||||||
also use the heap for internal workspace when processing complicated
|
also use the heap for internal workspace when processing complicated
|
||||||
patterns. This limit does not apply when JIT (which has its own memory
|
patterns. This limit does not apply when JIT (which has its own memory
|
||||||
arrangements) is used.
|
arrangements) is used.
|
||||||
|
|
||||||
You can also explicitly limit the depth of nested backtracking in the
|
You can also explicitly limit the depth of nested backtracking in the
|
||||||
pcre2_match() interpreter. This limit defaults to the value that is set
|
pcre2_match() interpreter. This limit defaults to the value that is set
|
||||||
for --with-match-limit. You can set a lower default limit by adding,
|
for --with-match-limit. You can set a lower default limit by adding,
|
||||||
for example,
|
for example,
|
||||||
|
|
||||||
--with-match-limit-depth=10000
|
--with-match-limit-depth=10000
|
||||||
|
|
||||||
to the configure command. This value can be overridden at run time.
|
to the configure command. This value can be overridden at run time.
|
||||||
This depth limit indirectly limits the amount of heap memory that is
|
This depth limit indirectly limits the amount of heap memory that is
|
||||||
used, but because the size of each backtracking "frame" depends on the
|
used, but because the size of each backtracking "frame" depends on the
|
||||||
number of capturing parentheses in a pattern, the amount of heap that
|
number of capturing parentheses in a pattern, the amount of heap that
|
||||||
is used before the limit is reached varies from pattern to pattern.
|
is used before the limit is reached varies from pattern to pattern.
|
||||||
This limit was more useful in versions before 10.30, where function re-
|
This limit was more useful in versions before 10.30, where function re-
|
||||||
cursion was used for backtracking.
|
cursion was used for backtracking.
|
||||||
|
|
||||||
As well as applying to pcre2_match(), the depth limit also controls the
|
As well as applying to pcre2_match(), the depth limit also controls the
|
||||||
depth of recursive function calls in pcre2_dfa_match(). These are used
|
depth of recursive function calls in pcre2_dfa_match(). These are used
|
||||||
for lookaround assertions, atomic groups, and recursion within pat-
|
for lookaround assertions, atomic groups, and recursion within pat-
|
||||||
terns. The limit does not apply to JIT matching.
|
terns. The limit does not apply to JIT matching.
|
||||||
|
|
||||||
|
|
||||||
|
@ -4160,67 +4155,67 @@ CREATING CHARACTER TABLES AT BUILD TIME
|
||||||
|
|
||||||
PCRE2 uses fixed tables for processing characters whose code points are
|
PCRE2 uses fixed tables for processing characters whose code points are
|
||||||
less than 256. By default, PCRE2 is built with a set of tables that are
|
less than 256. By default, PCRE2 is built with a set of tables that are
|
||||||
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
distributed in the file src/pcre2_chartables.c.dist. These tables are
|
||||||
for ASCII codes only. If you add
|
for ASCII codes only. If you add
|
||||||
|
|
||||||
--enable-rebuild-chartables
|
--enable-rebuild-chartables
|
||||||
|
|
||||||
to the configure command, the distributed tables are no longer used.
|
to the configure command, the distributed tables are no longer used.
|
||||||
Instead, a program called pcre2_dftables is compiled and run. This out-
|
Instead, a program called pcre2_dftables is compiled and run. This out-
|
||||||
puts the source for new set of tables, created in the default locale of
|
puts the source for new set of tables, created in the default locale of
|
||||||
your C run-time system. This method of replacing the tables does not
|
your C run-time system. This method of replacing the tables does not
|
||||||
work if you are cross compiling, because pcre2_dftables needs to be run
|
work if you are cross compiling, because pcre2_dftables needs to be run
|
||||||
on the local host and therefore not compiled with the cross compiler.
|
on the local host and therefore not compiled with the cross compiler.
|
||||||
|
|
||||||
If you need to create alternative tables when cross compiling, you will
|
If you need to create alternative tables when cross compiling, you will
|
||||||
have to do so "by hand". There may also be other reasons for creating
|
have to do so "by hand". There may also be other reasons for creating
|
||||||
tables manually. To cause pcre2_dftables to be built on the local
|
tables manually. To cause pcre2_dftables to be built on the local
|
||||||
host, run a normal compiling command, and then run the program with the
|
host, run a normal compiling command, and then run the program with the
|
||||||
output file as its argument, for example:
|
output file as its argument, for example:
|
||||||
|
|
||||||
cc src/pcre2_dftables.c -o pcre2_dftables
|
cc src/pcre2_dftables.c -o pcre2_dftables
|
||||||
./pcre2_dftables src/pcre2_chartables.c
|
./pcre2_dftables src/pcre2_chartables.c
|
||||||
|
|
||||||
This builds the tables in the default locale of the local host. If you
|
This builds the tables in the default locale of the local host. If you
|
||||||
want to specify a locale, you must use the -L option:
|
want to specify a locale, you must use the -L option:
|
||||||
|
|
||||||
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
|
LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c
|
||||||
|
|
||||||
You can also specify -b (with or without -L). This causes the tables to
|
You can also specify -b (with or without -L). This causes the tables to
|
||||||
be written in binary instead of as source code. A set of binary tables
|
be written in binary instead of as source code. A set of binary tables
|
||||||
can be loaded into memory by an application and passed to pcre2_com-
|
can be loaded into memory by an application and passed to pcre2_com-
|
||||||
pile() in the same way as tables created by calling pcre2_maketables().
|
pile() in the same way as tables created by calling pcre2_maketables().
|
||||||
The tables are just a string of bytes, independent of hardware charac-
|
The tables are just a string of bytes, independent of hardware charac-
|
||||||
teristics such as endianness. This means they can be bundled with an
|
teristics such as endianness. This means they can be bundled with an
|
||||||
application that runs in different environments, to ensure consistent
|
application that runs in different environments, to ensure consistent
|
||||||
behaviour.
|
behaviour.
|
||||||
|
|
||||||
|
|
||||||
USING EBCDIC CODE
|
USING EBCDIC CODE
|
||||||
|
|
||||||
PCRE2 assumes by default that it will run in an environment where the
|
PCRE2 assumes by default that it will run in an environment where the
|
||||||
character code is ASCII or Unicode, which is a superset of ASCII. This
|
character code is ASCII or Unicode, which is a superset of ASCII. This
|
||||||
is the case for most computer operating systems. PCRE2 can, however, be
|
is the case for most computer operating systems. PCRE2 can, however, be
|
||||||
compiled to run in an 8-bit EBCDIC environment by adding
|
compiled to run in an 8-bit EBCDIC environment by adding
|
||||||
|
|
||||||
--enable-ebcdic --disable-unicode
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
to the configure command. This setting implies --enable-rebuild-charta-
|
to the configure command. This setting implies --enable-rebuild-charta-
|
||||||
bles. You should only use it if you know that you are in an EBCDIC en-
|
bles. You should only use it if you know that you are in an EBCDIC en-
|
||||||
vironment (for example, an IBM mainframe operating system).
|
vironment (for example, an IBM mainframe operating system).
|
||||||
|
|
||||||
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||||
version of the library. Consequently, --enable-unicode and --enable-
|
version of the library. Consequently, --enable-unicode and --enable-
|
||||||
ebcdic are mutually exclusive.
|
ebcdic are mutually exclusive.
|
||||||
|
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
||||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||||
is used. In such an environment you should use
|
is used. In such an environment you should use
|
||||||
|
|
||||||
--enable-ebcdic-nl25
|
--enable-ebcdic-nl25
|
||||||
|
|
||||||
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
|
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR
|
||||||
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and
|
||||||
0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
|
0x25 is not chosen as LF is made to correspond to the Unicode NEL char-
|
||||||
acter (which, in Unicode, is 0x85).
|
acter (which, in Unicode, is 0x85).
|
||||||
|
|
||||||
|
@ -4232,47 +4227,47 @@ USING EBCDIC CODE
|
||||||
PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS
|
PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS
|
||||||
|
|
||||||
By default pcre2grep supports the use of callouts with string arguments
|
By default pcre2grep supports the use of callouts with string arguments
|
||||||
within the patterns it is matching. There are two kinds: one that gen-
|
within the patterns it is matching. There are two kinds: one that gen-
|
||||||
erates output using local code, and another that calls an external pro-
|
erates output using local code, and another that calls an external pro-
|
||||||
gram or script. If --disable-pcre2grep-callout-fork is added to the
|
gram or script. If --disable-pcre2grep-callout-fork is added to the
|
||||||
configure command, only the first kind of callout is supported; if
|
configure command, only the first kind of callout is supported; if
|
||||||
--disable-pcre2grep-callout is used, all callouts are completely ig-
|
--disable-pcre2grep-callout is used, all callouts are completely ig-
|
||||||
nored. For more details of pcre2grep callouts, see the pcre2grep docu-
|
nored. For more details of pcre2grep callouts, see the pcre2grep docu-
|
||||||
mentation.
|
mentation.
|
||||||
|
|
||||||
|
|
||||||
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
|
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
|
||||||
|
|
||||||
By default, pcre2grep reads all files as plain text. You can build it
|
By default, pcre2grep reads all files as plain text. You can build it
|
||||||
so that it recognizes files whose names end in .gz or .bz2, and reads
|
so that it recognizes files whose names end in .gz or .bz2, and reads
|
||||||
them with libz or libbz2, respectively, by adding one or both of
|
them with libz or libbz2, respectively, by adding one or both of
|
||||||
|
|
||||||
--enable-pcre2grep-libz
|
--enable-pcre2grep-libz
|
||||||
--enable-pcre2grep-libbz2
|
--enable-pcre2grep-libbz2
|
||||||
|
|
||||||
to the configure command. These options naturally require that the rel-
|
to the configure command. These options naturally require that the rel-
|
||||||
evant libraries are installed on your system. Configuration will fail
|
evant libraries are installed on your system. Configuration will fail
|
||||||
if they are not.
|
if they are not.
|
||||||
|
|
||||||
|
|
||||||
PCRE2GREP BUFFER SIZE
|
PCRE2GREP BUFFER SIZE
|
||||||
|
|
||||||
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
||||||
scanning, in order to be able to output "before" and "after" lines when
|
scanning, in order to be able to output "before" and "after" lines when
|
||||||
it finds a match. The default starting size of the buffer is 20KiB. The
|
it finds a match. The default starting size of the buffer is 20KiB. The
|
||||||
buffer itself is three times this size, but because of the way it is
|
buffer itself is three times this size, but because of the way it is
|
||||||
used for holding "before" lines, the longest line that is guaranteed to
|
used for holding "before" lines, the longest line that is guaranteed to
|
||||||
be processable is the notional buffer size. If a longer line is encoun-
|
be processable is the notional buffer size. If a longer line is encoun-
|
||||||
tered, pcre2grep automatically expands the buffer, up to a specified
|
tered, pcre2grep automatically expands the buffer, up to a specified
|
||||||
maximum size, whose default is 1MiB or the starting size, whichever is
|
maximum size, whose default is 1MiB or the starting size, whichever is
|
||||||
the larger. You can change the default parameter values by adding, for
|
the larger. You can change the default parameter values by adding, for
|
||||||
example,
|
example,
|
||||||
|
|
||||||
--with-pcre2grep-bufsize=51200
|
--with-pcre2grep-bufsize=51200
|
||||||
--with-pcre2grep-max-bufsize=2097152
|
--with-pcre2grep-max-bufsize=2097152
|
||||||
|
|
||||||
to the configure command. The caller of pcre2grep can override these
|
to the configure command. The caller of pcre2grep can override these
|
||||||
values by using --buffer-size and --max-buffer-size on the command
|
values by using --buffer-size and --max-buffer-size on the command
|
||||||
line.
|
line.
|
||||||
|
|
||||||
|
|
||||||
|
@ -4283,26 +4278,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
|
||||||
--enable-pcre2test-libreadline
|
--enable-pcre2test-libreadline
|
||||||
--enable-pcre2test-libedit
|
--enable-pcre2test-libedit
|
||||||
|
|
||||||
to the configure command, pcre2test is linked with the libreadline or-
|
to the configure command, pcre2test is linked with the libreadline or-
|
||||||
libedit library, respectively, and when its input is from a terminal,
|
libedit library, respectively, and when its input is from a terminal,
|
||||||
it reads it using the readline() function. This provides line-editing
|
it reads it using the readline() function. This provides line-editing
|
||||||
and history facilities. Note that libreadline is GPL-licensed, so if
|
and history facilities. Note that libreadline is GPL-licensed, so if
|
||||||
you distribute a binary of pcre2test linked in this way, there may be
|
you distribute a binary of pcre2test linked in this way, there may be
|
||||||
licensing issues. These can be avoided by linking instead with libedit,
|
licensing issues. These can be avoided by linking instead with libedit,
|
||||||
which has a BSD licence.
|
which has a BSD licence.
|
||||||
|
|
||||||
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
Setting --enable-pcre2test-libreadline causes the -lreadline option to
|
||||||
be added to the pcre2test build. In many operating environments with a
|
be added to the pcre2test build. In many operating environments with a
|
||||||
sytem-installed readline library this is sufficient. However, in some
|
sytem-installed readline library this is sufficient. However, in some
|
||||||
environments (e.g. if an unmodified distribution version of readline is
|
environments (e.g. if an unmodified distribution version of readline is
|
||||||
in use), some extra configuration may be necessary. The INSTALL file
|
in use), some extra configuration may be necessary. The INSTALL file
|
||||||
for libreadline says this:
|
for libreadline says this:
|
||||||
|
|
||||||
"Readline uses the termcap functions, but does not link with
|
"Readline uses the termcap functions, but does not link with
|
||||||
the termcap or curses library itself, allowing applications
|
the termcap or curses library itself, allowing applications
|
||||||
which link with readline the to choose an appropriate library."
|
which link with readline the to choose an appropriate library."
|
||||||
|
|
||||||
If your environment has not been set up so that an appropriate library
|
If your environment has not been set up so that an appropriate library
|
||||||
is automatically included, you may need to add something like
|
is automatically included, you may need to add something like
|
||||||
|
|
||||||
LIBS="-ncurses"
|
LIBS="-ncurses"
|
||||||
|
@ -4316,7 +4311,7 @@ INCLUDING DEBUGGING CODE
|
||||||
|
|
||||||
--enable-debug
|
--enable-debug
|
||||||
|
|
||||||
to the configure command, additional debugging code is included in the
|
to the configure command, additional debugging code is included in the
|
||||||
build. This feature is intended for use by the PCRE2 maintainers.
|
build. This feature is intended for use by the PCRE2 maintainers.
|
||||||
|
|
||||||
|
|
||||||
|
@ -4326,14 +4321,14 @@ DEBUGGING WITH VALGRIND SUPPORT
|
||||||
|
|
||||||
--enable-valgrind
|
--enable-valgrind
|
||||||
|
|
||||||
to the configure command, PCRE2 will use valgrind annotations to mark
|
to the configure command, PCRE2 will use valgrind annotations to mark
|
||||||
certain memory regions as unaddressable. This allows it to detect in-
|
certain memory regions as unaddressable. This allows it to detect in-
|
||||||
valid memory accesses, and is mostly useful for debugging PCRE2 itself.
|
valid memory accesses, and is mostly useful for debugging PCRE2 itself.
|
||||||
|
|
||||||
|
|
||||||
CODE COVERAGE REPORTING
|
CODE COVERAGE REPORTING
|
||||||
|
|
||||||
If your C compiler is gcc, you can build a version of PCRE2 that can
|
If your C compiler is gcc, you can build a version of PCRE2 that can
|
||||||
generate a code coverage report for its test suite. To enable this, you
|
generate a code coverage report for its test suite. To enable this, you
|
||||||
must install lcov version 1.6 or above. Then specify
|
must install lcov version 1.6 or above. Then specify
|
||||||
|
|
||||||
|
@ -4342,20 +4337,20 @@ CODE COVERAGE REPORTING
|
||||||
to the configure command and build PCRE2 in the usual way.
|
to the configure command and build PCRE2 in the usual way.
|
||||||
|
|
||||||
Note that using ccache (a caching C compiler) is incompatible with code
|
Note that using ccache (a caching C compiler) is incompatible with code
|
||||||
coverage reporting. If you have configured ccache to run automatically
|
coverage reporting. If you have configured ccache to run automatically
|
||||||
on your system, you must set the environment variable
|
on your system, you must set the environment variable
|
||||||
|
|
||||||
CCACHE_DISABLE=1
|
CCACHE_DISABLE=1
|
||||||
|
|
||||||
before running make to build PCRE2, so that ccache is not used.
|
before running make to build PCRE2, so that ccache is not used.
|
||||||
|
|
||||||
When --enable-coverage is used, the following addition targets are
|
When --enable-coverage is used, the following addition targets are
|
||||||
added to the Makefile:
|
added to the Makefile:
|
||||||
|
|
||||||
make coverage
|
make coverage
|
||||||
|
|
||||||
This creates a fresh coverage report for the PCRE2 test suite. It is
|
This creates a fresh coverage report for the PCRE2 test suite. It is
|
||||||
equivalent to running "make coverage-reset", "make coverage-baseline",
|
equivalent to running "make coverage-reset", "make coverage-baseline",
|
||||||
"make check", and then "make coverage-report".
|
"make check", and then "make coverage-report".
|
||||||
|
|
||||||
make coverage-reset
|
make coverage-reset
|
||||||
|
@ -4372,73 +4367,73 @@ CODE COVERAGE REPORTING
|
||||||
|
|
||||||
make coverage-clean-report
|
make coverage-clean-report
|
||||||
|
|
||||||
This removes the generated coverage report without cleaning the cover-
|
This removes the generated coverage report without cleaning the cover-
|
||||||
age data itself.
|
age data itself.
|
||||||
|
|
||||||
make coverage-clean-data
|
make coverage-clean-data
|
||||||
|
|
||||||
This removes the captured coverage data without removing the coverage
|
This removes the captured coverage data without removing the coverage
|
||||||
files created at compile time (*.gcno).
|
files created at compile time (*.gcno).
|
||||||
|
|
||||||
make coverage-clean
|
make coverage-clean
|
||||||
|
|
||||||
This cleans all coverage data including the generated coverage report.
|
This cleans all coverage data including the generated coverage report.
|
||||||
For more information about code coverage, see the gcov and lcov docu-
|
For more information about code coverage, see the gcov and lcov docu-
|
||||||
mentation.
|
mentation.
|
||||||
|
|
||||||
|
|
||||||
DISABLING THE Z AND T FORMATTING MODIFIERS
|
DISABLING THE Z AND T FORMATTING MODIFIERS
|
||||||
|
|
||||||
The C99 standard defines formatting modifiers z and t for size_t and
|
The C99 standard defines formatting modifiers z and t for size_t and
|
||||||
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers
|
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers
|
||||||
in environments other than old versions of Microsoft Visual Studio when
|
in environments other than old versions of Microsoft Visual Studio when
|
||||||
__STDC_VERSION__ is defined and has a value greater than or equal to
|
__STDC_VERSION__ is defined and has a value greater than or equal to
|
||||||
199901L (indicating support for C99). However, there is at least one
|
199901L (indicating support for C99). However, there is at least one
|
||||||
environment that claims to be C99 but does not support these modifiers.
|
environment that claims to be C99 but does not support these modifiers.
|
||||||
If
|
If
|
||||||
|
|
||||||
--disable-percent-zt
|
--disable-percent-zt
|
||||||
|
|
||||||
is specified, no use is made of the z or t modifiers. Instead of %td or
|
is specified, no use is made of the z or t modifiers. Instead of %td or
|
||||||
%zu, a suitable format is used depending in the size of long for the
|
%zu, a suitable format is used depending in the size of long for the
|
||||||
platform.
|
platform.
|
||||||
|
|
||||||
|
|
||||||
SUPPORT FOR FUZZERS
|
SUPPORT FOR FUZZERS
|
||||||
|
|
||||||
There is a special option for use by people who want to run fuzzing
|
There is a special option for use by people who want to run fuzzing
|
||||||
tests on PCRE2:
|
tests on PCRE2:
|
||||||
|
|
||||||
--enable-fuzz-support
|
--enable-fuzz-support
|
||||||
|
|
||||||
At present this applies only to the 8-bit library. If set, it causes an
|
At present this applies only to the 8-bit library. If set, it causes an
|
||||||
extra library called libpcre2-fuzzsupport.a to be built, but not in-
|
extra library called libpcre2-fuzzsupport.a to be built, but not in-
|
||||||
stalled. This contains a single function called LLVMFuzzerTestOneIn-
|
stalled. This contains a single function called LLVMFuzzerTestOneIn-
|
||||||
put() whose arguments are a pointer to a string and the length of the
|
put() whose arguments are a pointer to a string and the length of the
|
||||||
string. When called, this function tries to compile the string as a
|
string. When called, this function tries to compile the string as a
|
||||||
pattern, and if that succeeds, to match it. This is done both with no
|
pattern, and if that succeeds, to match it. This is done both with no
|
||||||
options and with some random options bits that are generated from the
|
options and with some random options bits that are generated from the
|
||||||
string.
|
string.
|
||||||
|
|
||||||
Setting --enable-fuzz-support also causes a binary called pcre2fuz-
|
Setting --enable-fuzz-support also causes a binary called pcre2fuz-
|
||||||
zcheck to be created. This is normally run under valgrind or used when
|
zcheck to be created. This is normally run under valgrind or used when
|
||||||
PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing
|
PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing
|
||||||
function and outputs information about what it is doing. The input
|
function and outputs information about what it is doing. The input
|
||||||
strings are specified by arguments: if an argument starts with "=" the
|
strings are specified by arguments: if an argument starts with "=" the
|
||||||
rest of it is a literal input string. Otherwise, it is assumed to be a
|
rest of it is a literal input string. Otherwise, it is assumed to be a
|
||||||
file name, and the contents of the file are the test string.
|
file name, and the contents of the file are the test string.
|
||||||
|
|
||||||
|
|
||||||
OBSOLETE OPTION
|
OBSOLETE OPTION
|
||||||
|
|
||||||
In versions of PCRE2 prior to 10.30, there were two ways of handling
|
In versions of PCRE2 prior to 10.30, there were two ways of handling
|
||||||
backtracking in the pcre2_match() function. The default was to use the
|
backtracking in the pcre2_match() function. The default was to use the
|
||||||
system stack, but if
|
system stack, but if
|
||||||
|
|
||||||
--disable-stack-for-recursion
|
--disable-stack-for-recursion
|
||||||
|
|
||||||
was set, memory on the heap was used. From release 10.30 onwards this
|
was set, memory on the heap was used. From release 10.30 onwards this
|
||||||
has changed (the stack is no longer used) and this option now does
|
has changed (the stack is no longer used) and this option now does
|
||||||
nothing except give a warning.
|
nothing except give a warning.
|
||||||
|
|
||||||
|
|
||||||
|
@ -4450,14 +4445,14 @@ SEE ALSO
|
||||||
AUTHOR
|
AUTHOR
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
University Computing Service
|
Retired from University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
|
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 08 December 2021
|
Last updated: 27 July 2022
|
||||||
Copyright (c) 1997-2021 University of Cambridge.
|
Copyright (c) 1997-2022 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -5596,18 +5591,22 @@ SIZE AND OTHER LIMITATIONS
|
||||||
The maximum length of a string argument to a callout is the largest
|
The maximum length of a string argument to a callout is the largest
|
||||||
number a 32-bit unsigned integer can hold.
|
number a 32-bit unsigned integer can hold.
|
||||||
|
|
||||||
|
The maximum amount of heap memory used for matching is controlled by
|
||||||
|
the heap limit, which can be set in a pattern or in a match context.
|
||||||
|
The default is a very large number, effectively unlimited.
|
||||||
|
|
||||||
|
|
||||||
AUTHOR
|
AUTHOR
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
University Computing Service
|
Retired from University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
|
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 02 February 2019
|
Last updated: 26 July 2022
|
||||||
Copyright (c) 1997-2019 University of Cambridge.
|
Copyright (c) 1997-2022 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -9773,152 +9772,169 @@ STACK AND HEAP USAGE AT RUN TIME
|
||||||
sive function calls could use a great deal of stack, and this could
|
sive function calls could use a great deal of stack, and this could
|
||||||
cause problems, but this usage has been eliminated. Backtracking posi-
|
cause problems, but this usage has been eliminated. Backtracking posi-
|
||||||
tions are now explicitly remembered in memory frames controlled by the
|
tions are now explicitly remembered in memory frames controlled by the
|
||||||
code. An initial 20KiB vector of frames is allocated on the system
|
code.
|
||||||
stack (enough for about 100 frames for small patterns), but if this is
|
|
||||||
insufficient, heap memory is used. The amount of heap memory can be
|
|
||||||
limited; if the limit is set to zero, only the initial stack vector is
|
|
||||||
used. Rewriting patterns to be time-efficient, as described below, may
|
|
||||||
also reduce the memory requirements.
|
|
||||||
|
|
||||||
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive
|
The size of each frame depends on the size of pointer variables and the
|
||||||
function calls, but only for processing atomic groups, lookaround as-
|
number of capturing parenthesized groups in the pattern being matched.
|
||||||
|
On a 64-bit system the frame size for a pattern with no captures is 128
|
||||||
|
bytes. For each capturing group the size increases by 16 bytes.
|
||||||
|
|
||||||
|
Until release 10.41, an initial 20KiB frames vector was allocated on
|
||||||
|
the system stack, but this still caused some issues for multi-thread
|
||||||
|
applications where each thread has a very small stack. From release
|
||||||
|
10.41 backtracking memory frames are always held in heap memory. An
|
||||||
|
initial heap allocation is obtained the first time any match data block
|
||||||
|
is passed to pcre2_match(). This is remembered with the match data
|
||||||
|
block and re-used if that block is used for another match. It is freed
|
||||||
|
when the match data block itself is freed.
|
||||||
|
|
||||||
|
The size of the initial block is the larger of 20KiB or ten times the
|
||||||
|
pattern's frame size, unless the heap limit is less than this, in which
|
||||||
|
case the heap limit is used. If the initial block proves to be too
|
||||||
|
small during matching, it is replaced by a larger block, subject to the
|
||||||
|
heap limit. The heap limit is checked only when a new block is to be
|
||||||
|
allocated. Reducing the heap limit between calls to pcre2_match() with
|
||||||
|
the same match data block does not affect the saved block.
|
||||||
|
|
||||||
|
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive
|
||||||
|
function calls, but only for processing atomic groups, lookaround as-
|
||||||
sertions, and recursion within the pattern. The original version of the
|
sertions, and recursion within the pattern. The original version of the
|
||||||
code used to allocate quite large internal workspace vectors on the
|
code used to allocate quite large internal workspace vectors on the
|
||||||
stack, which caused some problems for some patterns in environments
|
stack, which caused some problems for some patterns in environments
|
||||||
with small stacks. From release 10.32 the code for pcre2_dfa_match()
|
with small stacks. From release 10.32 the code for pcre2_dfa_match()
|
||||||
has been re-factored to use heap memory when necessary for internal
|
has been re-factored to use heap memory when necessary for internal
|
||||||
workspace when recursing, though recursive function calls are still
|
workspace when recursing, though recursive function calls are still
|
||||||
used.
|
used.
|
||||||
|
|
||||||
The "match depth" parameter can be used to limit the depth of function
|
The "match depth" parameter can be used to limit the depth of function
|
||||||
recursion, and the "match heap" parameter to limit heap memory in
|
recursion, and the "match heap" parameter to limit heap memory in
|
||||||
pcre2_dfa_match().
|
pcre2_dfa_match().
|
||||||
|
|
||||||
|
|
||||||
PROCESSING TIME
|
PROCESSING TIME
|
||||||
|
|
||||||
Certain items in regular expression patterns are processed more effi-
|
Certain items in regular expression patterns are processed more effi-
|
||||||
ciently than others. It is more efficient to use a character class like
|
ciently than others. It is more efficient to use a character class like
|
||||||
[aeiou] than a set of single-character alternatives such as
|
[aeiou] than a set of single-character alternatives such as
|
||||||
(a|e|i|o|u). In general, the simplest construction that provides the
|
(a|e|i|o|u). In general, the simplest construction that provides the
|
||||||
required behaviour is usually the most efficient. Jeffrey Friedl's book
|
required behaviour is usually the most efficient. Jeffrey Friedl's book
|
||||||
contains a lot of useful general discussion about optimizing regular
|
contains a lot of useful general discussion about optimizing regular
|
||||||
expressions for efficient performance. This document contains a few ob-
|
expressions for efficient performance. This document contains a few ob-
|
||||||
servations about PCRE2.
|
servations about PCRE2.
|
||||||
|
|
||||||
Using Unicode character properties (the \p, \P, and \X escapes) is
|
Using Unicode character properties (the \p, \P, and \X escapes) is
|
||||||
slow, because PCRE2 has to use a multi-stage table lookup whenever it
|
slow, because PCRE2 has to use a multi-stage table lookup whenever it
|
||||||
needs a character's property. If you can find an alternative pattern
|
needs a character's property. If you can find an alternative pattern
|
||||||
that does not use character properties, it will probably be faster.
|
that does not use character properties, it will probably be faster.
|
||||||
|
|
||||||
By default, the escape sequences \b, \d, \s, and \w, and the POSIX
|
By default, the escape sequences \b, \d, \s, and \w, and the POSIX
|
||||||
character classes such as [:alpha:] do not use Unicode properties,
|
character classes such as [:alpha:] do not use Unicode properties,
|
||||||
partly for backwards compatibility, and partly for performance reasons.
|
partly for backwards compatibility, and partly for performance reasons.
|
||||||
However, you can set the PCRE2_UCP option or start the pattern with
|
However, you can set the PCRE2_UCP option or start the pattern with
|
||||||
(*UCP) if you want Unicode character properties to be used. This can
|
(*UCP) if you want Unicode character properties to be used. This can
|
||||||
double the matching time for items such as \d, when matched with
|
double the matching time for items such as \d, when matched with
|
||||||
pcre2_match(); the performance loss is less with a DFA matching func-
|
pcre2_match(); the performance loss is less with a DFA matching func-
|
||||||
tion, and in both cases there is not much difference for \b.
|
tion, and in both cases there is not much difference for \b.
|
||||||
|
|
||||||
When a pattern begins with .* not in atomic parentheses, nor in paren-
|
When a pattern begins with .* not in atomic parentheses, nor in paren-
|
||||||
theses that are the subject of a backreference, and the PCRE2_DOTALL
|
theses that are the subject of a backreference, and the PCRE2_DOTALL
|
||||||
option is set, the pattern is implicitly anchored by PCRE2, since it
|
option is set, the pattern is implicitly anchored by PCRE2, since it
|
||||||
can match only at the start of a subject string. If the pattern has
|
can match only at the start of a subject string. If the pattern has
|
||||||
multiple top-level branches, they must all be anchorable. The optimiza-
|
multiple top-level branches, they must all be anchorable. The optimiza-
|
||||||
tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au-
|
tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au-
|
||||||
tomatically disabled if the pattern contains (*PRUNE) or (*SKIP).
|
tomatically disabled if the pattern contains (*PRUNE) or (*SKIP).
|
||||||
|
|
||||||
If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be-
|
If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be-
|
||||||
cause the dot metacharacter does not then match a newline, and if the
|
cause the dot metacharacter does not then match a newline, and if the
|
||||||
subject string contains newlines, the pattern may match from the char-
|
subject string contains newlines, the pattern may match from the char-
|
||||||
acter immediately following one of them instead of from the very start.
|
acter immediately following one of them instead of from the very start.
|
||||||
For example, the pattern
|
For example, the pattern
|
||||||
|
|
||||||
.*second
|
.*second
|
||||||
|
|
||||||
matches the subject "first\nand second" (where \n stands for a newline
|
matches the subject "first\nand second" (where \n stands for a newline
|
||||||
character), with the match starting at the seventh character. In order
|
character), with the match starting at the seventh character. In order
|
||||||
to do this, PCRE2 has to retry the match starting after every newline
|
to do this, PCRE2 has to retry the match starting after every newline
|
||||||
in the subject.
|
in the subject.
|
||||||
|
|
||||||
If you are using such a pattern with subject strings that do not con-
|
If you are using such a pattern with subject strings that do not con-
|
||||||
tain newlines, the best performance is obtained by setting
|
tain newlines, the best performance is obtained by setting
|
||||||
PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex-
|
PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex-
|
||||||
plicit anchoring. That saves PCRE2 from having to scan along the sub-
|
plicit anchoring. That saves PCRE2 from having to scan along the sub-
|
||||||
ject looking for a newline to restart at.
|
ject looking for a newline to restart at.
|
||||||
|
|
||||||
Beware of patterns that contain nested indefinite repeats. These can
|
Beware of patterns that contain nested indefinite repeats. These can
|
||||||
take a long time to run when applied to a string that does not match.
|
take a long time to run when applied to a string that does not match.
|
||||||
Consider the pattern fragment
|
Consider the pattern fragment
|
||||||
|
|
||||||
^(a+)*
|
^(a+)*
|
||||||
|
|
||||||
This can match "aaaa" in 16 different ways, and this number increases
|
This can match "aaaa" in 16 different ways, and this number increases
|
||||||
very rapidly as the string gets longer. (The * repeat can match 0, 1,
|
very rapidly as the string gets longer. (The * repeat can match 0, 1,
|
||||||
2, 3, or 4 times, and for each of those cases other than 0 or 4, the +
|
2, 3, or 4 times, and for each of those cases other than 0 or 4, the +
|
||||||
repeats can match different numbers of times.) When the remainder of
|
repeats can match different numbers of times.) When the remainder of
|
||||||
the pattern is such that the entire match is going to fail, PCRE2 has
|
the pattern is such that the entire match is going to fail, PCRE2 has
|
||||||
in principle to try every possible variation, and this can take an ex-
|
in principle to try every possible variation, and this can take an ex-
|
||||||
tremely long time, even for relatively short strings.
|
tremely long time, even for relatively short strings.
|
||||||
|
|
||||||
An optimization catches some of the more simple cases such as
|
An optimization catches some of the more simple cases such as
|
||||||
|
|
||||||
(a+)*b
|
(a+)*b
|
||||||
|
|
||||||
where a literal character follows. Before embarking on the standard
|
where a literal character follows. Before embarking on the standard
|
||||||
matching procedure, PCRE2 checks that there is a "b" later in the sub-
|
matching procedure, PCRE2 checks that there is a "b" later in the sub-
|
||||||
ject string, and if there is not, it fails the match immediately. How-
|
ject string, and if there is not, it fails the match immediately. How-
|
||||||
ever, when there is no following literal this optimization cannot be
|
ever, when there is no following literal this optimization cannot be
|
||||||
used. You can see the difference by comparing the behaviour of
|
used. You can see the difference by comparing the behaviour of
|
||||||
|
|
||||||
(a+)*\d
|
(a+)*\d
|
||||||
|
|
||||||
with the pattern above. The former gives a failure almost instantly
|
with the pattern above. The former gives a failure almost instantly
|
||||||
when applied to a whole line of "a" characters, whereas the latter
|
when applied to a whole line of "a" characters, whereas the latter
|
||||||
takes an appreciable time with strings longer than about 20 characters.
|
takes an appreciable time with strings longer than about 20 characters.
|
||||||
|
|
||||||
In many cases, the solution to this kind of performance issue is to use
|
In many cases, the solution to this kind of performance issue is to use
|
||||||
an atomic group or a possessive quantifier. This can often reduce mem-
|
an atomic group or a possessive quantifier. This can often reduce mem-
|
||||||
ory requirements as well. As another example, consider this pattern:
|
ory requirements as well. As another example, consider this pattern:
|
||||||
|
|
||||||
([^<]|<(?!inet))+
|
([^<]|<(?!inet))+
|
||||||
|
|
||||||
It matches from wherever it starts until it encounters "<inet" or the
|
It matches from wherever it starts until it encounters "<inet" or the
|
||||||
end of the data, and is the kind of pattern that might be used when
|
end of the data, and is the kind of pattern that might be used when
|
||||||
processing an XML file. Each iteration of the outer parentheses matches
|
processing an XML file. Each iteration of the outer parentheses matches
|
||||||
either one character that is not "<" or a "<" that is not followed by
|
either one character that is not "<" or a "<" that is not followed by
|
||||||
"inet". However, each time a parenthesis is processed, a backtracking
|
"inet". However, each time a parenthesis is processed, a backtracking
|
||||||
position is passed, so this formulation uses a memory frame for each
|
position is passed, so this formulation uses a memory frame for each
|
||||||
matched character. For a long string, a lot of memory is required. Con-
|
matched character. For a long string, a lot of memory is required. Con-
|
||||||
sider now this rewritten pattern, which matches exactly the same
|
sider now this rewritten pattern, which matches exactly the same
|
||||||
strings:
|
strings:
|
||||||
|
|
||||||
([^<]++|<(?!inet))+
|
([^<]++|<(?!inet))+
|
||||||
|
|
||||||
This runs much faster, because sequences of characters that do not con-
|
This runs much faster, because sequences of characters that do not con-
|
||||||
tain "<" are "swallowed" in one item inside the parentheses, and a pos-
|
tain "<" are "swallowed" in one item inside the parentheses, and a pos-
|
||||||
sessive quantifier is used to stop any backtracking into the runs of
|
sessive quantifier is used to stop any backtracking into the runs of
|
||||||
non-"<" characters. This version also uses a lot less memory because
|
non-"<" characters. This version also uses a lot less memory because
|
||||||
entry to a new set of parentheses happens only when a "<" character
|
entry to a new set of parentheses happens only when a "<" character
|
||||||
that is not followed by "inet" is encountered (and we assume this is
|
that is not followed by "inet" is encountered (and we assume this is
|
||||||
relatively rare).
|
relatively rare).
|
||||||
|
|
||||||
This example shows that one way of optimizing performance when matching
|
This example shows that one way of optimizing performance when matching
|
||||||
long subject strings is to write repeated parenthesized subpatterns to
|
long subject strings is to write repeated parenthesized subpatterns to
|
||||||
match more than one character whenever possible.
|
match more than one character whenever possible.
|
||||||
|
|
||||||
SETTING RESOURCE LIMITS
|
SETTING RESOURCE LIMITS
|
||||||
|
|
||||||
You can set limits on the amount of processing that takes place when
|
You can set limits on the amount of processing that takes place when
|
||||||
matching, and on the amount of heap memory that is used. The default
|
matching, and on the amount of heap memory that is used. The default
|
||||||
values of the limits are very large, and unlikely ever to operate. They
|
values of the limits are very large, and unlikely ever to operate. They
|
||||||
can be changed when PCRE2 is built, and they can also be set when
|
can be changed when PCRE2 is built, and they can also be set when
|
||||||
pcre2_match() or pcre2_dfa_match() is called. For details of these in-
|
pcre2_match() or pcre2_dfa_match() is called. For details of these in-
|
||||||
terfaces, see the pcre2build documentation and the section entitled
|
terfaces, see the pcre2build documentation and the section entitled
|
||||||
"The match context" in the pcre2api documentation.
|
"The match context" in the pcre2api documentation.
|
||||||
|
|
||||||
The pcre2test test program has a modifier called "find_limits" which,
|
The pcre2test test program has a modifier called "find_limits" which,
|
||||||
if applied to a subject line, causes it to find the smallest limits
|
if applied to a subject line, causes it to find the smallest limits
|
||||||
that allow a pattern to match. This is done by repeatedly matching with
|
that allow a pattern to match. This is done by repeatedly matching with
|
||||||
different limits.
|
different limits.
|
||||||
|
|
||||||
|
@ -9926,14 +9942,14 @@ PROCESSING TIME
|
||||||
AUTHOR
|
AUTHOR
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
University Computing Service
|
Retired from University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
|
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 03 February 2019
|
Last updated: 27 July 2022
|
||||||
Copyright (c) 1997-2019 University of Cambridge.
|
Copyright (c) 1997-2022 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2GREP 1 "27 July 2022" "PCRE2 10.41"
|
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -43,13 +43,15 @@ For example:
|
||||||
.sp
|
.sp
|
||||||
pcre2grep some-pattern file1 - file3
|
pcre2grep some-pattern file1 - file3
|
||||||
.sp
|
.sp
|
||||||
Input files are searched line by line. By default, each line that matches a
|
By default, input files are searched line by line. Each line that matches a
|
||||||
pattern is copied to the standard output, and if there is more than one file,
|
pattern is copied to the standard output, and if there is more than one file,
|
||||||
the file name is output at the start of each line, followed by a colon.
|
the file name is output at the start of each line, followed by a colon.
|
||||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||||
span line boundaries. What defines a line boundary is controlled by the
|
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||||
\fB-N\fP (\fB--newline\fP) option.
|
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||||
|
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||||
|
terminator to a zero byte.
|
||||||
.P
|
.P
|
||||||
The amount of memory used for buffering files that are being scanned is
|
The amount of memory used for buffering files that are being scanned is
|
||||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||||
|
@ -149,9 +151,11 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
||||||
lines are output if the next match or the end of the file is reached, or if the
|
lines are output if the next match or the end of the file is reached, or if the
|
||||||
processing buffer size has been set too small. If file names and/or line
|
processing buffer size has been set too small. If file names and/or line
|
||||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||||
context lines. A line containing "--" is output between each group of lines,
|
context lines (the \fB-Z\fP option can be used to change the file name
|
||||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
terminator to a zero byte). A line containing "--" is output between each group
|
||||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
of lines, unless they are in fact contiguous in the input file. The value of
|
||||||
|
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||||
|
\fB-A\fP is ignored.
|
||||||
.TP
|
.TP
|
||||||
\fB-a\fP, \fB--text\fP
|
\fB-a\fP, \fB--text\fP
|
||||||
Treat binary files as text. This is equivalent to
|
Treat binary files as text. This is equivalent to
|
||||||
|
@ -167,9 +171,10 @@ Output up to \fInumber\fP lines of context before each matching line. Fewer
|
||||||
lines are output if the previous match or the start of the file is within
|
lines are output if the previous match or the start of the file is within
|
||||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||||
file names and/or line numbers are being output, a hyphen separator is used
|
file names and/or line numbers are being output, a hyphen separator is used
|
||||||
instead of a colon for the context lines. A line containing "--" is output
|
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||||
between each group of lines, unless they are in fact contiguous in the input
|
change the file name terminator to a zero byte). A line containing "--" is
|
||||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
output between each group of lines, unless they are in fact contiguous in the
|
||||||
|
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||||
.TP
|
.TP
|
||||||
\fB--binary-files=\fP\fIword\fP
|
\fB--binary-files=\fP\fIword\fP
|
||||||
|
@ -356,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
||||||
.TP
|
.TP
|
||||||
\fB-H\fP, \fB--with-filename\fP
|
\fB-H\fP, \fB--with-filename\fP
|
||||||
Force the inclusion of the file name at the start of output lines when
|
Force the inclusion of the file name at the start of output lines when
|
||||||
searching a single file. By default, the file name is not shown in this case.
|
searching a single file. The file name is not normally shown in this case.
|
||||||
For matching lines, the file name is followed by a colon; for context lines, a
|
By default, for matching lines, the file name is followed by a colon; for
|
||||||
hyphen separator is used. If a line number is also being output, it follows the
|
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
change the terminator to a zero byte. If a line number is also being output,
|
||||||
line, only the first is preceded by the file name. This option overrides any
|
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
more than one line, only the first is preceded by the file name. This option
|
||||||
|
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||||
.TP
|
.TP
|
||||||
\fB-h\fP, \fB--no-filename\fP
|
\fB-h\fP, \fB--no-filename\fP
|
||||||
Suppress the output file names when searching multiple files. By default,
|
Suppress the output file names when searching multiple files. File names are
|
||||||
file names are shown when multiple files are searched. For matching lines, the
|
normally shown when multiple files are searched. By default, for matching
|
||||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||||
If a line number is also being output, it follows the file name. This option
|
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
a zero byte. If a line number is also being output, it follows the file name.
|
||||||
|
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||||
.TP
|
.TP
|
||||||
\fB--heap-limit\fP=\fInumber\fP
|
\fB--heap-limit\fP=\fInumber\fP
|
||||||
See \fB--match-limit\fP below.
|
See \fB--match-limit\fP below.
|
||||||
|
@ -417,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
||||||
\fB-L\fP, \fB--files-without-match\fP
|
\fB-L\fP, \fB--files-without-match\fP
|
||||||
Instead of outputting lines from the files, just output the names of the files
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
that do not contain any lines that would have been output. Each file name is
|
that do not contain any lines that would have been output. Each file name is
|
||||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||||
\fB-h\fP, or \fB-l\fP options.
|
they are separated by zero bytes instead of newlines. This option overrides any
|
||||||
|
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||||
.TP
|
.TP
|
||||||
\fB-l\fP, \fB--files-with-matches\fP
|
\fB-l\fP, \fB--files-with-matches\fP
|
||||||
Instead of outputting lines from the files, just output the names of the files
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
containing lines that would have been output. Each file name is output once, on
|
containing lines that would have been output. Each file name is output once, on
|
||||||
a separate line. Searching normally stops as soon as a matching line is found
|
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||||
continues in order to obtain the correct count, and those files that have at
|
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||||
least one match are listed along with their counts. Using this option with
|
matching continues in order to obtain the correct count, and those files that
|
||||||
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
have at least one match are listed along with their counts. Using this option
|
||||||
|
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||||
\fB-h\fP, or \fB-L\fP options.
|
\fB-h\fP, or \fB-L\fP options.
|
||||||
.TP
|
.TP
|
||||||
|
@ -729,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
||||||
pattern and ")$" at the end. This option applies only to the patterns that are
|
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||||
matched against the contents of files; it does not apply to patterns specified
|
matched against the contents of files; it does not apply to patterns specified
|
||||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||||
|
.TP
|
||||||
|
\fB-Z\fP, \fB--null\fP
|
||||||
|
Terminate files names in the regular output with a zero byte (the NUL
|
||||||
|
character) instead of what would normally appear. This is useful when file
|
||||||
|
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||||
|
option does not apply to file names in error messages.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "ENVIRONMENT VARIABLES"
|
.SH "ENVIRONMENT VARIABLES"
|
||||||
|
@ -957,6 +972,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 27 July 2022
|
Last updated: 30 July 2022
|
||||||
Copyright (c) 1997-2022 University of Cambridge.
|
Copyright (c) 1997-2022 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1111,7 +1111,8 @@ SUBJECT MODIFIERS
|
||||||
copy=<number or name> copy captured substring
|
copy=<number or name> copy captured substring
|
||||||
depth_limit=<n> set a depth limit
|
depth_limit=<n> set a depth limit
|
||||||
dfa use pcre2_dfa_match()
|
dfa use pcre2_dfa_match()
|
||||||
find_limits find match and depth limits
|
find_limits find heap, match and depth limits
|
||||||
|
find_limits_noheap find match and depth limits
|
||||||
get=<number or name> extract captured substring
|
get=<number or name> extract captured substring
|
||||||
getall extract all captured substrings
|
getall extract all captured substrings
|
||||||
/g global global matching
|
/g global global matching
|
||||||
|
@ -1411,7 +1412,7 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
||||||
priate limits in the match context. These values are ignored when the
|
priate limits in the match context. These values are ignored when the
|
||||||
find_limits modifier is specified.
|
find_limits or find_limits_noheap modifier is specified.
|
||||||
|
|
||||||
Finding minimum limits
|
Finding minimum limits
|
||||||
|
|
||||||
|
@ -1419,8 +1420,12 @@ SUBJECT MODIFIERS
|
||||||
calls the relevant matching function several times, setting different
|
calls the relevant matching function several times, setting different
|
||||||
values in the match context via pcre2_set_heap_limit(),
|
values in the match context via pcre2_set_heap_limit(),
|
||||||
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
||||||
minimum values for each parameter that allows the match to complete
|
smallest value for each parameter that allows the match to complete
|
||||||
without error. If JIT is being used, only the match limit is relevant.
|
without a "limit exceeded" error. The match itself may succeed or fail.
|
||||||
|
An alternative modifier, find_limits_noheap, omits the heap limit. This
|
||||||
|
is used in the standard tests, because the minimum heap limit varies
|
||||||
|
between systems. If JIT is being used, only the match limit is rele-
|
||||||
|
vant, and the other two are automatically omitted.
|
||||||
|
|
||||||
When using this modifier, the pattern should not contain any limit set-
|
When using this modifier, the pattern should not contain any limit set-
|
||||||
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
||||||
|
@ -1446,9 +1451,7 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
For both kinds of matching, the heap_limit number, which is in
|
For both kinds of matching, the heap_limit number, which is in
|
||||||
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
||||||
for matching. A value of zero disables the use of any heap memory; many
|
for matching.
|
||||||
simple pattern matches can be done without using the heap, so zero is
|
|
||||||
not an unreasonable setting.
|
|
||||||
|
|
||||||
Showing MARK names
|
Showing MARK names
|
||||||
|
|
||||||
|
@ -1463,13 +1466,11 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
||||||
ory allocation and freeing calls that occur during a call to
|
ory allocation and freeing calls that occur during a call to
|
||||||
pcre2_match() or pcre2_dfa_match(). These occur only when a match re-
|
pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is
|
||||||
quires a bigger vector than the default for remembering backtracking
|
used only when a match requires more internal workspace that the de-
|
||||||
points (pcre2_match()) or for internal workspace (pcre2_dfa_match()).
|
fault allocation on the stack, so in many cases there will be no out-
|
||||||
In many cases there will be no heap memory used and therefore no addi-
|
put. No heap memory is allocated during matching with JIT. For this
|
||||||
tional output. No heap memory is allocated during matching with JIT, so
|
modifier to work, the null_context modifier must not be set on both the
|
||||||
in that case the memory modifier never has any effect. For this modi-
|
|
||||||
fier to work, the null_context modifier must not be set on both the
|
|
||||||
pattern and the subject, though it can be set on one or the other.
|
pattern and the subject, though it can be set on one or the other.
|
||||||
|
|
||||||
Setting a starting offset
|
Setting a starting offset
|
||||||
|
@ -1518,45 +1519,46 @@ SUBJECT MODIFIERS
|
||||||
null_context modifier is set, however, NULL is passed. This is for
|
null_context modifier is set, however, NULL is passed. This is for
|
||||||
testing that the matching and substitution functions behave correctly
|
testing that the matching and substitution functions behave correctly
|
||||||
in this case (they use default values). This modifier cannot be used
|
in this case (they use default values). This modifier cannot be used
|
||||||
with the find_limits or substitute_callout modifiers.
|
with the find_limits, find_limits_noheap, or substitute_callout modi-
|
||||||
|
fiers.
|
||||||
|
|
||||||
Similarly, for testing purposes, if the null_subject or null_replace-
|
Similarly, for testing purposes, if the null_subject or null_replace-
|
||||||
ment modifier is set, the subject or replacement string pointers are
|
ment modifier is set, the subject or replacement string pointers are
|
||||||
passed as NULL, respectively, to the relevant functions.
|
passed as NULL, respectively, to the relevant functions.
|
||||||
|
|
||||||
|
|
||||||
THE ALTERNATIVE MATCHING FUNCTION
|
THE ALTERNATIVE MATCHING FUNCTION
|
||||||
|
|
||||||
By default, pcre2test uses the standard PCRE2 matching function,
|
By default, pcre2test uses the standard PCRE2 matching function,
|
||||||
pcre2_match() to match each subject line. PCRE2 also supports an alter-
|
pcre2_match() to match each subject line. PCRE2 also supports an alter-
|
||||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||||
ferent way, and has some restrictions. The differences between the two
|
ferent way, and has some restrictions. The differences between the two
|
||||||
functions are described in the pcre2matching documentation.
|
functions are described in the pcre2matching documentation.
|
||||||
|
|
||||||
If the dfa modifier is set, the alternative matching function is used.
|
If the dfa modifier is set, the alternative matching function is used.
|
||||||
This function finds all possible matches at a given point in the sub-
|
This function finds all possible matches at a given point in the sub-
|
||||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||||
after the first match is found. This is always the shortest possible
|
after the first match is found. This is always the shortest possible
|
||||||
match.
|
match.
|
||||||
|
|
||||||
|
|
||||||
DEFAULT OUTPUT FROM pcre2test
|
DEFAULT OUTPUT FROM pcre2test
|
||||||
|
|
||||||
This section describes the output when the normal matching function,
|
This section describes the output when the normal matching function,
|
||||||
pcre2_match(), is being used.
|
pcre2_match(), is being used.
|
||||||
|
|
||||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||||
strings, starting with number 0 for the string that matched the whole
|
strings, starting with number 0 for the string that matched the whole
|
||||||
pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER-
|
pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER-
|
||||||
ROR_NOMATCH, or "Partial match:" followed by the partially matching
|
ROR_NOMATCH, or "Partial match:" followed by the partially matching
|
||||||
substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is
|
substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is
|
||||||
the entire substring that was inspected during the partial match; it
|
the entire substring that was inspected during the partial match; it
|
||||||
may include characters before the actual match start if a lookbehind
|
may include characters before the actual match start if a lookbehind
|
||||||
assertion, \K, \b, or \B was involved.)
|
assertion, \K, \b, or \B was involved.)
|
||||||
|
|
||||||
For any other return, pcre2test outputs the PCRE2 negative error number
|
For any other return, pcre2test outputs the PCRE2 negative error number
|
||||||
and a short descriptive phrase. If the error is a failed UTF string
|
and a short descriptive phrase. If the error is a failed UTF string
|
||||||
check, the code unit offset of the start of the failing character is
|
check, the code unit offset of the start of the failing character is
|
||||||
also output. Here is an example of an interactive pcre2test run.
|
also output. Here is an example of an interactive pcre2test run.
|
||||||
|
|
||||||
$ pcre2test
|
$ pcre2test
|
||||||
|
@ -1572,8 +1574,8 @@ DEFAULT OUTPUT FROM pcre2test
|
||||||
Unset capturing substrings that are not followed by one that is set are
|
Unset capturing substrings that are not followed by one that is set are
|
||||||
not shown by pcre2test unless the allcaptures modifier is specified. In
|
not shown by pcre2test unless the allcaptures modifier is specified. In
|
||||||
the following example, there are two capturing substrings, but when the
|
the following example, there are two capturing substrings, but when the
|
||||||
first data line is matched, the second, unset substring is not shown.
|
first data line is matched, the second, unset substring is not shown.
|
||||||
An "internal" unset substring is shown as "<unset>", as for the second
|
An "internal" unset substring is shown as "<unset>", as for the second
|
||||||
data line.
|
data line.
|
||||||
|
|
||||||
re> /(a)|(b)/
|
re> /(a)|(b)/
|
||||||
|
@ -1585,11 +1587,11 @@ DEFAULT OUTPUT FROM pcre2test
|
||||||
1: <unset>
|
1: <unset>
|
||||||
2: b
|
2: b
|
||||||
|
|
||||||
If the strings contain any non-printing characters, they are output as
|
If the strings contain any non-printing characters, they are output as
|
||||||
\xhh escapes if the value is less than 256 and UTF mode is not set.
|
\xhh escapes if the value is less than 256 and UTF mode is not set.
|
||||||
Otherwise they are output as \x{hh...} escapes. See below for the defi-
|
Otherwise they are output as \x{hh...} escapes. See below for the defi-
|
||||||
nition of non-printing characters. If the aftertext modifier is set,
|
nition of non-printing characters. If the aftertext modifier is set,
|
||||||
the output for substring 0 is followed by the the rest of the subject
|
the output for substring 0 is followed by the the rest of the subject
|
||||||
string, identified by "0+" like this:
|
string, identified by "0+" like this:
|
||||||
|
|
||||||
re> /cat/aftertext
|
re> /cat/aftertext
|
||||||
|
@ -1609,8 +1611,8 @@ DEFAULT OUTPUT FROM pcre2test
|
||||||
0: ipp
|
0: ipp
|
||||||
1: pp
|
1: pp
|
||||||
|
|
||||||
"No match" is output only if the first match attempt fails. Here is an
|
"No match" is output only if the first match attempt fails. Here is an
|
||||||
example of a failure message (the offset 4 that is specified by the
|
example of a failure message (the offset 4 that is specified by the
|
||||||
offset modifier is past the end of the subject string):
|
offset modifier is past the end of the subject string):
|
||||||
|
|
||||||
re> /xyz/
|
re> /xyz/
|
||||||
|
@ -1618,7 +1620,7 @@ DEFAULT OUTPUT FROM pcre2test
|
||||||
Error -24 (bad offset value)
|
Error -24 (bad offset value)
|
||||||
|
|
||||||
Note that whereas patterns can be continued over several lines (a plain
|
Note that whereas patterns can be continued over several lines (a plain
|
||||||
">" prompt is used for continuations), subject lines may not. However
|
">" prompt is used for continuations), subject lines may not. However
|
||||||
newlines can be included in a subject by means of the \n escape (or \r,
|
newlines can be included in a subject by means of the \n escape (or \r,
|
||||||
\r\n, etc., depending on the newline sequence setting).
|
\r\n, etc., depending on the newline sequence setting).
|
||||||
|
|
||||||
|
@ -1626,7 +1628,7 @@ DEFAULT OUTPUT FROM pcre2test
|
||||||
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||||
|
|
||||||
When the alternative matching function, pcre2_dfa_match(), is used, the
|
When the alternative matching function, pcre2_dfa_match(), is used, the
|
||||||
output consists of a list of all the matches that start at the first
|
output consists of a list of all the matches that start at the first
|
||||||
point in the subject where there is at least one match. For example:
|
point in the subject where there is at least one match. For example:
|
||||||
|
|
||||||
re> /(tang|tangerine|tan)/
|
re> /(tang|tangerine|tan)/
|
||||||
|
@ -1635,11 +1637,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||||
1: tang
|
1: tang
|
||||||
2: tan
|
2: tan
|
||||||
|
|
||||||
Using the normal matching function on this data finds only "tang". The
|
Using the normal matching function on this data finds only "tang". The
|
||||||
longest matching string is always given first (and numbered zero). Af-
|
longest matching string is always given first (and numbered zero). Af-
|
||||||
ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
|
ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol-
|
||||||
lowed by the partially matching substring. Note that this is the entire
|
lowed by the partially matching substring. Note that this is the entire
|
||||||
substring that was inspected during the partial match; it may include
|
substring that was inspected during the partial match; it may include
|
||||||
characters before the actual match start if a lookbehind assertion, \b,
|
characters before the actual match start if a lookbehind assertion, \b,
|
||||||
or \B was involved. (\K is not supported for DFA matching.)
|
or \B was involved. (\K is not supported for DFA matching.)
|
||||||
|
|
||||||
|
@ -1655,16 +1657,16 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||||
1: tan
|
1: tan
|
||||||
0: tan
|
0: tan
|
||||||
|
|
||||||
The alternative matching function does not support substring capture,
|
The alternative matching function does not support substring capture,
|
||||||
so the modifiers that are concerned with captured substrings are not
|
so the modifiers that are concerned with captured substrings are not
|
||||||
relevant.
|
relevant.
|
||||||
|
|
||||||
|
|
||||||
RESTARTING AFTER A PARTIAL MATCH
|
RESTARTING AFTER A PARTIAL MATCH
|
||||||
|
|
||||||
When the alternative matching function has given the PCRE2_ERROR_PAR-
|
When the alternative matching function has given the PCRE2_ERROR_PAR-
|
||||||
TIAL return, indicating that the subject partially matched the pattern,
|
TIAL return, indicating that the subject partially matched the pattern,
|
||||||
you can restart the match with additional subject data by means of the
|
you can restart the match with additional subject data by means of the
|
||||||
dfa_restart modifier. For example:
|
dfa_restart modifier. For example:
|
||||||
|
|
||||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||||
|
@ -1673,37 +1675,37 @@ RESTARTING AFTER A PARTIAL MATCH
|
||||||
data> n05\=dfa,dfa_restart
|
data> n05\=dfa,dfa_restart
|
||||||
0: n05
|
0: n05
|
||||||
|
|
||||||
For further information about partial matching, see the pcre2partial
|
For further information about partial matching, see the pcre2partial
|
||||||
documentation.
|
documentation.
|
||||||
|
|
||||||
|
|
||||||
CALLOUTS
|
CALLOUTS
|
||||||
|
|
||||||
If the pattern contains any callout requests, pcre2test's callout func-
|
If the pattern contains any callout requests, pcre2test's callout func-
|
||||||
tion is called during matching unless callout_none is specified. This
|
tion is called during matching unless callout_none is specified. This
|
||||||
works with both matching functions, and with JIT, though there are some
|
works with both matching functions, and with JIT, though there are some
|
||||||
differences in behaviour. The output for callouts with numerical argu-
|
differences in behaviour. The output for callouts with numerical argu-
|
||||||
ments and those with string arguments is slightly different.
|
ments and those with string arguments is slightly different.
|
||||||
|
|
||||||
Callouts with numerical arguments
|
Callouts with numerical arguments
|
||||||
|
|
||||||
By default, the callout function displays the callout number, the start
|
By default, the callout function displays the callout number, the start
|
||||||
and current positions in the subject text at the callout time, and the
|
and current positions in the subject text at the callout time, and the
|
||||||
next pattern item to be tested. For example:
|
next pattern item to be tested. For example:
|
||||||
|
|
||||||
--->pqrabcdef
|
--->pqrabcdef
|
||||||
0 ^ ^ \d
|
0 ^ ^ \d
|
||||||
|
|
||||||
This output indicates that callout number 0 occurred for a match at-
|
This output indicates that callout number 0 occurred for a match at-
|
||||||
tempt starting at the fourth character of the subject string, when the
|
tempt starting at the fourth character of the subject string, when the
|
||||||
pointer was at the seventh character, and when the next pattern item
|
pointer was at the seventh character, and when the next pattern item
|
||||||
was \d. Just one circumflex is output if the start and current posi-
|
was \d. Just one circumflex is output if the start and current posi-
|
||||||
tions are the same, or if the current position precedes the start posi-
|
tions are the same, or if the current position precedes the start posi-
|
||||||
tion, which can happen if the callout is in a lookbehind assertion.
|
tion, which can happen if the callout is in a lookbehind assertion.
|
||||||
|
|
||||||
Callouts numbered 255 are assumed to be automatic callouts, inserted as
|
Callouts numbered 255 are assumed to be automatic callouts, inserted as
|
||||||
a result of the auto_callout pattern modifier. In this case, instead of
|
a result of the auto_callout pattern modifier. In this case, instead of
|
||||||
showing the callout number, the offset in the pattern, preceded by a
|
showing the callout number, the offset in the pattern, preceded by a
|
||||||
plus, is output. For example:
|
plus, is output. For example:
|
||||||
|
|
||||||
re> /\d?[A-E]\*/auto_callout
|
re> /\d?[A-E]\*/auto_callout
|
||||||
|
@ -1730,17 +1732,17 @@ CALLOUTS
|
||||||
+12 ^ ^
|
+12 ^ ^
|
||||||
0: abc
|
0: abc
|
||||||
|
|
||||||
The mark changes between matching "a" and "b", but stays the same for
|
The mark changes between matching "a" and "b", but stays the same for
|
||||||
the rest of the match, so nothing more is output. If, as a result of
|
the rest of the match, so nothing more is output. If, as a result of
|
||||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||||
output.
|
output.
|
||||||
|
|
||||||
Callouts with string arguments
|
Callouts with string arguments
|
||||||
|
|
||||||
The output for a callout with a string argument is similar, except that
|
The output for a callout with a string argument is similar, except that
|
||||||
instead of outputting a callout number before the position indicators,
|
instead of outputting a callout number before the position indicators,
|
||||||
the callout string and its offset in the pattern string are output be-
|
the callout string and its offset in the pattern string are output be-
|
||||||
fore the reflection of the subject string, and the subject string is
|
fore the reflection of the subject string, and the subject string is
|
||||||
reflected for each callout. For example:
|
reflected for each callout. For example:
|
||||||
|
|
||||||
re> /^ab(?C'first')cd(?C"second")ef/
|
re> /^ab(?C'first')cd(?C"second")ef/
|
||||||
|
@ -1756,26 +1758,26 @@ CALLOUTS
|
||||||
|
|
||||||
Callout modifiers
|
Callout modifiers
|
||||||
|
|
||||||
The callout function in pcre2test returns zero (carry on matching) by
|
The callout function in pcre2test returns zero (carry on matching) by
|
||||||
default, but you can use a callout_fail modifier in a subject line to
|
default, but you can use a callout_fail modifier in a subject line to
|
||||||
change this and other parameters of the callout (see below).
|
change this and other parameters of the callout (see below).
|
||||||
|
|
||||||
If the callout_capture modifier is set, the current captured groups are
|
If the callout_capture modifier is set, the current captured groups are
|
||||||
output when a callout occurs. This is useful only for non-DFA matching,
|
output when a callout occurs. This is useful only for non-DFA matching,
|
||||||
as pcre2_dfa_match() does not support capturing, so no captures are
|
as pcre2_dfa_match() does not support capturing, so no captures are
|
||||||
ever shown.
|
ever shown.
|
||||||
|
|
||||||
The normal callout output, showing the callout number or pattern offset
|
The normal callout output, showing the callout number or pattern offset
|
||||||
(as described above) is suppressed if the callout_no_where modifier is
|
(as described above) is suppressed if the callout_no_where modifier is
|
||||||
set.
|
set.
|
||||||
|
|
||||||
When using the interpretive matching function pcre2_match() without
|
When using the interpretive matching function pcre2_match() without
|
||||||
JIT, setting the callout_extra modifier causes additional output from
|
JIT, setting the callout_extra modifier causes additional output from
|
||||||
pcre2test's callout function to be generated. For the first callout in
|
pcre2test's callout function to be generated. For the first callout in
|
||||||
a match attempt at a new starting position in the subject, "New match
|
a match attempt at a new starting position in the subject, "New match
|
||||||
attempt" is output. If there has been a backtrack since the last call-
|
attempt" is output. If there has been a backtrack since the last call-
|
||||||
out (or start of matching if this is the first callout), "Backtrack" is
|
out (or start of matching if this is the first callout), "Backtrack" is
|
||||||
output, followed by "No other matching paths" if the backtrack ended
|
output, followed by "No other matching paths" if the backtrack ended
|
||||||
the previous match attempt. For example:
|
the previous match attempt. For example:
|
||||||
|
|
||||||
re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
|
re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess
|
||||||
|
@ -1812,86 +1814,86 @@ CALLOUTS
|
||||||
+1 ^ a+
|
+1 ^ a+
|
||||||
No match
|
No match
|
||||||
|
|
||||||
Notice that various optimizations must be turned off if you want all
|
Notice that various optimizations must be turned off if you want all
|
||||||
possible matching paths to be scanned. If no_start_optimize is not
|
possible matching paths to be scanned. If no_start_optimize is not
|
||||||
used, there is an immediate "no match", without any callouts, because
|
used, there is an immediate "no match", without any callouts, because
|
||||||
the starting optimization fails to find "b" in the subject, which it
|
the starting optimization fails to find "b" in the subject, which it
|
||||||
knows must be present for any match. If no_auto_possess is not used,
|
knows must be present for any match. If no_auto_possess is not used,
|
||||||
the "a+" item is turned into "a++", which reduces the number of back-
|
the "a+" item is turned into "a++", which reduces the number of back-
|
||||||
tracks.
|
tracks.
|
||||||
|
|
||||||
The callout_extra modifier has no effect if used with the DFA matching
|
The callout_extra modifier has no effect if used with the DFA matching
|
||||||
function, or with JIT.
|
function, or with JIT.
|
||||||
|
|
||||||
Return values from callouts
|
Return values from callouts
|
||||||
|
|
||||||
The default return from the callout function is zero, which allows
|
The default return from the callout function is zero, which allows
|
||||||
matching to continue. The callout_fail modifier can be given one or two
|
matching to continue. The callout_fail modifier can be given one or two
|
||||||
numbers. If there is only one number, 1 is returned instead of 0 (caus-
|
numbers. If there is only one number, 1 is returned instead of 0 (caus-
|
||||||
ing matching to backtrack) when a callout of that number is reached. If
|
ing matching to backtrack) when a callout of that number is reached. If
|
||||||
two numbers (<n>:<m>) are given, 1 is returned when callout <n> is
|
two numbers (<n>:<m>) are given, 1 is returned when callout <n> is
|
||||||
reached and there have been at least <m> callouts. The callout_error
|
reached and there have been at least <m> callouts. The callout_error
|
||||||
modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus-
|
modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus-
|
||||||
ing the entire matching process to be aborted. If both these modifiers
|
ing the entire matching process to be aborted. If both these modifiers
|
||||||
are set for the same callout number, callout_error takes precedence.
|
are set for the same callout number, callout_error takes precedence.
|
||||||
Note that callouts with string arguments are always given the number
|
Note that callouts with string arguments are always given the number
|
||||||
zero.
|
zero.
|
||||||
|
|
||||||
The callout_data modifier can be given an unsigned or a negative num-
|
The callout_data modifier can be given an unsigned or a negative num-
|
||||||
ber. This is set as the "user data" that is passed to the matching
|
ber. This is set as the "user data" that is passed to the matching
|
||||||
function, and passed back when the callout function is invoked. Any
|
function, and passed back when the callout function is invoked. Any
|
||||||
value other than zero is used as a return from pcre2test's callout
|
value other than zero is used as a return from pcre2test's callout
|
||||||
function.
|
function.
|
||||||
|
|
||||||
Inserting callouts can be helpful when using pcre2test to check compli-
|
Inserting callouts can be helpful when using pcre2test to check compli-
|
||||||
cated regular expressions. For further information about callouts, see
|
cated regular expressions. For further information about callouts, see
|
||||||
the pcre2callout documentation.
|
the pcre2callout documentation.
|
||||||
|
|
||||||
|
|
||||||
NON-PRINTING CHARACTERS
|
NON-PRINTING CHARACTERS
|
||||||
|
|
||||||
When pcre2test is outputting text in the compiled version of a pattern,
|
When pcre2test is outputting text in the compiled version of a pattern,
|
||||||
bytes other than 32-126 are always treated as non-printing characters
|
bytes other than 32-126 are always treated as non-printing characters
|
||||||
and are therefore shown as hex escapes.
|
and are therefore shown as hex escapes.
|
||||||
|
|
||||||
When pcre2test is outputting text that is a matched part of a subject
|
When pcre2test is outputting text that is a matched part of a subject
|
||||||
string, it behaves in the same way, unless a different locale has been
|
string, it behaves in the same way, unless a different locale has been
|
||||||
set for the pattern (using the locale modifier). In this case, the is-
|
set for the pattern (using the locale modifier). In this case, the is-
|
||||||
print() function is used to distinguish printing and non-printing char-
|
print() function is used to distinguish printing and non-printing char-
|
||||||
acters.
|
acters.
|
||||||
|
|
||||||
|
|
||||||
SAVING AND RESTORING COMPILED PATTERNS
|
SAVING AND RESTORING COMPILED PATTERNS
|
||||||
|
|
||||||
It is possible to save compiled patterns on disc or elsewhere, and
|
It is possible to save compiled patterns on disc or elsewhere, and
|
||||||
reload them later, subject to a number of restrictions. JIT data cannot
|
reload them later, subject to a number of restrictions. JIT data cannot
|
||||||
be saved. The host on which the patterns are reloaded must be running
|
be saved. The host on which the patterns are reloaded must be running
|
||||||
the same version of PCRE2, with the same code unit width, and must also
|
the same version of PCRE2, with the same code unit width, and must also
|
||||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||||
compiled patterns can be saved they must be serialized, that is, con-
|
compiled patterns can be saved they must be serialized, that is, con-
|
||||||
verted to a stream of bytes. A single byte stream may contain any num-
|
verted to a stream of bytes. A single byte stream may contain any num-
|
||||||
ber of compiled patterns, but they must all use the same character ta-
|
ber of compiled patterns, but they must all use the same character ta-
|
||||||
bles. A single copy of the tables is included in the byte stream (its
|
bles. A single copy of the tables is included in the byte stream (its
|
||||||
size is 1088 bytes).
|
size is 1088 bytes).
|
||||||
|
|
||||||
The functions whose names begin with pcre2_serialize_ are used for se-
|
The functions whose names begin with pcre2_serialize_ are used for se-
|
||||||
rializing and de-serializing. They are described in the pcre2serialize
|
rializing and de-serializing. They are described in the pcre2serialize
|
||||||
documentation. In this section we describe the features of pcre2test
|
documentation. In this section we describe the features of pcre2test
|
||||||
that can be used to test these functions.
|
that can be used to test these functions.
|
||||||
|
|
||||||
Note that "serialization" in PCRE2 does not convert compiled patterns
|
Note that "serialization" in PCRE2 does not convert compiled patterns
|
||||||
to an abstract format like Java or .NET. It just makes a reloadable
|
to an abstract format like Java or .NET. It just makes a reloadable
|
||||||
byte code stream. Hence the restrictions on reloading mentioned above.
|
byte code stream. Hence the restrictions on reloading mentioned above.
|
||||||
|
|
||||||
In pcre2test, when a pattern with push modifier is successfully com-
|
In pcre2test, when a pattern with push modifier is successfully com-
|
||||||
piled, it is pushed onto a stack of compiled patterns, and pcre2test
|
piled, it is pushed onto a stack of compiled patterns, and pcre2test
|
||||||
expects the next line to contain a new pattern (or command) instead of
|
expects the next line to contain a new pattern (or command) instead of
|
||||||
a subject line. By contrast, the pushcopy modifier causes a copy of the
|
a subject line. By contrast, the pushcopy modifier causes a copy of the
|
||||||
compiled pattern to be stacked, leaving the original available for im-
|
compiled pattern to be stacked, leaving the original available for im-
|
||||||
mediate matching. By using push and/or pushcopy, a number of patterns
|
mediate matching. By using push and/or pushcopy, a number of patterns
|
||||||
can be compiled and retained. These modifiers are incompatible with
|
can be compiled and retained. These modifiers are incompatible with
|
||||||
posix, and control modifiers that act at match time are ignored (with a
|
posix, and control modifiers that act at match time are ignored (with a
|
||||||
message) for the stacked patterns. The jitverify modifier applies only
|
message) for the stacked patterns. The jitverify modifier applies only
|
||||||
at compile time.
|
at compile time.
|
||||||
|
|
||||||
The command
|
The command
|
||||||
|
@ -1899,21 +1901,21 @@ SAVING AND RESTORING COMPILED PATTERNS
|
||||||
#save <filename>
|
#save <filename>
|
||||||
|
|
||||||
causes all the stacked patterns to be serialized and the result written
|
causes all the stacked patterns to be serialized and the result written
|
||||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||||
command
|
command
|
||||||
|
|
||||||
#load <filename>
|
#load <filename>
|
||||||
|
|
||||||
reads the data in the file, and then arranges for it to be de-serial-
|
reads the data in the file, and then arranges for it to be de-serial-
|
||||||
ized, with the resulting compiled patterns added to the pattern stack.
|
ized, with the resulting compiled patterns added to the pattern stack.
|
||||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||||
mand, which must be followed by lines of subjects that are to be
|
mand, which must be followed by lines of subjects that are to be
|
||||||
matched with the pattern, terminated as usual by an empty line or end
|
matched with the pattern, terminated as usual by an empty line or end
|
||||||
of file. This command may be followed by a modifier list containing
|
of file. This command may be followed by a modifier list containing
|
||||||
only control modifiers that act after a pattern has been compiled. In
|
only control modifiers that act after a pattern has been compiled. In
|
||||||
particular, hex, posix, posix_nosub, push, and pushcopy are not al-
|
particular, hex, posix, posix_nosub, push, and pushcopy are not al-
|
||||||
lowed, nor are any option-setting modifiers. The JIT modifiers are,
|
lowed, nor are any option-setting modifiers. The JIT modifiers are,
|
||||||
however permitted. Here is an example that saves and reloads two pat-
|
however permitted. Here is an example that saves and reloads two pat-
|
||||||
terns.
|
terns.
|
||||||
|
|
||||||
/abc/push
|
/abc/push
|
||||||
|
@ -1926,10 +1928,10 @@ SAVING AND RESTORING COMPILED PATTERNS
|
||||||
#pop jit,bincode
|
#pop jit,bincode
|
||||||
abc
|
abc
|
||||||
|
|
||||||
If jitverify is used with #pop, it does not automatically imply jit,
|
If jitverify is used with #pop, it does not automatically imply jit,
|
||||||
which is different behaviour from when it is used on a pattern.
|
which is different behaviour from when it is used on a pattern.
|
||||||
|
|
||||||
The #popcopy command is analagous to the pushcopy modifier in that it
|
The #popcopy command is analagous to the pushcopy modifier in that it
|
||||||
makes current a copy of the topmost stack pattern, leaving the original
|
makes current a copy of the topmost stack pattern, leaving the original
|
||||||
still on the stack.
|
still on the stack.
|
||||||
|
|
||||||
|
@ -1949,5 +1951,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 12 January 2022
|
Last updated: 27 July 2022
|
||||||
Copyright (c) 1997-2022 University of Cambridge.
|
Copyright (c) 1997-2022 University of Cambridge.
|
||||||
|
|
|
@ -205,9 +205,6 @@ point. */
|
||||||
* Global variables *
|
* Global variables *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* Jeffrey Friedl has some debugging requirements that are not part of the
|
|
||||||
regular code. */
|
|
||||||
|
|
||||||
static const char *colour_string = "1;31";
|
static const char *colour_string = "1;31";
|
||||||
static const char *colour_option = NULL;
|
static const char *colour_option = NULL;
|
||||||
static const char *dee_option = NULL;
|
static const char *dee_option = NULL;
|
||||||
|
@ -220,6 +217,10 @@ static const char *output_text = NULL;
|
||||||
|
|
||||||
static char *main_buffer = NULL;
|
static char *main_buffer = NULL;
|
||||||
|
|
||||||
|
static const char *printname_nl = STDOUT_NL; /* Changed to NULL for -Z */
|
||||||
|
static int printname_colon = ':'; /* Changed to 0 for -Z */
|
||||||
|
static int printname_hyphen = '-'; /* Changed to 0 for -Z */
|
||||||
|
|
||||||
static int after_context = 0;
|
static int after_context = 0;
|
||||||
static int before_context = 0;
|
static int before_context = 0;
|
||||||
static int binary_files = BIN_BINARY;
|
static int binary_files = BIN_BINARY;
|
||||||
|
@ -483,6 +484,7 @@ static option_item optionlist[] = {
|
||||||
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
|
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
|
||||||
{ OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" },
|
{ OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" },
|
||||||
{ OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" },
|
{ OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" },
|
||||||
|
{ OP_NODATA, 'Z', NULL, "null", "output 0 byte after file names" },
|
||||||
{ OP_NODATA, 0, NULL, NULL, NULL }
|
{ OP_NODATA, 0, NULL, NULL, NULL }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1773,7 +1775,7 @@ if (after_context > 0 && lastmatchnumber > 0)
|
||||||
{
|
{
|
||||||
char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
|
char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
|
||||||
if (ellength == 0 && pp == main_buffer + bufsize) break;
|
if (ellength == 0 && pp == main_buffer + bufsize) break;
|
||||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
if (printname != NULL) fprintf(stdout, "%s%c", printname, printname_hyphen);
|
||||||
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
||||||
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||||
lastmatchrestart = pp;
|
lastmatchrestart = pp;
|
||||||
|
@ -2730,7 +2732,9 @@ while (ptr < endptr)
|
||||||
|
|
||||||
else if (filenames == FN_MATCH_ONLY)
|
else if (filenames == FN_MATCH_ONLY)
|
||||||
{
|
{
|
||||||
fprintf(stdout, "%s" STDOUT_NL, printname);
|
fprintf(stdout, "%s", printname);
|
||||||
|
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
|
||||||
|
else fprintf(stdout, "%s", printname_nl);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2749,7 +2753,8 @@ while (ptr < endptr)
|
||||||
{
|
{
|
||||||
PCRE2_SIZE oldstartoffset;
|
PCRE2_SIZE oldstartoffset;
|
||||||
|
|
||||||
if (printname != NULL) fprintf(stdout, "%s:", printname);
|
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||||
|
printname_colon);
|
||||||
if (number) fprintf(stdout, "%lu:", linenumber);
|
if (number) fprintf(stdout, "%lu:", linenumber);
|
||||||
|
|
||||||
/* Handle --line-offsets */
|
/* Handle --line-offsets */
|
||||||
|
@ -2871,7 +2876,8 @@ while (ptr < endptr)
|
||||||
while (lastmatchrestart < p)
|
while (lastmatchrestart < p)
|
||||||
{
|
{
|
||||||
char *pp = lastmatchrestart;
|
char *pp = lastmatchrestart;
|
||||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||||
|
printname_hyphen);
|
||||||
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
||||||
pp = end_of_line(pp, endptr, &ellength);
|
pp = end_of_line(pp, endptr, &ellength);
|
||||||
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||||
|
@ -2912,7 +2918,8 @@ while (ptr < endptr)
|
||||||
{
|
{
|
||||||
int ellength;
|
int ellength;
|
||||||
char *pp = p;
|
char *pp = p;
|
||||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||||
|
printname_hyphen);
|
||||||
if (number) fprintf(stdout, "%lu-", linenumber - linecount--);
|
if (number) fprintf(stdout, "%lu-", linenumber - linecount--);
|
||||||
pp = end_of_line(pp, endptr, &ellength);
|
pp = end_of_line(pp, endptr, &ellength);
|
||||||
FWRITE_IGNORE(p, 1, pp - p, stdout);
|
FWRITE_IGNORE(p, 1, pp - p, stdout);
|
||||||
|
@ -2926,7 +2933,8 @@ while (ptr < endptr)
|
||||||
if (after_context > 0 || before_context > 0)
|
if (after_context > 0 || before_context > 0)
|
||||||
endhyphenpending = TRUE;
|
endhyphenpending = TRUE;
|
||||||
|
|
||||||
if (printname != NULL) fprintf(stdout, "%s:", printname);
|
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||||
|
printname_colon);
|
||||||
if (number) fprintf(stdout, "%lu:", linenumber);
|
if (number) fprintf(stdout, "%lu:", linenumber);
|
||||||
|
|
||||||
/* In multiline mode, or if colouring, we have to split the line(s) up
|
/* In multiline mode, or if colouring, we have to split the line(s) up
|
||||||
|
@ -3131,7 +3139,9 @@ were none. If we found a match, we won't have got this far. */
|
||||||
|
|
||||||
if (filenames == FN_NOMATCH_ONLY)
|
if (filenames == FN_NOMATCH_ONLY)
|
||||||
{
|
{
|
||||||
fprintf(stdout, "%s" STDOUT_NL, printname);
|
fprintf(stdout, "%s", printname);
|
||||||
|
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
|
||||||
|
else fprintf(stdout, "%s", printname_nl);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3142,7 +3152,7 @@ if (count_only && !quiet)
|
||||||
if (count > 0 || !omit_zero_count)
|
if (count > 0 || !omit_zero_count)
|
||||||
{
|
{
|
||||||
if (printname != NULL && filenames != FN_NONE)
|
if (printname != NULL && filenames != FN_NONE)
|
||||||
fprintf(stdout, "%s:", printname);
|
fprintf(stdout, "%s%c", printname, printname_colon);
|
||||||
fprintf(stdout, "%lu" STDOUT_NL, count);
|
fprintf(stdout, "%lu" STDOUT_NL, count);
|
||||||
counts_printed++;
|
counts_printed++;
|
||||||
}
|
}
|
||||||
|
@ -3528,8 +3538,6 @@ switch(letter)
|
||||||
case 'u': options |= PCRE2_UTF; utf = TRUE; break;
|
case 'u': options |= PCRE2_UTF; utf = TRUE; break;
|
||||||
case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
|
case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
|
||||||
case 'v': invert = TRUE; break;
|
case 'v': invert = TRUE; break;
|
||||||
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
|
||||||
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
|
||||||
|
|
||||||
case 'V':
|
case 'V':
|
||||||
{
|
{
|
||||||
|
@ -3540,6 +3548,10 @@ switch(letter)
|
||||||
pcre2grep_exit(0);
|
pcre2grep_exit(0);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
||||||
|
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
||||||
|
case 'Z': printname_colon = printname_hyphen = 0; printname_nl = NULL; break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter);
|
fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter);
|
||||||
pcre2grep_exit(usage(2));
|
pcre2grep_exit(usage(2));
|
||||||
|
@ -4259,8 +4271,6 @@ if (DEE_option != NULL)
|
||||||
|
|
||||||
(void)pcre2_set_compile_extra_options(compile_context, extra_options);
|
(void)pcre2_set_compile_extra_options(compile_context, extra_options);
|
||||||
|
|
||||||
/* Check the values for Jeffrey Friedl's debugging options. */
|
|
||||||
|
|
||||||
/* If use_jit is set, check whether JIT is available. If not, do not try
|
/* If use_jit is set, check whether JIT is available. If not, do not try
|
||||||
to use JIT. */
|
to use JIT. */
|
||||||
|
|
||||||
|
|
|
@ -991,3 +991,22 @@ RC=0
|
||||||
---------------------------- Test 134 -----------------------------
|
---------------------------- Test 134 -----------------------------
|
||||||
=AB3CD5=
|
=AB3CD5=
|
||||||
RC=0
|
RC=0
|
||||||
|
---------------------------- Test 135 -----------------------------
|
||||||
|
./testdata/grepinputv@The word is cat in this line
|
||||||
|
RC=0
|
||||||
|
./testdata/grepinputv@./testdata/grepinputv@RC=0
|
||||||
|
./testdata/grepinputv@This line contains \E and (regex) *meta* [characters].
|
||||||
|
./testdata/grepinputv@The word is cat in this line
|
||||||
|
./testdata/grepinputv@The caterpillar sat on the mat
|
||||||
|
RC=0
|
||||||
|
testdata/grepinputM |