Implement -Z in pcre2grep and update documentation
This commit is contained in:
parent
cc5e121c8e
commit
8b133fa0ba
|
@ -49,6 +49,8 @@ tests.
|
|||
tests run by 'make check', but can be run manually. The current output is from
|
||||
a 64-bit system.
|
||||
|
||||
13. Implemented -Z aka --null in pcre2grep.
|
||||
|
||||
|
||||
Version 10.40 15-April-2022
|
||||
---------------------------
|
||||
|
|
42
RunGrepTest
42
RunGrepTest
|
@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b"
|
|||
diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u"
|
||||
diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub"
|
||||
|
||||
# Some tests involve NUL characters. It seems impossible to handle them easily
|
||||
# in many operating systems. An earlier version of this script used sed to
|
||||
# translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character. However, on (some versions
|
||||
# of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
# If this test is being run from "make check", $srcdir will be set. If not, set
|
||||
# it to the current or parent directory, whichever one contains the test data.
|
||||
# Subsequently, we run most of the pcre2grep tests in the source directory so
|
||||
|
@ -685,6 +701,16 @@ echo "---------------------------- Test 134 -----------------------------" >>tes
|
|||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 135 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||
|
@ -759,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep >
|
|||
printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep
|
||||
|
||||
# This next test involves NUL characters. It seems impossible to handle them
|
||||
# easily in many operating systems. An earlier version of this script used sed
|
||||
# to translate NUL into the string ZERO, but this didn't work on Solaris (aka
|
||||
# SunOS), where the version of sed explicitly doesn't like them, and also MacOS
|
||||
# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine,
|
||||
# even when using GNU sed. A user suggested using tr instead, which
|
||||
# necessitates translating to a single character (@). However, on (some
|
||||
# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if
|
||||
# /usr/xpg4/bin/tr is available, it can do so, so test for that.
|
||||
|
||||
if [ -x /usr/xpg4/bin/tr ] ; then
|
||||
tr=/usr/xpg4/bin/tr
|
||||
else
|
||||
tr=tr
|
||||
fi
|
||||
|
||||
printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep
|
||||
printf 'abc\0def' >testNinputgrep
|
||||
$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep
|
||||
|
|
|
@ -121,6 +121,7 @@ environment, for example.
|
|||
pcre2_substring.c
|
||||
pcre2_tables.c
|
||||
pcre2_ucd.c
|
||||
pcre2_ucptables.c
|
||||
pcre2_valid_utf.c
|
||||
pcre2_xclass.c
|
||||
|
||||
|
@ -373,7 +374,7 @@ Otherwise:
|
|||
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||
have been created.
|
||||
|
||||
2. Edit RunTest.bat to indentify the full or relative location of
|
||||
2. Edit RunTest.bat to identify the full or relative location of
|
||||
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||
|
||||
set srcdir=C:\pcre2\pcre2-10.00
|
||||
|
|
|
@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com.
|
|||
You can access the archives and also subscribe or manage your subscription
|
||||
here:
|
||||
|
||||
https://groups.google.com/pcre2-dev
|
||||
https://groups.google.com/g/pcre2-dev
|
||||
|
||||
Please read the NEWS file if you are upgrading from a previous release. The
|
||||
contents of this README file are:
|
||||
|
@ -375,7 +375,8 @@ library. They are also documented in the pcre2build man page.
|
|||
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||
but does not link with the termcap or curses library itself, allowing
|
||||
applications which link with readline the to choose an appropriate library."
|
||||
applications which link with readline the option to choose an appropriate
|
||||
library."
|
||||
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
@ -400,10 +401,10 @@ library. They are also documented in the pcre2build man page.
|
|||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
outputs information about what it is doing. The input strings are specified
|
||||
by arguments: if an argument starts with "=" the rest of it is a literal
|
||||
input string. Otherwise, it is assumed to be a file name, and the contents
|
||||
of the file are the test string.
|
||||
|
||||
. Releases before 10.30 could be compiled with --disable-stack-for-recursion,
|
||||
which caused pcre2_match() to use individual blocks on the heap for
|
||||
|
@ -695,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for
|
|||
different code unit widths.
|
||||
|
||||
Test 15 contains a number of tests that must not be run with JIT. They check,
|
||||
among other non-JIT things, the match-limiting features of the intepretive
|
||||
among other non-JIT things, the match-limiting features of the interpretive
|
||||
matcher.
|
||||
|
||||
Test 16 is run only when JIT support is not available. It checks that an
|
||||
|
|
|
@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the
|
|||
documentation for more details). If the limit is reached, the negative error
|
||||
code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2
|
||||
is built; if it is not, the default is set very large and is essentially
|
||||
"unlimited".
|
||||
unlimited.
|
||||
</P>
|
||||
<P>
|
||||
A value for the heap limit may also be supplied by an item at the start of a
|
||||
|
@ -1030,19 +1030,17 @@ less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
|||
limit is set, less than the default.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack for recording backtracking points. The more nested backtracking points
|
||||
there are (that is, the deeper the search tree), the more memory is needed.
|
||||
Heap memory is used only if the initial vector is too small. If the heap limit
|
||||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
The <b>pcre2_match()</b> function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
<b>pcre2_match()</b> uses the heap are given in the
|
||||
<a href="pcre2perform.html"><b>pcre2perform</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for <b>pcre2_dfa_match()</b>, a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, too, setting a value
|
||||
of zero disables the use of the heap.
|
||||
For <b>pcre2_dfa_match()</b>, a vector on the system stack is used when
|
||||
processing pattern recursions, lookarounds, or atomic groups, and only if this
|
||||
is not big enough is heap memory used. In this case, setting a value of zero
|
||||
disables the use of the heap.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
|
@ -1089,10 +1087,10 @@ less than the limit set by the caller of <b>pcre2_match()</b> or
|
|||
<br>
|
||||
<br>
|
||||
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
|
||||
Each time a nested backtracking point is passed, a new memory "frame" is used
|
||||
Each time a nested backtracking point is passed, a new memory frame is used
|
||||
to remember the state of matching at that point. Thus, this parameter
|
||||
indirectly limits the amount of memory that is used in a match. However,
|
||||
because the size of each memory "frame" depends on the number of capturing
|
||||
because the size of each memory frame depends on the number of capturing
|
||||
parentheses, the actual memory limit varies from pattern to pattern. This limit
|
||||
was more useful in versions before 10.30, where function recursion was used for
|
||||
backtracking.
|
||||
|
@ -3148,11 +3146,11 @@ The backtracking match limit was reached.
|
|||
<pre>
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
</pre>
|
||||
If a pattern contains many nested backtracking points, heap memory is used to
|
||||
remember them. This error is given when the memory allocation function (default
|
||||
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
|
||||
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is given when
|
||||
the memory allocation function (default or custom) fails. Note that a different
|
||||
error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
<pre>
|
||||
PCRE2_ERROR_NULL
|
||||
</pre>
|
||||
|
@ -4020,9 +4018,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 14 December 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -284,12 +284,11 @@ to the <b>configure</b> command. This setting also applies to the
|
|||
counting is done differently).
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_match()</b> function starts out using a 20KiB vector on the system
|
||||
stack to record backtracking points. The more nested backtracking points there
|
||||
are (that is, the deeper the search tree), the more memory is needed. If the
|
||||
initial vector is not large enough, heap memory is used, up to a certain limit,
|
||||
which is specified in kibibytes (units of 1024 bytes). The limit can be changed
|
||||
at run time, as described in the
|
||||
The <b>pcre2_match()</b> function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the deeper the
|
||||
search tree), the more memory is needed. There is an upper limit, specified in
|
||||
kibibytes (units of 1024 bytes). This limit can be changed at run time, as
|
||||
described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation. The default limit (in effect unlimited) is 20 million. You can
|
||||
change this by a setting such as
|
||||
|
@ -609,16 +608,16 @@ give a warning.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 08 December 2021
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,15 @@ For example:
|
|||
<pre>
|
||||
pcre2grep some-pattern file1 - file3
|
||||
</pre>
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. In
|
||||
particular, the <b>-M</b> option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
<b>-N</b> (<b>--newline</b>) option.
|
||||
However, there are options that can change how <b>pcre2grep</b> behaves. For
|
||||
example, the <b>-M</b> option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the <b>-N</b>
|
||||
(<b>--newline</b>) option. The <b>-h</b> and <b>-H</b> options control whether or
|
||||
not file names are shown, and the <b>-Z</b> option changes the file name
|
||||
terminator to a zero byte.
|
||||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
|
@ -178,9 +180,11 @@ Output up to <i>number</i> lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
context lines (the <b>-Z</b> option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
<i>number</i> is expected to be relatively small. When <b>-c</b> is used,
|
||||
<b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -199,9 +203,10 @@ Output up to <i>number</i> lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the <b>-Z</b> option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with <b>--output</b>,
|
|||
<P>
|
||||
<b>-H</b>, <b>--with-filename</b>
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the <b>-M</b> option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The <b>-Z</b> option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the <b>-M</b> option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous <b>-h</b>, <b>-l</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-h</b>, <b>--no-filename</b>
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The <b>-Z</b> option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous <b>-H</b>, <b>-L</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--heap-limit</b>=<i>number</i>
|
||||
|
@ -481,18 +488,20 @@ given any number of times. If a directory matches both <b>--include-dir</b> and
|
|||
<b>-L</b>, <b>--files-without-match</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-l</b> options.
|
||||
output once, on a separate line by default, but if the <b>-Z</b> option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous <b>-H</b>, <b>-h</b>, or <b>-l</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-l</b>, <b>--files-with-matches</b>
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the <b>-Z</b> option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with <b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
|
@ -592,10 +601,7 @@ value set by <b>--match-limit</b> is reached, an error occurs.
|
|||
<br>
|
||||
<br>
|
||||
The <b>--heap-limit</b> option specifies, as a number of kibibytes (units of
|
||||
1024 bytes), the amount of heap memory that may be used for matching. Heap
|
||||
memory is needed only if matching the pattern requires a significant number of
|
||||
nested backtracking points to be remembered. This parameter can be set to zero
|
||||
to forbid the use of heap memory altogether.
|
||||
1024 bytes), the maximum amount of heap memory that may be used for matching.
|
||||
<br>
|
||||
<br>
|
||||
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
|
||||
|
@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are
|
|||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>-Z</b>, <b>--null</b>
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||
|
@ -1053,9 +1066,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 31 August 2021
|
||||
Last updated: 30 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2021 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
|||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<P>
|
||||
The maximum amount of heap memory used for matching is controlled by the heap
|
||||
limit, which can be set in a pattern or in a match context. The default is a
|
||||
very large number, effectively unlimited.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
|
@ -86,9 +91,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 02 February 2019
|
||||
Last updated: 26 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
|
|||
uses very little system stack at run time. In earlier releases recursive
|
||||
function calls could use a great deal of stack, and this could cause problems,
|
||||
but this usage has been eliminated. Backtracking positions are now explicitly
|
||||
remembered in memory frames controlled by the code. An initial 20KiB vector of
|
||||
frames is allocated on the system stack (enough for about 100 frames for small
|
||||
patterns), but if this is insufficient, heap memory is used. The amount of heap
|
||||
memory can be limited; if the limit is set to zero, only the initial stack
|
||||
vector is used. Rewriting patterns to be time-efficient, as described below,
|
||||
may also reduce the memory requirements.
|
||||
remembered in memory frames controlled by the code.
|
||||
</P>
|
||||
<P>
|
||||
The size of each frame depends on the size of pointer variables and the number
|
||||
of capturing parenthesized groups in the pattern being matched. On a 64-bit
|
||||
system the frame size for a pattern with no captures is 128 bytes. For each
|
||||
capturing group the size increases by 16 bytes.
|
||||
</P>
|
||||
<P>
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on the system
|
||||
stack, but this still caused some issues for multi-thread applications where
|
||||
each thread has a very small stack. From release 10.41 backtracking memory
|
||||
frames are always held in heap memory. An initial heap allocation is obtained
|
||||
the first time any match data block is passed to <b>pcre2_match()</b>. This is
|
||||
remembered with the match data block and re-used if that block is used for
|
||||
another match. It is freed when the match data block itself is freed.
|
||||
</P>
|
||||
<P>
|
||||
The size of the initial block is the larger of 20KiB or ten times the pattern's
|
||||
frame size, unless the heap limit is less than this, in which case the heap
|
||||
limit is used. If the initial block proves to be too small during matching, it
|
||||
is replaced by a larger block, subject to the heap limit. The heap limit is
|
||||
checked only when a new block is to be allocated. Reducing the heap limit
|
||||
between calls to <b>pcre2_match()</b> with the same match data block does not
|
||||
affect the saved block.
|
||||
</P>
|
||||
<P>
|
||||
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
|
||||
|
@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.
|
|||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
<br>
|
||||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 03 February 2019
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2019 University of Cambridge.
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -1241,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject.
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1564,7 +1565,7 @@ Setting heap, match, and depth limits
|
|||
<P>
|
||||
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
|
||||
the appropriate limits in the match context. These values are ignored when the
|
||||
<b>find_limits</b> modifier is specified.
|
||||
<b>find_limits</b> or <b>find_limits_noheap</b> modifier is specified.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding minimum limits
|
||||
|
@ -1574,8 +1575,12 @@ If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b
|
|||
calls the relevant matching function several times, setting different values in
|
||||
the match context via <b>pcre2_set_heap_limit()</b>,
|
||||
<b>pcre2_set_match_limit()</b>, or <b>pcre2_set_depth_limit()</b> until it finds
|
||||
the minimum values for each parameter that allows the match to complete without
|
||||
error. If JIT is being used, only the match limit is relevant.
|
||||
the smallest value for each parameter that allows the match to complete without
|
||||
a "limit exceeded" error. The match itself may succeed or fail. An alternative
|
||||
modifier, <b>find_limits_noheap</b>, omits the heap limit. This is used in the
|
||||
standard tests, because the minimum heap limit varies between systems. If JIT
|
||||
is being used, only the match limit is relevant, and the other two are
|
||||
automatically omitted.
|
||||
</P>
|
||||
<P>
|
||||
When using this modifier, the pattern should not contain any limit settings
|
||||
|
@ -1603,9 +1608,7 @@ overall amount of computing resource that is used.
|
|||
</P>
|
||||
<P>
|
||||
For both kinds of matching, the <i>heap_limit</i> number, which is in kibibytes
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching. A
|
||||
value of zero disables the use of any heap memory; many simple pattern matches
|
||||
can be done without using the heap, so zero is not an unreasonable setting.
|
||||
(units of 1024 bytes), limits the amount of heap memory used for matching.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing MARK names
|
||||
|
@ -1623,12 +1626,10 @@ Showing memory usage
|
|||
<P>
|
||||
The <b>memory</b> modifier causes <b>pcre2test</b> to log the sizes of all heap
|
||||
memory allocation and freeing calls that occur during a call to
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. These occur only when a match
|
||||
requires a bigger vector than the default for remembering backtracking points
|
||||
(<b>pcre2_match()</b>) or for internal workspace (<b>pcre2_dfa_match()</b>). In
|
||||
many cases there will be no heap memory used and therefore no additional
|
||||
output. No heap memory is allocated during matching with JIT, so in that case
|
||||
the <b>memory</b> modifier never has any effect. For this modifier to work, the
|
||||
<b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>. In the latter case, heap memory
|
||||
is used only when a match requires more internal workspace that the default
|
||||
allocation on the stack, so in many cases there will be no output. No heap
|
||||
memory is allocated during matching with JIT. For this modifier to work, the
|
||||
<b>null_context</b> modifier must not be set on both the pattern and the
|
||||
subject, though it can be set on one or the other.
|
||||
</P>
|
||||
|
@ -1690,7 +1691,8 @@ Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
|||
If the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly in this
|
||||
case (they use default values). This modifier cannot be used with the
|
||||
<b>find_limits</b> or <b>substitute_callout</b> modifiers.
|
||||
<b>find_limits</b>, <b>find_limits_noheap</b>, or <b>substitute_callout</b>
|
||||
modifiers.
|
||||
</P>
|
||||
<P>
|
||||
Similarly, for testing purposes, if the <b>null_subject</b> or
|
||||
|
@ -2141,7 +2143,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 12 January 2022
|
||||
Last updated: 27 July 2022
|
||||
<br>
|
||||
Copyright © 1997-2022 University of Cambridge.
|
||||
<br>
|
||||
|
|
114
doc/pcre2.txt
114
doc/pcre2.txt
|
@ -1028,7 +1028,7 @@ PCRE2 CONTEXTS
|
|||
pcre2jit documentation for more details). If the limit is reached, the
|
||||
negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
|
||||
limit can be set when PCRE2 is built; if it is not, the default is set
|
||||
very large and is essentially "unlimited".
|
||||
very large and is essentially unlimited.
|
||||
|
||||
A value for the heap limit may also be supplied by an item at the start
|
||||
of a pattern of the form
|
||||
|
@ -1039,19 +1039,15 @@ PCRE2 CONTEXTS
|
|||
less ddd is less than the limit set by the caller of pcre2_match() or,
|
||||
if no such limit is set, less than the default.
|
||||
|
||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
||||
tem stack for recording backtracking points. The more nested backtrack-
|
||||
ing points there are (that is, the deeper the search tree), the more
|
||||
memory is needed. Heap memory is used only if the initial vector is
|
||||
too small. If the heap limit is set to a value less than 21 (in partic-
|
||||
ular, zero) no heap memory will be used. In this case, only patterns
|
||||
that do not have a lot of nested backtracking can be successfully pro-
|
||||
cessed.
|
||||
The pcre2_match() function always needs some heap memory, so setting a
|
||||
value of zero guarantees a "heap limit exceeded" error. Details of how
|
||||
pcre2_match() uses the heap are given in the pcre2perform documenta-
|
||||
tion.
|
||||
|
||||
Similarly, for pcre2_dfa_match(), a vector on the system stack is used
|
||||
when processing pattern recursions, lookarounds, or atomic groups, and
|
||||
only if this is not big enough is heap memory used. In this case, too,
|
||||
setting a value of zero disables the use of the heap.
|
||||
For pcre2_dfa_match(), a vector on the system stack is used when pro-
|
||||
cessing pattern recursions, lookarounds, or atomic groups, and only if
|
||||
this is not big enough is heap memory used. In this case, setting a
|
||||
value of zero disables the use of the heap.
|
||||
|
||||
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
||||
uint32_t value);
|
||||
|
@ -1093,12 +1089,12 @@ PCRE2 CONTEXTS
|
|||
|
||||
This parameter limits the depth of nested backtracking in
|
||||
pcre2_match(). Each time a nested backtracking point is passed, a new
|
||||
memory "frame" is used to remember the state of matching at that point.
|
||||
memory frame is used to remember the state of matching at that point.
|
||||
Thus, this parameter indirectly limits the amount of memory that is
|
||||
used in a match. However, because the size of each memory "frame" de-
|
||||
pends on the number of capturing parentheses, the actual memory limit
|
||||
varies from pattern to pattern. This limit was more useful in versions
|
||||
before 10.30, where function recursion was used for backtracking.
|
||||
used in a match. However, because the size of each memory frame depends
|
||||
on the number of capturing parentheses, the actual memory limit varies
|
||||
from pattern to pattern. This limit was more useful in versions before
|
||||
10.30, where function recursion was used for backtracking.
|
||||
|
||||
The depth limit is not relevant, and is ignored, when matching is done
|
||||
using JIT compiled code. However, it is supported by pcre2_dfa_match(),
|
||||
|
@ -3051,12 +3047,12 @@ ERROR RETURNS FROM pcre2_match()
|
|||
|
||||
PCRE2_ERROR_NOMEMORY
|
||||
|
||||
If a pattern contains many nested backtracking points, heap memory is
|
||||
used to remember them. This error is given when the memory allocation
|
||||
function (default or custom) fails. Note that a different error,
|
||||
PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds
|
||||
the heap limit. PCRE2_ERROR_NOMEMORY is also returned if
|
||||
PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
|
||||
Heap memory is used to remember backgracking points. This error is
|
||||
given when the memory allocation function (default or custom) fails.
|
||||
Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the
|
||||
amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
|
||||
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca-
|
||||
tion fails.
|
||||
|
||||
PCRE2_ERROR_NULL
|
||||
|
||||
|
@ -3860,8 +3856,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 14 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -4118,14 +4114,13 @@ LIMITING PCRE2 RESOURCE USAGE
|
|||
pcre2_dfa_match() matching function, and to JIT matching (though the
|
||||
counting is done differently).
|
||||
|
||||
The pcre2_match() function starts out using a 20KiB vector on the sys-
|
||||
tem stack to record backtracking points. The more nested backtracking
|
||||
points there are (that is, the deeper the search tree), the more memory
|
||||
is needed. If the initial vector is not large enough, heap memory is
|
||||
used, up to a certain limit, which is specified in kibibytes (units of
|
||||
1024 bytes). The limit can be changed at run time, as described in the
|
||||
pcre2api documentation. The default limit (in effect unlimited) is 20
|
||||
million. You can change this by a setting such as
|
||||
The pcre2_match() function uses heap memory to record backtracking
|
||||
points. The more nested backtracking points there are (that is, the
|
||||
deeper the search tree), the more memory is needed. There is an upper
|
||||
limit, specified in kibibytes (units of 1024 bytes). This limit can be
|
||||
changed at run time, as described in the pcre2api documentation. The
|
||||
default limit (in effect unlimited) is 20 million. You can change this
|
||||
by a setting such as
|
||||
|
||||
--with-heap-limit=500
|
||||
|
||||
|
@ -4450,14 +4445,14 @@ SEE ALSO
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 08 December 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -5596,18 +5591,22 @@ SIZE AND OTHER LIMITATIONS
|
|||
The maximum length of a string argument to a callout is the largest
|
||||
number a 32-bit unsigned integer can hold.
|
||||
|
||||
The maximum amount of heap memory used for matching is controlled by
|
||||
the heap limit, which can be set in a pattern or in a match context.
|
||||
The default is a very large number, effectively unlimited.
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 02 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 26 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -9773,12 +9772,29 @@ STACK AND HEAP USAGE AT RUN TIME
|
|||
sive function calls could use a great deal of stack, and this could
|
||||
cause problems, but this usage has been eliminated. Backtracking posi-
|
||||
tions are now explicitly remembered in memory frames controlled by the
|
||||
code. An initial 20KiB vector of frames is allocated on the system
|
||||
stack (enough for about 100 frames for small patterns), but if this is
|
||||
insufficient, heap memory is used. The amount of heap memory can be
|
||||
limited; if the limit is set to zero, only the initial stack vector is
|
||||
used. Rewriting patterns to be time-efficient, as described below, may
|
||||
also reduce the memory requirements.
|
||||
code.
|
||||
|
||||
The size of each frame depends on the size of pointer variables and the
|
||||
number of capturing parenthesized groups in the pattern being matched.
|
||||
On a 64-bit system the frame size for a pattern with no captures is 128
|
||||
bytes. For each capturing group the size increases by 16 bytes.
|
||||
|
||||
Until release 10.41, an initial 20KiB frames vector was allocated on
|
||||
the system stack, but this still caused some issues for multi-thread
|
||||
applications where each thread has a very small stack. From release
|
||||
10.41 backtracking memory frames are always held in heap memory. An
|
||||
initial heap allocation is obtained the first time any match data block
|
||||
is passed to pcre2_match(). This is remembered with the match data
|
||||
block and re-used if that block is used for another match. It is freed
|
||||
when the match data block itself is freed.
|
||||
|
||||
The size of the initial block is the larger of 20KiB or ten times the
|
||||
pattern's frame size, unless the heap limit is less than this, in which
|
||||
case the heap limit is used. If the initial block proves to be too
|
||||
small during matching, it is replaced by a larger block, subject to the
|
||||
heap limit. The heap limit is checked only when a new block is to be
|
||||
allocated. Reducing the heap limit between calls to pcre2_match() with
|
||||
the same match data block does not affect the saved block.
|
||||
|
||||
In contrast to pcre2_match(), pcre2_dfa_match() does use recursive
|
||||
function calls, but only for processing atomic groups, lookaround as-
|
||||
|
@ -9926,14 +9942,14 @@ PROCESSING TIME
|
|||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Retired from University Computing Service
|
||||
Cambridge, England.
|
||||
|
||||
|
||||
REVISION
|
||||
|
||||
Last updated: 03 February 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "27 July 2022" "PCRE2 10.41"
|
||||
.TH PCRE2GREP 1 "30 July 2022" "PCRE2 10.41"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -43,13 +43,15 @@ For example:
|
|||
.sp
|
||||
pcre2grep some-pattern file1 - file3
|
||||
.sp
|
||||
Input files are searched line by line. By default, each line that matches a
|
||||
By default, input files are searched line by line. Each line that matches a
|
||||
pattern is copied to the standard output, and if there is more than one file,
|
||||
the file name is output at the start of each line, followed by a colon.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. In
|
||||
particular, the \fB-M\fP option makes it possible to search for strings that
|
||||
span line boundaries. What defines a line boundary is controlled by the
|
||||
\fB-N\fP (\fB--newline\fP) option.
|
||||
However, there are options that can change how \fBpcre2grep\fP behaves. For
|
||||
example, the \fB-M\fP option makes it possible to search for strings that span
|
||||
line boundaries. What defines a line boundary is controlled by the \fB-N\fP
|
||||
(\fB--newline\fP) option. The \fB-h\fP and \fB-H\fP options control whether or
|
||||
not file names are shown, and the \fB-Z\fP option changes the file name
|
||||
terminator to a zero byte.
|
||||
.P
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the \fB--buffer-size\fP and
|
||||
|
@ -149,9 +151,11 @@ Output up to \fInumber\fP lines of context after each matching line. Fewer
|
|||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of \fInumber\fP
|
||||
is expected to be relatively small. When \fB-c\fP is used, \fB-A\fP is ignored.
|
||||
context lines (the \fB-Z\fP option can be used to change the file name
|
||||
terminator to a zero byte). A line containing "--" is output between each group
|
||||
of lines, unless they are in fact contiguous in the input file. The value of
|
||||
\fInumber\fP is expected to be relatively small. When \fB-c\fP is used,
|
||||
\fB-A\fP is ignored.
|
||||
.TP
|
||||
\fB-a\fP, \fB--text\fP
|
||||
Treat binary files as text. This is equivalent to
|
||||
|
@ -167,9 +171,10 @@ Output up to \fInumber\fP lines of context before each matching line. Fewer
|
|||
lines are output if the previous match or the start of the file is within
|
||||
\fInumber\fP lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
instead of a colon for the context lines (the \fB-Z\fP option can be used to
|
||||
change the file name terminator to a zero byte). A line containing "--" is
|
||||
output between each group of lines, unless they are in fact contiguous in the
|
||||
input file. The value of \fInumber\fP is expected to be relatively small. When
|
||||
\fB-c\fP is used, \fB-B\fP is ignored.
|
||||
.TP
|
||||
\fB--binary-files=\fP\fIword\fP
|
||||
|
@ -356,19 +361,21 @@ shown separately. This option is mutually exclusive with \fB--output\fP,
|
|||
.TP
|
||||
\fB-H\fP, \fB--with-filename\fP
|
||||
Force the inclusion of the file name at the start of output lines when
|
||||
searching a single file. By default, the file name is not shown in this case.
|
||||
For matching lines, the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. If a line number is also being output, it follows the
|
||||
file name. When the \fB-M\fP option causes a pattern to match more than one
|
||||
line, only the first is preceded by the file name. This option overrides any
|
||||
previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
searching a single file. The file name is not normally shown in this case.
|
||||
By default, for matching lines, the file name is followed by a colon; for
|
||||
context lines, a hyphen separator is used. The \fB-Z\fP option can be used to
|
||||
change the terminator to a zero byte. If a line number is also being output,
|
||||
it follows the file name. When the \fB-M\fP option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file name. This option
|
||||
overrides any previous \fB-h\fP, \fB-l\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB-h\fP, \fB--no-filename\fP
|
||||
Suppress the output file names when searching multiple files. By default,
|
||||
file names are shown when multiple files are searched. For matching lines, the
|
||||
file name is followed by a colon; for context lines, a hyphen separator is used.
|
||||
If a line number is also being output, it follows the file name. This option
|
||||
overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
Suppress the output file names when searching multiple files. File names are
|
||||
normally shown when multiple files are searched. By default, for matching
|
||||
lines, the file name is followed by a colon; for context lines, a hyphen
|
||||
separator is used. The \fB-Z\fP option can be used to change the terminator to
|
||||
a zero byte. If a line number is also being output, it follows the file name.
|
||||
This option overrides any previous \fB-H\fP, \fB-L\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB--heap-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP below.
|
||||
|
@ -417,17 +424,19 @@ given any number of times. If a directory matches both \fB--include-dir\fP and
|
|||
\fB-L\fP, \fB--files-without-match\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
that do not contain any lines that would have been output. Each file name is
|
||||
output once, on a separate line. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-l\fP options.
|
||||
output once, on a separate line by default, but if the \fB-Z\fP option is set,
|
||||
they are separated by zero bytes instead of newlines. This option overrides any
|
||||
previous \fB-H\fP, \fB-h\fP, or \fB-l\fP options.
|
||||
.TP
|
||||
\fB-l\fP, \fB--files-with-matches\fP
|
||||
Instead of outputting lines from the files, just output the names of the files
|
||||
containing lines that would have been output. Each file name is output once, on
|
||||
a separate line. Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
a separate line, but if the \fB-Z\fP option is set, they are separated by zero
|
||||
bytes instead of newlines. Searching normally stops as soon as a matching line
|
||||
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and those files that
|
||||
have at least one match are listed along with their counts. Using this option
|
||||
with \fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
|
@ -729,6 +738,12 @@ be more than one line. This is equivalent to having "^(?:" at the start of each
|
|||
pattern and ")$" at the end. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||
.TP
|
||||
\fB-Z\fP, \fB--null\fP
|
||||
Terminate files names in the regular output with a zero byte (the NUL
|
||||
character) instead of what would normally appear. This is useful when file
|
||||
names contain unusual characters such as colons, hyphens, or even newlines. The
|
||||
option does not apply to file names in error messages.
|
||||
.
|
||||
.
|
||||
.SH "ENVIRONMENT VARIABLES"
|
||||
|
@ -957,6 +972,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 27 July 2022
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -42,13 +42,15 @@ DESCRIPTION
|
|||
|
||||
pcre2grep some-pattern file1 - file3
|
||||
|
||||
Input files are searched line by line. By default, each line that
|
||||
By default, input files are searched line by line. Each line that
|
||||
matches a pattern is copied to the standard output, and if there is
|
||||
more than one file, the file name is output at the start of each line,
|
||||
followed by a colon. However, there are options that can change how
|
||||
pcre2grep behaves. In particular, the -M option makes it possible to
|
||||
pcre2grep behaves. For example, the -M option makes it possible to
|
||||
search for strings that span line boundaries. What defines a line
|
||||
boundary is controlled by the -N (--newline) option.
|
||||
boundary is controlled by the -N (--newline) option. The -h and -H op-
|
||||
tions control whether or not file names are shown, and the -Z option
|
||||
changes the file name terminator to a zero byte.
|
||||
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by parameters that can be set by the --buffer-size and
|
||||
|
@ -149,10 +151,12 @@ OPTIONS
|
|||
the file is reached, or if the processing buffer size has
|
||||
been set too small. If file names and/or line numbers are be-
|
||||
ing output, a hyphen separator is used instead of a colon for
|
||||
the context lines. A line containing "--" is output between
|
||||
each group of lines, unless they are in fact contiguous in
|
||||
the input file. The value of number is expected to be rela-
|
||||
tively small. When -c is used, -A is ignored.
|
||||
the context lines (the -Z option can be used to change the
|
||||
file name terminator to a zero byte). A line containing "--"
|
||||
is output between each group of lines, unless they are in
|
||||
fact contiguous in the input file. The value of number is ex-
|
||||
pected to be relatively small. When -c is used, -A is ig-
|
||||
nored.
|
||||
|
||||
-a, --text
|
||||
Treat binary files as text. This is equivalent to --binary-
|
||||
|
@ -170,11 +174,12 @@ OPTIONS
|
|||
start of the file is within number lines, or if the process-
|
||||
ing buffer size has been set too small. If file names and/or
|
||||
line numbers are being output, a hyphen separator is used in-
|
||||
stead of a colon for the context lines. A line containing
|
||||
"--" is output between each group of lines, unless they are
|
||||
in fact contiguous in the input file. The value of number is
|
||||
expected to be relatively small. When -c is used, -B is ig-
|
||||
nored.
|
||||
stead of a colon for the context lines (the -Z option can be
|
||||
used to change the file name terminator to a zero byte). A
|
||||
line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The
|
||||
value of number is expected to be relatively small. When -c
|
||||
is used, -B is ignored.
|
||||
|
||||
--binary-files=word
|
||||
Specify how binary files are to be processed. If the word is
|
||||
|
@ -387,22 +392,25 @@ OPTIONS
|
|||
|
||||
-H, --with-filename
|
||||
Force the inclusion of the file name at the start of output
|
||||
lines when searching a single file. By default, the file name
|
||||
is not shown in this case. For matching lines, the file name
|
||||
is followed by a colon; for context lines, a hyphen separator
|
||||
is used. If a line number is also being output, it follows
|
||||
the file name. When the -M option causes a pattern to match
|
||||
more than one line, only the first is preceded by the file
|
||||
name. This option overrides any previous -h, -l, or -L op-
|
||||
tions.
|
||||
lines when searching a single file. The file name is not nor-
|
||||
mally shown in this case. By default, for matching lines,
|
||||
the file name is followed by a colon; for context lines, a
|
||||
hyphen separator is used. The -Z option can be used to change
|
||||
the terminator to a zero byte. If a line number is also being
|
||||
output, it follows the file name. When the -M option causes a
|
||||
pattern to match more than one line, only the first is pre-
|
||||
ceded by the file name. This option overrides any previous
|
||||
-h, -l, or -L options.
|
||||
|
||||
-h, --no-filename
|
||||
Suppress the output file names when searching multiple files.
|
||||
By default, file names are shown when multiple files are
|
||||
searched. For matching lines, the file name is followed by a
|
||||
colon; for context lines, a hyphen separator is used. If a
|
||||
line number is also being output, it follows the file name.
|
||||
This option overrides any previous -H, -L, or -l options.
|
||||
File names are normally shown when multiple files are
|
||||
searched. By default, for matching lines, the file name is
|
||||
followed by a colon; for context lines, a hyphen separator is
|
||||
used. The -Z option can be used to change the terminator to a
|
||||
zero byte. If a line number is also being output, it follows
|
||||
the file name. This option overrides any previous -H, -L, or
|
||||
-l options.
|
||||
|
||||
--heap-limit=number
|
||||
See --match-limit below.
|
||||
|
@ -455,21 +463,23 @@ OPTIONS
|
|||
Instead of outputting lines from the files, just output the
|
||||
names of the files that do not contain any lines that would
|
||||
have been output. Each file name is output once, on a sepa-
|
||||
rate line. This option overrides any previous -H, -h, or -l
|
||||
options.
|
||||
rate line by default, but if the -Z option is set, they are
|
||||
separated by zero bytes instead of newlines. This option
|
||||
overrides any previous -H, -h, or -l options.
|
||||
|
||||
-l, --files-with-matches
|
||||
Instead of outputting lines from the files, just output the
|
||||
names of the files containing lines that would have been out-
|
||||
put. Each file name is output once, on a separate line.
|
||||
Searching normally stops as soon as a matching line is found
|
||||
in a file. However, if the -c (count) option is also used,
|
||||
matching continues in order to obtain the correct count, and
|
||||
those files that have at least one match are listed along
|
||||
with their counts. Using this option with -c is a way of sup-
|
||||
pressing the listing of files with no matches that occurs
|
||||
with -c on its own. This option overrides any previous -H,
|
||||
-h, or -L options.
|
||||
put. Each file name is output once, on a separate line, but
|
||||
if the -Z option is set, they are separated by zero bytes in-
|
||||
stead of newlines. Searching normally stops as soon as a
|
||||
matching line is found in a file. However, if the -c (count)
|
||||
option is also used, matching continues in order to obtain
|
||||
the correct count, and those files that have at least one
|
||||
match are listed along with their counts. Using this option
|
||||
with -c is a way of suppressing the listing of files with no
|
||||
matches that occurs with -c on its own. This option overrides
|
||||
any previous -H, -h, or -L options.
|
||||
|
||||
--label=name
|
||||
This option supplies a name to be used for the standard input
|
||||
|
@ -571,11 +581,8 @@ OPTIONS
|
|||
an error occurs.
|
||||
|
||||
The --heap-limit option specifies, as a number of kibibytes
|
||||
(units of 1024 bytes), the amount of heap memory that may be
|
||||
used for matching. Heap memory is needed only if matching the
|
||||
pattern requires a significant number of nested backtracking
|
||||
points to be remembered. This parameter can be set to zero to
|
||||
forbid the use of heap memory altogether.
|
||||
(units of 1024 bytes), the maximum amount of heap memory that
|
||||
may be used for matching.
|
||||
|
||||
The --depth-limit option limits the depth of nested back-
|
||||
tracking points, which indirectly limits the amount of memory
|
||||
|
@ -812,6 +819,13 @@ OPTIONS
|
|||
does not apply to patterns specified by any of the --include
|
||||
or --exclude options.
|
||||
|
||||
-Z, --null
|
||||
Terminate files names in the regular output with a zero byte
|
||||
(the NUL character) instead of what would normally appear.
|
||||
This is useful when file names contain unusual characters
|
||||
such as colons, hyphens, or even newlines. The option does
|
||||
not apply to file names in error messages.
|
||||
|
||||
|
||||
ENVIRONMENT VARIABLES
|
||||
|
||||
|
@ -1022,5 +1036,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 31 August 2021
|
||||
Copyright (c) 1997-2021 University of Cambridge.
|
||||
Last updated: 30 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -1111,7 +1111,8 @@ SUBJECT MODIFIERS
|
|||
copy=<number or name> copy captured substring
|
||||
depth_limit=<n> set a depth limit
|
||||
dfa use pcre2_dfa_match()
|
||||
find_limits find match and depth limits
|
||||
find_limits find heap, match and depth limits
|
||||
find_limits_noheap find match and depth limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
|
@ -1411,7 +1412,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
The heap_limit, match_limit, and depth_limit modifiers set the appro-
|
||||
priate limits in the match context. These values are ignored when the
|
||||
find_limits modifier is specified.
|
||||
find_limits or find_limits_noheap modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
|
@ -1419,8 +1420,12 @@ SUBJECT MODIFIERS
|
|||
calls the relevant matching function several times, setting different
|
||||
values in the match context via pcre2_set_heap_limit(),
|
||||
pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the
|
||||
minimum values for each parameter that allows the match to complete
|
||||
without error. If JIT is being used, only the match limit is relevant.
|
||||
smallest value for each parameter that allows the match to complete
|
||||
without a "limit exceeded" error. The match itself may succeed or fail.
|
||||
An alternative modifier, find_limits_noheap, omits the heap limit. This
|
||||
is used in the standard tests, because the minimum heap limit varies
|
||||
between systems. If JIT is being used, only the match limit is rele-
|
||||
vant, and the other two are automatically omitted.
|
||||
|
||||
When using this modifier, the pattern should not contain any limit set-
|
||||
tings such as (*LIMIT_MATCH=...) within it. If such a setting is
|
||||
|
@ -1446,9 +1451,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
For both kinds of matching, the heap_limit number, which is in
|
||||
kibibytes (units of 1024 bytes), limits the amount of heap memory used
|
||||
for matching. A value of zero disables the use of any heap memory; many
|
||||
simple pattern matches can be done without using the heap, so zero is
|
||||
not an unreasonable setting.
|
||||
for matching.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
@ -1463,13 +1466,11 @@ SUBJECT MODIFIERS
|
|||
|
||||
The memory modifier causes pcre2test to log the sizes of all heap mem-
|
||||
ory allocation and freeing calls that occur during a call to
|
||||
pcre2_match() or pcre2_dfa_match(). These occur only when a match re-
|
||||
quires a bigger vector than the default for remembering backtracking
|
||||
points (pcre2_match()) or for internal workspace (pcre2_dfa_match()).
|
||||
In many cases there will be no heap memory used and therefore no addi-
|
||||
tional output. No heap memory is allocated during matching with JIT, so
|
||||
in that case the memory modifier never has any effect. For this modi-
|
||||
fier to work, the null_context modifier must not be set on both the
|
||||
pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is
|
||||
used only when a match requires more internal workspace that the de-
|
||||
fault allocation on the stack, so in many cases there will be no out-
|
||||
put. No heap memory is allocated during matching with JIT. For this
|
||||
modifier to work, the null_context modifier must not be set on both the
|
||||
pattern and the subject, though it can be set on one or the other.
|
||||
|
||||
Setting a starting offset
|
||||
|
@ -1518,7 +1519,8 @@ SUBJECT MODIFIERS
|
|||
null_context modifier is set, however, NULL is passed. This is for
|
||||
testing that the matching and substitution functions behave correctly
|
||||
in this case (they use default values). This modifier cannot be used
|
||||
with the find_limits or substitute_callout modifiers.
|
||||
with the find_limits, find_limits_noheap, or substitute_callout modi-
|
||||
fiers.
|
||||
|
||||
Similarly, for testing purposes, if the null_subject or null_replace-
|
||||
ment modifier is set, the subject or replacement string pointers are
|
||||
|
@ -1949,5 +1951,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 12 January 2022
|
||||
Last updated: 27 July 2022
|
||||
Copyright (c) 1997-2022 University of Cambridge.
|
||||
|
|
|
@ -205,9 +205,6 @@ point. */
|
|||
* Global variables *
|
||||
*************************************************/
|
||||
|
||||
/* Jeffrey Friedl has some debugging requirements that are not part of the
|
||||
regular code. */
|
||||
|
||||
static const char *colour_string = "1;31";
|
||||
static const char *colour_option = NULL;
|
||||
static const char *dee_option = NULL;
|
||||
|
@ -220,6 +217,10 @@ static const char *output_text = NULL;
|
|||
|
||||
static char *main_buffer = NULL;
|
||||
|
||||
static const char *printname_nl = STDOUT_NL; /* Changed to NULL for -Z */
|
||||
static int printname_colon = ':'; /* Changed to 0 for -Z */
|
||||
static int printname_hyphen = '-'; /* Changed to 0 for -Z */
|
||||
|
||||
static int after_context = 0;
|
||||
static int before_context = 0;
|
||||
static int binary_files = BIN_BINARY;
|
||||
|
@ -483,6 +484,7 @@ static option_item optionlist[] = {
|
|||
{ OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
|
||||
{ OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" },
|
||||
{ OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" },
|
||||
{ OP_NODATA, 'Z', NULL, "null", "output 0 byte after file names" },
|
||||
{ OP_NODATA, 0, NULL, NULL, NULL }
|
||||
};
|
||||
|
||||
|
@ -1773,7 +1775,7 @@ if (after_context > 0 && lastmatchnumber > 0)
|
|||
{
|
||||
char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
|
||||
if (ellength == 0 && pp == main_buffer + bufsize) break;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname, printname_hyphen);
|
||||
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
||||
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||
lastmatchrestart = pp;
|
||||
|
@ -2730,7 +2732,9 @@ while (ptr < endptr)
|
|||
|
||||
else if (filenames == FN_MATCH_ONLY)
|
||||
{
|
||||
fprintf(stdout, "%s" STDOUT_NL, printname);
|
||||
fprintf(stdout, "%s", printname);
|
||||
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
|
||||
else fprintf(stdout, "%s", printname_nl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2749,7 +2753,8 @@ while (ptr < endptr)
|
|||
{
|
||||
PCRE2_SIZE oldstartoffset;
|
||||
|
||||
if (printname != NULL) fprintf(stdout, "%s:", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_colon);
|
||||
if (number) fprintf(stdout, "%lu:", linenumber);
|
||||
|
||||
/* Handle --line-offsets */
|
||||
|
@ -2871,7 +2876,8 @@ while (ptr < endptr)
|
|||
while (lastmatchrestart < p)
|
||||
{
|
||||
char *pp = lastmatchrestart;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_hyphen);
|
||||
if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
||||
|
@ -2912,7 +2918,8 @@ while (ptr < endptr)
|
|||
{
|
||||
int ellength;
|
||||
char *pp = p;
|
||||
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_hyphen);
|
||||
if (number) fprintf(stdout, "%lu-", linenumber - linecount--);
|
||||
pp = end_of_line(pp, endptr, &ellength);
|
||||
FWRITE_IGNORE(p, 1, pp - p, stdout);
|
||||
|
@ -2926,7 +2933,8 @@ while (ptr < endptr)
|
|||
if (after_context > 0 || before_context > 0)
|
||||
endhyphenpending = TRUE;
|
||||
|
||||
if (printname != NULL) fprintf(stdout, "%s:", printname);
|
||||
if (printname != NULL) fprintf(stdout, "%s%c", printname,
|
||||
printname_colon);
|
||||
if (number) fprintf(stdout, "%lu:", linenumber);
|
||||
|
||||
/* In multiline mode, or if colouring, we have to split the line(s) up
|
||||
|
@ -3131,7 +3139,9 @@ were none. If we found a match, we won't have got this far. */
|
|||
|
||||
if (filenames == FN_NOMATCH_ONLY)
|
||||
{
|
||||
fprintf(stdout, "%s" STDOUT_NL, printname);
|
||||
fprintf(stdout, "%s", printname);
|
||||
if (printname_nl == NULL) fprintf(stdout, "%c", 0);
|
||||
else fprintf(stdout, "%s", printname_nl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -3142,7 +3152,7 @@ if (count_only && !quiet)
|
|||
if (count > 0 || !omit_zero_count)
|
||||
{
|
||||
if (printname != NULL && filenames != FN_NONE)
|
||||
fprintf(stdout, "%s:", printname);
|
||||
fprintf(stdout, "%s%c", printname, printname_colon);
|
||||
fprintf(stdout, "%lu" STDOUT_NL, count);
|
||||
counts_printed++;
|
||||
}
|
||||
|
@ -3528,8 +3538,6 @@ switch(letter)
|
|||
case 'u': options |= PCRE2_UTF; utf = TRUE; break;
|
||||
case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break;
|
||||
case 'v': invert = TRUE; break;
|
||||
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
||||
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
||||
|
||||
case 'V':
|
||||
{
|
||||
|
@ -3540,6 +3548,10 @@ switch(letter)
|
|||
pcre2grep_exit(0);
|
||||
break;
|
||||
|
||||
case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
|
||||
case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
|
||||
case 'Z': printname_colon = printname_hyphen = 0; printname_nl = NULL; break;
|
||||
|
||||
default:
|
||||
fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter);
|
||||
pcre2grep_exit(usage(2));
|
||||
|
@ -4259,8 +4271,6 @@ if (DEE_option != NULL)
|
|||
|
||||
(void)pcre2_set_compile_extra_options(compile_context, extra_options);
|
||||
|
||||
/* Check the values for Jeffrey Friedl's debugging options. */
|
||||
|
||||
/* If use_jit is set, check whether JIT is available. If not, do not try
|
||||
to use JIT. */
|
||||
|
||||
|
|
|
@ -991,3 +991,22 @@ RC=0
|
|||
---------------------------- Test 134 -----------------------------
|
||||
=AB3CD5=
|
||||
RC=0
|
||||
---------------------------- Test 135 -----------------------------
|
||||
./testdata/grepinputv@The word is cat in this line
|
||||
RC=0
|
||||
./testdata/grepinputv@./testdata/grepinputv@RC=0
|
||||
./testdata/grepinputv@This line contains \E and (regex) *meta* [characters].
|
||||
./testdata/grepinputv@The word is cat in this line
|
||||
./testdata/grepinputv@The caterpillar sat on the mat
|
||||
RC=0
|
||||
testdata/grepinputM |