pcre2grep update: -m and $x{..}, $o{..} escapes. Also some doc updates.

This commit is contained in:
Philip.Hazel 2020-10-04 16:34:31 +00:00
parent 3bdc76e4f3
commit 81da2b97e3
15 changed files with 1522 additions and 1270 deletions

View File

@ -76,6 +76,16 @@ the subject \xe5A. Fixes Bugzilla #2642.
14. Fixed a bug in character set matching when JIT is enabled and both unicode 14. Fixed a bug in character set matching when JIT is enabled and both unicode
scripts and unicode classes are present at the same time. scripts and unicode classes are present at the same time.
15. Added GNU grep's -m (aka --max-count) option to pcre2grep.
16. Refactored substitution processing in pcre2grep strings, both for the -O
option and when dealing with callouts. There is now a single function that
handles $ expansion in all cases (instead of multiple copies of almost
identical code). This means that the same escape sequences are available
everywhere, which was not previously the case. At the same time, the escape
sequences $x{...} and $o{...} have been introduced, to allow for characters
whose code points are greater than 255 in Unicode mode.
Version 10.35 09-May-2020 Version 10.35 09-May-2020
--------------------------- ---------------------------

6
README
View File

@ -892,6 +892,6 @@ The distribution should contain the files listed below.
) environments ) environments
Philip Hazel Philip Hazel
Email local part: ph10 Email local part: Philip.Hazel
Email domain: cam.ac.uk Email domain: gmail.com
Last updated: 20 March 2020 Last updated: 22 September 2020

View File

@ -661,6 +661,26 @@ echo "---------------------------- Test 128 -----------------------------" >>tes
(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1 (cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 129 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -m 2 'fox' testdata/grepinput) >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 130 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -o -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 131 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -oc -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <testdata/grepinput >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <testdata/grepinputv >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
# Now compare the results. # Now compare the results.
$cf $srcdir/testdata/grepoutput testtrygrep $cf $srcdir/testdata/grepoutput testtrygrep
@ -694,6 +714,10 @@ if [ $utf8 -ne 0 ] ; then
(cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep (cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep
echo "RC=$?" >>testtrygrep echo "RC=$?" >>testtrygrep
echo "---------------------------- Test U6 -----------------------------" >>testtrygrep
(cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <testdata/grepinputv >>testtrygrep 2>&1
echo "RC=$?" >>testtrygrep
$cf $srcdir/testdata/grepoutput8 testtrygrep $cf $srcdir/testdata/grepoutput8 testtrygrep
if [ $? != 0 ] ; then exit 1; fi if [ $? != 0 ] ; then exit 1; fi
@ -764,6 +788,7 @@ if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scri
$valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep $valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
$valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep $valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
$valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep $valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
$valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep
if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
$cf $srcdir/testdata/grepoutputCN testtrygrep $cf $srcdir/testdata/grepoutputCN testtrygrep

View File

@ -892,6 +892,6 @@ The distribution should contain the files listed below.
) environments ) environments
Philip Hazel Philip Hazel
Email local part: ph10 Email local part: Philip.Hazel
Email domain: cam.ac.uk Email domain: gmail.com
Last updated: 20 March 2020 Last updated: 22 September 2020

View File

@ -111,8 +111,8 @@ matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
(either shown literally, or as an offset), scanning resumes immediately (either shown literally, or as an offset), scanning resumes immediately
following the match, so that further matches on the same line can be found. If following the match, so that further matches on the same line can be found. If
there are multiple patterns, they are all tried on the remainder of the line, there are multiple patterns, they are all tried on the remainder of the line,
but patterns that follow the one that matched are not tried on the earlier part but patterns that follow the one that matched are not tried on the earlier
of the line. matched part of the line.
</P> </P>
<P> <P>
This behaviour means that the order in which multiple patterns are specified This behaviour means that the order in which multiple patterns are specified
@ -146,11 +146,10 @@ ignored.
<br><a name="SEC4" href="#TOC1">BINARY FILES</a><br> <br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
<P> <P>
By default, a file that contains a binary zero byte within the first 1024 bytes By default, a file that contains a binary zero byte within the first 1024 bytes
is identified as a binary file, and is processed specially. (GNU grep is identified as a binary file, and is processed specially. However, if the
identifies binary files in this manner.) However, if the newline type is newline type is specified as NUL, that is, the line terminator is a binary
specified as NUL, that is, the line terminator is a binary zero, the test for zero, the test for a binary file is not applied. See the <b>--binary-files</b>
a binary file is not applied. See the <b>--binary-files</b> option for a means option for a means of changing the way binary files are handled.
of changing the way binary files are handled.
</P> </P>
<br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br> <br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br>
<P> <P>
@ -443,8 +442,8 @@ Ignore upper/lower case distinctions during comparisons.
<P> <P>
<b>--include</b>=<i>pattern</i> <b>--include</b>=<i>pattern</i>
If any <b>--include</b> patterns are specified, the only files that are If any <b>--include</b> patterns are specified, the only files that are
processed are those that match one of the patterns (and do not match an processed are those whose names match one of the patterns and do not match an
<b>--exclude</b> pattern). This option does not affect directories, but it <b>--exclude</b> pattern. This option does not affect directories, but it
applies to all files, whether listed on the command line, obtained from applies to all files, whether listed on the command line, obtained from
<b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular <b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
expression, and is matched against the final component of the file name, not expression, and is matched against the final component of the file name, not
@ -463,8 +462,8 @@ may be given any number of times; all the files are read.
<P> <P>
<b>--include-dir</b>=<i>pattern</i> <b>--include-dir</b>=<i>pattern</i>
If any <b>--include-dir</b> patterns are specified, the only directories that If any <b>--include-dir</b> patterns are specified, the only directories that
are processed are those that match one of the patterns (and do not match an are processed are those whose names match one of the patterns and do not match
<b>--exclude-dir</b> pattern). This applies to all directories, whether listed an <b>--exclude-dir</b> pattern. This applies to all directories, whether listed
on the command line, obtained from <b>--file-list</b>, or by scanning a parent on the command line, obtained from <b>--file-list</b>, or by scanning a parent
directory. The pattern is a PCRE2 regular expression, and is matched against directory. The pattern is a PCRE2 regular expression, and is matched against
the final component of the directory name, not the entire path. The <b>-F</b>, the final component of the directory name, not the entire path. The <b>-F</b>,
@ -487,8 +486,9 @@ a separate line. Searching normally stops as soon as a matching line is found
in a file. However, if the <b>-c</b> (count) option is also used, matching in a file. However, if the <b>-c</b> (count) option is also used, matching
continues in order to obtain the correct count, and those files that have at continues in order to obtain the correct count, and those files that have at
least one match are listed along with their counts. Using this option with least one match are listed along with their counts. Using this option with
<b>-c</b> is a way of suppressing the listing of files with no matches. This <b>-c</b> is a way of suppressing the listing of files with no matches that
opeion overrides any previous <b>-H</b>, <b>-h</b>, or <b>-L</b> options. occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
<b>-h</b>, or <b>-L</b> options.
</P> </P>
<P> <P>
<b>--label</b>=<i>name</i> <b>--label</b>=<i>name</i>
@ -501,8 +501,8 @@ short form for this option.
When this option is given, non-compressed input is read and processed line by When this option is given, non-compressed input is read and processed line by
line, and the output is flushed after each write. By default, input is read in line, and the output is flushed after each write. By default, input is read in
large chunks, unless <b>pcre2grep</b> can determine that it is reading from a large chunks, unless <b>pcre2grep</b> can determine that it is reading from a
terminal (which is currently possible only in Unix-like environments or terminal, which is currently possible only in Unix-like environments or
Windows). Output to terminal is normally automatically flushed by the operating Windows. Output to terminal is normally automatically flushed by the operating
system. This option can be useful when the input or output is attached to a system. This option can be useful when the input or output is attached to a
pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data. pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data.
However, its use will affect performance, and the <b>-M</b> (multiline) option However, its use will affect performance, and the <b>-M</b> (multiline) option
@ -528,6 +528,49 @@ locale is specified, the PCRE2 library's default (usually the "C" locale) is
used. There is no short form for this option. used. There is no short form for this option.
</P> </P>
<P> <P>
<b>-M</b>, <b>--multiline</b>
Allow patterns to match more than one line. When this option is set, the PCRE2
library is called in "multiline" mode. This allows a matched string to extend
past the end of a line and continue on one or more subsequent lines. Patterns
used with <b>-M</b> may usefully contain literal newline characters and internal
occurrences of ^ and $ characters. The output for a successful match may
consist of more than one line. The first line is the line in which the match
started, and the last line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the end of that line.
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
match has been handled, scanning restarts at the beginning of the line after
the one in which the match ended.
<br>
<br>
The newline sequence that separates multiple lines must be matched as part of
the pattern. For example, to find the phrase "regular expression" in a file
where "regular" might be at the end of a line and "expression" at the start of
the next line, you could use this command:
<pre>
pcre2grep -M 'regular\s+expression' &#60;file&#62;
</pre>
The \s escape sequence matches any white space character, including newlines,
and is followed by + so as to match trailing white space on the first line as
well as possibly handling a two-character newline sequence.
<br>
<br>
There is a limit to the number of lines that can be matched, imposed by the way
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
large processing buffer, this should not be a problem, but the <b>-M</b> option
does not work when input is read line by line (see <b>--line-buffered</b>.)
</P>
<P>
<b>-m</b> <i>number</i>, <b>--max-count</b>=<i>number</i>
Stop processing after finding <i>number</i> matching lines, or non-matching
lines if <b>-v</b> is also set. Any trailing context lines are output after the
final match. In multiline mode, each multiline match counts as just one line
for this purpose. If this limit is reached when reading the standard input from
a regular file, the file is left positioned just after the last matching line.
If <b>-c</b> is also set, the count that is output is never greater than
<i>number</i>. This option has no effect if used with <b>-L</b>, <b>-l</b>, or
<b>-q</b>, or when just checking for a match in a binary file.
</P>
<P>
<b>--match-limit</b>=<i>number</i> <b>--match-limit</b>=<i>number</i>
Processing some regular expression patterns may take a very long time to search Processing some regular expression patterns may take a very long time to search
for all possible matching strings. Others may require a very large amount of for all possible matching strings. Others may require a very large amount of
@ -568,38 +611,6 @@ set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
smaller than the starting buffer size. smaller than the starting buffer size.
</P> </P>
<P> <P>
<b>-M</b>, <b>--multiline</b>
Allow patterns to match more than one line. When this option is set, the PCRE2
library is called in "multiline" mode. This allows a matched string to extend
past the end of a line and continue on one or more subsequent lines. Patterns
used with <b>-M</b> may usefully contain literal newline characters and internal
occurrences of ^ and $ characters. The output for a successful match may
consist of more than one line. The first line is the line in which the match
started, and the last line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the end of that line.
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
match has been handled, scanning restarts at the beginning of the line after
the one in which the match ended.
<br>
<br>
The newline sequence that separates multiple lines must be matched as part of
the pattern. For example, to find the phrase "regular expression" in a file
where "regular" might be at the end of a line and "expression" at the start of
the next line, you could use this command:
<pre>
pcre2grep -M 'regular\s+expression' &#60;file&#62;
</pre>
The \s escape sequence matches any white space character, including newlines,
and is followed by + so as to match trailing white space on the first line as
well as possibly handling a two-character newline sequence.
<br>
<br>
There is a limit to the number of lines that can be matched, imposed by the way
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
large processing buffer, this should not be a problem, but the <b>-M</b> option
does not work when input is read line by line (see <b>--line-buffered</b>.)
</P>
<P>
<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i> <b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
Six different conventions for indicating the ends of lines in scanned files are Six different conventions for indicating the ends of lines in scanned files are
supported. For example: supported. For example:
@ -648,31 +659,41 @@ It should never be needed in normal use.
</P> </P>
<P> <P>
<b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i> <b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
When there is a match, instead of outputting the whole line that matched, When there is a match, instead of outputting the line that matched, output just
output just the given text, followed by an operating-system standard newline. the text specified in this option, followed by an operating-system standard
The <b>--newline</b> option has no effect on this option, which is mutually newline. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>,
exclusive with <b>--only-matching</b>, <b>--file-offsets</b>, and and <b>-C</b> options are ignored. The <b>--newline</b> option has no effect on
<b>--line-offsets</b>. Escape sequences starting with a dollar character may be this option, which is mutually exclusive with <b>--only-matching</b>,
used to insert the contents of the matched part of the line and/or captured <b>--file-offsets</b>, and <b>--line-offsets</b>. However, like
substrings into the text. <b>--only-matching</b>, if there is more than one match in a line, each of them
causes a line of output.
<br> <br>
<br> <br>
$&#60;digits&#62; or ${&#60;digits&#62;} is replaced by the captured Escape sequences starting with a dollar character may be used to insert the
substring of the given decimal number; zero substitutes the whole match. If contents of the matched part of the line and/or captured substrings into the
the number is greater than the number of capturing substrings, or if the text.
capture is unset, the replacement is empty. <br>
<br>
$&#60;digits&#62; or ${&#60;digits&#62;} is replaced by the captured substring of the given
decimal number; zero substitutes the whole match. If the number is greater than
the number of capturing substrings, or if the capture is unset, the replacement
is empty.
<br> <br>
<br> <br>
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
newline; $r by carriage return; $t by tab; $v by vertical tab. newline; $r by carriage return; $t by tab; $v by vertical tab.
<br> <br>
<br> <br>
$o&#60;digits&#62; is replaced by the character represented by the given octal $o&#60;digits&#62; or $o{&#60;digits&#62;} is replaced by the character whose code point is the
number; up to three digits are processed. given octal number. In the first form, up to three octal digits are processed.
When more digits are needed in Unicode mode to specify a wide character, the
second form must be used.
<br> <br>
<br> <br>
$x&#60;digits&#62; is replaced by the character represented by the given hexadecimal $x&#60;digits&#62; or $x{&#60;digits&#62;} is replaced by the character represented by the
number; up to two digits are processed. given hexadecimal number. In the first form, up to two hexadecimal digits are
processed. When more digits are needed in Unicode mode to specify a wide
character, the second form must be used.
<br> <br>
<br> <br>
Any other character is substituted by itself. In particular, $$ is replaced by Any other character is substituted by itself. In particular, $$ is replaced by
@ -741,7 +762,8 @@ option to "recurse".
</P> </P>
<P> <P>
<b>--recursion-limit</b>=<i>number</i> <b>--recursion-limit</b>=<i>number</i>
See <b>--match-limit</b> above. This is an obsolete synonym for <b>--depth-limit</b>. See <b>--match-limit</b>
above for details.
</P> </P>
<P> <P>
<b>-s</b>, <b>--no-messages</b> <b>-s</b>, <b>--no-messages</b>
@ -765,15 +787,18 @@ total would always be zero.
<b>-u</b>, <b>--utf</b> <b>-u</b>, <b>--utf</b>
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
<b>--include</b> options) and all subject lines that are scanned must be valid <b>--include</b> options) and all lines that are scanned must be valid strings
strings of UTF-8 characters. of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
occurs.
</P> </P>
<P> <P>
<b>-U</b>, <b>--utf-allow-invalid</b> <b>-U</b>, <b>--utf-allow-invalid</b>
As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code
unit sequences. These can never form part of any pattern match. This facility unit sequences. These can never form part of any pattern match. Patterns
allows valid UTF-8 strings to be sought in executable or other binary files. themselves, however, must still be valid UTF-8 strings. This facility allows
For more details about matching in non-valid UTF-8 strings, see the valid UTF-8 strings to be sought within arbitrary byte sequences in executable
or other binary files. For more details about matching in non-valid UTF-8
strings, see the
<a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a> <a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a>
documentation. documentation.
</P> </P>
@ -786,7 +811,9 @@ ignored.
<P> <P>
<b>-v</b>, <b>--invert-match</b> <b>-v</b>, <b>--invert-match</b>
Invert the sense of the match, so that lines which do <i>not</i> match any of Invert the sense of the match, so that lines which do <i>not</i> match any of
the patterns are the ones that are found. the patterns are the ones that are found. When this option is set, options such
as <b>--only-matching</b> and <b>--output</b>, which specify parts of a match
that are to be output, are ignored.
</P> </P>
<P> <P>
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b> <b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
@ -909,12 +936,36 @@ documentation for details). Numbered callouts are ignored by <b>pcre2grep</b>;
only callouts with string arguments are useful. only callouts with string arguments are useful.
</P> </P>
<br><b> <br><b>
Echoing a specific string
</b><br>
<P>
Starting the callout string with a pipe character invokes an echoing facility
that avoids calling an external program or script. This facility is always
available, provided that callouts were not completely disabled when
<b>pcre2grep</b> was built. The rest of the callout string is processed as a
zero-terminated string, which means it should not contain any internal binary
zeros. It is written to the output, having first been passed through the same
escape processing as text from the <b>--output</b> (<b>-O</b>) option (see
above). However, $0 cannot be used to insert a matched substring because the
match is still in progress. Instead, the single character '0' is inserted. Any
syntax errors in the string (for example, a dollar not followed by another
character) causes the callout to be ignored. No terminator is added to the
output string, so if you want a newline, you must include it explicitly using
the escape $n. For example:
<pre>
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' &#60;some file&#62;
</pre>
Matching continues normally after the string is output. If you want to see only
the callout output but not any output from an actual match, you should end the
pattern with (*FAIL).
</P>
<br><b>
Calling external programs or scripts Calling external programs or scripts
</b><br> </b><br>
<P> <P>
This facility can be independently disabled when <b>pcre2grep</b> is built. It This facility can be independently disabled when <b>pcre2grep</b> is built. It
is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS, is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
where <b>lib$spawn()</b> is used, and for any other Unix-like environment where where <b>lib$spawn()</b> is used, and for any Unix-like environment where
<b>fork()</b> and <b>execv()</b> are available. <b>fork()</b> and <b>execv()</b> are available.
</P> </P>
<P> <P>
@ -926,14 +977,11 @@ arguments:
executable_name|arg1|arg2|... executable_name|arg1|arg2|...
</pre> </pre>
Any substring (including the executable name) may contain escape sequences Any substring (including the executable name) may contain escape sequences
started by a dollar character: $&#60;digits&#62; or ${&#60;digits&#62;} is replaced by the started by a dollar character. These are the same as for the <b>--output</b>
captured substring of the given decimal number, which must be greater than (<b>-O</b>) option documented above, except that $0 cannot insert the matched
zero. If the number is greater than the number of capturing substrings, or if string because the match is still in progress. Instead, the character '0'
the capture is unset, the replacement is empty. is inserted. If you need a literal dollar or pipe character in any
</P> substring, use $$ or $| respectively. Here is an example:
<P>
Any other character is substituted by itself. In particular, $$ is replaced by
a single dollar and $| is replaced by a pipe character. Here is an example:
<pre> <pre>
echo -e "abcde\n12345" | pcre2grep \ echo -e "abcde\n12345" | pcre2grep \
'(?x)(.)(..(.)) '(?x)(.)(..(.))
@ -946,28 +994,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
Arg1: [1] [234] [4] Arg2: |1| () Arg1: [1] [234] [4] Arg2: |1| ()
12345 12345
</pre> </pre>
The parameters for the system call that is used to run the The parameters for the system call that is used to run the program or script
program or script are zero-terminated strings. This means that binary zero are zero-terminated strings. This means that binary zero characters in the
characters in the callout argument will cause premature termination of their callout argument will cause premature termination of their substrings, and
substrings, and therefore should not be present. Any syntax errors in the therefore should not be present. Any syntax errors in the string (for example,
string (for example, a dollar not followed by another character) cause the a dollar not followed by another character) causes the callout to be ignored.
callout to be ignored. If running the program fails for any reason (including If running the program fails for any reason (including the non-existence of the
the non-existence of the executable), a local matching failure occurs and the executable), a local matching failure occurs and the matcher backtracks in the
matcher backtracks in the normal way. normal way.
</P>
<br><b>
Echoing a specific string
</b><br>
<P>
This facility is always available, provided that callouts were not completely
disabled when <b>pcre2grep</b> was built. If the callout string starts with a
pipe (vertical bar) character, the rest of the string is written to the output,
having been passed through the same escape processing as text from the --output
option. This provides a simple echoing facility that avoids calling an external
program or script. No terminator is added to the string, so if you want a
newline, you must include it explicitly. Matching continues normally after the
string is output. If you want to see only the callout output but not any output
from an actual match, you should end the relevant pattern with (*FAIL).
</P> </P>
<br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br> <br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br>
<P> <P>
@ -999,7 +1033,8 @@ because VMS does not distinguish between exit(0) and exit(1).
</P> </P>
<br><a name="SEC14" href="#TOC1">SEE ALSO</a><br> <br><a name="SEC14" href="#TOC1">SEE ALSO</a><br>
<P> <P>
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3). <b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3),
<b>pcre2unicode</b>(3).
</P> </P>
<br><a name="SEC15" href="#TOC1">AUTHOR</a><br> <br><a name="SEC15" href="#TOC1">AUTHOR</a><br>
<P> <P>
@ -1012,7 +1047,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC16" href="#TOC1">REVISION</a><br> <br><a name="SEC16" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 25 January 2020 Last updated: 04 October 2020
<br> <br>
Copyright &copy; 1997-2020 University of Cambridge. Copyright &copy; 1997-2020 University of Cambridge.
<br> <br>

View File

@ -323,7 +323,7 @@ test data, command lines that begin with # may appear. This file format, with
some restrictions, can also be processed by the <b>perltest.sh</b> script that some restrictions, can also be processed by the <b>perltest.sh</b> script that
is distributed with PCRE2 as a means of checking that the behaviour of PCRE2 is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
and Perl is the same. For a specification of <b>perltest.sh</b>, see the and Perl is the same. For a specification of <b>perltest.sh</b>, see the
comments near its beginning. comments near its beginning. See also the #perltest command below.
</P> </P>
<P> <P>
When the input is a terminal, <b>pcre2test</b> prompts for each line of input, When the input is a terminal, <b>pcre2test</b> prompts for each line of input,
@ -420,14 +420,20 @@ patterns. Modifiers on a pattern can change these settings.
<pre> <pre>
#perltest #perltest
</pre> </pre>
The appearance of this line causes all subsequent modifier settings to be This line is used in test files that can also be processed by <b>perltest.sh</b>
checked for compatibility with the <b>perltest.sh</b> script, which is used to to confirm that Perl gives the same results as PCRE2. Subsequent tests are
confirm that Perl gives the same results as PCRE2. Also, apart from comment checked for the use of <b>pcre2test</b> features that are incompatible with the
lines, #pattern commands, and #subject commands that set or unset "mark", no <b>perltest.sh</b> script.
command lines are permitted, because they and many of the modifiers are </P>
specific to <b>pcre2test</b>, and should not be used in test files that are also <P>
processed by <b>perltest.sh</b>. The <b>#perltest</b> command helps detect tests Patterns must use '/' as their delimiter, and only certain modifiers are
that are accidentally put in the wrong file. supported. Comment lines, #pattern commands, and #subject commands that set or
unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and
#newline_default commands, which are needed in the relevant pcre2test files,
are silently ignored. All other command lines are ignored, but give a warning
message. The <b>#perltest</b> command helps detect tests that are accidentally
put in the wrong file or use the wrong delimiter. For more details of the
<b>perltest.sh</b> script see the comments it contains.
<pre> <pre>
#pop [&#60;modifiers&#62;] #pop [&#60;modifiers&#62;]
#popcopy [&#60;modifiers&#62;] #popcopy [&#60;modifiers&#62;]
@ -2113,7 +2119,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br> <br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 20 March 2020 Last updated: 14 September 2020
<br> <br>
Copyright &copy; 1997-2020 University of Cambridge. Copyright &copy; 1997-2020 University of Cambridge.
<br> <br>

View File

@ -1,4 +1,4 @@
.TH PCRE2GREP 1 "25 January 2020" "PCRE2 10.35" .TH PCRE2GREP 1 "04 October 2020" "PCRE2 10.36"
.SH NAME .SH NAME
pcre2grep - a grep with Perl-compatible regular expressions. pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS .SH SYNOPSIS
@ -79,8 +79,8 @@ matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
(either shown literally, or as an offset), scanning resumes immediately (either shown literally, or as an offset), scanning resumes immediately
following the match, so that further matches on the same line can be found. If following the match, so that further matches on the same line can be found. If
there are multiple patterns, they are all tried on the remainder of the line, there are multiple patterns, they are all tried on the remainder of the line,
but patterns that follow the one that matched are not tried on the earlier part but patterns that follow the one that matched are not tried on the earlier
of the line. matched part of the line.
.P .P
This behaviour means that the order in which multiple patterns are specified This behaviour means that the order in which multiple patterns are specified
can affect the output when one of the above options is used. This is no longer can affect the output when one of the above options is used. This is no longer
@ -115,11 +115,10 @@ ignored.
.rs .rs
.sp .sp
By default, a file that contains a binary zero byte within the first 1024 bytes By default, a file that contains a binary zero byte within the first 1024 bytes
is identified as a binary file, and is processed specially. (GNU grep is identified as a binary file, and is processed specially. However, if the
identifies binary files in this manner.) However, if the newline type is newline type is specified as NUL, that is, the line terminator is a binary
specified as NUL, that is, the line terminator is a binary zero, the test for zero, the test for a binary file is not applied. See the \fB--binary-files\fP
a binary file is not applied. See the \fB--binary-files\fP option for a means option for a means of changing the way binary files are handled.
of changing the way binary files are handled.
. .
. .
.SH "BINARY ZEROS IN PATTERNS" .SH "BINARY ZEROS IN PATTERNS"
@ -383,8 +382,8 @@ Ignore upper/lower case distinctions during comparisons.
.TP .TP
\fB--include\fP=\fIpattern\fP \fB--include\fP=\fIpattern\fP
If any \fB--include\fP patterns are specified, the only files that are If any \fB--include\fP patterns are specified, the only files that are
processed are those that match one of the patterns (and do not match an processed are those whose names match one of the patterns and do not match an
\fB--exclude\fP pattern). This option does not affect directories, but it \fB--exclude\fP pattern. This option does not affect directories, but it
applies to all files, whether listed on the command line, obtained from applies to all files, whether listed on the command line, obtained from
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular \fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
expression, and is matched against the final component of the file name, not expression, and is matched against the final component of the file name, not
@ -401,8 +400,8 @@ may be given any number of times; all the files are read.
.TP .TP
\fB--include-dir\fP=\fIpattern\fP \fB--include-dir\fP=\fIpattern\fP
If any \fB--include-dir\fP patterns are specified, the only directories that If any \fB--include-dir\fP patterns are specified, the only directories that
are processed are those that match one of the patterns (and do not match an are processed are those whose names match one of the patterns and do not match
\fB--exclude-dir\fP pattern). This applies to all directories, whether listed an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed
on the command line, obtained from \fB--file-list\fP, or by scanning a parent on the command line, obtained from \fB--file-list\fP, or by scanning a parent
directory. The pattern is a PCRE2 regular expression, and is matched against directory. The pattern is a PCRE2 regular expression, and is matched against
the final component of the directory name, not the entire path. The \fB-F\fP, the final component of the directory name, not the entire path. The \fB-F\fP,
@ -423,8 +422,9 @@ a separate line. Searching normally stops as soon as a matching line is found
in a file. However, if the \fB-c\fP (count) option is also used, matching in a file. However, if the \fB-c\fP (count) option is also used, matching
continues in order to obtain the correct count, and those files that have at continues in order to obtain the correct count, and those files that have at
least one match are listed along with their counts. Using this option with least one match are listed along with their counts. Using this option with
\fB-c\fP is a way of suppressing the listing of files with no matches. This \fB-c\fP is a way of suppressing the listing of files with no matches that
opeion overrides any previous \fB-H\fP, \fB-h\fP, or \fB-L\fP options. occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
\fB-h\fP, or \fB-L\fP options.
.TP .TP
\fB--label\fP=\fIname\fP \fB--label\fP=\fIname\fP
This option supplies a name to be used for the standard input when file names This option supplies a name to be used for the standard input when file names
@ -435,8 +435,8 @@ short form for this option.
When this option is given, non-compressed input is read and processed line by When this option is given, non-compressed input is read and processed line by
line, and the output is flushed after each write. By default, input is read in line, and the output is flushed after each write. By default, input is read in
large chunks, unless \fBpcre2grep\fP can determine that it is reading from a large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
terminal (which is currently possible only in Unix-like environments or terminal, which is currently possible only in Unix-like environments or
Windows). Output to terminal is normally automatically flushed by the operating Windows. Output to terminal is normally automatically flushed by the operating
system. This option can be useful when the input or output is attached to a system. This option can be useful when the input or output is attached to a
pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data. pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
However, its use will affect performance, and the \fB-M\fP (multiline) option However, its use will affect performance, and the \fB-M\fP (multiline) option
@ -459,6 +459,45 @@ the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
locale is specified, the PCRE2 library's default (usually the "C" locale) is locale is specified, the PCRE2 library's default (usually the "C" locale) is
used. There is no short form for this option. used. There is no short form for this option.
.TP .TP
\fB-M\fP, \fB--multiline\fP
Allow patterns to match more than one line. When this option is set, the PCRE2
library is called in "multiline" mode. This allows a matched string to extend
past the end of a line and continue on one or more subsequent lines. Patterns
used with \fB-M\fP may usefully contain literal newline characters and internal
occurrences of ^ and $ characters. The output for a successful match may
consist of more than one line. The first line is the line in which the match
started, and the last line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the end of that line.
If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a
match has been handled, scanning restarts at the beginning of the line after
the one in which the match ended.
.sp
The newline sequence that separates multiple lines must be matched as part of
the pattern. For example, to find the phrase "regular expression" in a file
where "regular" might be at the end of a line and "expression" at the start of
the next line, you could use this command:
.sp
pcre2grep -M 'regular\es+expression' <file>
.sp
The \es escape sequence matches any white space character, including newlines,
and is followed by + so as to match trailing white space on the first line as
well as possibly handling a two-character newline sequence.
.sp
There is a limit to the number of lines that can be matched, imposed by the way
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
large processing buffer, this should not be a problem, but the \fB-M\fP option
does not work when input is read line by line (see \fB--line-buffered\fP.)
.TP
\fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP
Stop processing after finding \fInumber\fP matching lines, or non-matching
lines if \fB-v\fP is also set. Any trailing context lines are output after the
final match. In multiline mode, each multiline match counts as just one line
for this purpose. If this limit is reached when reading the standard input from
a regular file, the file is left positioned just after the last matching line.
If \fB-c\fP is also set, the count that is output is never greater than
\fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or
\fB-q\fP, or when just checking for a match in a binary file.
.TP
\fB--match-limit\fP=\fInumber\fP \fB--match-limit\fP=\fInumber\fP
Processing some regular expression patterns may take a very long time to search Processing some regular expression patterns may take a very long time to search
for all possible matching strings. Others may require a very large amount of for all possible matching strings. Others may require a very large amount of
@ -493,35 +532,6 @@ This limits the expansion of the processing buffer, whose initial size can be
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
smaller than the starting buffer size. smaller than the starting buffer size.
.TP .TP
\fB-M\fP, \fB--multiline\fP
Allow patterns to match more than one line. When this option is set, the PCRE2
library is called in "multiline" mode. This allows a matched string to extend
past the end of a line and continue on one or more subsequent lines. Patterns
used with \fB-M\fP may usefully contain literal newline characters and internal
occurrences of ^ and $ characters. The output for a successful match may
consist of more than one line. The first line is the line in which the match
started, and the last line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the end of that line.
If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a
match has been handled, scanning restarts at the beginning of the line after
the one in which the match ended.
.sp
The newline sequence that separates multiple lines must be matched as part of
the pattern. For example, to find the phrase "regular expression" in a file
where "regular" might be at the end of a line and "expression" at the start of
the next line, you could use this command:
.sp
pcre2grep -M 'regular\es+expression' <file>
.sp
The \es escape sequence matches any white space character, including newlines,
and is followed by + so as to match trailing white space on the first line as
well as possibly handling a two-character newline sequence.
.sp
There is a limit to the number of lines that can be matched, imposed by the way
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
large processing buffer, this should not be a problem, but the \fB-M\fP option
does not work when input is read line by line (see \fB--line-buffered\fP.)
.TP
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP \fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
Six different conventions for indicating the ends of lines in scanned files are Six different conventions for indicating the ends of lines in scanned files are
supported. For example: supported. For example:
@ -565,27 +575,36 @@ use of JIT at run time. It is provided for testing and working round problems.
It should never be needed in normal use. It should never be needed in normal use.
.TP .TP
\fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP \fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
When there is a match, instead of outputting the whole line that matched, When there is a match, instead of outputting the line that matched, output just
output just the given text, followed by an operating-system standard newline. the text specified in this option, followed by an operating-system standard
The \fB--newline\fP option has no effect on this option, which is mutually newline. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP,
exclusive with \fB--only-matching\fP, \fB--file-offsets\fP, and and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on
\fB--line-offsets\fP. Escape sequences starting with a dollar character may be this option, which is mutually exclusive with \fB--only-matching\fP,
used to insert the contents of the matched part of the line and/or captured \fB--file-offsets\fP, and \fB--line-offsets\fP. However, like
substrings into the text. \fB--only-matching\fP, if there is more than one match in a line, each of them
causes a line of output.
.sp .sp
$<digits> or ${<digits>} is replaced by the captured Escape sequences starting with a dollar character may be used to insert the
substring of the given decimal number; zero substitutes the whole match. If contents of the matched part of the line and/or captured substrings into the
the number is greater than the number of capturing substrings, or if the text.
capture is unset, the replacement is empty. .sp
$<digits> or ${<digits>} is replaced by the captured substring of the given
decimal number; zero substitutes the whole match. If the number is greater than
the number of capturing substrings, or if the capture is unset, the replacement
is empty.
.sp .sp
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by $a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
newline; $r by carriage return; $t by tab; $v by vertical tab. newline; $r by carriage return; $t by tab; $v by vertical tab.
.sp .sp
$o<digits> is replaced by the character represented by the given octal $o<digits> or $o{<digits>} is replaced by the character whose code point is the
number; up to three digits are processed. given octal number. In the first form, up to three octal digits are processed.
When more digits are needed in Unicode mode to specify a wide character, the
second form must be used.
.sp .sp
$x<digits> is replaced by the character represented by the given hexadecimal $x<digits> or $x{<digits>} is replaced by the character represented by the
number; up to two digits are processed. given hexadecimal number. In the first form, up to two hexadecimal digits are
processed. When more digits are needed in Unicode mode to specify a wide
character, the second form must be used.
.sp .sp
Any other character is substituted by itself. In particular, $$ is replaced by Any other character is substituted by itself. In particular, $$ is replaced by
a single dollar. a single dollar.
@ -644,7 +663,8 @@ immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
option to "recurse". option to "recurse".
.TP .TP
\fB--recursion-limit\fP=\fInumber\fP \fB--recursion-limit\fP=\fInumber\fP
See \fB--match-limit\fP above. This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP
above for details.
.TP .TP
\fB-s\fP, \fB--no-messages\fP \fB-s\fP, \fB--no-messages\fP
Suppress error messages about non-existent or unreadable files. Such files are Suppress error messages about non-existent or unreadable files. Such files are
@ -665,14 +685,17 @@ total would always be zero.
\fB-u\fP, \fB--utf\fP \fB-u\fP, \fB--utf\fP
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
\fB--include\fP options) and all subject lines that are scanned must be valid \fB--include\fP options) and all lines that are scanned must be valid strings
strings of UTF-8 characters. of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
occurs.
.TP .TP
\fB-U\fP, \fB--utf-allow-invalid\fP \fB-U\fP, \fB--utf-allow-invalid\fP
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
unit sequences. These can never form part of any pattern match. This facility unit sequences. These can never form part of any pattern match. Patterns
allows valid UTF-8 strings to be sought in executable or other binary files. themselves, however, must still be valid UTF-8 strings. This facility allows
For more details about matching in non-valid UTF-8 strings, see the valid UTF-8 strings to be sought within arbitrary byte sequences in executable
or other binary files. For more details about matching in non-valid UTF-8
strings, see the
.\" HREF .\" HREF
\fBpcre2unicode\fP(3) \fBpcre2unicode\fP(3)
.\" .\"
@ -685,7 +708,9 @@ ignored.
.TP .TP
\fB-v\fP, \fB--invert-match\fP \fB-v\fP, \fB--invert-match\fP
Invert the sense of the match, so that lines which do \fInot\fP match any of Invert the sense of the match, so that lines which do \fInot\fP match any of
the patterns are the ones that are found. the patterns are the ones that are found. When this option is set, options such
as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match
that are to be output, are ignored.
.TP .TP
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP \fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
Force the patterns only to match "words". That is, there must be a word Force the patterns only to match "words". That is, there must be a word
@ -812,12 +837,36 @@ documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP;
only callouts with string arguments are useful. only callouts with string arguments are useful.
. .
. .
.SS "Echoing a specific string"
.rs
.sp
Starting the callout string with a pipe character invokes an echoing facility
that avoids calling an external program or script. This facility is always
available, provided that callouts were not completely disabled when
\fBpcre2grep\fP was built. The rest of the callout string is processed as a
zero-terminated string, which means it should not contain any internal binary
zeros. It is written to the output, having first been passed through the same
escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
above). However, $0 cannot be used to insert a matched substring because the
match is still in progress. Instead, the single character '0' is inserted. Any
syntax errors in the string (for example, a dollar not followed by another
character) causes the callout to be ignored. No terminator is added to the
output string, so if you want a newline, you must include it explicitly using
the escape $n. For example:
.sp
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
.sp
Matching continues normally after the string is output. If you want to see only
the callout output but not any output from an actual match, you should end the
pattern with (*FAIL).
.
.
.SS "Calling external programs or scripts" .SS "Calling external programs or scripts"
.rs .rs
.sp .sp
This facility can be independently disabled when \fBpcre2grep\fP is built. It This facility can be independently disabled when \fBpcre2grep\fP is built. It
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS, is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
where \fBlib$spawn()\fP is used, and for any other Unix-like environment where where \fBlib$spawn()\fP is used, and for any Unix-like environment where
\fBfork()\fP and \fBexecv()\fP are available. \fBfork()\fP and \fBexecv()\fP are available.
.P .P
If the callout string does not start with a pipe (vertical bar) character, it If the callout string does not start with a pipe (vertical bar) character, it
@ -828,13 +877,11 @@ arguments:
executable_name|arg1|arg2|... executable_name|arg1|arg2|...
.sp .sp
Any substring (including the executable name) may contain escape sequences Any substring (including the executable name) may contain escape sequences
started by a dollar character: $<digits> or ${<digits>} is replaced by the started by a dollar character. These are the same as for the \fB--output\fP
captured substring of the given decimal number, which must be greater than (\fB-O\fP) option documented above, except that $0 cannot insert the matched
zero. If the number is greater than the number of capturing substrings, or if string because the match is still in progress. Instead, the character '0'
the capture is unset, the replacement is empty. is inserted. If you need a literal dollar or pipe character in any
.P substring, use $$ or $| respectively. Here is an example:
Any other character is substituted by itself. In particular, $$ is replaced by
a single dollar and $| is replaced by a pipe character. Here is an example:
.sp .sp
echo -e "abcde\en12345" | pcre2grep \e echo -e "abcde\en12345" | pcre2grep \e
'(?x)(.)(..(.)) '(?x)(.)(..(.))
@ -847,28 +894,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
Arg1: [1] [234] [4] Arg2: |1| () Arg1: [1] [234] [4] Arg2: |1| ()
12345 12345
.sp .sp
The parameters for the system call that is used to run the The parameters for the system call that is used to run the program or script
program or script are zero-terminated strings. This means that binary zero are zero-terminated strings. This means that binary zero characters in the
characters in the callout argument will cause premature termination of their callout argument will cause premature termination of their substrings, and
substrings, and therefore should not be present. Any syntax errors in the therefore should not be present. Any syntax errors in the string (for example,
string (for example, a dollar not followed by another character) cause the a dollar not followed by another character) causes the callout to be ignored.
callout to be ignored. If running the program fails for any reason (including If running the program fails for any reason (including the non-existence of the
the non-existence of the executable), a local matching failure occurs and the executable), a local matching failure occurs and the matcher backtracks in the
matcher backtracks in the normal way. normal way.
.
.
.SS "Echoing a specific string"
.rs
.sp
This facility is always available, provided that callouts were not completely
disabled when \fBpcre2grep\fP was built. If the callout string starts with a
pipe (vertical bar) character, the rest of the string is written to the output,
having been passed through the same escape processing as text from the --output
option. This provides a simple echoing facility that avoids calling an external
program or script. No terminator is added to the string, so if you want a
newline, you must include it explicitly. Matching continues normally after the
string is output. If you want to see only the callout output but not any output
from an actual match, you should end the relevant pattern with (*FAIL).
. .
. .
.SH "MATCHING ERRORS" .SH "MATCHING ERRORS"
@ -904,7 +937,8 @@ because VMS does not distinguish between exit(0) and exit(1).
.SH "SEE ALSO" .SH "SEE ALSO"
.rs .rs
.sp .sp
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3). \fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3),
\fBpcre2unicode\fP(3).
. .
. .
.SH AUTHOR .SH AUTHOR
@ -921,6 +955,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 25 January 2020 Last updated: 04 October 2020
Copyright (c) 1997-2020 University of Cambridge. Copyright (c) 1997-2020 University of Cambridge.
.fi .fi

View File

@ -80,7 +80,7 @@ DESCRIPTION
following the match, so that further matches on the same line can be following the match, so that further matches on the same line can be
found. If there are multiple patterns, they are all tried on the re- found. If there are multiple patterns, they are all tried on the re-
mainder of the line, but patterns that follow the one that matched are mainder of the line, but patterns that follow the one that matched are
not tried on the earlier part of the line. not tried on the earlier matched part of the line.
This behaviour means that the order in which multiple patterns are This behaviour means that the order in which multiple patterns are
specified can affect the output when one of the above options is used. specified can affect the output when one of the above options is used.
@ -115,10 +115,10 @@ BINARY FILES
By default, a file that contains a binary zero byte within the first By default, a file that contains a binary zero byte within the first
1024 bytes is identified as a binary file, and is processed specially. 1024 bytes is identified as a binary file, and is processed specially.
(GNU grep identifies binary files in this manner.) However, if the new- However, if the newline type is specified as NUL, that is, the line
line type is specified as NUL, that is, the line terminator is a binary terminator is a binary zero, the test for a binary file is not applied.
zero, the test for a binary file is not applied. See the --binary-files See the --binary-files option for a means of changing the way binary
option for a means of changing the way binary files are handled. files are handled.
BINARY ZEROS IN PATTERNS BINARY ZEROS IN PATTERNS
@ -413,17 +413,17 @@ OPTIONS
--include=pattern --include=pattern
If any --include patterns are specified, the only files that If any --include patterns are specified, the only files that
are processed are those that match one of the patterns (and are processed are those whose names match one of the patterns
do not match an --exclude pattern). This option does not af- and do not match an --exclude pattern. This option does not
fect directories, but it applies to all files, whether listed affect directories, but it applies to all files, whether
on the command line, obtained from --file-list, or by scan- listed on the command line, obtained from --file-list, or by
ning a directory. The pattern is a PCRE2 regular expression, scanning a directory. The pattern is a PCRE2 regular expres-
and is matched against the final component of the file name, sion, and is matched against the final component of the file
not the entire path. The -F, -w, and -x options do not apply name, not the entire path. The -F, -w, and -x options do not
to this pattern. The option may be given any number of times. apply to this pattern. The option may be given any number of
If a file name matches both an --include and an --exclude times. If a file name matches both an --include and an --ex-
pattern, it is excluded. There is no short form for this op- clude pattern, it is excluded. There is no short form for
tion. this option.
--include-from=filename --include-from=filename
Treat each non-empty line of the file as the data for an Treat each non-empty line of the file as the data for an
@ -434,8 +434,8 @@ OPTIONS
--include-dir=pattern --include-dir=pattern
If any --include-dir patterns are specified, the only direc- If any --include-dir patterns are specified, the only direc-
tories that are processed are those that match one of the tories that are processed are those whose names match one of
patterns (and do not match an --exclude-dir pattern). This the patterns and do not match an --exclude-dir pattern. This
applies to all directories, whether listed on the command applies to all directories, whether listed on the command
line, obtained from --file-list, or by scanning a parent di- line, obtained from --file-list, or by scanning a parent di-
rectory. The pattern is a PCRE2 regular expression, and is rectory. The pattern is a PCRE2 regular expression, and is
@ -461,8 +461,9 @@ OPTIONS
matching continues in order to obtain the correct count, and matching continues in order to obtain the correct count, and
those files that have at least one match are listed along those files that have at least one match are listed along
with their counts. Using this option with -c is a way of sup- with their counts. Using this option with -c is a way of sup-
pressing the listing of files with no matches. This opeion pressing the listing of files with no matches that occurs
overrides any previous -H, -h, or -L options. with -c on its own. This option overrides any previous -H,
-h, or -L options.
--label=name --label=name
This option supplies a name to be used for the standard input This option supplies a name to be used for the standard input
@ -470,37 +471,84 @@ OPTIONS
input)" is used. There is no short form for this option. input)" is used. There is no short form for this option.
--line-buffered --line-buffered
When this option is given, non-compressed input is read and When this option is given, non-compressed input is read and
processed line by line, and the output is flushed after each processed line by line, and the output is flushed after each
write. By default, input is read in large chunks, unless write. By default, input is read in large chunks, unless
pcre2grep can determine that it is reading from a terminal pcre2grep can determine that it is reading from a terminal,
(which is currently possible only in Unix-like environments which is currently possible only in Unix-like environments or
or Windows). Output to terminal is normally automatically Windows. Output to terminal is normally automatically flushed
flushed by the operating system. This option can be useful by the operating system. This option can be useful when the
when the input or output is attached to a pipe and you do not input or output is attached to a pipe and you do not want
want pcre2grep to buffer up large amounts of data. However, pcre2grep to buffer up large amounts of data. However, its
its use will affect performance, and the -M (multiline) op- use will affect performance, and the -M (multiline) option
tion ceases to work. When input is from a compressed .gz or ceases to work. When input is from a compressed .gz or .bz2
.bz2 file, --line-buffered is ignored. file, --line-buffered is ignored.
--line-offsets --line-offsets
Instead of showing lines or parts of lines that match, show Instead of showing lines or parts of lines that match, show
each match as a line number, the offset from the start of the each match as a line number, the offset from the start of the
line, and a length. The line number is terminated by a colon line, and a length. The line number is terminated by a colon
(as usual; see the -n option), and the offset and length are (as usual; see the -n option), and the offset and length are
separated by a comma. In this mode, no context is shown. separated by a comma. In this mode, no context is shown.
That is, the -A, -B, and -C options are ignored. If there is That is, the -A, -B, and -C options are ignored. If there is
more than one match in a line, each of them is shown sepa- more than one match in a line, each of them is shown sepa-
rately. This option is mutually exclusive with --output, rately. This option is mutually exclusive with --output,
--file-offsets, and --only-matching. --file-offsets, and --only-matching.
--locale=locale-name --locale=locale-name
This option specifies a locale to be used for pattern match- This option specifies a locale to be used for pattern match-
ing. It overrides the value in the LC_ALL or LC_CTYPE envi- ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
ronment variables. If no locale is specified, the PCRE2 li- ronment variables. If no locale is specified, the PCRE2 li-
brary's default (usually the "C" locale) is used. There is no brary's default (usually the "C" locale) is used. There is no
short form for this option. short form for this option.
-M, --multiline
Allow patterns to match more than one line. When this option
is set, the PCRE2 library is called in "multiline" mode. This
allows a matched string to extend past the end of a line and
continue on one or more subsequent lines. Patterns used with
-M may usefully contain literal newline characters and inter-
nal occurrences of ^ and $ characters. The output for a suc-
cessful match may consist of more than one line. The first
line is the line in which the match started, and the last
line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the
end of that line. If -v is set, none of the lines in a
multi-line match are output. Once a match has been handled,
scanning restarts at the beginning of the line after the one
in which the match ended.
The newline sequence that separates multiple lines must be
matched as part of the pattern. For example, to find the
phrase "regular expression" in a file where "regular" might
be at the end of a line and "expression" at the start of the
next line, you could use this command:
pcre2grep -M 'regular\s+expression' <file>
The \s escape sequence matches any white space character, in-
cluding newlines, and is followed by + so as to match trail-
ing white space on the first line as well as possibly han-
dling a two-character newline sequence.
There is a limit to the number of lines that can be matched,
imposed by the way that pcre2grep buffers the input file as
it scans it. With a sufficiently large processing buffer,
this should not be a problem, but the -M option does not work
when input is read line by line (see --line-buffered.)
-m number, --max-count=number
Stop processing after finding number matching lines, or non-
matching lines if -v is also set. Any trailing context lines
are output after the final match. In multiline mode, each
multiline match counts as just one line for this purpose. If
this limit is reached when reading the standard input from a
regular file, the file is left positioned just after the last
matching line. If -c is also set, the count that is output
is never greater than number. This option has no effect if
used with -L, -l, or -q, or when just checking for a match in
a binary file.
--match-limit=number --match-limit=number
Processing some regular expression patterns may take a very Processing some regular expression patterns may take a very
long time to search for all possible matching strings. Others long time to search for all possible matching strings. Others
@ -542,41 +590,6 @@ OPTIONS
size is silently forced to be no smaller than the starting size is silently forced to be no smaller than the starting
buffer size. buffer size.
-M, --multiline
Allow patterns to match more than one line. When this option
is set, the PCRE2 library is called in "multiline" mode. This
allows a matched string to extend past the end of a line and
continue on one or more subsequent lines. Patterns used with
-M may usefully contain literal newline characters and inter-
nal occurrences of ^ and $ characters. The output for a suc-
cessful match may consist of more than one line. The first
line is the line in which the match started, and the last
line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the
end of that line. If -v is set, none of the lines in a
multi-line match are output. Once a match has been handled,
scanning restarts at the beginning of the line after the one
in which the match ended.
The newline sequence that separates multiple lines must be
matched as part of the pattern. For example, to find the
phrase "regular expression" in a file where "regular" might
be at the end of a line and "expression" at the start of the
next line, you could use this command:
pcre2grep -M 'regular\s+expression' <file>
The \s escape sequence matches any white space character, in-
cluding newlines, and is followed by + so as to match trail-
ing white space on the first line as well as possibly han-
dling a two-character newline sequence.
There is a limit to the number of lines that can be matched,
imposed by the way that pcre2grep buffers the input file as
it scans it. With a sufficiently large processing buffer,
this should not be a problem, but the -M option does not work
when input is read line by line (see --line-buffered.)
-N newline-type, --newline=newline-type -N newline-type, --newline=newline-type
Six different conventions for indicating the ends of lines in Six different conventions for indicating the ends of lines in
scanned files are supported. For example: scanned files are supported. For example:
@ -625,97 +638,109 @@ OPTIONS
lems. It should never be needed in normal use. lems. It should never be needed in normal use.
-O text, --output=text -O text, --output=text
When there is a match, instead of outputting the whole line When there is a match, instead of outputting the line that
that matched, output just the given text, followed by an op- matched, output just the text specified in this option, fol-
erating-system standard newline. The --newline option has no lowed by an operating-system standard newline. In this mode,
effect on this option, which is mutually exclusive with no context is shown. That is, the -A, -B, and -C options are
--only-matching, --file-offsets, and --line-offsets. Escape ignored. The --newline option has no effect on this option,
sequences starting with a dollar character may be used to in- which is mutually exclusive with --only-matching, --file-off-
sert the contents of the matched part of the line and/or cap- sets, and --line-offsets. However, like --only-matching, if
tured substrings into the text. there is more than one match in a line, each of them causes a
line of output.
$<digits> or ${<digits>} is replaced by the captured sub- Escape sequences starting with a dollar character may be used
string of the given decimal number; zero substitutes the to insert the contents of the matched part of the line and/or
captured substrings into the text.
$<digits> or ${<digits>} is replaced by the captured sub-
string of the given decimal number; zero substitutes the
whole match. If the number is greater than the number of cap- whole match. If the number is greater than the number of cap-
turing substrings, or if the capture is unset, the replace- turing substrings, or if the capture is unset, the replace-
ment is empty. ment is empty.
$a is replaced by bell; $b by backspace; $e by escape; $f by $a is replaced by bell; $b by backspace; $e by escape; $f by
form feed; $n by newline; $r by carriage return; $t by tab; form feed; $n by newline; $r by carriage return; $t by tab;
$v by vertical tab. $v by vertical tab.
$o<digits> is replaced by the character represented by the $o<digits> or $o{<digits>} is replaced by the character whose
given octal number; up to three digits are processed. code point is the given octal number. In the first form, up
to three octal digits are processed. When more digits are
needed in Unicode mode to specify a wide character, the sec-
ond form must be used.
$x<digits> is replaced by the character represented by the $x<digits> or $x{<digits>} is replaced by the character rep-
given hexadecimal number; up to two digits are processed. resented by the given hexadecimal number. In the first form,
up to two hexadecimal digits are processed. When more digits
are needed in Unicode mode to specify a wide character, the
second form must be used.
Any other character is substituted by itself. In particular, Any other character is substituted by itself. In particular,
$$ is replaced by a single dollar. $$ is replaced by a single dollar.
-o, --only-matching -o, --only-matching
Show only the part of the line that matched a pattern instead Show only the part of the line that matched a pattern instead
of the whole line. In this mode, no context is shown. That of the whole line. In this mode, no context is shown. That
is, the -A, -B, and -C options are ignored. If there is more is, the -A, -B, and -C options are ignored. If there is more
than one match in a line, each of them is shown separately, than one match in a line, each of them is shown separately,
on a separate line of output. If -o is combined with -v (in- on a separate line of output. If -o is combined with -v (in-
vert the sense of the match to find non-matching lines), no vert the sense of the match to find non-matching lines), no
output is generated, but the return code is set appropri- output is generated, but the return code is set appropri-
ately. If the matched portion of the line is empty, nothing ately. If the matched portion of the line is empty, nothing
is output unless the file name or line number are being is output unless the file name or line number are being
printed, in which case they are shown on an otherwise empty printed, in which case they are shown on an otherwise empty
line. This option is mutually exclusive with --output, line. This option is mutually exclusive with --output,
--file-offsets and --line-offsets. --file-offsets and --line-offsets.
-onumber, --only-matching=number -onumber, --only-matching=number
Show only the part of the line that matched the capturing Show only the part of the line that matched the capturing
parentheses of the given number. Up to 50 capturing parenthe- parentheses of the given number. Up to 50 capturing parenthe-
ses are supported by default. This limit can be changed via ses are supported by default. This limit can be changed via
the --om-capture option. A pattern may contain any number of the --om-capture option. A pattern may contain any number of
capturing parentheses, but only those whose number is within capturing parentheses, but only those whose number is within
the limit can be accessed by -o. An error occurs if the num- the limit can be accessed by -o. An error occurs if the num-
ber specified by -o is greater than the limit. ber specified by -o is greater than the limit.
-o0 is the same as -o without a number. Because these options -o0 is the same as -o without a number. Because these options
can be given without an argument (see above), if an argument can be given without an argument (see above), if an argument
is present, it must be given in the same shell item, for ex- is present, it must be given in the same shell item, for ex-
ample, -o3 or --only-matching=2. The comments given for the ample, -o3 or --only-matching=2. The comments given for the
non-argument case above also apply to this option. If the non-argument case above also apply to this option. If the
specified capturing parentheses do not exist in the pattern, specified capturing parentheses do not exist in the pattern,
or were not set in the match, nothing is output unless the or were not set in the match, nothing is output unless the
file name or line number are being output. file name or line number are being output.
If this option is given multiple times, multiple substrings If this option is given multiple times, multiple substrings
are output for each match, in the order the options are are output for each match, in the order the options are
given, and all on one line. For example, -o3 -o1 -o3 causes given, and all on one line. For example, -o3 -o1 -o3 causes
the substrings matched by capturing parentheses 3 and 1 and the substrings matched by capturing parentheses 3 and 1 and
then 3 again to be output. By default, there is no separator then 3 again to be output. By default, there is no separator
(but see the next but one option). (but see the next but one option).
--om-capture=number --om-capture=number
Set the number of capturing parentheses that can be accessed Set the number of capturing parentheses that can be accessed
by -o. The default is 50. by -o. The default is 50.
--om-separator=text --om-separator=text
Specify a separating string for multiple occurrences of -o. Specify a separating string for multiple occurrences of -o.
The default is an empty string. Separating strings are never The default is an empty string. Separating strings are never
coloured. coloured.
-q, --quiet -q, --quiet
Work quietly, that is, display nothing except error messages. Work quietly, that is, display nothing except error messages.
The exit status indicates whether or not any matches were The exit status indicates whether or not any matches were
found. found.
-r, --recursive -r, --recursive
If any given path is a directory, recursively scan the files If any given path is a directory, recursively scan the files
it contains, taking note of any --include and --exclude set- it contains, taking note of any --include and --exclude set-
tings. By default, a directory is read as a normal file; in tings. By default, a directory is read as a normal file; in
some operating systems this gives an immediate end-of-file. some operating systems this gives an immediate end-of-file.
This option is a shorthand for setting the -d option to "re- This option is a shorthand for setting the -d option to "re-
curse". curse".
--recursion-limit=number --recursion-limit=number
See --match-limit above. This is an obsolete synonym for --depth-limit. See --match-
limit above for details.
-s, --no-messages -s, --no-messages
Suppress error messages about non-existent or unreadable Suppress error messages about non-existent or unreadable
@ -737,26 +762,30 @@ OPTIONS
-u, --utf Operate in UTF-8 mode. This option is available only if PCRE2 -u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
has been compiled with UTF-8 support. All patterns (including has been compiled with UTF-8 support. All patterns (including
those for any --exclude and --include options) and all sub- those for any --exclude and --include options) and all lines
ject lines that are scanned must be valid strings of UTF-8 that are scanned must be valid strings of UTF-8 characters.
characters. If an invalid UTF-8 string is encountered, an error occurs.
-U, --utf-allow-invalid -U, --utf-allow-invalid
As --utf, but in addition subject lines may contain invalid As --utf, but in addition subject lines may contain invalid
UTF-8 code unit sequences. These can never form part of any UTF-8 code unit sequences. These can never form part of any
pattern match. This facility allows valid UTF-8 strings to be pattern match. Patterns themselves, however, must still be
sought in executable or other binary files. For more details valid UTF-8 strings. This facility allows valid UTF-8 strings
about matching in non-valid UTF-8 strings, see the pcre2uni- to be sought within arbitrary byte sequences in executable or
code(3) documentation. other binary files. For more details about matching in non-
valid UTF-8 strings, see the pcre2unicode(3) documentation.
-V, --version -V, --version
Write the version numbers of pcre2grep and the PCRE2 library Write the version numbers of pcre2grep and the PCRE2 library
to the standard output and then exit. Anything else on the to the standard output and then exit. Anything else on the
command line is ignored. command line is ignored.
-v, --invert-match -v, --invert-match
Invert the sense of the match, so that lines which do not Invert the sense of the match, so that lines which do not
match any of the patterns are the ones that are found. match any of the patterns are the ones that are found. When
this option is set, options such as --only-matching and
--output, which specify parts of a match that are to be out-
put, are ignored.
-w, --word-regex, --word-regexp -w, --word-regex, --word-regexp
Force the patterns only to match "words". That is, there must Force the patterns only to match "words". That is, there must
@ -878,30 +907,49 @@ USING PCRE2'S CALLOUT FACILITY
mentation for details). Numbered callouts are ignored by pcre2grep; mentation for details). Numbered callouts are ignored by pcre2grep;
only callouts with string arguments are useful. only callouts with string arguments are useful.
Echoing a specific string
Starting the callout string with a pipe character invokes an echoing
facility that avoids calling an external program or script. This facil-
ity is always available, provided that callouts were not completely
disabled when pcre2grep was built. The rest of the callout string is
processed as a zero-terminated string, which means it should not con-
tain any internal binary zeros. It is written to the output, having
first been passed through the same escape processing as text from the
--output (-O) option (see above). However, $0 cannot be used to insert
a matched substring because the match is still in progress. Instead,
the single character '0' is inserted. Any syntax errors in the string
(for example, a dollar not followed by another character) causes the
callout to be ignored. No terminator is added to the output string, so
if you want a newline, you must include it explicitly using the escape
$n. For example:
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
Matching continues normally after the string is output. If you want to
see only the callout output but not any output from an actual match,
you should end the pattern with (*FAIL).
Calling external programs or scripts Calling external programs or scripts
This facility can be independently disabled when pcre2grep is built. It This facility can be independently disabled when pcre2grep is built. It
is supported for Windows, where a call to _spawnvp() is used, for VMS, is supported for Windows, where a call to _spawnvp() is used, for VMS,
where lib$spawn() is used, and for any other Unix-like environment where lib$spawn() is used, and for any Unix-like environment where
where fork() and execv() are available. fork() and execv() are available.
If the callout string does not start with a pipe (vertical bar) charac- If the callout string does not start with a pipe (vertical bar) charac-
ter, it is parsed into a list of substrings separated by pipe charac- ter, it is parsed into a list of substrings separated by pipe charac-
ters. The first substring must be an executable name, with the follow- ters. The first substring must be an executable name, with the follow-
ing substrings specifying arguments: ing substrings specifying arguments:
executable_name|arg1|arg2|... executable_name|arg1|arg2|...
Any substring (including the executable name) may contain escape se- Any substring (including the executable name) may contain escape se-
quences started by a dollar character: $<digits> or ${<digits>} is re- quences started by a dollar character. These are the same as for the
placed by the captured substring of the given decimal number, which --output (-O) option documented above, except that $0 cannot insert the
must be greater than zero. If the number is greater than the number of matched string because the match is still in progress. Instead, the
capturing substrings, or if the capture is unset, the replacement is character '0' is inserted. If you need a literal dollar or pipe charac-
empty. ter in any substring, use $$ or $| respectively. Here is an example:
Any other character is substituted by itself. In particular, $$ is re-
placed by a single dollar and $| is replaced by a pipe character. Here
is an example:
echo -e "abcde\n12345" | pcre2grep \ echo -e "abcde\n12345" | pcre2grep \
'(?x)(.)(..(.)) '(?x)(.)(..(.))
@ -914,28 +962,15 @@ USING PCRE2'S CALLOUT FACILITY
Arg1: [1] [234] [4] Arg2: |1| () Arg1: [1] [234] [4] Arg2: |1| ()
12345 12345
The parameters for the system call that is used to run the program or The parameters for the system call that is used to run the program or
script are zero-terminated strings. This means that binary zero charac- script are zero-terminated strings. This means that binary zero charac-
ters in the callout argument will cause premature termination of their ters in the callout argument will cause premature termination of their
substrings, and therefore should not be present. Any syntax errors in substrings, and therefore should not be present. Any syntax errors in
the string (for example, a dollar not followed by another character) the string (for example, a dollar not followed by another character)
cause the callout to be ignored. If running the program fails for any causes the callout to be ignored. If running the program fails for any
reason (including the non-existence of the executable), a local match- reason (including the non-existence of the executable), a local match-
ing failure occurs and the matcher backtracks in the normal way. ing failure occurs and the matcher backtracks in the normal way.
Echoing a specific string
This facility is always available, provided that callouts were not com-
pletely disabled when pcre2grep was built. If the callout string starts
with a pipe (vertical bar) character, the rest of the string is written
to the output, having been passed through the same escape processing as
text from the --output option. This provides a simple echoing facility
that avoids calling an external program or script. No terminator is
added to the string, so if you want a newline, you must include it ex-
plicitly. Matching continues normally after the string is output. If
you want to see only the callout output but not any output from an ac-
tual match, you should end the relevant pattern with (*FAIL).
MATCHING ERRORS MATCHING ERRORS
@ -969,7 +1004,7 @@ DIAGNOSTICS
SEE ALSO SEE ALSO
pcre2pattern(3), pcre2syntax(3), pcre2callout(3). pcre2pattern(3), pcre2syntax(3), pcre2callout(3), pcre2unicode(3).
AUTHOR AUTHOR
@ -981,5 +1016,5 @@ AUTHOR
REVISION REVISION
Last updated: 25 January 2020 Last updated: 04 October 2020
Copyright (c) 1997-2020 University of Cambridge. Copyright (c) 1997-2020 University of Cambridge.

File diff suppressed because it is too large Load Diff

View File

@ -164,6 +164,10 @@ enum { DEE_READ, DEE_SKIP };
enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT }; enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT };
/* Return values from decode_dollar_escape() */
enum { DDE_ERROR, DDE_CAPTURE, DDE_CHAR };
/* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some /* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some
environments), a warning is issued if the value of fwrite() is ignored. environments), a warning is issued if the value of fwrite() is ignored.
Unfortunately, casting to (void) does not suppress the warning. To get round Unfortunately, casting to (void) does not suppress the warning. To get round
@ -179,13 +183,21 @@ handled by using STDOUT_NL as the newline string. We also use a normal double
quote for the example, as single quotes aren't usually available. */ quote for the example, as single quotes aren't usually available. */
#ifdef WIN32 #ifdef WIN32
#define STDOUT_NL "\r\n" #define STDOUT_NL "\r\n"
#define QUOT "\"" #define STDOUT_NL_LEN 2
#define QUOT "\""
#else #else
#define STDOUT_NL "\n" #define STDOUT_NL "\n"
#define QUOT "'" #define STDOUT_NL_LEN 1
#define QUOT "'"
#endif #endif
/* This code is returned from decode_dollar_escape() when $n is encountered,
and used to mean "output STDOUT_NL". It is, of course, not a valid Unicode code
point. */
#define STDOUT_NL_CODE 0x7fffffffu
/************************************************* /*************************************************
@ -224,8 +236,9 @@ static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
static int bufsize = 3*PCRE2GREP_BUFSIZE; static int bufsize = 3*PCRE2GREP_BUFSIZE;
static int endlinetype; static int endlinetype;
static unsigned long int total_count = 0; static int count_limit = -1; /* Not long, so that it works with OP_NUMBER */
static unsigned long int counts_printed = 0; static unsigned long int counts_printed = 0;
static unsigned long int total_count = 0;
#ifdef WIN32 #ifdef WIN32
static int dee_action = dee_SKIP; static int dee_action = dee_SKIP;
@ -277,6 +290,9 @@ static BOOL show_total_count = FALSE;
static BOOL silent = FALSE; static BOOL silent = FALSE;
static BOOL utf = FALSE; static BOOL utf = FALSE;
static uint8_t utf8_buffer[8];
/* Structure for list of --only-matching capturing numbers. */ /* Structure for list of --only-matching capturing numbers. */
typedef struct omstr { typedef struct omstr {
@ -443,6 +459,7 @@ static option_item optionlist[] = {
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "depth-limit=number", "set PCRE2 depth limit option" }, { OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "depth-limit=number", "set PCRE2 depth limit option" },
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "recursion-limit=number", "obsolete synonym for depth-limit" }, { OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "recursion-limit=number", "obsolete synonym for depth-limit" },
{ OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" }, { OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" },
{ OP_NUMBER, 'm', &count_limit, "max-count=number", "stop after <number> matched lines" },
{ OP_STRING, 'N', &newline_arg, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF, ANY, or NUL)" }, { OP_STRING, 'N', &newline_arg, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF, ANY, or NUL)" },
{ OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" }, { OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" },
#ifdef SUPPORT_PCRE2GREP_JIT #ifdef SUPPORT_PCRE2GREP_JIT
@ -482,8 +499,13 @@ of PCRE2_NEWLINE_xx in pcre2.h. */
static const char *newlines[] = { static const char *newlines[] = {
"DEFAULT", "CR", "LF", "CRLF", "ANY", "ANYCRLF", "NUL" }; "DEFAULT", "CR", "LF", "CRLF", "ANY", "ANYCRLF", "NUL" };
/* UTF-8 tables - used only when the newline setting is "any". */ /* UTF-8 tables */
const int utf8_table1[] =
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
const int utf8_table1_size = sizeof(utf8_table1) / sizeof(int);
const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
const char utf8_table4[] = { const char utf8_table4[] = {
@ -531,6 +553,32 @@ else
#endif /* not VPCOMPAT && not HAVE_MEMMOVE */ #endif /* not VPCOMPAT && not HAVE_MEMMOVE */
/*************************************************
* Convert code point to UTF-8 *
*************************************************/
/* A static buffer is used. Returns the number of bytes. */
static int
ord2utf8(uint32_t value)
{
int i, j;
uint8_t *utf8bytes = utf8_buffer;
for (i = 0; i < utf8_table1_size; i++)
if (value <= (uint32_t)utf8_table1[i]) break;
utf8bytes += i;
for (j = i; j > 0; j--)
{
*utf8bytes-- = 0x80 | (value & 0x3f);
value >>= 6;
}
*utf8bytes = utf8_table2[i] | value;
return i + 1;
}
/************************************************* /*************************************************
* Case-independent string compare * * Case-independent string compare *
*************************************************/ *************************************************/
@ -1788,6 +1836,7 @@ if (slen > 200)
slen = 200; slen = 200;
msg = "text that starts:\n\n"; msg = "text that starts:\n\n";
} }
for (i = 1; p != NULL; p = p->next, i++) for (i = 1; p != NULL; p = p->next, i++)
{ {
*mrc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, (int)length, *mrc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, (int)length,
@ -1823,107 +1872,245 @@ return FALSE; /* No match, no errors */
} }
/*************************************************
* Decode dollar escape sequence *
*************************************************/
/* Called from various places to decode $ escapes in output strings. The escape
sequences are as follows:
$<digits> or ${<digits>} returns a capture number. However, if callout is TRUE,
zero is never returned; '0' is substituted.
$a returns bell.
$b returns backspace.
$e returns escape.
$f returns form feed.
$n returns newline.
$r returns carriage return.
$t returns tab.
$v returns vertical tab.
$o<digits> returns the character represented by the given octal
number; up to three digits are processed.
$o{<digits>} does the same, up to 7 digits, but gives an error for mode-invalid
code points.
$x<digits> returns the character represented by the given hexadecimal
number; up to two digits are processed.
$x{<digits} does the same, up to 6 digits, but gives an error for mode-invalid
code points.
Any other character is substituted by itself. E.g: $$ is replaced by a single
dollar.
Arguments:
begin the start of the whole string
string points to the $
callout TRUE if in a callout (inhibits error messages)
value where to return a value
last where to return pointer to the last used character
Returns: DDE_ERROR after a syntax error
DDE_CAPTURE if *value is a capture number
DDE_CHAR if *value is a character code
*/
static int
decode_dollar_escape(PCRE2_SPTR begin, PCRE2_SPTR string, BOOL callout,
uint32_t *value, PCRE2_SPTR *last)
{
uint32_t c = 0;
int base = 10;
int dcount;
int rc = DDE_CHAR;
BOOL brace = FALSE;
switch (*(++string))
{
case 0: /* Syntax error: a character must be present after $. */
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "no character after $");
*last = string;
return DDE_ERROR;
case '{':
brace = TRUE;
string++;
if (!isdigit(*string)) /* Syntax error: a decimal number required. */
{
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "decimal number expected");
rc = DDE_ERROR;
break;
}
/* Fall through */
/* The maximum capture number is 65535, so any number greater than that will
always be an unknown capture number. We just stop incrementing, in order to
avoid overflow. */
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
do
{
if (c <= 65535) c = c * 10 + (*string - '0');
string++;
}
while (*string >= '0' && *string <= '9');
string--; /* Point to last digit */
/* In a callout, capture number 0 is not available. No error can be given,
so just return the character '0'. */
if (callout && c == 0)
{
*value = '0';
}
else
{
*value = c;
rc = DDE_CAPTURE;
}
break;
/* Limit octal numbers to 3 digits without braces, or up to 7 with braces,
for valid Unicode code points. */
case 'o':
base = 8;
string++;
if (*string == '{')
{
brace = TRUE;
string++;
dcount = 7;
}
else dcount = 3;
for (; dcount > 0; dcount--)
{
if (*string < '0' || *string > '7') break;
c = c * 8 + (*string++ - '0');
}
*value = c;
string--; /* Point to last digit */
break;
/* Limit hex numbers to 2 digits without braces, or up to 6 with braces,
for valid Unicode code points. */
case 'x':
base = 16;
string++;
if (*string == '{')
{
brace = TRUE;
string++;
dcount = 6;
}
else dcount = 2;
for (; dcount > 0; dcount--)
{
if (!isxdigit(*string)) break;
if (*string >= '0' && *string <= '9')
c = c *16 + *string++ - '0';
else
c = c * 16 + (*string++ | 0x20) - 'a' + 10;
}
*value = c;
string--; /* Point to last digit */
break;
case 'a': *value = '\a'; break;
case 'b': *value = '\b'; break;
#ifndef EBCDIC
case 'e': *value = '\033'; break;
#else
case 'e': *value = '\047'; break;
#endif
case 'f': *value = '\f'; break;
case 'n': *value = STDOUT_NL_CODE; break;
case 'r': *value = '\r'; break;
case 't': *value = '\t'; break;
case 'v': *value = '\v'; break;
default: *value = *string; break;
}
if (brace)
{
c = string[1];
if (c != '}')
{
rc = DDE_ERROR;
if (!callout)
{
if ((base == 8 && c >= '0' && c <= '7') ||
(base == 16 && isxdigit(c)))
{
fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
"too many %s digits\n", (int)(string - begin),
(base == 8)? "octal" : "hex");
}
else
{
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "missing closing brace");
}
}
}
else string++;
}
/* Check maximum code point values, but take note of STDOUT_NL_CODE. */
if (rc == DDE_CHAR && *value != STDOUT_NL_CODE)
{
uint32_t max = utf? 0x0010ffffu : 0xffu;
if (*value > max)
{
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
"code point greater than 0x%x is invalid\n", (int)(string - begin), max);
rc = DDE_ERROR;
}
}
*last = string;
return rc;
}
/************************************************* /*************************************************
* Check output text for errors * * Check output text for errors *
*************************************************/ *************************************************/
/* Called early, to get errors before doing anything for -O text; also called
from callouts to check before outputting.
Arguments:
string an --output text string
callout TRUE if in a callout (stops printing errors)
Returns: TRUE if OK, FALSE on error
*/
static BOOL static BOOL
syntax_check_output_text(PCRE2_SPTR string, BOOL callout) syntax_check_output_text(PCRE2_SPTR string, BOOL callout)
{ {
uint32_t value;
PCRE2_SPTR begin = string; PCRE2_SPTR begin = string;
for (; *string != 0; string++) for (; *string != 0; string++)
{ {
if (*string == '$') if (*string == '$' &&
{ decode_dollar_escape(begin, string, callout, &value, &string) == DDE_ERROR)
PCRE2_SIZE capture_id = 0;
BOOL brace = FALSE;
string++;
/* Syntax error: a character must be present after $. */
if (*string == 0)
{
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "no character after $");
return FALSE; return FALSE;
}
if (*string == '{')
{
/* Must be a decimal number in braces, e.g: {5} or {38} */
string++;
brace = TRUE;
}
if ((*string >= '1' && *string <= '9') || (!callout && *string == '0'))
{
do
{
/* Maximum capture id is 65535. */
if (capture_id <= 65535)
capture_id = capture_id * 10 + (*string - '0');
string++;
}
while (*string >= '0' && *string <= '9');
if (brace)
{
/* Syntax error: closing brace is missing. */
if (*string != '}')
{
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "missing closing brace");
return FALSE;
}
}
else
{
/* To negate the effect of the for. */
string--;
}
}
else if (brace)
{
/* Syntax error: a decimal number required. */
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "decimal number expected");
return FALSE;
}
else if (*string == 'o')
{
string++;
if (*string < '0' || *string > '7')
{
/* Syntax error: an octal number required. */
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "octal number expected");
return FALSE;
}
}
else if (*string == 'x')
{
string++;
if (!isxdigit((unsigned char)*string))
{
/* Syntax error: a hexdecimal number required. */
if (!callout)
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
(int)(string - begin), "hexadecimal number expected");
return FALSE;
}
}
}
} }
return TRUE; return TRUE;
} }
@ -1932,31 +2119,7 @@ for (; *string != 0; string++)
*************************************************/ *************************************************/
/* Display the output text, which is assumed to have already been syntax /* Display the output text, which is assumed to have already been syntax
checked. Output may contain escape sequences started by the dollar sign. The checked. Output may contain escape sequences started by the dollar sign.
escape sequences are substituted as follows:
$<digits> or ${<digits>} is replaced by the captured substring of the given
decimal number; zero will substitute the whole match. If the number is
greater than the number of capturing substrings, or if the capture is unset,
the replacement is empty.
$a is replaced by bell.
$b is replaced by backspace.
$e is replaced by escape.
$f is replaced by form feed.
$n is replaced by newline.
$r is replaced by carriage return.
$t is replaced by tab.
$v is replaced by vertical tab.
$o<digits> is replaced by the character represented by the given octal
number; up to three digits are processed.
$x<digits> is replaced by the character represented by the given hexadecimal
number; up to two digits are processed.
Any other character is substituted by itself. E.g: $$ is replaced by a single
dollar.
Arguments: Arguments:
string: the output text string: the output text
@ -1973,121 +2136,54 @@ static BOOL
display_output_text(PCRE2_SPTR string, BOOL callout, PCRE2_SPTR subject, display_output_text(PCRE2_SPTR string, BOOL callout, PCRE2_SPTR subject,
PCRE2_SIZE *ovector, PCRE2_SIZE capture_top) PCRE2_SIZE *ovector, PCRE2_SIZE capture_top)
{ {
uint32_t value;
BOOL printed = FALSE; BOOL printed = FALSE;
PCRE2_SPTR begin = string;
for (; *string != 0; string++) for (; *string != 0; string++)
{ {
int ch = EOF;
if (*string == '$') if (*string == '$')
{ {
PCRE2_SIZE capture_id = 0; switch(decode_dollar_escape(begin, string, callout, &value, &string))
BOOL brace = FALSE;
string++;
if (*string == '{')
{ {
/* Must be a decimal number in braces, e.g: {5} or {38} */ case DDE_CHAR:
string++; if (value == STDOUT_NL_CODE)
brace = TRUE;
}
if ((*string >= '1' && *string <= '9') || (!callout && *string == '0'))
{
do
{ {
/* Maximum capture id is 65535. */ fprintf(stdout, STDOUT_NL);
if (capture_id <= 65535) printed = FALSE;
capture_id = capture_id * 10 + (*string - '0'); continue;
string++;
} }
while (*string >= '0' && *string <= '9'); break; /* Will print value */
if (!brace) case DDE_CAPTURE:
{ if (value < capture_top)
/* To negate the effect of the for. */
string--;
}
if (capture_id < capture_top)
{ {
PCRE2_SIZE capturesize; PCRE2_SIZE capturesize;
capture_id *= 2; value *= 2;
capturesize = ovector[value + 1] - ovector[value];
capturesize = ovector[capture_id + 1] - ovector[capture_id];
if (capturesize > 0) if (capturesize > 0)
{ {
print_match(subject + ovector[capture_id], capturesize); print_match(subject + ovector[value], capturesize);
printed = TRUE; printed = TRUE;
} }
} }
} continue;
else if (*string == 'a') ch = '\a';
else if (*string == 'b') ch = '\b';
#ifndef EBCDIC
else if (*string == 'e') ch = '\033';
#else
else if (*string == 'e') ch = '\047';
#endif
else if (*string == 'f') ch = '\f';
else if (*string == 'r') ch = '\r';
else if (*string == 't') ch = '\t';
else if (*string == 'v') ch = '\v';
else if (*string == 'n')
{
fprintf(stdout, STDOUT_NL);
printed = FALSE;
}
else if (*string == 'o')
{
string++;
ch = *string - '0'; default: /* Should not occur */
if (string[1] >= '0' && string[1] <= '7') break;
{
string++;
ch = ch * 8 + (*string - '0');
}
if (string[1] >= '0' && string[1] <= '7')
{
string++;
ch = ch * 8 + (*string - '0');
}
} }
else if (*string == 'x') }
{
string++;
if (*string >= '0' && *string <= '9') else value = *string; /* Not a $ escape */
ch = *string - '0';
else if (utf && value <= 127) fprintf(stdout, "%c", *string); else
ch = (*string | 0x20) - 'a' + 10;
if (isxdigit((unsigned char)string[1]))
{
string++;
ch *= 16;
if (*string >= '0' && *string <= '9')
ch += *string - '0';
else
ch += (*string | 0x20) - 'a' + 10;
}
}
else
{
ch = *string;
}
}
else
{ {
ch = *string; int i;
} int n = ord2utf8(value);
if (ch != EOF) for (i = 0; i < n; i++) fputc(utf8_buffer[i], stdout);
{
fprintf(stdout, "%c", ch);
printed = TRUE;
} }
printed = TRUE;
} }
return printed; return printed;
@ -2166,7 +2262,7 @@ int result = 0;
(void)unused; /* Avoid compiler warning */ (void)unused; /* Avoid compiler warning */
/* Only callout with strings are supported. */ /* Only callouts with strings are supported. */
if (string == NULL || length == 0) return 0; if (string == NULL || length == 0) return 0;
@ -2185,83 +2281,43 @@ return 0;
#else #else
/* Checking syntax and compute the number of string fragments. Callout strings /* Checking syntax and compute the number of string fragments. Callout strings
are ignored in case of a syntax error. */ are silently ignored in the event of a syntax error. */
while (length > 0) while (length > 0)
{ {
if (*string == '|') if (*string == '|')
{ {
argsvectorlen++; argsvectorlen++;
if (argsvectorlen > 10000) return 0; /* Too many args */
/* Maximum 10000 arguments allowed. */
if (argsvectorlen > 10000) return 0;
} }
else if (*string == '$') else if (*string == '$')
{ {
PCRE2_SIZE capture_id = 0; uint32_t value;
PCRE2_SPTR begin = string;
string++; switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
length--;
/* Syntax error: a character must be present after $. */
if (length == 0) return 0;
if (*string >= '1' && *string <= '9')
{ {
do case DDE_CAPTURE:
if (value < capture_top)
{ {
/* Maximum capture id is 65535. */ value *= 2;
if (capture_id <= 65535) argslen += ovector[value + 1] - ovector[value];
capture_id = capture_id * 10 + (*string - '0');
string++;
length--;
} }
while (length > 0 && *string >= '0' && *string <= '9'); argslen--; /* Negate the effect of argslen++ below. */
break;
/* To negate the effect of string++ below. */ case DDE_CHAR:
string--; if (value == STDOUT_NL_CODE) argslen += STDOUT_NL_LEN - 1;
length++; else if (utf && value > 127) argslen += ord2utf8(value) - 1;
} break;
else if (*string == '{')
{
/* Must be a decimal number in braces, e.g: {5} or {38} */
string++;
length--;
/* Syntax error: a decimal number required. */ default: /* Should not occur */
if (length == 0) return 0; case DDE_ERROR:
if (*string < '1' || *string > '9') return 0; return 0;
do
{
/* Maximum capture id is 65535. */
if (capture_id <= 65535)
capture_id = capture_id * 10 + (*string - '0');
string++;
length--;
/* Syntax error: no more characters */
if (length == 0) return 0;
}
while (*string >= '0' && *string <= '9');
/* Syntax error: closing brace is missing. */
if (*string != '}') return 0;
} }
if (capture_id > 0) length -= (string - begin);
{
if (capture_id < capture_top)
{
capture_id *= 2;
argslen += ovector[capture_id + 1] - ovector[capture_id];
}
/* To negate the effect of argslen++ below. */
argslen--;
}
} }
string++; string++;
@ -2269,6 +2325,8 @@ while (length > 0)
argslen++; argslen++;
} }
/* Get memory for the argument vector and its strings. */
args = (char*)malloc(argslen); args = (char*)malloc(argslen);
if (args == NULL) return 0; if (args == NULL) return 0;
@ -2279,9 +2337,10 @@ if (argsvector == NULL)
return 0; return 0;
} }
/* Now reprocess the string and set up the arguments. */
argsptr = args; argsptr = args;
argsvectorptr = argsvector; argsvectorptr = argsvector;
*argsvectorptr++ = argsptr; *argsvectorptr++ = argsptr;
length = calloutptr->callout_string_length; length = calloutptr->callout_string_length;
@ -2294,69 +2353,55 @@ while (length > 0)
*argsptr++ = '\0'; *argsptr++ = '\0';
*argsvectorptr++ = argsptr; *argsvectorptr++ = argsptr;
} }
else if (*string == '$') else if (*string == '$')
{ {
string++; uint32_t value;
length--; PCRE2_SPTR begin = string;
if ((*string >= '1' && *string <= '9') || *string == '{') switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
{ {
PCRE2_SIZE capture_id = 0; case DDE_CAPTURE:
if (value < capture_top)
if (*string != '{')
{ {
do PCRE2_SIZE capturesize;
{ value *= 2;
/* Maximum capture id is 65535. */ capturesize = ovector[value + 1] - ovector[value];
if (capture_id <= 65535) memcpy(argsptr, subject + ovector[value], capturesize);
capture_id = capture_id * 10 + (*string - '0'); argsptr += capturesize;
}
break;
string++; case DDE_CHAR:
length--; if (value == STDOUT_NL_CODE)
} {
while (length > 0 && *string >= '0' && *string <= '9'); memcpy(argsptr, STDOUT_NL, STDOUT_NL_LEN);
argsptr += STDOUT_NL_LEN;
/* To negate the effect of string++ below. */ }
string--; else if (utf && value > 127)
length++; {
int n = ord2utf8(value);
memcpy(argsptr, utf8_buffer, n);
argsptr += n;
} }
else else
{ {
string++; *argsptr++ = value;
length--;
do
{
/* Maximum capture id is 65535. */
if (capture_id <= 65535)
capture_id = capture_id * 10 + (*string - '0');
string++;
length--;
}
while (*string != '}');
} }
break;
if (capture_id < capture_top) default: /* Should not occur */
{ case DDE_ERROR:
PCRE2_SIZE capturesize; return 0;
capture_id *= 2; }
capturesize = ovector[capture_id + 1] - ovector[capture_id]; length -= (string - begin);
memcpy(argsptr, subject + ovector[capture_id], capturesize);
argsptr += capturesize;
}
}
else
{
*argsptr++ = *string;
}
}
else
{
*argsptr++ = *string;
} }
else *argsptr++ = *string;
/* Advance along the string */
string++; string++;
length--; length--;
} }
@ -2479,6 +2524,7 @@ int filepos = 0;
unsigned long int linenumber = 1; unsigned long int linenumber = 1;
unsigned long int lastmatchnumber = 0; unsigned long int lastmatchnumber = 0;
unsigned long int count = 0; unsigned long int count = 0;
long int count_matched_lines = 0;
char *lastmatchrestart = main_buffer; char *lastmatchrestart = main_buffer;
char *ptr = main_buffer; char *ptr = main_buffer;
char *endptr; char *endptr;
@ -2505,7 +2551,7 @@ bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
input_line_buffered); input_line_buffered);
#ifdef SUPPORT_LIBBZ2 #ifdef SUPPORT_LIBBZ2
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is PCRE2_SIZE; */ if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is PCRE2_SIZE */
#endif #endif
endptr = main_buffer + bufflength; endptr = main_buffer + bufflength;
@ -2533,10 +2579,23 @@ while (ptr < endptr)
int mrc = 0; int mrc = 0;
unsigned int options = 0; unsigned int options = 0;
BOOL match; BOOL match;
BOOL line_matched = FALSE;
char *t = ptr; char *t = ptr;
PCRE2_SIZE length, linelength; PCRE2_SIZE length, linelength;
PCRE2_SIZE startoffset = 0; PCRE2_SIZE startoffset = 0;
/* If the -m option set a limit for the number of matched or non-matched
lines, check it here. A limit of zero means that no matching is ever done.
For stdin from a file, set the file position. */
if (count_limit >= 0 && count_matched_lines >= count_limit)
{
if (frtype == FR_PLAIN && filename == stdin_name && !is_file_tty(handle))
(void)fseek(handle, (long int)filepos, SEEK_SET);
rc = (count_limit == 0)? 1 : 0;
break;
}
/* At this point, ptr is at the start of a line. We need to find the length /* At this point, ptr is at the start of a line. We need to find the length
of the subject string to pass to pcre2_match(). In multiline mode, it is the of the subject string to pass to pcre2_match(). In multiline mode, it is the
length remainder of the data in the buffer. Otherwise, it is the length of length remainder of the data in the buffer. Otherwise, it is the length of
@ -2686,6 +2745,10 @@ while (ptr < endptr)
if (filenames == FN_NOMATCH_ONLY) return 1; if (filenames == FN_NOMATCH_ONLY) return 1;
/* Remember that this line matched (for counting matched lines) */
line_matched = TRUE;
/* If all we want is a yes/no answer, we can return immediately. */ /* If all we want is a yes/no answer, we can return immediately. */
if (quiet) return 0; if (quiet) return 0;
@ -3067,6 +3130,11 @@ while (ptr < endptr)
filepos += (int)(linelength + endlinelength); filepos += (int)(linelength + endlinelength);
linenumber++; linenumber++;
/* If there was at least one match (or a non-match, as required) in the line,
increment the count for the -m option. */
if (line_matched) count_matched_lines++;
/* If input is line buffered, and the buffer is not yet full, read another /* If input is line buffered, and the buffer is not yet full, read another
line and add it into the buffer. */ line and add it into the buffer. */
@ -4088,6 +4156,7 @@ if (only_matching_count > 1)
pcre2grep_exit(usage(2)); pcre2grep_exit(usage(2));
} }
/* Check that there is a big enough ovector for all -o settings. */ /* Check that there is a big enough ovector for all -o settings. */
for (om = only_matching; om != NULL; om = om->next) for (om = only_matching; om != NULL; om = om->next)

24
testdata/grepoutput vendored
View File

@ -956,3 +956,27 @@ RC=0
pcre2grep: Requested group 1 cannot be captured. pcre2grep: Requested group 1 cannot be captured.
pcre2grep: Use --om-capture to increase the size of the capture vector. pcre2grep: Use --om-capture to increase the size of the capture vector.
RC=2 RC=2
---------------------------- Test 129 -----------------------------
The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the
lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox
RC=0
---------------------------- Test 130 -----------------------------
fox
fox
fox
fox
RC=0
---------------------------- Test 131 -----------------------------
2
RC=0
---------------------------- Test 132 -----------------------------
match 1:
a
match 2:
b
---
a
RC=0
---------------------------- Test 133 -----------------------------
=AB3CD5=
RC=0

View File

@ -29,3 +29,6 @@ RC=1
---------------------------- Test U5 ------------------------------ ---------------------------- Test U5 ------------------------------
CD Z CD Z
RC=0 RC=0
---------------------------- Test U6 -----------------------------
=ǓǤ=
RC=0

View File

@ -40,3 +40,5 @@ T
T T
T T
T T
0:T:AA
The quick brown

View File

@ -28,3 +28,5 @@ T
T T
T T
T T
0:T:AA
The quick brown