pcre2grep update: -m and $x{..}, $o{..} escapes. Also some doc updates.
This commit is contained in:
parent
3bdc76e4f3
commit
81da2b97e3
10
ChangeLog
10
ChangeLog
|
@ -76,6 +76,16 @@ the subject \xe5A. Fixes Bugzilla #2642.
|
||||||
14. Fixed a bug in character set matching when JIT is enabled and both unicode
|
14. Fixed a bug in character set matching when JIT is enabled and both unicode
|
||||||
scripts and unicode classes are present at the same time.
|
scripts and unicode classes are present at the same time.
|
||||||
|
|
||||||
|
15. Added GNU grep's -m (aka --max-count) option to pcre2grep.
|
||||||
|
|
||||||
|
16. Refactored substitution processing in pcre2grep strings, both for the -O
|
||||||
|
option and when dealing with callouts. There is now a single function that
|
||||||
|
handles $ expansion in all cases (instead of multiple copies of almost
|
||||||
|
identical code). This means that the same escape sequences are available
|
||||||
|
everywhere, which was not previously the case. At the same time, the escape
|
||||||
|
sequences $x{...} and $o{...} have been introduced, to allow for characters
|
||||||
|
whose code points are greater than 255 in Unicode mode.
|
||||||
|
|
||||||
|
|
||||||
Version 10.35 09-May-2020
|
Version 10.35 09-May-2020
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
6
README
6
README
|
@ -892,6 +892,6 @@ The distribution should contain the files listed below.
|
||||||
) environments
|
) environments
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: Philip.Hazel
|
||||||
Email domain: cam.ac.uk
|
Email domain: gmail.com
|
||||||
Last updated: 20 March 2020
|
Last updated: 22 September 2020
|
||||||
|
|
25
RunGrepTest
25
RunGrepTest
|
@ -661,6 +661,26 @@ echo "---------------------------- Test 128 -----------------------------" >>tes
|
||||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
||||||
echo "RC=$?" >>testtrygrep
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test 129 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -m 2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test 130 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -o -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test 131 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -oc -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <testdata/grepinput >>testtrygrep 2>&1
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <testdata/grepinputv >>testtrygrep 2>&1
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
# Now compare the results.
|
# Now compare the results.
|
||||||
|
|
||||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||||
|
@ -694,6 +714,10 @@ if [ $utf8 -ne 0 ] ; then
|
||||||
(cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep
|
(cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep
|
||||||
echo "RC=$?" >>testtrygrep
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
|
echo "---------------------------- Test U6 -----------------------------" >>testtrygrep
|
||||||
|
(cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <testdata/grepinputv >>testtrygrep 2>&1
|
||||||
|
echo "RC=$?" >>testtrygrep
|
||||||
|
|
||||||
$cf $srcdir/testdata/grepoutput8 testtrygrep
|
$cf $srcdir/testdata/grepoutput8 testtrygrep
|
||||||
if [ $? != 0 ] ; then exit 1; fi
|
if [ $? != 0 ] ; then exit 1; fi
|
||||||
|
|
||||||
|
@ -764,6 +788,7 @@ if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scri
|
||||||
$valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
|
$valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||||
$valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
$valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||||
$valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
|
$valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
|
||||||
|
$valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||||
|
|
||||||
if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
|
if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
|
||||||
$cf $srcdir/testdata/grepoutputCN testtrygrep
|
$cf $srcdir/testdata/grepoutputCN testtrygrep
|
||||||
|
|
|
@ -892,6 +892,6 @@ The distribution should contain the files listed below.
|
||||||
) environments
|
) environments
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: Philip.Hazel
|
||||||
Email domain: cam.ac.uk
|
Email domain: gmail.com
|
||||||
Last updated: 20 March 2020
|
Last updated: 22 September 2020
|
||||||
|
|
|
@ -111,8 +111,8 @@ matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
|
||||||
(either shown literally, or as an offset), scanning resumes immediately
|
(either shown literally, or as an offset), scanning resumes immediately
|
||||||
following the match, so that further matches on the same line can be found. If
|
following the match, so that further matches on the same line can be found. If
|
||||||
there are multiple patterns, they are all tried on the remainder of the line,
|
there are multiple patterns, they are all tried on the remainder of the line,
|
||||||
but patterns that follow the one that matched are not tried on the earlier part
|
but patterns that follow the one that matched are not tried on the earlier
|
||||||
of the line.
|
matched part of the line.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
This behaviour means that the order in which multiple patterns are specified
|
This behaviour means that the order in which multiple patterns are specified
|
||||||
|
@ -146,11 +146,10 @@ ignored.
|
||||||
<br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
|
<br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
|
||||||
<P>
|
<P>
|
||||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||||
is identified as a binary file, and is processed specially. (GNU grep
|
is identified as a binary file, and is processed specially. However, if the
|
||||||
identifies binary files in this manner.) However, if the newline type is
|
newline type is specified as NUL, that is, the line terminator is a binary
|
||||||
specified as NUL, that is, the line terminator is a binary zero, the test for
|
zero, the test for a binary file is not applied. See the <b>--binary-files</b>
|
||||||
a binary file is not applied. See the <b>--binary-files</b> option for a means
|
option for a means of changing the way binary files are handled.
|
||||||
of changing the way binary files are handled.
|
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br>
|
<br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -443,8 +442,8 @@ Ignore upper/lower case distinctions during comparisons.
|
||||||
<P>
|
<P>
|
||||||
<b>--include</b>=<i>pattern</i>
|
<b>--include</b>=<i>pattern</i>
|
||||||
If any <b>--include</b> patterns are specified, the only files that are
|
If any <b>--include</b> patterns are specified, the only files that are
|
||||||
processed are those that match one of the patterns (and do not match an
|
processed are those whose names match one of the patterns and do not match an
|
||||||
<b>--exclude</b> pattern). This option does not affect directories, but it
|
<b>--exclude</b> pattern. This option does not affect directories, but it
|
||||||
applies to all files, whether listed on the command line, obtained from
|
applies to all files, whether listed on the command line, obtained from
|
||||||
<b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
|
<b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
|
||||||
expression, and is matched against the final component of the file name, not
|
expression, and is matched against the final component of the file name, not
|
||||||
|
@ -463,8 +462,8 @@ may be given any number of times; all the files are read.
|
||||||
<P>
|
<P>
|
||||||
<b>--include-dir</b>=<i>pattern</i>
|
<b>--include-dir</b>=<i>pattern</i>
|
||||||
If any <b>--include-dir</b> patterns are specified, the only directories that
|
If any <b>--include-dir</b> patterns are specified, the only directories that
|
||||||
are processed are those that match one of the patterns (and do not match an
|
are processed are those whose names match one of the patterns and do not match
|
||||||
<b>--exclude-dir</b> pattern). This applies to all directories, whether listed
|
an <b>--exclude-dir</b> pattern. This applies to all directories, whether listed
|
||||||
on the command line, obtained from <b>--file-list</b>, or by scanning a parent
|
on the command line, obtained from <b>--file-list</b>, or by scanning a parent
|
||||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||||
the final component of the directory name, not the entire path. The <b>-F</b>,
|
the final component of the directory name, not the entire path. The <b>-F</b>,
|
||||||
|
@ -487,8 +486,9 @@ a separate line. Searching normally stops as soon as a matching line is found
|
||||||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||||
continues in order to obtain the correct count, and those files that have at
|
continues in order to obtain the correct count, and those files that have at
|
||||||
least one match are listed along with their counts. Using this option with
|
least one match are listed along with their counts. Using this option with
|
||||||
<b>-c</b> is a way of suppressing the listing of files with no matches. This
|
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||||
opeion overrides any previous <b>-H</b>, <b>-h</b>, or <b>-L</b> options.
|
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||||
|
<b>-h</b>, or <b>-L</b> options.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>--label</b>=<i>name</i>
|
<b>--label</b>=<i>name</i>
|
||||||
|
@ -501,8 +501,8 @@ short form for this option.
|
||||||
When this option is given, non-compressed input is read and processed line by
|
When this option is given, non-compressed input is read and processed line by
|
||||||
line, and the output is flushed after each write. By default, input is read in
|
line, and the output is flushed after each write. By default, input is read in
|
||||||
large chunks, unless <b>pcre2grep</b> can determine that it is reading from a
|
large chunks, unless <b>pcre2grep</b> can determine that it is reading from a
|
||||||
terminal (which is currently possible only in Unix-like environments or
|
terminal, which is currently possible only in Unix-like environments or
|
||||||
Windows). Output to terminal is normally automatically flushed by the operating
|
Windows. Output to terminal is normally automatically flushed by the operating
|
||||||
system. This option can be useful when the input or output is attached to a
|
system. This option can be useful when the input or output is attached to a
|
||||||
pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data.
|
pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data.
|
||||||
However, its use will affect performance, and the <b>-M</b> (multiline) option
|
However, its use will affect performance, and the <b>-M</b> (multiline) option
|
||||||
|
@ -528,6 +528,49 @@ locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||||
used. There is no short form for this option.
|
used. There is no short form for this option.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
<b>-M</b>, <b>--multiline</b>
|
||||||
|
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||||
|
library is called in "multiline" mode. This allows a matched string to extend
|
||||||
|
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||||
|
used with <b>-M</b> may usefully contain literal newline characters and internal
|
||||||
|
occurrences of ^ and $ characters. The output for a successful match may
|
||||||
|
consist of more than one line. The first line is the line in which the match
|
||||||
|
started, and the last line is the line in which the match ended. If the matched
|
||||||
|
string ends with a newline sequence, the output ends at the end of that line.
|
||||||
|
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
|
||||||
|
match has been handled, scanning restarts at the beginning of the line after
|
||||||
|
the one in which the match ended.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
The newline sequence that separates multiple lines must be matched as part of
|
||||||
|
the pattern. For example, to find the phrase "regular expression" in a file
|
||||||
|
where "regular" might be at the end of a line and "expression" at the start of
|
||||||
|
the next line, you could use this command:
|
||||||
|
<pre>
|
||||||
|
pcre2grep -M 'regular\s+expression' <file>
|
||||||
|
</pre>
|
||||||
|
The \s escape sequence matches any white space character, including newlines,
|
||||||
|
and is followed by + so as to match trailing white space on the first line as
|
||||||
|
well as possibly handling a two-character newline sequence.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
There is a limit to the number of lines that can be matched, imposed by the way
|
||||||
|
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
||||||
|
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
||||||
|
does not work when input is read line by line (see <b>--line-buffered</b>.)
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-m</b> <i>number</i>, <b>--max-count</b>=<i>number</i>
|
||||||
|
Stop processing after finding <i>number</i> matching lines, or non-matching
|
||||||
|
lines if <b>-v</b> is also set. Any trailing context lines are output after the
|
||||||
|
final match. In multiline mode, each multiline match counts as just one line
|
||||||
|
for this purpose. If this limit is reached when reading the standard input from
|
||||||
|
a regular file, the file is left positioned just after the last matching line.
|
||||||
|
If <b>-c</b> is also set, the count that is output is never greater than
|
||||||
|
<i>number</i>. This option has no effect if used with <b>-L</b>, <b>-l</b>, or
|
||||||
|
<b>-q</b>, or when just checking for a match in a binary file.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
<b>--match-limit</b>=<i>number</i>
|
<b>--match-limit</b>=<i>number</i>
|
||||||
Processing some regular expression patterns may take a very long time to search
|
Processing some regular expression patterns may take a very long time to search
|
||||||
for all possible matching strings. Others may require a very large amount of
|
for all possible matching strings. Others may require a very large amount of
|
||||||
|
@ -568,38 +611,6 @@ set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
||||||
smaller than the starting buffer size.
|
smaller than the starting buffer size.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-M</b>, <b>--multiline</b>
|
|
||||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
|
||||||
library is called in "multiline" mode. This allows a matched string to extend
|
|
||||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
|
||||||
used with <b>-M</b> may usefully contain literal newline characters and internal
|
|
||||||
occurrences of ^ and $ characters. The output for a successful match may
|
|
||||||
consist of more than one line. The first line is the line in which the match
|
|
||||||
started, and the last line is the line in which the match ended. If the matched
|
|
||||||
string ends with a newline sequence, the output ends at the end of that line.
|
|
||||||
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
|
|
||||||
match has been handled, scanning restarts at the beginning of the line after
|
|
||||||
the one in which the match ended.
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
The newline sequence that separates multiple lines must be matched as part of
|
|
||||||
the pattern. For example, to find the phrase "regular expression" in a file
|
|
||||||
where "regular" might be at the end of a line and "expression" at the start of
|
|
||||||
the next line, you could use this command:
|
|
||||||
<pre>
|
|
||||||
pcre2grep -M 'regular\s+expression' <file>
|
|
||||||
</pre>
|
|
||||||
The \s escape sequence matches any white space character, including newlines,
|
|
||||||
and is followed by + so as to match trailing white space on the first line as
|
|
||||||
well as possibly handling a two-character newline sequence.
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
There is a limit to the number of lines that can be matched, imposed by the way
|
|
||||||
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
|
||||||
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
|
||||||
does not work when input is read line by line (see <b>--line-buffered</b>.)
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
|
<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
|
||||||
Six different conventions for indicating the ends of lines in scanned files are
|
Six different conventions for indicating the ends of lines in scanned files are
|
||||||
supported. For example:
|
supported. For example:
|
||||||
|
@ -648,31 +659,41 @@ It should never be needed in normal use.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
|
<b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
|
||||||
When there is a match, instead of outputting the whole line that matched,
|
When there is a match, instead of outputting the line that matched, output just
|
||||||
output just the given text, followed by an operating-system standard newline.
|
the text specified in this option, followed by an operating-system standard
|
||||||
The <b>--newline</b> option has no effect on this option, which is mutually
|
newline. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>,
|
||||||
exclusive with <b>--only-matching</b>, <b>--file-offsets</b>, and
|
and <b>-C</b> options are ignored. The <b>--newline</b> option has no effect on
|
||||||
<b>--line-offsets</b>. Escape sequences starting with a dollar character may be
|
this option, which is mutually exclusive with <b>--only-matching</b>,
|
||||||
used to insert the contents of the matched part of the line and/or captured
|
<b>--file-offsets</b>, and <b>--line-offsets</b>. However, like
|
||||||
substrings into the text.
|
<b>--only-matching</b>, if there is more than one match in a line, each of them
|
||||||
|
causes a line of output.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
$<digits> or ${<digits>} is replaced by the captured
|
Escape sequences starting with a dollar character may be used to insert the
|
||||||
substring of the given decimal number; zero substitutes the whole match. If
|
contents of the matched part of the line and/or captured substrings into the
|
||||||
the number is greater than the number of capturing substrings, or if the
|
text.
|
||||||
capture is unset, the replacement is empty.
|
<br>
|
||||||
|
<br>
|
||||||
|
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||||
|
decimal number; zero substitutes the whole match. If the number is greater than
|
||||||
|
the number of capturing substrings, or if the capture is unset, the replacement
|
||||||
|
is empty.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
$o<digits> is replaced by the character represented by the given octal
|
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||||
number; up to three digits are processed.
|
given octal number. In the first form, up to three octal digits are processed.
|
||||||
|
When more digits are needed in Unicode mode to specify a wide character, the
|
||||||
|
second form must be used.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||||
number; up to two digits are processed.
|
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||||
|
processed. When more digits are needed in Unicode mode to specify a wide
|
||||||
|
character, the second form must be used.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||||
|
@ -741,7 +762,8 @@ option to "recurse".
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>--recursion-limit</b>=<i>number</i>
|
<b>--recursion-limit</b>=<i>number</i>
|
||||||
See <b>--match-limit</b> above.
|
This is an obsolete synonym for <b>--depth-limit</b>. See <b>--match-limit</b>
|
||||||
|
above for details.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-s</b>, <b>--no-messages</b>
|
<b>-s</b>, <b>--no-messages</b>
|
||||||
|
@ -765,15 +787,18 @@ total would always be zero.
|
||||||
<b>-u</b>, <b>--utf</b>
|
<b>-u</b>, <b>--utf</b>
|
||||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||||
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
||||||
<b>--include</b> options) and all subject lines that are scanned must be valid
|
<b>--include</b> options) and all lines that are scanned must be valid strings
|
||||||
strings of UTF-8 characters.
|
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||||
|
occurs.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-U</b>, <b>--utf-allow-invalid</b>
|
<b>-U</b>, <b>--utf-allow-invalid</b>
|
||||||
As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code
|
As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code
|
||||||
unit sequences. These can never form part of any pattern match. This facility
|
unit sequences. These can never form part of any pattern match. Patterns
|
||||||
allows valid UTF-8 strings to be sought in executable or other binary files.
|
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||||
For more details about matching in non-valid UTF-8 strings, see the
|
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||||
|
or other binary files. For more details about matching in non-valid UTF-8
|
||||||
|
strings, see the
|
||||||
<a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a>
|
<a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a>
|
||||||
documentation.
|
documentation.
|
||||||
</P>
|
</P>
|
||||||
|
@ -786,7 +811,9 @@ ignored.
|
||||||
<P>
|
<P>
|
||||||
<b>-v</b>, <b>--invert-match</b>
|
<b>-v</b>, <b>--invert-match</b>
|
||||||
Invert the sense of the match, so that lines which do <i>not</i> match any of
|
Invert the sense of the match, so that lines which do <i>not</i> match any of
|
||||||
the patterns are the ones that are found.
|
the patterns are the ones that are found. When this option is set, options such
|
||||||
|
as <b>--only-matching</b> and <b>--output</b>, which specify parts of a match
|
||||||
|
that are to be output, are ignored.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
|
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
|
||||||
|
@ -909,12 +936,36 @@ documentation for details). Numbered callouts are ignored by <b>pcre2grep</b>;
|
||||||
only callouts with string arguments are useful.
|
only callouts with string arguments are useful.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
Echoing a specific string
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
Starting the callout string with a pipe character invokes an echoing facility
|
||||||
|
that avoids calling an external program or script. This facility is always
|
||||||
|
available, provided that callouts were not completely disabled when
|
||||||
|
<b>pcre2grep</b> was built. The rest of the callout string is processed as a
|
||||||
|
zero-terminated string, which means it should not contain any internal binary
|
||||||
|
zeros. It is written to the output, having first been passed through the same
|
||||||
|
escape processing as text from the <b>--output</b> (<b>-O</b>) option (see
|
||||||
|
above). However, $0 cannot be used to insert a matched substring because the
|
||||||
|
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||||
|
syntax errors in the string (for example, a dollar not followed by another
|
||||||
|
character) causes the callout to be ignored. No terminator is added to the
|
||||||
|
output string, so if you want a newline, you must include it explicitly using
|
||||||
|
the escape $n. For example:
|
||||||
|
<pre>
|
||||||
|
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||||
|
</pre>
|
||||||
|
Matching continues normally after the string is output. If you want to see only
|
||||||
|
the callout output but not any output from an actual match, you should end the
|
||||||
|
pattern with (*FAIL).
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
Calling external programs or scripts
|
Calling external programs or scripts
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
This facility can be independently disabled when <b>pcre2grep</b> is built. It
|
This facility can be independently disabled when <b>pcre2grep</b> is built. It
|
||||||
is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
|
is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
|
||||||
where <b>lib$spawn()</b> is used, and for any other Unix-like environment where
|
where <b>lib$spawn()</b> is used, and for any Unix-like environment where
|
||||||
<b>fork()</b> and <b>execv()</b> are available.
|
<b>fork()</b> and <b>execv()</b> are available.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -926,14 +977,11 @@ arguments:
|
||||||
executable_name|arg1|arg2|...
|
executable_name|arg1|arg2|...
|
||||||
</pre>
|
</pre>
|
||||||
Any substring (including the executable name) may contain escape sequences
|
Any substring (including the executable name) may contain escape sequences
|
||||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
started by a dollar character. These are the same as for the <b>--output</b>
|
||||||
captured substring of the given decimal number, which must be greater than
|
(<b>-O</b>) option documented above, except that $0 cannot insert the matched
|
||||||
zero. If the number is greater than the number of capturing substrings, or if
|
string because the match is still in progress. Instead, the character '0'
|
||||||
the capture is unset, the replacement is empty.
|
is inserted. If you need a literal dollar or pipe character in any
|
||||||
</P>
|
substring, use $$ or $| respectively. Here is an example:
|
||||||
<P>
|
|
||||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
|
||||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
|
||||||
<pre>
|
<pre>
|
||||||
echo -e "abcde\n12345" | pcre2grep \
|
echo -e "abcde\n12345" | pcre2grep \
|
||||||
'(?x)(.)(..(.))
|
'(?x)(.)(..(.))
|
||||||
|
@ -946,28 +994,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||||
Arg1: [1] [234] [4] Arg2: |1| ()
|
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||||
12345
|
12345
|
||||||
</pre>
|
</pre>
|
||||||
The parameters for the system call that is used to run the
|
The parameters for the system call that is used to run the program or script
|
||||||
program or script are zero-terminated strings. This means that binary zero
|
are zero-terminated strings. This means that binary zero characters in the
|
||||||
characters in the callout argument will cause premature termination of their
|
callout argument will cause premature termination of their substrings, and
|
||||||
substrings, and therefore should not be present. Any syntax errors in the
|
therefore should not be present. Any syntax errors in the string (for example,
|
||||||
string (for example, a dollar not followed by another character) cause the
|
a dollar not followed by another character) causes the callout to be ignored.
|
||||||
callout to be ignored. If running the program fails for any reason (including
|
If running the program fails for any reason (including the non-existence of the
|
||||||
the non-existence of the executable), a local matching failure occurs and the
|
executable), a local matching failure occurs and the matcher backtracks in the
|
||||||
matcher backtracks in the normal way.
|
normal way.
|
||||||
</P>
|
|
||||||
<br><b>
|
|
||||||
Echoing a specific string
|
|
||||||
</b><br>
|
|
||||||
<P>
|
|
||||||
This facility is always available, provided that callouts were not completely
|
|
||||||
disabled when <b>pcre2grep</b> was built. If the callout string starts with a
|
|
||||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
|
||||||
having been passed through the same escape processing as text from the --output
|
|
||||||
option. This provides a simple echoing facility that avoids calling an external
|
|
||||||
program or script. No terminator is added to the string, so if you want a
|
|
||||||
newline, you must include it explicitly. Matching continues normally after the
|
|
||||||
string is output. If you want to see only the callout output but not any output
|
|
||||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br>
|
<br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -999,7 +1033,8 @@ because VMS does not distinguish between exit(0) and exit(1).
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC14" href="#TOC1">SEE ALSO</a><br>
|
<br><a name="SEC14" href="#TOC1">SEE ALSO</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3).
|
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3),
|
||||||
|
<b>pcre2unicode</b>(3).
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC15" href="#TOC1">AUTHOR</a><br>
|
<br><a name="SEC15" href="#TOC1">AUTHOR</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -1012,7 +1047,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 25 January 2020
|
Last updated: 04 October 2020
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2020 University of Cambridge.
|
Copyright © 1997-2020 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -323,7 +323,7 @@ test data, command lines that begin with # may appear. This file format, with
|
||||||
some restrictions, can also be processed by the <b>perltest.sh</b> script that
|
some restrictions, can also be processed by the <b>perltest.sh</b> script that
|
||||||
is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
|
is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
|
||||||
and Perl is the same. For a specification of <b>perltest.sh</b>, see the
|
and Perl is the same. For a specification of <b>perltest.sh</b>, see the
|
||||||
comments near its beginning.
|
comments near its beginning. See also the #perltest command below.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
When the input is a terminal, <b>pcre2test</b> prompts for each line of input,
|
When the input is a terminal, <b>pcre2test</b> prompts for each line of input,
|
||||||
|
@ -420,14 +420,20 @@ patterns. Modifiers on a pattern can change these settings.
|
||||||
<pre>
|
<pre>
|
||||||
#perltest
|
#perltest
|
||||||
</pre>
|
</pre>
|
||||||
The appearance of this line causes all subsequent modifier settings to be
|
This line is used in test files that can also be processed by <b>perltest.sh</b>
|
||||||
checked for compatibility with the <b>perltest.sh</b> script, which is used to
|
to confirm that Perl gives the same results as PCRE2. Subsequent tests are
|
||||||
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
checked for the use of <b>pcre2test</b> features that are incompatible with the
|
||||||
lines, #pattern commands, and #subject commands that set or unset "mark", no
|
<b>perltest.sh</b> script.
|
||||||
command lines are permitted, because they and many of the modifiers are
|
</P>
|
||||||
specific to <b>pcre2test</b>, and should not be used in test files that are also
|
<P>
|
||||||
processed by <b>perltest.sh</b>. The <b>#perltest</b> command helps detect tests
|
Patterns must use '/' as their delimiter, and only certain modifiers are
|
||||||
that are accidentally put in the wrong file.
|
supported. Comment lines, #pattern commands, and #subject commands that set or
|
||||||
|
unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and
|
||||||
|
#newline_default commands, which are needed in the relevant pcre2test files,
|
||||||
|
are silently ignored. All other command lines are ignored, but give a warning
|
||||||
|
message. The <b>#perltest</b> command helps detect tests that are accidentally
|
||||||
|
put in the wrong file or use the wrong delimiter. For more details of the
|
||||||
|
<b>perltest.sh</b> script see the comments it contains.
|
||||||
<pre>
|
<pre>
|
||||||
#pop [<modifiers>]
|
#pop [<modifiers>]
|
||||||
#popcopy [<modifiers>]
|
#popcopy [<modifiers>]
|
||||||
|
@ -2113,7 +2119,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 20 March 2020
|
Last updated: 14 September 2020
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2020 University of Cambridge.
|
Copyright © 1997-2020 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
232
doc/pcre2grep.1
232
doc/pcre2grep.1
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2GREP 1 "25 January 2020" "PCRE2 10.35"
|
.TH PCRE2GREP 1 "04 October 2020" "PCRE2 10.36"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -79,8 +79,8 @@ matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
|
||||||
(either shown literally, or as an offset), scanning resumes immediately
|
(either shown literally, or as an offset), scanning resumes immediately
|
||||||
following the match, so that further matches on the same line can be found. If
|
following the match, so that further matches on the same line can be found. If
|
||||||
there are multiple patterns, they are all tried on the remainder of the line,
|
there are multiple patterns, they are all tried on the remainder of the line,
|
||||||
but patterns that follow the one that matched are not tried on the earlier part
|
but patterns that follow the one that matched are not tried on the earlier
|
||||||
of the line.
|
matched part of the line.
|
||||||
.P
|
.P
|
||||||
This behaviour means that the order in which multiple patterns are specified
|
This behaviour means that the order in which multiple patterns are specified
|
||||||
can affect the output when one of the above options is used. This is no longer
|
can affect the output when one of the above options is used. This is no longer
|
||||||
|
@ -115,11 +115,10 @@ ignored.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||||
is identified as a binary file, and is processed specially. (GNU grep
|
is identified as a binary file, and is processed specially. However, if the
|
||||||
identifies binary files in this manner.) However, if the newline type is
|
newline type is specified as NUL, that is, the line terminator is a binary
|
||||||
specified as NUL, that is, the line terminator is a binary zero, the test for
|
zero, the test for a binary file is not applied. See the \fB--binary-files\fP
|
||||||
a binary file is not applied. See the \fB--binary-files\fP option for a means
|
option for a means of changing the way binary files are handled.
|
||||||
of changing the way binary files are handled.
|
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "BINARY ZEROS IN PATTERNS"
|
.SH "BINARY ZEROS IN PATTERNS"
|
||||||
|
@ -383,8 +382,8 @@ Ignore upper/lower case distinctions during comparisons.
|
||||||
.TP
|
.TP
|
||||||
\fB--include\fP=\fIpattern\fP
|
\fB--include\fP=\fIpattern\fP
|
||||||
If any \fB--include\fP patterns are specified, the only files that are
|
If any \fB--include\fP patterns are specified, the only files that are
|
||||||
processed are those that match one of the patterns (and do not match an
|
processed are those whose names match one of the patterns and do not match an
|
||||||
\fB--exclude\fP pattern). This option does not affect directories, but it
|
\fB--exclude\fP pattern. This option does not affect directories, but it
|
||||||
applies to all files, whether listed on the command line, obtained from
|
applies to all files, whether listed on the command line, obtained from
|
||||||
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
|
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
|
||||||
expression, and is matched against the final component of the file name, not
|
expression, and is matched against the final component of the file name, not
|
||||||
|
@ -401,8 +400,8 @@ may be given any number of times; all the files are read.
|
||||||
.TP
|
.TP
|
||||||
\fB--include-dir\fP=\fIpattern\fP
|
\fB--include-dir\fP=\fIpattern\fP
|
||||||
If any \fB--include-dir\fP patterns are specified, the only directories that
|
If any \fB--include-dir\fP patterns are specified, the only directories that
|
||||||
are processed are those that match one of the patterns (and do not match an
|
are processed are those whose names match one of the patterns and do not match
|
||||||
\fB--exclude-dir\fP pattern). This applies to all directories, whether listed
|
an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed
|
||||||
on the command line, obtained from \fB--file-list\fP, or by scanning a parent
|
on the command line, obtained from \fB--file-list\fP, or by scanning a parent
|
||||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||||
the final component of the directory name, not the entire path. The \fB-F\fP,
|
the final component of the directory name, not the entire path. The \fB-F\fP,
|
||||||
|
@ -423,8 +422,9 @@ a separate line. Searching normally stops as soon as a matching line is found
|
||||||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||||
continues in order to obtain the correct count, and those files that have at
|
continues in order to obtain the correct count, and those files that have at
|
||||||
least one match are listed along with their counts. Using this option with
|
least one match are listed along with their counts. Using this option with
|
||||||
\fB-c\fP is a way of suppressing the listing of files with no matches. This
|
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||||
opeion overrides any previous \fB-H\fP, \fB-h\fP, or \fB-L\fP options.
|
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||||
|
\fB-h\fP, or \fB-L\fP options.
|
||||||
.TP
|
.TP
|
||||||
\fB--label\fP=\fIname\fP
|
\fB--label\fP=\fIname\fP
|
||||||
This option supplies a name to be used for the standard input when file names
|
This option supplies a name to be used for the standard input when file names
|
||||||
|
@ -435,8 +435,8 @@ short form for this option.
|
||||||
When this option is given, non-compressed input is read and processed line by
|
When this option is given, non-compressed input is read and processed line by
|
||||||
line, and the output is flushed after each write. By default, input is read in
|
line, and the output is flushed after each write. By default, input is read in
|
||||||
large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
|
large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
|
||||||
terminal (which is currently possible only in Unix-like environments or
|
terminal, which is currently possible only in Unix-like environments or
|
||||||
Windows). Output to terminal is normally automatically flushed by the operating
|
Windows. Output to terminal is normally automatically flushed by the operating
|
||||||
system. This option can be useful when the input or output is attached to a
|
system. This option can be useful when the input or output is attached to a
|
||||||
pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
|
pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
|
||||||
However, its use will affect performance, and the \fB-M\fP (multiline) option
|
However, its use will affect performance, and the \fB-M\fP (multiline) option
|
||||||
|
@ -459,6 +459,45 @@ the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
|
||||||
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||||
used. There is no short form for this option.
|
used. There is no short form for this option.
|
||||||
.TP
|
.TP
|
||||||
|
\fB-M\fP, \fB--multiline\fP
|
||||||
|
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||||
|
library is called in "multiline" mode. This allows a matched string to extend
|
||||||
|
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||||
|
used with \fB-M\fP may usefully contain literal newline characters and internal
|
||||||
|
occurrences of ^ and $ characters. The output for a successful match may
|
||||||
|
consist of more than one line. The first line is the line in which the match
|
||||||
|
started, and the last line is the line in which the match ended. If the matched
|
||||||
|
string ends with a newline sequence, the output ends at the end of that line.
|
||||||
|
If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a
|
||||||
|
match has been handled, scanning restarts at the beginning of the line after
|
||||||
|
the one in which the match ended.
|
||||||
|
.sp
|
||||||
|
The newline sequence that separates multiple lines must be matched as part of
|
||||||
|
the pattern. For example, to find the phrase "regular expression" in a file
|
||||||
|
where "regular" might be at the end of a line and "expression" at the start of
|
||||||
|
the next line, you could use this command:
|
||||||
|
.sp
|
||||||
|
pcre2grep -M 'regular\es+expression' <file>
|
||||||
|
.sp
|
||||||
|
The \es escape sequence matches any white space character, including newlines,
|
||||||
|
and is followed by + so as to match trailing white space on the first line as
|
||||||
|
well as possibly handling a two-character newline sequence.
|
||||||
|
.sp
|
||||||
|
There is a limit to the number of lines that can be matched, imposed by the way
|
||||||
|
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
|
||||||
|
large processing buffer, this should not be a problem, but the \fB-M\fP option
|
||||||
|
does not work when input is read line by line (see \fB--line-buffered\fP.)
|
||||||
|
.TP
|
||||||
|
\fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP
|
||||||
|
Stop processing after finding \fInumber\fP matching lines, or non-matching
|
||||||
|
lines if \fB-v\fP is also set. Any trailing context lines are output after the
|
||||||
|
final match. In multiline mode, each multiline match counts as just one line
|
||||||
|
for this purpose. If this limit is reached when reading the standard input from
|
||||||
|
a regular file, the file is left positioned just after the last matching line.
|
||||||
|
If \fB-c\fP is also set, the count that is output is never greater than
|
||||||
|
\fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or
|
||||||
|
\fB-q\fP, or when just checking for a match in a binary file.
|
||||||
|
.TP
|
||||||
\fB--match-limit\fP=\fInumber\fP
|
\fB--match-limit\fP=\fInumber\fP
|
||||||
Processing some regular expression patterns may take a very long time to search
|
Processing some regular expression patterns may take a very long time to search
|
||||||
for all possible matching strings. Others may require a very large amount of
|
for all possible matching strings. Others may require a very large amount of
|
||||||
|
@ -493,35 +532,6 @@ This limits the expansion of the processing buffer, whose initial size can be
|
||||||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||||
smaller than the starting buffer size.
|
smaller than the starting buffer size.
|
||||||
.TP
|
.TP
|
||||||
\fB-M\fP, \fB--multiline\fP
|
|
||||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
|
||||||
library is called in "multiline" mode. This allows a matched string to extend
|
|
||||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
|
||||||
used with \fB-M\fP may usefully contain literal newline characters and internal
|
|
||||||
occurrences of ^ and $ characters. The output for a successful match may
|
|
||||||
consist of more than one line. The first line is the line in which the match
|
|
||||||
started, and the last line is the line in which the match ended. If the matched
|
|
||||||
string ends with a newline sequence, the output ends at the end of that line.
|
|
||||||
If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a
|
|
||||||
match has been handled, scanning restarts at the beginning of the line after
|
|
||||||
the one in which the match ended.
|
|
||||||
.sp
|
|
||||||
The newline sequence that separates multiple lines must be matched as part of
|
|
||||||
the pattern. For example, to find the phrase "regular expression" in a file
|
|
||||||
where "regular" might be at the end of a line and "expression" at the start of
|
|
||||||
the next line, you could use this command:
|
|
||||||
.sp
|
|
||||||
pcre2grep -M 'regular\es+expression' <file>
|
|
||||||
.sp
|
|
||||||
The \es escape sequence matches any white space character, including newlines,
|
|
||||||
and is followed by + so as to match trailing white space on the first line as
|
|
||||||
well as possibly handling a two-character newline sequence.
|
|
||||||
.sp
|
|
||||||
There is a limit to the number of lines that can be matched, imposed by the way
|
|
||||||
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
|
|
||||||
large processing buffer, this should not be a problem, but the \fB-M\fP option
|
|
||||||
does not work when input is read line by line (see \fB--line-buffered\fP.)
|
|
||||||
.TP
|
|
||||||
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
|
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
|
||||||
Six different conventions for indicating the ends of lines in scanned files are
|
Six different conventions for indicating the ends of lines in scanned files are
|
||||||
supported. For example:
|
supported. For example:
|
||||||
|
@ -565,27 +575,36 @@ use of JIT at run time. It is provided for testing and working round problems.
|
||||||
It should never be needed in normal use.
|
It should never be needed in normal use.
|
||||||
.TP
|
.TP
|
||||||
\fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
|
\fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
|
||||||
When there is a match, instead of outputting the whole line that matched,
|
When there is a match, instead of outputting the line that matched, output just
|
||||||
output just the given text, followed by an operating-system standard newline.
|
the text specified in this option, followed by an operating-system standard
|
||||||
The \fB--newline\fP option has no effect on this option, which is mutually
|
newline. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP,
|
||||||
exclusive with \fB--only-matching\fP, \fB--file-offsets\fP, and
|
and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on
|
||||||
\fB--line-offsets\fP. Escape sequences starting with a dollar character may be
|
this option, which is mutually exclusive with \fB--only-matching\fP,
|
||||||
used to insert the contents of the matched part of the line and/or captured
|
\fB--file-offsets\fP, and \fB--line-offsets\fP. However, like
|
||||||
substrings into the text.
|
\fB--only-matching\fP, if there is more than one match in a line, each of them
|
||||||
|
causes a line of output.
|
||||||
.sp
|
.sp
|
||||||
$<digits> or ${<digits>} is replaced by the captured
|
Escape sequences starting with a dollar character may be used to insert the
|
||||||
substring of the given decimal number; zero substitutes the whole match. If
|
contents of the matched part of the line and/or captured substrings into the
|
||||||
the number is greater than the number of capturing substrings, or if the
|
text.
|
||||||
capture is unset, the replacement is empty.
|
.sp
|
||||||
|
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||||
|
decimal number; zero substitutes the whole match. If the number is greater than
|
||||||
|
the number of capturing substrings, or if the capture is unset, the replacement
|
||||||
|
is empty.
|
||||||
.sp
|
.sp
|
||||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||||
.sp
|
.sp
|
||||||
$o<digits> is replaced by the character represented by the given octal
|
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||||
number; up to three digits are processed.
|
given octal number. In the first form, up to three octal digits are processed.
|
||||||
|
When more digits are needed in Unicode mode to specify a wide character, the
|
||||||
|
second form must be used.
|
||||||
.sp
|
.sp
|
||||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||||
number; up to two digits are processed.
|
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||||
|
processed. When more digits are needed in Unicode mode to specify a wide
|
||||||
|
character, the second form must be used.
|
||||||
.sp
|
.sp
|
||||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||||
a single dollar.
|
a single dollar.
|
||||||
|
@ -644,7 +663,8 @@ immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
|
||||||
option to "recurse".
|
option to "recurse".
|
||||||
.TP
|
.TP
|
||||||
\fB--recursion-limit\fP=\fInumber\fP
|
\fB--recursion-limit\fP=\fInumber\fP
|
||||||
See \fB--match-limit\fP above.
|
This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP
|
||||||
|
above for details.
|
||||||
.TP
|
.TP
|
||||||
\fB-s\fP, \fB--no-messages\fP
|
\fB-s\fP, \fB--no-messages\fP
|
||||||
Suppress error messages about non-existent or unreadable files. Such files are
|
Suppress error messages about non-existent or unreadable files. Such files are
|
||||||
|
@ -665,14 +685,17 @@ total would always be zero.
|
||||||
\fB-u\fP, \fB--utf\fP
|
\fB-u\fP, \fB--utf\fP
|
||||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||||
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
||||||
\fB--include\fP options) and all subject lines that are scanned must be valid
|
\fB--include\fP options) and all lines that are scanned must be valid strings
|
||||||
strings of UTF-8 characters.
|
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||||
|
occurs.
|
||||||
.TP
|
.TP
|
||||||
\fB-U\fP, \fB--utf-allow-invalid\fP
|
\fB-U\fP, \fB--utf-allow-invalid\fP
|
||||||
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
|
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
|
||||||
unit sequences. These can never form part of any pattern match. This facility
|
unit sequences. These can never form part of any pattern match. Patterns
|
||||||
allows valid UTF-8 strings to be sought in executable or other binary files.
|
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||||
For more details about matching in non-valid UTF-8 strings, see the
|
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||||
|
or other binary files. For more details about matching in non-valid UTF-8
|
||||||
|
strings, see the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2unicode\fP(3)
|
\fBpcre2unicode\fP(3)
|
||||||
.\"
|
.\"
|
||||||
|
@ -685,7 +708,9 @@ ignored.
|
||||||
.TP
|
.TP
|
||||||
\fB-v\fP, \fB--invert-match\fP
|
\fB-v\fP, \fB--invert-match\fP
|
||||||
Invert the sense of the match, so that lines which do \fInot\fP match any of
|
Invert the sense of the match, so that lines which do \fInot\fP match any of
|
||||||
the patterns are the ones that are found.
|
the patterns are the ones that are found. When this option is set, options such
|
||||||
|
as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match
|
||||||
|
that are to be output, are ignored.
|
||||||
.TP
|
.TP
|
||||||
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
|
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
|
||||||
Force the patterns only to match "words". That is, there must be a word
|
Force the patterns only to match "words". That is, there must be a word
|
||||||
|
@ -812,12 +837,36 @@ documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP;
|
||||||
only callouts with string arguments are useful.
|
only callouts with string arguments are useful.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
.SS "Echoing a specific string"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Starting the callout string with a pipe character invokes an echoing facility
|
||||||
|
that avoids calling an external program or script. This facility is always
|
||||||
|
available, provided that callouts were not completely disabled when
|
||||||
|
\fBpcre2grep\fP was built. The rest of the callout string is processed as a
|
||||||
|
zero-terminated string, which means it should not contain any internal binary
|
||||||
|
zeros. It is written to the output, having first been passed through the same
|
||||||
|
escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
|
||||||
|
above). However, $0 cannot be used to insert a matched substring because the
|
||||||
|
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||||
|
syntax errors in the string (for example, a dollar not followed by another
|
||||||
|
character) causes the callout to be ignored. No terminator is added to the
|
||||||
|
output string, so if you want a newline, you must include it explicitly using
|
||||||
|
the escape $n. For example:
|
||||||
|
.sp
|
||||||
|
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||||
|
.sp
|
||||||
|
Matching continues normally after the string is output. If you want to see only
|
||||||
|
the callout output but not any output from an actual match, you should end the
|
||||||
|
pattern with (*FAIL).
|
||||||
|
.
|
||||||
|
.
|
||||||
.SS "Calling external programs or scripts"
|
.SS "Calling external programs or scripts"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
This facility can be independently disabled when \fBpcre2grep\fP is built. It
|
This facility can be independently disabled when \fBpcre2grep\fP is built. It
|
||||||
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
|
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
|
||||||
where \fBlib$spawn()\fP is used, and for any other Unix-like environment where
|
where \fBlib$spawn()\fP is used, and for any Unix-like environment where
|
||||||
\fBfork()\fP and \fBexecv()\fP are available.
|
\fBfork()\fP and \fBexecv()\fP are available.
|
||||||
.P
|
.P
|
||||||
If the callout string does not start with a pipe (vertical bar) character, it
|
If the callout string does not start with a pipe (vertical bar) character, it
|
||||||
|
@ -828,13 +877,11 @@ arguments:
|
||||||
executable_name|arg1|arg2|...
|
executable_name|arg1|arg2|...
|
||||||
.sp
|
.sp
|
||||||
Any substring (including the executable name) may contain escape sequences
|
Any substring (including the executable name) may contain escape sequences
|
||||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
started by a dollar character. These are the same as for the \fB--output\fP
|
||||||
captured substring of the given decimal number, which must be greater than
|
(\fB-O\fP) option documented above, except that $0 cannot insert the matched
|
||||||
zero. If the number is greater than the number of capturing substrings, or if
|
string because the match is still in progress. Instead, the character '0'
|
||||||
the capture is unset, the replacement is empty.
|
is inserted. If you need a literal dollar or pipe character in any
|
||||||
.P
|
substring, use $$ or $| respectively. Here is an example:
|
||||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
|
||||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
|
||||||
.sp
|
.sp
|
||||||
echo -e "abcde\en12345" | pcre2grep \e
|
echo -e "abcde\en12345" | pcre2grep \e
|
||||||
'(?x)(.)(..(.))
|
'(?x)(.)(..(.))
|
||||||
|
@ -847,28 +894,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||||
Arg1: [1] [234] [4] Arg2: |1| ()
|
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||||
12345
|
12345
|
||||||
.sp
|
.sp
|
||||||
The parameters for the system call that is used to run the
|
The parameters for the system call that is used to run the program or script
|
||||||
program or script are zero-terminated strings. This means that binary zero
|
are zero-terminated strings. This means that binary zero characters in the
|
||||||
characters in the callout argument will cause premature termination of their
|
callout argument will cause premature termination of their substrings, and
|
||||||
substrings, and therefore should not be present. Any syntax errors in the
|
therefore should not be present. Any syntax errors in the string (for example,
|
||||||
string (for example, a dollar not followed by another character) cause the
|
a dollar not followed by another character) causes the callout to be ignored.
|
||||||
callout to be ignored. If running the program fails for any reason (including
|
If running the program fails for any reason (including the non-existence of the
|
||||||
the non-existence of the executable), a local matching failure occurs and the
|
executable), a local matching failure occurs and the matcher backtracks in the
|
||||||
matcher backtracks in the normal way.
|
normal way.
|
||||||
.
|
|
||||||
.
|
|
||||||
.SS "Echoing a specific string"
|
|
||||||
.rs
|
|
||||||
.sp
|
|
||||||
This facility is always available, provided that callouts were not completely
|
|
||||||
disabled when \fBpcre2grep\fP was built. If the callout string starts with a
|
|
||||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
|
||||||
having been passed through the same escape processing as text from the --output
|
|
||||||
option. This provides a simple echoing facility that avoids calling an external
|
|
||||||
program or script. No terminator is added to the string, so if you want a
|
|
||||||
newline, you must include it explicitly. Matching continues normally after the
|
|
||||||
string is output. If you want to see only the callout output but not any output
|
|
||||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "MATCHING ERRORS"
|
.SH "MATCHING ERRORS"
|
||||||
|
@ -904,7 +937,8 @@ because VMS does not distinguish between exit(0) and exit(1).
|
||||||
.SH "SEE ALSO"
|
.SH "SEE ALSO"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3).
|
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3),
|
||||||
|
\fBpcre2unicode\fP(3).
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH AUTHOR
|
.SH AUTHOR
|
||||||
|
@ -921,6 +955,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 25 January 2020
|
Last updated: 04 October 2020
|
||||||
Copyright (c) 1997-2020 University of Cambridge.
|
Copyright (c) 1997-2020 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -80,7 +80,7 @@ DESCRIPTION
|
||||||
following the match, so that further matches on the same line can be
|
following the match, so that further matches on the same line can be
|
||||||
found. If there are multiple patterns, they are all tried on the re-
|
found. If there are multiple patterns, they are all tried on the re-
|
||||||
mainder of the line, but patterns that follow the one that matched are
|
mainder of the line, but patterns that follow the one that matched are
|
||||||
not tried on the earlier part of the line.
|
not tried on the earlier matched part of the line.
|
||||||
|
|
||||||
This behaviour means that the order in which multiple patterns are
|
This behaviour means that the order in which multiple patterns are
|
||||||
specified can affect the output when one of the above options is used.
|
specified can affect the output when one of the above options is used.
|
||||||
|
@ -115,10 +115,10 @@ BINARY FILES
|
||||||
|
|
||||||
By default, a file that contains a binary zero byte within the first
|
By default, a file that contains a binary zero byte within the first
|
||||||
1024 bytes is identified as a binary file, and is processed specially.
|
1024 bytes is identified as a binary file, and is processed specially.
|
||||||
(GNU grep identifies binary files in this manner.) However, if the new-
|
However, if the newline type is specified as NUL, that is, the line
|
||||||
line type is specified as NUL, that is, the line terminator is a binary
|
terminator is a binary zero, the test for a binary file is not applied.
|
||||||
zero, the test for a binary file is not applied. See the --binary-files
|
See the --binary-files option for a means of changing the way binary
|
||||||
option for a means of changing the way binary files are handled.
|
files are handled.
|
||||||
|
|
||||||
|
|
||||||
BINARY ZEROS IN PATTERNS
|
BINARY ZEROS IN PATTERNS
|
||||||
|
@ -413,17 +413,17 @@ OPTIONS
|
||||||
|
|
||||||
--include=pattern
|
--include=pattern
|
||||||
If any --include patterns are specified, the only files that
|
If any --include patterns are specified, the only files that
|
||||||
are processed are those that match one of the patterns (and
|
are processed are those whose names match one of the patterns
|
||||||
do not match an --exclude pattern). This option does not af-
|
and do not match an --exclude pattern. This option does not
|
||||||
fect directories, but it applies to all files, whether listed
|
affect directories, but it applies to all files, whether
|
||||||
on the command line, obtained from --file-list, or by scan-
|
listed on the command line, obtained from --file-list, or by
|
||||||
ning a directory. The pattern is a PCRE2 regular expression,
|
scanning a directory. The pattern is a PCRE2 regular expres-
|
||||||
and is matched against the final component of the file name,
|
sion, and is matched against the final component of the file
|
||||||
not the entire path. The -F, -w, and -x options do not apply
|
name, not the entire path. The -F, -w, and -x options do not
|
||||||
to this pattern. The option may be given any number of times.
|
apply to this pattern. The option may be given any number of
|
||||||
If a file name matches both an --include and an --exclude
|
times. If a file name matches both an --include and an --ex-
|
||||||
pattern, it is excluded. There is no short form for this op-
|
clude pattern, it is excluded. There is no short form for
|
||||||
tion.
|
this option.
|
||||||
|
|
||||||
--include-from=filename
|
--include-from=filename
|
||||||
Treat each non-empty line of the file as the data for an
|
Treat each non-empty line of the file as the data for an
|
||||||
|
@ -434,8 +434,8 @@ OPTIONS
|
||||||
|
|
||||||
--include-dir=pattern
|
--include-dir=pattern
|
||||||
If any --include-dir patterns are specified, the only direc-
|
If any --include-dir patterns are specified, the only direc-
|
||||||
tories that are processed are those that match one of the
|
tories that are processed are those whose names match one of
|
||||||
patterns (and do not match an --exclude-dir pattern). This
|
the patterns and do not match an --exclude-dir pattern. This
|
||||||
applies to all directories, whether listed on the command
|
applies to all directories, whether listed on the command
|
||||||
line, obtained from --file-list, or by scanning a parent di-
|
line, obtained from --file-list, or by scanning a parent di-
|
||||||
rectory. The pattern is a PCRE2 regular expression, and is
|
rectory. The pattern is a PCRE2 regular expression, and is
|
||||||
|
@ -461,8 +461,9 @@ OPTIONS
|
||||||
matching continues in order to obtain the correct count, and
|
matching continues in order to obtain the correct count, and
|
||||||
those files that have at least one match are listed along
|
those files that have at least one match are listed along
|
||||||
with their counts. Using this option with -c is a way of sup-
|
with their counts. Using this option with -c is a way of sup-
|
||||||
pressing the listing of files with no matches. This opeion
|
pressing the listing of files with no matches that occurs
|
||||||
overrides any previous -H, -h, or -L options.
|
with -c on its own. This option overrides any previous -H,
|
||||||
|
-h, or -L options.
|
||||||
|
|
||||||
--label=name
|
--label=name
|
||||||
This option supplies a name to be used for the standard input
|
This option supplies a name to be used for the standard input
|
||||||
|
@ -470,37 +471,84 @@ OPTIONS
|
||||||
input)" is used. There is no short form for this option.
|
input)" is used. There is no short form for this option.
|
||||||
|
|
||||||
--line-buffered
|
--line-buffered
|
||||||
When this option is given, non-compressed input is read and
|
When this option is given, non-compressed input is read and
|
||||||
processed line by line, and the output is flushed after each
|
processed line by line, and the output is flushed after each
|
||||||
write. By default, input is read in large chunks, unless
|
write. By default, input is read in large chunks, unless
|
||||||
pcre2grep can determine that it is reading from a terminal
|
pcre2grep can determine that it is reading from a terminal,
|
||||||
(which is currently possible only in Unix-like environments
|
which is currently possible only in Unix-like environments or
|
||||||
or Windows). Output to terminal is normally automatically
|
Windows. Output to terminal is normally automatically flushed
|
||||||
flushed by the operating system. This option can be useful
|
by the operating system. This option can be useful when the
|
||||||
when the input or output is attached to a pipe and you do not
|
input or output is attached to a pipe and you do not want
|
||||||
want pcre2grep to buffer up large amounts of data. However,
|
pcre2grep to buffer up large amounts of data. However, its
|
||||||
its use will affect performance, and the -M (multiline) op-
|
use will affect performance, and the -M (multiline) option
|
||||||
tion ceases to work. When input is from a compressed .gz or
|
ceases to work. When input is from a compressed .gz or .bz2
|
||||||
.bz2 file, --line-buffered is ignored.
|
file, --line-buffered is ignored.
|
||||||
|
|
||||||
--line-offsets
|
--line-offsets
|
||||||
Instead of showing lines or parts of lines that match, show
|
Instead of showing lines or parts of lines that match, show
|
||||||
each match as a line number, the offset from the start of the
|
each match as a line number, the offset from the start of the
|
||||||
line, and a length. The line number is terminated by a colon
|
line, and a length. The line number is terminated by a colon
|
||||||
(as usual; see the -n option), and the offset and length are
|
(as usual; see the -n option), and the offset and length are
|
||||||
separated by a comma. In this mode, no context is shown.
|
separated by a comma. In this mode, no context is shown.
|
||||||
That is, the -A, -B, and -C options are ignored. If there is
|
That is, the -A, -B, and -C options are ignored. If there is
|
||||||
more than one match in a line, each of them is shown sepa-
|
more than one match in a line, each of them is shown sepa-
|
||||||
rately. This option is mutually exclusive with --output,
|
rately. This option is mutually exclusive with --output,
|
||||||
--file-offsets, and --only-matching.
|
--file-offsets, and --only-matching.
|
||||||
|
|
||||||
--locale=locale-name
|
--locale=locale-name
|
||||||
This option specifies a locale to be used for pattern match-
|
This option specifies a locale to be used for pattern match-
|
||||||
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
||||||
ronment variables. If no locale is specified, the PCRE2 li-
|
ronment variables. If no locale is specified, the PCRE2 li-
|
||||||
brary's default (usually the "C" locale) is used. There is no
|
brary's default (usually the "C" locale) is used. There is no
|
||||||
short form for this option.
|
short form for this option.
|
||||||
|
|
||||||
|
-M, --multiline
|
||||||
|
Allow patterns to match more than one line. When this option
|
||||||
|
is set, the PCRE2 library is called in "multiline" mode. This
|
||||||
|
allows a matched string to extend past the end of a line and
|
||||||
|
continue on one or more subsequent lines. Patterns used with
|
||||||
|
-M may usefully contain literal newline characters and inter-
|
||||||
|
nal occurrences of ^ and $ characters. The output for a suc-
|
||||||
|
cessful match may consist of more than one line. The first
|
||||||
|
line is the line in which the match started, and the last
|
||||||
|
line is the line in which the match ended. If the matched
|
||||||
|
string ends with a newline sequence, the output ends at the
|
||||||
|
end of that line. If -v is set, none of the lines in a
|
||||||
|
multi-line match are output. Once a match has been handled,
|
||||||
|
scanning restarts at the beginning of the line after the one
|
||||||
|
in which the match ended.
|
||||||
|
|
||||||
|
The newline sequence that separates multiple lines must be
|
||||||
|
matched as part of the pattern. For example, to find the
|
||||||
|
phrase "regular expression" in a file where "regular" might
|
||||||
|
be at the end of a line and "expression" at the start of the
|
||||||
|
next line, you could use this command:
|
||||||
|
|
||||||
|
pcre2grep -M 'regular\s+expression' <file>
|
||||||
|
|
||||||
|
The \s escape sequence matches any white space character, in-
|
||||||
|
cluding newlines, and is followed by + so as to match trail-
|
||||||
|
ing white space on the first line as well as possibly han-
|
||||||
|
dling a two-character newline sequence.
|
||||||
|
|
||||||
|
There is a limit to the number of lines that can be matched,
|
||||||
|
imposed by the way that pcre2grep buffers the input file as
|
||||||
|
it scans it. With a sufficiently large processing buffer,
|
||||||
|
this should not be a problem, but the -M option does not work
|
||||||
|
when input is read line by line (see --line-buffered.)
|
||||||
|
|
||||||
|
-m number, --max-count=number
|
||||||
|
Stop processing after finding number matching lines, or non-
|
||||||
|
matching lines if -v is also set. Any trailing context lines
|
||||||
|
are output after the final match. In multiline mode, each
|
||||||
|
multiline match counts as just one line for this purpose. If
|
||||||
|
this limit is reached when reading the standard input from a
|
||||||
|
regular file, the file is left positioned just after the last
|
||||||
|
matching line. If -c is also set, the count that is output
|
||||||
|
is never greater than number. This option has no effect if
|
||||||
|
used with -L, -l, or -q, or when just checking for a match in
|
||||||
|
a binary file.
|
||||||
|
|
||||||
--match-limit=number
|
--match-limit=number
|
||||||
Processing some regular expression patterns may take a very
|
Processing some regular expression patterns may take a very
|
||||||
long time to search for all possible matching strings. Others
|
long time to search for all possible matching strings. Others
|
||||||
|
@ -542,41 +590,6 @@ OPTIONS
|
||||||
size is silently forced to be no smaller than the starting
|
size is silently forced to be no smaller than the starting
|
||||||
buffer size.
|
buffer size.
|
||||||
|
|
||||||
-M, --multiline
|
|
||||||
Allow patterns to match more than one line. When this option
|
|
||||||
is set, the PCRE2 library is called in "multiline" mode. This
|
|
||||||
allows a matched string to extend past the end of a line and
|
|
||||||
continue on one or more subsequent lines. Patterns used with
|
|
||||||
-M may usefully contain literal newline characters and inter-
|
|
||||||
nal occurrences of ^ and $ characters. The output for a suc-
|
|
||||||
cessful match may consist of more than one line. The first
|
|
||||||
line is the line in which the match started, and the last
|
|
||||||
line is the line in which the match ended. If the matched
|
|
||||||
string ends with a newline sequence, the output ends at the
|
|
||||||
end of that line. If -v is set, none of the lines in a
|
|
||||||
multi-line match are output. Once a match has been handled,
|
|
||||||
scanning restarts at the beginning of the line after the one
|
|
||||||
in which the match ended.
|
|
||||||
|
|
||||||
The newline sequence that separates multiple lines must be
|
|
||||||
matched as part of the pattern. For example, to find the
|
|
||||||
phrase "regular expression" in a file where "regular" might
|
|
||||||
be at the end of a line and "expression" at the start of the
|
|
||||||
next line, you could use this command:
|
|
||||||
|
|
||||||
pcre2grep -M 'regular\s+expression' <file>
|
|
||||||
|
|
||||||
The \s escape sequence matches any white space character, in-
|
|
||||||
cluding newlines, and is followed by + so as to match trail-
|
|
||||||
ing white space on the first line as well as possibly han-
|
|
||||||
dling a two-character newline sequence.
|
|
||||||
|
|
||||||
There is a limit to the number of lines that can be matched,
|
|
||||||
imposed by the way that pcre2grep buffers the input file as
|
|
||||||
it scans it. With a sufficiently large processing buffer,
|
|
||||||
this should not be a problem, but the -M option does not work
|
|
||||||
when input is read line by line (see --line-buffered.)
|
|
||||||
|
|
||||||
-N newline-type, --newline=newline-type
|
-N newline-type, --newline=newline-type
|
||||||
Six different conventions for indicating the ends of lines in
|
Six different conventions for indicating the ends of lines in
|
||||||
scanned files are supported. For example:
|
scanned files are supported. For example:
|
||||||
|
@ -625,97 +638,109 @@ OPTIONS
|
||||||
lems. It should never be needed in normal use.
|
lems. It should never be needed in normal use.
|
||||||
|
|
||||||
-O text, --output=text
|
-O text, --output=text
|
||||||
When there is a match, instead of outputting the whole line
|
When there is a match, instead of outputting the line that
|
||||||
that matched, output just the given text, followed by an op-
|
matched, output just the text specified in this option, fol-
|
||||||
erating-system standard newline. The --newline option has no
|
lowed by an operating-system standard newline. In this mode,
|
||||||
effect on this option, which is mutually exclusive with
|
no context is shown. That is, the -A, -B, and -C options are
|
||||||
--only-matching, --file-offsets, and --line-offsets. Escape
|
ignored. The --newline option has no effect on this option,
|
||||||
sequences starting with a dollar character may be used to in-
|
which is mutually exclusive with --only-matching, --file-off-
|
||||||
sert the contents of the matched part of the line and/or cap-
|
sets, and --line-offsets. However, like --only-matching, if
|
||||||
tured substrings into the text.
|
there is more than one match in a line, each of them causes a
|
||||||
|
line of output.
|
||||||
|
|
||||||
$<digits> or ${<digits>} is replaced by the captured sub-
|
Escape sequences starting with a dollar character may be used
|
||||||
string of the given decimal number; zero substitutes the
|
to insert the contents of the matched part of the line and/or
|
||||||
|
captured substrings into the text.
|
||||||
|
|
||||||
|
$<digits> or ${<digits>} is replaced by the captured sub-
|
||||||
|
string of the given decimal number; zero substitutes the
|
||||||
whole match. If the number is greater than the number of cap-
|
whole match. If the number is greater than the number of cap-
|
||||||
turing substrings, or if the capture is unset, the replace-
|
turing substrings, or if the capture is unset, the replace-
|
||||||
ment is empty.
|
ment is empty.
|
||||||
|
|
||||||
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
||||||
form feed; $n by newline; $r by carriage return; $t by tab;
|
form feed; $n by newline; $r by carriage return; $t by tab;
|
||||||
$v by vertical tab.
|
$v by vertical tab.
|
||||||
|
|
||||||
$o<digits> is replaced by the character represented by the
|
$o<digits> or $o{<digits>} is replaced by the character whose
|
||||||
given octal number; up to three digits are processed.
|
code point is the given octal number. In the first form, up
|
||||||
|
to three octal digits are processed. When more digits are
|
||||||
|
needed in Unicode mode to specify a wide character, the sec-
|
||||||
|
ond form must be used.
|
||||||
|
|
||||||
$x<digits> is replaced by the character represented by the
|
$x<digits> or $x{<digits>} is replaced by the character rep-
|
||||||
given hexadecimal number; up to two digits are processed.
|
resented by the given hexadecimal number. In the first form,
|
||||||
|
up to two hexadecimal digits are processed. When more digits
|
||||||
|
are needed in Unicode mode to specify a wide character, the
|
||||||
|
second form must be used.
|
||||||
|
|
||||||
Any other character is substituted by itself. In particular,
|
Any other character is substituted by itself. In particular,
|
||||||
$$ is replaced by a single dollar.
|
$$ is replaced by a single dollar.
|
||||||
|
|
||||||
-o, --only-matching
|
-o, --only-matching
|
||||||
Show only the part of the line that matched a pattern instead
|
Show only the part of the line that matched a pattern instead
|
||||||
of the whole line. In this mode, no context is shown. That
|
of the whole line. In this mode, no context is shown. That
|
||||||
is, the -A, -B, and -C options are ignored. If there is more
|
is, the -A, -B, and -C options are ignored. If there is more
|
||||||
than one match in a line, each of them is shown separately,
|
than one match in a line, each of them is shown separately,
|
||||||
on a separate line of output. If -o is combined with -v (in-
|
on a separate line of output. If -o is combined with -v (in-
|
||||||
vert the sense of the match to find non-matching lines), no
|
vert the sense of the match to find non-matching lines), no
|
||||||
output is generated, but the return code is set appropri-
|
output is generated, but the return code is set appropri-
|
||||||
ately. If the matched portion of the line is empty, nothing
|
ately. If the matched portion of the line is empty, nothing
|
||||||
is output unless the file name or line number are being
|
is output unless the file name or line number are being
|
||||||
printed, in which case they are shown on an otherwise empty
|
printed, in which case they are shown on an otherwise empty
|
||||||
line. This option is mutually exclusive with --output,
|
line. This option is mutually exclusive with --output,
|
||||||
--file-offsets and --line-offsets.
|
--file-offsets and --line-offsets.
|
||||||
|
|
||||||
-onumber, --only-matching=number
|
-onumber, --only-matching=number
|
||||||
Show only the part of the line that matched the capturing
|
Show only the part of the line that matched the capturing
|
||||||
parentheses of the given number. Up to 50 capturing parenthe-
|
parentheses of the given number. Up to 50 capturing parenthe-
|
||||||
ses are supported by default. This limit can be changed via
|
ses are supported by default. This limit can be changed via
|
||||||
the --om-capture option. A pattern may contain any number of
|
the --om-capture option. A pattern may contain any number of
|
||||||
capturing parentheses, but only those whose number is within
|
capturing parentheses, but only those whose number is within
|
||||||
the limit can be accessed by -o. An error occurs if the num-
|
the limit can be accessed by -o. An error occurs if the num-
|
||||||
ber specified by -o is greater than the limit.
|
ber specified by -o is greater than the limit.
|
||||||
|
|
||||||
-o0 is the same as -o without a number. Because these options
|
-o0 is the same as -o without a number. Because these options
|
||||||
can be given without an argument (see above), if an argument
|
can be given without an argument (see above), if an argument
|
||||||
is present, it must be given in the same shell item, for ex-
|
is present, it must be given in the same shell item, for ex-
|
||||||
ample, -o3 or --only-matching=2. The comments given for the
|
ample, -o3 or --only-matching=2. The comments given for the
|
||||||
non-argument case above also apply to this option. If the
|
non-argument case above also apply to this option. If the
|
||||||
specified capturing parentheses do not exist in the pattern,
|
specified capturing parentheses do not exist in the pattern,
|
||||||
or were not set in the match, nothing is output unless the
|
or were not set in the match, nothing is output unless the
|
||||||
file name or line number are being output.
|
file name or line number are being output.
|
||||||
|
|
||||||
If this option is given multiple times, multiple substrings
|
If this option is given multiple times, multiple substrings
|
||||||
are output for each match, in the order the options are
|
are output for each match, in the order the options are
|
||||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||||
the substrings matched by capturing parentheses 3 and 1 and
|
the substrings matched by capturing parentheses 3 and 1 and
|
||||||
then 3 again to be output. By default, there is no separator
|
then 3 again to be output. By default, there is no separator
|
||||||
(but see the next but one option).
|
(but see the next but one option).
|
||||||
|
|
||||||
--om-capture=number
|
--om-capture=number
|
||||||
Set the number of capturing parentheses that can be accessed
|
Set the number of capturing parentheses that can be accessed
|
||||||
by -o. The default is 50.
|
by -o. The default is 50.
|
||||||
|
|
||||||
--om-separator=text
|
--om-separator=text
|
||||||
Specify a separating string for multiple occurrences of -o.
|
Specify a separating string for multiple occurrences of -o.
|
||||||
The default is an empty string. Separating strings are never
|
The default is an empty string. Separating strings are never
|
||||||
coloured.
|
coloured.
|
||||||
|
|
||||||
-q, --quiet
|
-q, --quiet
|
||||||
Work quietly, that is, display nothing except error messages.
|
Work quietly, that is, display nothing except error messages.
|
||||||
The exit status indicates whether or not any matches were
|
The exit status indicates whether or not any matches were
|
||||||
found.
|
found.
|
||||||
|
|
||||||
-r, --recursive
|
-r, --recursive
|
||||||
If any given path is a directory, recursively scan the files
|
If any given path is a directory, recursively scan the files
|
||||||
it contains, taking note of any --include and --exclude set-
|
it contains, taking note of any --include and --exclude set-
|
||||||
tings. By default, a directory is read as a normal file; in
|
tings. By default, a directory is read as a normal file; in
|
||||||
some operating systems this gives an immediate end-of-file.
|
some operating systems this gives an immediate end-of-file.
|
||||||
This option is a shorthand for setting the -d option to "re-
|
This option is a shorthand for setting the -d option to "re-
|
||||||
curse".
|
curse".
|
||||||
|
|
||||||
--recursion-limit=number
|
--recursion-limit=number
|
||||||
See --match-limit above.
|
This is an obsolete synonym for --depth-limit. See --match-
|
||||||
|
limit above for details.
|
||||||
|
|
||||||
-s, --no-messages
|
-s, --no-messages
|
||||||
Suppress error messages about non-existent or unreadable
|
Suppress error messages about non-existent or unreadable
|
||||||
|
@ -737,26 +762,30 @@ OPTIONS
|
||||||
|
|
||||||
-u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
|
-u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
|
||||||
has been compiled with UTF-8 support. All patterns (including
|
has been compiled with UTF-8 support. All patterns (including
|
||||||
those for any --exclude and --include options) and all sub-
|
those for any --exclude and --include options) and all lines
|
||||||
ject lines that are scanned must be valid strings of UTF-8
|
that are scanned must be valid strings of UTF-8 characters.
|
||||||
characters.
|
If an invalid UTF-8 string is encountered, an error occurs.
|
||||||
|
|
||||||
-U, --utf-allow-invalid
|
-U, --utf-allow-invalid
|
||||||
As --utf, but in addition subject lines may contain invalid
|
As --utf, but in addition subject lines may contain invalid
|
||||||
UTF-8 code unit sequences. These can never form part of any
|
UTF-8 code unit sequences. These can never form part of any
|
||||||
pattern match. This facility allows valid UTF-8 strings to be
|
pattern match. Patterns themselves, however, must still be
|
||||||
sought in executable or other binary files. For more details
|
valid UTF-8 strings. This facility allows valid UTF-8 strings
|
||||||
about matching in non-valid UTF-8 strings, see the pcre2uni-
|
to be sought within arbitrary byte sequences in executable or
|
||||||
code(3) documentation.
|
other binary files. For more details about matching in non-
|
||||||
|
valid UTF-8 strings, see the pcre2unicode(3) documentation.
|
||||||
|
|
||||||
-V, --version
|
-V, --version
|
||||||
Write the version numbers of pcre2grep and the PCRE2 library
|
Write the version numbers of pcre2grep and the PCRE2 library
|
||||||
to the standard output and then exit. Anything else on the
|
to the standard output and then exit. Anything else on the
|
||||||
command line is ignored.
|
command line is ignored.
|
||||||
|
|
||||||
-v, --invert-match
|
-v, --invert-match
|
||||||
Invert the sense of the match, so that lines which do not
|
Invert the sense of the match, so that lines which do not
|
||||||
match any of the patterns are the ones that are found.
|
match any of the patterns are the ones that are found. When
|
||||||
|
this option is set, options such as --only-matching and
|
||||||
|
--output, which specify parts of a match that are to be out-
|
||||||
|
put, are ignored.
|
||||||
|
|
||||||
-w, --word-regex, --word-regexp
|
-w, --word-regex, --word-regexp
|
||||||
Force the patterns only to match "words". That is, there must
|
Force the patterns only to match "words". That is, there must
|
||||||
|
@ -878,30 +907,49 @@ USING PCRE2'S CALLOUT FACILITY
|
||||||
mentation for details). Numbered callouts are ignored by pcre2grep;
|
mentation for details). Numbered callouts are ignored by pcre2grep;
|
||||||
only callouts with string arguments are useful.
|
only callouts with string arguments are useful.
|
||||||
|
|
||||||
|
Echoing a specific string
|
||||||
|
|
||||||
|
Starting the callout string with a pipe character invokes an echoing
|
||||||
|
facility that avoids calling an external program or script. This facil-
|
||||||
|
ity is always available, provided that callouts were not completely
|
||||||
|
disabled when pcre2grep was built. The rest of the callout string is
|
||||||
|
processed as a zero-terminated string, which means it should not con-
|
||||||
|
tain any internal binary zeros. It is written to the output, having
|
||||||
|
first been passed through the same escape processing as text from the
|
||||||
|
--output (-O) option (see above). However, $0 cannot be used to insert
|
||||||
|
a matched substring because the match is still in progress. Instead,
|
||||||
|
the single character '0' is inserted. Any syntax errors in the string
|
||||||
|
(for example, a dollar not followed by another character) causes the
|
||||||
|
callout to be ignored. No terminator is added to the output string, so
|
||||||
|
if you want a newline, you must include it explicitly using the escape
|
||||||
|
$n. For example:
|
||||||
|
|
||||||
|
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||||
|
|
||||||
|
Matching continues normally after the string is output. If you want to
|
||||||
|
see only the callout output but not any output from an actual match,
|
||||||
|
you should end the pattern with (*FAIL).
|
||||||
|
|
||||||
Calling external programs or scripts
|
Calling external programs or scripts
|
||||||
|
|
||||||
This facility can be independently disabled when pcre2grep is built. It
|
This facility can be independently disabled when pcre2grep is built. It
|
||||||
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
||||||
where lib$spawn() is used, and for any other Unix-like environment
|
where lib$spawn() is used, and for any Unix-like environment where
|
||||||
where fork() and execv() are available.
|
fork() and execv() are available.
|
||||||
|
|
||||||
If the callout string does not start with a pipe (vertical bar) charac-
|
If the callout string does not start with a pipe (vertical bar) charac-
|
||||||
ter, it is parsed into a list of substrings separated by pipe charac-
|
ter, it is parsed into a list of substrings separated by pipe charac-
|
||||||
ters. The first substring must be an executable name, with the follow-
|
ters. The first substring must be an executable name, with the follow-
|
||||||
ing substrings specifying arguments:
|
ing substrings specifying arguments:
|
||||||
|
|
||||||
executable_name|arg1|arg2|...
|
executable_name|arg1|arg2|...
|
||||||
|
|
||||||
Any substring (including the executable name) may contain escape se-
|
Any substring (including the executable name) may contain escape se-
|
||||||
quences started by a dollar character: $<digits> or ${<digits>} is re-
|
quences started by a dollar character. These are the same as for the
|
||||||
placed by the captured substring of the given decimal number, which
|
--output (-O) option documented above, except that $0 cannot insert the
|
||||||
must be greater than zero. If the number is greater than the number of
|
matched string because the match is still in progress. Instead, the
|
||||||
capturing substrings, or if the capture is unset, the replacement is
|
character '0' is inserted. If you need a literal dollar or pipe charac-
|
||||||
empty.
|
ter in any substring, use $$ or $| respectively. Here is an example:
|
||||||
|
|
||||||
Any other character is substituted by itself. In particular, $$ is re-
|
|
||||||
placed by a single dollar and $| is replaced by a pipe character. Here
|
|
||||||
is an example:
|
|
||||||
|
|
||||||
echo -e "abcde\n12345" | pcre2grep \
|
echo -e "abcde\n12345" | pcre2grep \
|
||||||
'(?x)(.)(..(.))
|
'(?x)(.)(..(.))
|
||||||
|
@ -914,28 +962,15 @@ USING PCRE2'S CALLOUT FACILITY
|
||||||
Arg1: [1] [234] [4] Arg2: |1| ()
|
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||||
12345
|
12345
|
||||||
|
|
||||||
The parameters for the system call that is used to run the program or
|
The parameters for the system call that is used to run the program or
|
||||||
script are zero-terminated strings. This means that binary zero charac-
|
script are zero-terminated strings. This means that binary zero charac-
|
||||||
ters in the callout argument will cause premature termination of their
|
ters in the callout argument will cause premature termination of their
|
||||||
substrings, and therefore should not be present. Any syntax errors in
|
substrings, and therefore should not be present. Any syntax errors in
|
||||||
the string (for example, a dollar not followed by another character)
|
the string (for example, a dollar not followed by another character)
|
||||||
cause the callout to be ignored. If running the program fails for any
|
causes the callout to be ignored. If running the program fails for any
|
||||||
reason (including the non-existence of the executable), a local match-
|
reason (including the non-existence of the executable), a local match-
|
||||||
ing failure occurs and the matcher backtracks in the normal way.
|
ing failure occurs and the matcher backtracks in the normal way.
|
||||||
|
|
||||||
Echoing a specific string
|
|
||||||
|
|
||||||
This facility is always available, provided that callouts were not com-
|
|
||||||
pletely disabled when pcre2grep was built. If the callout string starts
|
|
||||||
with a pipe (vertical bar) character, the rest of the string is written
|
|
||||||
to the output, having been passed through the same escape processing as
|
|
||||||
text from the --output option. This provides a simple echoing facility
|
|
||||||
that avoids calling an external program or script. No terminator is
|
|
||||||
added to the string, so if you want a newline, you must include it ex-
|
|
||||||
plicitly. Matching continues normally after the string is output. If
|
|
||||||
you want to see only the callout output but not any output from an ac-
|
|
||||||
tual match, you should end the relevant pattern with (*FAIL).
|
|
||||||
|
|
||||||
|
|
||||||
MATCHING ERRORS
|
MATCHING ERRORS
|
||||||
|
|
||||||
|
@ -969,7 +1004,7 @@ DIAGNOSTICS
|
||||||
|
|
||||||
SEE ALSO
|
SEE ALSO
|
||||||
|
|
||||||
pcre2pattern(3), pcre2syntax(3), pcre2callout(3).
|
pcre2pattern(3), pcre2syntax(3), pcre2callout(3), pcre2unicode(3).
|
||||||
|
|
||||||
|
|
||||||
AUTHOR
|
AUTHOR
|
||||||
|
@ -981,5 +1016,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 25 January 2020
|
Last updated: 04 October 2020
|
||||||
Copyright (c) 1997-2020 University of Cambridge.
|
Copyright (c) 1997-2020 University of Cambridge.
|
||||||
|
|
1051
doc/pcre2test.txt
1051
doc/pcre2test.txt
File diff suppressed because it is too large
Load Diff
717
src/pcre2grep.c
717
src/pcre2grep.c
|
@ -164,6 +164,10 @@ enum { DEE_READ, DEE_SKIP };
|
||||||
|
|
||||||
enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT };
|
enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT };
|
||||||
|
|
||||||
|
/* Return values from decode_dollar_escape() */
|
||||||
|
|
||||||
|
enum { DDE_ERROR, DDE_CAPTURE, DDE_CHAR };
|
||||||
|
|
||||||
/* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some
|
/* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some
|
||||||
environments), a warning is issued if the value of fwrite() is ignored.
|
environments), a warning is issued if the value of fwrite() is ignored.
|
||||||
Unfortunately, casting to (void) does not suppress the warning. To get round
|
Unfortunately, casting to (void) does not suppress the warning. To get round
|
||||||
|
@ -179,13 +183,21 @@ handled by using STDOUT_NL as the newline string. We also use a normal double
|
||||||
quote for the example, as single quotes aren't usually available. */
|
quote for the example, as single quotes aren't usually available. */
|
||||||
|
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
#define STDOUT_NL "\r\n"
|
#define STDOUT_NL "\r\n"
|
||||||
#define QUOT "\""
|
#define STDOUT_NL_LEN 2
|
||||||
|
#define QUOT "\""
|
||||||
#else
|
#else
|
||||||
#define STDOUT_NL "\n"
|
#define STDOUT_NL "\n"
|
||||||
#define QUOT "'"
|
#define STDOUT_NL_LEN 1
|
||||||
|
#define QUOT "'"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* This code is returned from decode_dollar_escape() when $n is encountered,
|
||||||
|
and used to mean "output STDOUT_NL". It is, of course, not a valid Unicode code
|
||||||
|
point. */
|
||||||
|
|
||||||
|
#define STDOUT_NL_CODE 0x7fffffffu
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
|
@ -224,8 +236,9 @@ static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
|
||||||
static int bufsize = 3*PCRE2GREP_BUFSIZE;
|
static int bufsize = 3*PCRE2GREP_BUFSIZE;
|
||||||
static int endlinetype;
|
static int endlinetype;
|
||||||
|
|
||||||
static unsigned long int total_count = 0;
|
static int count_limit = -1; /* Not long, so that it works with OP_NUMBER */
|
||||||
static unsigned long int counts_printed = 0;
|
static unsigned long int counts_printed = 0;
|
||||||
|
static unsigned long int total_count = 0;
|
||||||
|
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
static int dee_action = dee_SKIP;
|
static int dee_action = dee_SKIP;
|
||||||
|
@ -277,6 +290,9 @@ static BOOL show_total_count = FALSE;
|
||||||
static BOOL silent = FALSE;
|
static BOOL silent = FALSE;
|
||||||
static BOOL utf = FALSE;
|
static BOOL utf = FALSE;
|
||||||
|
|
||||||
|
static uint8_t utf8_buffer[8];
|
||||||
|
|
||||||
|
|
||||||
/* Structure for list of --only-matching capturing numbers. */
|
/* Structure for list of --only-matching capturing numbers. */
|
||||||
|
|
||||||
typedef struct omstr {
|
typedef struct omstr {
|
||||||
|
@ -443,6 +459,7 @@ static option_item optionlist[] = {
|
||||||
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "depth-limit=number", "set PCRE2 depth limit option" },
|
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "depth-limit=number", "set PCRE2 depth limit option" },
|
||||||
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "recursion-limit=number", "obsolete synonym for depth-limit" },
|
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "recursion-limit=number", "obsolete synonym for depth-limit" },
|
||||||
{ OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" },
|
{ OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" },
|
||||||
|
{ OP_NUMBER, 'm', &count_limit, "max-count=number", "stop after <number> matched lines" },
|
||||||
{ OP_STRING, 'N', &newline_arg, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF, ANY, or NUL)" },
|
{ OP_STRING, 'N', &newline_arg, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF, ANY, or NUL)" },
|
||||||
{ OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" },
|
{ OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" },
|
||||||
#ifdef SUPPORT_PCRE2GREP_JIT
|
#ifdef SUPPORT_PCRE2GREP_JIT
|
||||||
|
@ -482,8 +499,13 @@ of PCRE2_NEWLINE_xx in pcre2.h. */
|
||||||
static const char *newlines[] = {
|
static const char *newlines[] = {
|
||||||
"DEFAULT", "CR", "LF", "CRLF", "ANY", "ANYCRLF", "NUL" };
|
"DEFAULT", "CR", "LF", "CRLF", "ANY", "ANYCRLF", "NUL" };
|
||||||
|
|
||||||
/* UTF-8 tables - used only when the newline setting is "any". */
|
/* UTF-8 tables */
|
||||||
|
|
||||||
|
const int utf8_table1[] =
|
||||||
|
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
|
||||||
|
const int utf8_table1_size = sizeof(utf8_table1) / sizeof(int);
|
||||||
|
|
||||||
|
const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||||
const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||||
|
|
||||||
const char utf8_table4[] = {
|
const char utf8_table4[] = {
|
||||||
|
@ -531,6 +553,32 @@ else
|
||||||
#endif /* not VPCOMPAT && not HAVE_MEMMOVE */
|
#endif /* not VPCOMPAT && not HAVE_MEMMOVE */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Convert code point to UTF-8 *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* A static buffer is used. Returns the number of bytes. */
|
||||||
|
|
||||||
|
static int
|
||||||
|
ord2utf8(uint32_t value)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
uint8_t *utf8bytes = utf8_buffer;
|
||||||
|
for (i = 0; i < utf8_table1_size; i++)
|
||||||
|
if (value <= (uint32_t)utf8_table1[i]) break;
|
||||||
|
utf8bytes += i;
|
||||||
|
for (j = i; j > 0; j--)
|
||||||
|
{
|
||||||
|
*utf8bytes-- = 0x80 | (value & 0x3f);
|
||||||
|
value >>= 6;
|
||||||
|
}
|
||||||
|
*utf8bytes = utf8_table2[i] | value;
|
||||||
|
return i + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Case-independent string compare *
|
* Case-independent string compare *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
@ -1788,6 +1836,7 @@ if (slen > 200)
|
||||||
slen = 200;
|
slen = 200;
|
||||||
msg = "text that starts:\n\n";
|
msg = "text that starts:\n\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 1; p != NULL; p = p->next, i++)
|
for (i = 1; p != NULL; p = p->next, i++)
|
||||||
{
|
{
|
||||||
*mrc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, (int)length,
|
*mrc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, (int)length,
|
||||||
|
@ -1823,107 +1872,245 @@ return FALSE; /* No match, no errors */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Decode dollar escape sequence *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* Called from various places to decode $ escapes in output strings. The escape
|
||||||
|
sequences are as follows:
|
||||||
|
|
||||||
|
$<digits> or ${<digits>} returns a capture number. However, if callout is TRUE,
|
||||||
|
zero is never returned; '0' is substituted.
|
||||||
|
|
||||||
|
$a returns bell.
|
||||||
|
$b returns backspace.
|
||||||
|
$e returns escape.
|
||||||
|
$f returns form feed.
|
||||||
|
$n returns newline.
|
||||||
|
$r returns carriage return.
|
||||||
|
$t returns tab.
|
||||||
|
$v returns vertical tab.
|
||||||
|
$o<digits> returns the character represented by the given octal
|
||||||
|
number; up to three digits are processed.
|
||||||
|
$o{<digits>} does the same, up to 7 digits, but gives an error for mode-invalid
|
||||||
|
code points.
|
||||||
|
$x<digits> returns the character represented by the given hexadecimal
|
||||||
|
number; up to two digits are processed.
|
||||||
|
$x{<digits} does the same, up to 6 digits, but gives an error for mode-invalid
|
||||||
|
code points.
|
||||||
|
Any other character is substituted by itself. E.g: $$ is replaced by a single
|
||||||
|
dollar.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
begin the start of the whole string
|
||||||
|
string points to the $
|
||||||
|
callout TRUE if in a callout (inhibits error messages)
|
||||||
|
value where to return a value
|
||||||
|
last where to return pointer to the last used character
|
||||||
|
|
||||||
|
Returns: DDE_ERROR after a syntax error
|
||||||
|
DDE_CAPTURE if *value is a capture number
|
||||||
|
DDE_CHAR if *value is a character code
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int
|
||||||
|
decode_dollar_escape(PCRE2_SPTR begin, PCRE2_SPTR string, BOOL callout,
|
||||||
|
uint32_t *value, PCRE2_SPTR *last)
|
||||||
|
{
|
||||||
|
uint32_t c = 0;
|
||||||
|
int base = 10;
|
||||||
|
int dcount;
|
||||||
|
int rc = DDE_CHAR;
|
||||||
|
BOOL brace = FALSE;
|
||||||
|
|
||||||
|
switch (*(++string))
|
||||||
|
{
|
||||||
|
case 0: /* Syntax error: a character must be present after $. */
|
||||||
|
if (!callout)
|
||||||
|
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||||
|
(int)(string - begin), "no character after $");
|
||||||
|
*last = string;
|
||||||
|
return DDE_ERROR;
|
||||||
|
|
||||||
|
case '{':
|
||||||
|
brace = TRUE;
|
||||||
|
string++;
|
||||||
|
if (!isdigit(*string)) /* Syntax error: a decimal number required. */
|
||||||
|
{
|
||||||
|
if (!callout)
|
||||||
|
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||||
|
(int)(string - begin), "decimal number expected");
|
||||||
|
rc = DDE_ERROR;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fall through */
|
||||||
|
|
||||||
|
/* The maximum capture number is 65535, so any number greater than that will
|
||||||
|
always be an unknown capture number. We just stop incrementing, in order to
|
||||||
|
avoid overflow. */
|
||||||
|
|
||||||
|
case '0': case '1': case '2': case '3': case '4':
|
||||||
|
case '5': case '6': case '7': case '8': case '9':
|
||||||
|
do
|
||||||
|
{
|
||||||
|
if (c <= 65535) c = c * 10 + (*string - '0');
|
||||||
|
string++;
|
||||||
|
}
|
||||||
|
while (*string >= '0' && *string <= '9');
|
||||||
|
string--; /* Point to last digit */
|
||||||
|
|
||||||
|
/* In a callout, capture number 0 is not available. No error can be given,
|
||||||
|
so just return the character '0'. */
|
||||||
|
|
||||||
|
if (callout && c == 0)
|
||||||
|
{
|
||||||
|
*value = '0';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*value = c;
|
||||||
|
rc = DDE_CAPTURE;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Limit octal numbers to 3 digits without braces, or up to 7 with braces,
|
||||||
|
for valid Unicode code points. */
|
||||||
|
|
||||||
|
case 'o':
|
||||||
|
base = 8;
|
||||||
|
string++;
|
||||||
|
if (*string == '{')
|
||||||
|
{
|
||||||
|
brace = TRUE;
|
||||||
|
string++;
|
||||||
|
dcount = 7;
|
||||||
|
}
|
||||||
|
else dcount = 3;
|
||||||
|
for (; dcount > 0; dcount--)
|
||||||
|
{
|
||||||
|
if (*string < '0' || *string > '7') break;
|
||||||
|
c = c * 8 + (*string++ - '0');
|
||||||
|
}
|
||||||
|
*value = c;
|
||||||
|
string--; /* Point to last digit */
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Limit hex numbers to 2 digits without braces, or up to 6 with braces,
|
||||||
|
for valid Unicode code points. */
|
||||||
|
|
||||||
|
case 'x':
|
||||||
|
base = 16;
|
||||||
|
string++;
|
||||||
|
if (*string == '{')
|
||||||
|
{
|
||||||
|
brace = TRUE;
|
||||||
|
string++;
|
||||||
|
dcount = 6;
|
||||||
|
}
|
||||||
|
else dcount = 2;
|
||||||
|
for (; dcount > 0; dcount--)
|
||||||
|
{
|
||||||
|
if (!isxdigit(*string)) break;
|
||||||
|
if (*string >= '0' && *string <= '9')
|
||||||
|
c = c *16 + *string++ - '0';
|
||||||
|
else
|
||||||
|
c = c * 16 + (*string++ | 0x20) - 'a' + 10;
|
||||||
|
}
|
||||||
|
*value = c;
|
||||||
|
string--; /* Point to last digit */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'a': *value = '\a'; break;
|
||||||
|
case 'b': *value = '\b'; break;
|
||||||
|
#ifndef EBCDIC
|
||||||
|
case 'e': *value = '\033'; break;
|
||||||
|
#else
|
||||||
|
case 'e': *value = '\047'; break;
|
||||||
|
#endif
|
||||||
|
case 'f': *value = '\f'; break;
|
||||||
|
case 'n': *value = STDOUT_NL_CODE; break;
|
||||||
|
case 'r': *value = '\r'; break;
|
||||||
|
case 't': *value = '\t'; break;
|
||||||
|
case 'v': *value = '\v'; break;
|
||||||
|
|
||||||
|
default: *value = *string; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (brace)
|
||||||
|
{
|
||||||
|
c = string[1];
|
||||||
|
if (c != '}')
|
||||||
|
{
|
||||||
|
rc = DDE_ERROR;
|
||||||
|
if (!callout)
|
||||||
|
{
|
||||||
|
if ((base == 8 && c >= '0' && c <= '7') ||
|
||||||
|
(base == 16 && isxdigit(c)))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
|
||||||
|
"too many %s digits\n", (int)(string - begin),
|
||||||
|
(base == 8)? "octal" : "hex");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||||
|
(int)(string - begin), "missing closing brace");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else string++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check maximum code point values, but take note of STDOUT_NL_CODE. */
|
||||||
|
|
||||||
|
if (rc == DDE_CHAR && *value != STDOUT_NL_CODE)
|
||||||
|
{
|
||||||
|
uint32_t max = utf? 0x0010ffffu : 0xffu;
|
||||||
|
if (*value > max)
|
||||||
|
{
|
||||||
|
if (!callout)
|
||||||
|
fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
|
||||||
|
"code point greater than 0x%x is invalid\n", (int)(string - begin), max);
|
||||||
|
rc = DDE_ERROR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*last = string;
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Check output text for errors *
|
* Check output text for errors *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
|
/* Called early, to get errors before doing anything for -O text; also called
|
||||||
|
from callouts to check before outputting.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
string an --output text string
|
||||||
|
callout TRUE if in a callout (stops printing errors)
|
||||||
|
|
||||||
|
Returns: TRUE if OK, FALSE on error
|
||||||
|
*/
|
||||||
|
|
||||||
static BOOL
|
static BOOL
|
||||||
syntax_check_output_text(PCRE2_SPTR string, BOOL callout)
|
syntax_check_output_text(PCRE2_SPTR string, BOOL callout)
|
||||||
{
|
{
|
||||||
|
uint32_t value;
|
||||||
PCRE2_SPTR begin = string;
|
PCRE2_SPTR begin = string;
|
||||||
|
|
||||||
for (; *string != 0; string++)
|
for (; *string != 0; string++)
|
||||||
{
|
{
|
||||||
if (*string == '$')
|
if (*string == '$' &&
|
||||||
{
|
decode_dollar_escape(begin, string, callout, &value, &string) == DDE_ERROR)
|
||||||
PCRE2_SIZE capture_id = 0;
|
|
||||||
BOOL brace = FALSE;
|
|
||||||
|
|
||||||
string++;
|
|
||||||
|
|
||||||
/* Syntax error: a character must be present after $. */
|
|
||||||
if (*string == 0)
|
|
||||||
{
|
|
||||||
if (!callout)
|
|
||||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
|
||||||
(int)(string - begin), "no character after $");
|
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
|
||||||
|
|
||||||
if (*string == '{')
|
|
||||||
{
|
|
||||||
/* Must be a decimal number in braces, e.g: {5} or {38} */
|
|
||||||
string++;
|
|
||||||
|
|
||||||
brace = TRUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((*string >= '1' && *string <= '9') || (!callout && *string == '0'))
|
|
||||||
{
|
|
||||||
do
|
|
||||||
{
|
|
||||||
/* Maximum capture id is 65535. */
|
|
||||||
if (capture_id <= 65535)
|
|
||||||
capture_id = capture_id * 10 + (*string - '0');
|
|
||||||
|
|
||||||
string++;
|
|
||||||
}
|
|
||||||
while (*string >= '0' && *string <= '9');
|
|
||||||
|
|
||||||
if (brace)
|
|
||||||
{
|
|
||||||
/* Syntax error: closing brace is missing. */
|
|
||||||
if (*string != '}')
|
|
||||||
{
|
|
||||||
if (!callout)
|
|
||||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
|
||||||
(int)(string - begin), "missing closing brace");
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* To negate the effect of the for. */
|
|
||||||
string--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (brace)
|
|
||||||
{
|
|
||||||
/* Syntax error: a decimal number required. */
|
|
||||||
if (!callout)
|
|
||||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
|
||||||
(int)(string - begin), "decimal number expected");
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
else if (*string == 'o')
|
|
||||||
{
|
|
||||||
string++;
|
|
||||||
|
|
||||||
if (*string < '0' || *string > '7')
|
|
||||||
{
|
|
||||||
/* Syntax error: an octal number required. */
|
|
||||||
if (!callout)
|
|
||||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
|
||||||
(int)(string - begin), "octal number expected");
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (*string == 'x')
|
|
||||||
{
|
|
||||||
string++;
|
|
||||||
|
|
||||||
if (!isxdigit((unsigned char)*string))
|
|
||||||
{
|
|
||||||
/* Syntax error: a hexdecimal number required. */
|
|
||||||
if (!callout)
|
|
||||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
|
||||||
(int)(string - begin), "hexadecimal number expected");
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1932,31 +2119,7 @@ for (; *string != 0; string++)
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* Display the output text, which is assumed to have already been syntax
|
/* Display the output text, which is assumed to have already been syntax
|
||||||
checked. Output may contain escape sequences started by the dollar sign. The
|
checked. Output may contain escape sequences started by the dollar sign.
|
||||||
escape sequences are substituted as follows:
|
|
||||||
|
|
||||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
|
||||||
decimal number; zero will substitute the whole match. If the number is
|
|
||||||
greater than the number of capturing substrings, or if the capture is unset,
|
|
||||||
the replacement is empty.
|
|
||||||
|
|
||||||
$a is replaced by bell.
|
|
||||||
$b is replaced by backspace.
|
|
||||||
$e is replaced by escape.
|
|
||||||
$f is replaced by form feed.
|
|
||||||
$n is replaced by newline.
|
|
||||||
$r is replaced by carriage return.
|
|
||||||
$t is replaced by tab.
|
|
||||||
$v is replaced by vertical tab.
|
|
||||||
|
|
||||||
$o<digits> is replaced by the character represented by the given octal
|
|
||||||
number; up to three digits are processed.
|
|
||||||
|
|
||||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
|
||||||
number; up to two digits are processed.
|
|
||||||
|
|
||||||
Any other character is substituted by itself. E.g: $$ is replaced by a single
|
|
||||||
dollar.
|
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string: the output text
|
string: the output text
|
||||||
|
@ -1973,121 +2136,54 @@ static BOOL
|
||||||
display_output_text(PCRE2_SPTR string, BOOL callout, PCRE2_SPTR subject,
|
display_output_text(PCRE2_SPTR string, BOOL callout, PCRE2_SPTR subject,
|
||||||
PCRE2_SIZE *ovector, PCRE2_SIZE capture_top)
|
PCRE2_SIZE *ovector, PCRE2_SIZE capture_top)
|
||||||
{
|
{
|
||||||
|
uint32_t value;
|
||||||
BOOL printed = FALSE;
|
BOOL printed = FALSE;
|
||||||
|
PCRE2_SPTR begin = string;
|
||||||
|
|
||||||
for (; *string != 0; string++)
|
for (; *string != 0; string++)
|
||||||
{
|
{
|
||||||
int ch = EOF;
|
|
||||||
if (*string == '$')
|
if (*string == '$')
|
||||||
{
|
{
|
||||||
PCRE2_SIZE capture_id = 0;
|
switch(decode_dollar_escape(begin, string, callout, &value, &string))
|
||||||
BOOL brace = FALSE;
|
|
||||||
|
|
||||||
string++;
|
|
||||||
|
|
||||||
if (*string == '{')
|
|
||||||
{
|
{
|
||||||
/* Must be a decimal number in braces, e.g: {5} or {38} */
|
case DDE_CHAR:
|
||||||
string++;
|
if (value == STDOUT_NL_CODE)
|
||||||
|
|
||||||
brace = TRUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((*string >= '1' && *string <= '9') || (!callout && *string == '0'))
|
|
||||||
{
|
|
||||||
do
|
|
||||||
{
|
{
|
||||||
/* Maximum capture id is 65535. */
|
fprintf(stdout, STDOUT_NL);
|
||||||
if (capture_id <= 65535)
|
printed = FALSE;
|
||||||
capture_id = capture_id * 10 + (*string - '0');
|
continue;
|
||||||
|
|
||||||
string++;
|
|
||||||
}
|
}
|
||||||
while (*string >= '0' && *string <= '9');
|
break; /* Will print value */
|
||||||
|
|
||||||
if (!brace)
|
case DDE_CAPTURE:
|
||||||
{
|
if (value < capture_top)
|
||||||
/* To negate the effect of the for. */
|
|
||||||
string--;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (capture_id < capture_top)
|
|
||||||
{
|
{
|
||||||
PCRE2_SIZE capturesize;
|
PCRE2_SIZE capturesize;
|
||||||
capture_id *= 2;
|
value *= 2;
|
||||||
|
capturesize = ovector[value + 1] - ovector[value];
|
||||||
capturesize = ovector[capture_id + 1] - ovector[capture_id];
|
|
||||||
if (capturesize > 0)
|
if (capturesize > 0)
|
||||||
{
|
{
|
||||||
print_match(subject + ovector[capture_id], capturesize);
|
print_match(subject + ovector[value], capturesize);
|
||||||
printed = TRUE;
|
printed = TRUE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
continue;
|
||||||
else if (*string == 'a') ch = '\a';
|
|
||||||
else if (*string == 'b') ch = '\b';
|
|
||||||
#ifndef EBCDIC
|
|
||||||
else if (*string == 'e') ch = '\033';
|
|
||||||
#else
|
|
||||||
else if (*string == 'e') ch = '\047';
|
|
||||||
#endif
|
|
||||||
else if (*string == 'f') ch = '\f';
|
|
||||||
else if (*string == 'r') ch = '\r';
|
|
||||||
else if (*string == 't') ch = '\t';
|
|
||||||
else if (*string == 'v') ch = '\v';
|
|
||||||
else if (*string == 'n')
|
|
||||||
{
|
|
||||||
fprintf(stdout, STDOUT_NL);
|
|
||||||
printed = FALSE;
|
|
||||||
}
|
|
||||||
else if (*string == 'o')
|
|
||||||
{
|
|
||||||
string++;
|
|
||||||
|
|
||||||
ch = *string - '0';
|
default: /* Should not occur */
|
||||||
if (string[1] >= '0' && string[1] <= '7')
|
break;
|
||||||
{
|
|
||||||
string++;
|
|
||||||
ch = ch * 8 + (*string - '0');
|
|
||||||
}
|
|
||||||
if (string[1] >= '0' && string[1] <= '7')
|
|
||||||
{
|
|
||||||
string++;
|
|
||||||
ch = ch * 8 + (*string - '0');
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if (*string == 'x')
|
}
|
||||||
{
|
|
||||||
string++;
|
|
||||||
|
|
||||||
if (*string >= '0' && *string <= '9')
|
else value = *string; /* Not a $ escape */
|
||||||
ch = *string - '0';
|
|
||||||
else
|
if (utf && value <= 127) fprintf(stdout, "%c", *string); else
|
||||||
ch = (*string | 0x20) - 'a' + 10;
|
|
||||||
if (isxdigit((unsigned char)string[1]))
|
|
||||||
{
|
|
||||||
string++;
|
|
||||||
ch *= 16;
|
|
||||||
if (*string >= '0' && *string <= '9')
|
|
||||||
ch += *string - '0';
|
|
||||||
else
|
|
||||||
ch += (*string | 0x20) - 'a' + 10;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ch = *string;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
ch = *string;
|
int i;
|
||||||
}
|
int n = ord2utf8(value);
|
||||||
if (ch != EOF)
|
for (i = 0; i < n; i++) fputc(utf8_buffer[i], stdout);
|
||||||
{
|
|
||||||
fprintf(stdout, "%c", ch);
|
|
||||||
printed = TRUE;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printed = TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return printed;
|
return printed;
|
||||||
|
@ -2166,7 +2262,7 @@ int result = 0;
|
||||||
|
|
||||||
(void)unused; /* Avoid compiler warning */
|
(void)unused; /* Avoid compiler warning */
|
||||||
|
|
||||||
/* Only callout with strings are supported. */
|
/* Only callouts with strings are supported. */
|
||||||
|
|
||||||
if (string == NULL || length == 0) return 0;
|
if (string == NULL || length == 0) return 0;
|
||||||
|
|
||||||
|
@ -2185,83 +2281,43 @@ return 0;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
/* Checking syntax and compute the number of string fragments. Callout strings
|
/* Checking syntax and compute the number of string fragments. Callout strings
|
||||||
are ignored in case of a syntax error. */
|
are silently ignored in the event of a syntax error. */
|
||||||
|
|
||||||
while (length > 0)
|
while (length > 0)
|
||||||
{
|
{
|
||||||
if (*string == '|')
|
if (*string == '|')
|
||||||
{
|
{
|
||||||
argsvectorlen++;
|
argsvectorlen++;
|
||||||
|
if (argsvectorlen > 10000) return 0; /* Too many args */
|
||||||
/* Maximum 10000 arguments allowed. */
|
|
||||||
if (argsvectorlen > 10000) return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (*string == '$')
|
else if (*string == '$')
|
||||||
{
|
{
|
||||||
PCRE2_SIZE capture_id = 0;
|
uint32_t value;
|
||||||
|
PCRE2_SPTR begin = string;
|
||||||
|
|
||||||
string++;
|
switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
|
||||||
length--;
|
|
||||||
|
|
||||||
/* Syntax error: a character must be present after $. */
|
|
||||||
if (length == 0) return 0;
|
|
||||||
|
|
||||||
if (*string >= '1' && *string <= '9')
|
|
||||||
{
|
{
|
||||||
do
|
case DDE_CAPTURE:
|
||||||
|
if (value < capture_top)
|
||||||
{
|
{
|
||||||
/* Maximum capture id is 65535. */
|
value *= 2;
|
||||||
if (capture_id <= 65535)
|
argslen += ovector[value + 1] - ovector[value];
|
||||||
capture_id = capture_id * 10 + (*string - '0');
|
|
||||||
|
|
||||||
string++;
|
|
||||||
length--;
|
|
||||||
}
|
}
|
||||||
while (length > 0 && *string >= '0' && *string <= '9');
|
argslen--; /* Negate the effect of argslen++ below. */
|
||||||
|
break;
|
||||||
|
|
||||||
/* To negate the effect of string++ below. */
|
case DDE_CHAR:
|
||||||
string--;
|
if (value == STDOUT_NL_CODE) argslen += STDOUT_NL_LEN - 1;
|
||||||
length++;
|
else if (utf && value > 127) argslen += ord2utf8(value) - 1;
|
||||||
}
|
break;
|
||||||
else if (*string == '{')
|
|
||||||
{
|
|
||||||
/* Must be a decimal number in braces, e.g: {5} or {38} */
|
|
||||||
string++;
|
|
||||||
length--;
|
|
||||||
|
|
||||||
/* Syntax error: a decimal number required. */
|
default: /* Should not occur */
|
||||||
if (length == 0) return 0;
|
case DDE_ERROR:
|
||||||
if (*string < '1' || *string > '9') return 0;
|
return 0;
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
/* Maximum capture id is 65535. */
|
|
||||||
if (capture_id <= 65535)
|
|
||||||
capture_id = capture_id * 10 + (*string - '0');
|
|
||||||
|
|
||||||
string++;
|
|
||||||
length--;
|
|
||||||
|
|
||||||
/* Syntax error: no more characters */
|
|
||||||
if (length == 0) return 0;
|
|
||||||
}
|
|
||||||
while (*string >= '0' && *string <= '9');
|
|
||||||
|
|
||||||
/* Syntax error: closing brace is missing. */
|
|
||||||
if (*string != '}') return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (capture_id > 0)
|
length -= (string - begin);
|
||||||
{
|
|
||||||
if (capture_id < capture_top)
|
|
||||||
{
|
|
||||||
capture_id *= 2;
|
|
||||||
argslen += ovector[capture_id + 1] - ovector[capture_id];
|
|
||||||
}
|
|
||||||
|
|
||||||
/* To negate the effect of argslen++ below. */
|
|
||||||
argslen--;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string++;
|
string++;
|
||||||
|
@ -2269,6 +2325,8 @@ while (length > 0)
|
||||||
argslen++;
|
argslen++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Get memory for the argument vector and its strings. */
|
||||||
|
|
||||||
args = (char*)malloc(argslen);
|
args = (char*)malloc(argslen);
|
||||||
if (args == NULL) return 0;
|
if (args == NULL) return 0;
|
||||||
|
|
||||||
|
@ -2279,9 +2337,10 @@ if (argsvector == NULL)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Now reprocess the string and set up the arguments. */
|
||||||
|
|
||||||
argsptr = args;
|
argsptr = args;
|
||||||
argsvectorptr = argsvector;
|
argsvectorptr = argsvector;
|
||||||
|
|
||||||
*argsvectorptr++ = argsptr;
|
*argsvectorptr++ = argsptr;
|
||||||
|
|
||||||
length = calloutptr->callout_string_length;
|
length = calloutptr->callout_string_length;
|
||||||
|
@ -2294,69 +2353,55 @@ while (length > 0)
|
||||||
*argsptr++ = '\0';
|
*argsptr++ = '\0';
|
||||||
*argsvectorptr++ = argsptr;
|
*argsvectorptr++ = argsptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
else if (*string == '$')
|
else if (*string == '$')
|
||||||
{
|
{
|
||||||
string++;
|
uint32_t value;
|
||||||
length--;
|
PCRE2_SPTR begin = string;
|
||||||
|
|
||||||
if ((*string >= '1' && *string <= '9') || *string == '{')
|
switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
|
||||||
{
|
{
|
||||||
PCRE2_SIZE capture_id = 0;
|
case DDE_CAPTURE:
|
||||||
|
if (value < capture_top)
|
||||||
if (*string != '{')
|
|
||||||
{
|
{
|
||||||
do
|
PCRE2_SIZE capturesize;
|
||||||
{
|
value *= 2;
|
||||||
/* Maximum capture id is 65535. */
|
capturesize = ovector[value + 1] - ovector[value];
|
||||||
if (capture_id <= 65535)
|
memcpy(argsptr, subject + ovector[value], capturesize);
|
||||||
capture_id = capture_id * 10 + (*string - '0');
|
argsptr += capturesize;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
string++;
|
case DDE_CHAR:
|
||||||
length--;
|
if (value == STDOUT_NL_CODE)
|
||||||
}
|
{
|
||||||
while (length > 0 && *string >= '0' && *string <= '9');
|
memcpy(argsptr, STDOUT_NL, STDOUT_NL_LEN);
|
||||||
|
argsptr += STDOUT_NL_LEN;
|
||||||
/* To negate the effect of string++ below. */
|
}
|
||||||
string--;
|
else if (utf && value > 127)
|
||||||
length++;
|
{
|
||||||
|
int n = ord2utf8(value);
|
||||||
|
memcpy(argsptr, utf8_buffer, n);
|
||||||
|
argsptr += n;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
string++;
|
*argsptr++ = value;
|
||||||
length--;
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
/* Maximum capture id is 65535. */
|
|
||||||
if (capture_id <= 65535)
|
|
||||||
capture_id = capture_id * 10 + (*string - '0');
|
|
||||||
|
|
||||||
string++;
|
|
||||||
length--;
|
|
||||||
}
|
|
||||||
while (*string != '}');
|
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
if (capture_id < capture_top)
|
default: /* Should not occur */
|
||||||
{
|
case DDE_ERROR:
|
||||||
PCRE2_SIZE capturesize;
|
return 0;
|
||||||
capture_id *= 2;
|
}
|
||||||
|
|
||||||
capturesize = ovector[capture_id + 1] - ovector[capture_id];
|
length -= (string - begin);
|
||||||
memcpy(argsptr, subject + ovector[capture_id], capturesize);
|
|
||||||
argsptr += capturesize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
*argsptr++ = *string;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
*argsptr++ = *string;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
else *argsptr++ = *string;
|
||||||
|
|
||||||
|
/* Advance along the string */
|
||||||
|
|
||||||
string++;
|
string++;
|
||||||
length--;
|
length--;
|
||||||
}
|
}
|
||||||
|
@ -2479,6 +2524,7 @@ int filepos = 0;
|
||||||
unsigned long int linenumber = 1;
|
unsigned long int linenumber = 1;
|
||||||
unsigned long int lastmatchnumber = 0;
|
unsigned long int lastmatchnumber = 0;
|
||||||
unsigned long int count = 0;
|
unsigned long int count = 0;
|
||||||
|
long int count_matched_lines = 0;
|
||||||
char *lastmatchrestart = main_buffer;
|
char *lastmatchrestart = main_buffer;
|
||||||
char *ptr = main_buffer;
|
char *ptr = main_buffer;
|
||||||
char *endptr;
|
char *endptr;
|
||||||
|
@ -2505,7 +2551,7 @@ bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
|
||||||
input_line_buffered);
|
input_line_buffered);
|
||||||
|
|
||||||
#ifdef SUPPORT_LIBBZ2
|
#ifdef SUPPORT_LIBBZ2
|
||||||
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is PCRE2_SIZE; */
|
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is PCRE2_SIZE */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
endptr = main_buffer + bufflength;
|
endptr = main_buffer + bufflength;
|
||||||
|
@ -2533,10 +2579,23 @@ while (ptr < endptr)
|
||||||
int mrc = 0;
|
int mrc = 0;
|
||||||
unsigned int options = 0;
|
unsigned int options = 0;
|
||||||
BOOL match;
|
BOOL match;
|
||||||
|
BOOL line_matched = FALSE;
|
||||||
char *t = ptr;
|
char *t = ptr;
|
||||||
PCRE2_SIZE length, linelength;
|
PCRE2_SIZE length, linelength;
|
||||||
PCRE2_SIZE startoffset = 0;
|
PCRE2_SIZE startoffset = 0;
|
||||||
|
|
||||||
|
/* If the -m option set a limit for the number of matched or non-matched
|
||||||
|
lines, check it here. A limit of zero means that no matching is ever done.
|
||||||
|
For stdin from a file, set the file position. */
|
||||||
|
|
||||||
|
if (count_limit >= 0 && count_matched_lines >= count_limit)
|
||||||
|
{
|
||||||
|
if (frtype == FR_PLAIN && filename == stdin_name && !is_file_tty(handle))
|
||||||
|
(void)fseek(handle, (long int)filepos, SEEK_SET);
|
||||||
|
rc = (count_limit == 0)? 1 : 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
/* At this point, ptr is at the start of a line. We need to find the length
|
/* At this point, ptr is at the start of a line. We need to find the length
|
||||||
of the subject string to pass to pcre2_match(). In multiline mode, it is the
|
of the subject string to pass to pcre2_match(). In multiline mode, it is the
|
||||||
length remainder of the data in the buffer. Otherwise, it is the length of
|
length remainder of the data in the buffer. Otherwise, it is the length of
|
||||||
|
@ -2686,6 +2745,10 @@ while (ptr < endptr)
|
||||||
|
|
||||||
if (filenames == FN_NOMATCH_ONLY) return 1;
|
if (filenames == FN_NOMATCH_ONLY) return 1;
|
||||||
|
|
||||||
|
/* Remember that this line matched (for counting matched lines) */
|
||||||
|
|
||||||
|
line_matched = TRUE;
|
||||||
|
|
||||||
/* If all we want is a yes/no answer, we can return immediately. */
|
/* If all we want is a yes/no answer, we can return immediately. */
|
||||||
|
|
||||||
if (quiet) return 0;
|
if (quiet) return 0;
|
||||||
|
@ -3067,6 +3130,11 @@ while (ptr < endptr)
|
||||||
filepos += (int)(linelength + endlinelength);
|
filepos += (int)(linelength + endlinelength);
|
||||||
linenumber++;
|
linenumber++;
|
||||||
|
|
||||||
|
/* If there was at least one match (or a non-match, as required) in the line,
|
||||||
|
increment the count for the -m option. */
|
||||||
|
|
||||||
|
if (line_matched) count_matched_lines++;
|
||||||
|
|
||||||
/* If input is line buffered, and the buffer is not yet full, read another
|
/* If input is line buffered, and the buffer is not yet full, read another
|
||||||
line and add it into the buffer. */
|
line and add it into the buffer. */
|
||||||
|
|
||||||
|
@ -4088,6 +4156,7 @@ if (only_matching_count > 1)
|
||||||
pcre2grep_exit(usage(2));
|
pcre2grep_exit(usage(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Check that there is a big enough ovector for all -o settings. */
|
/* Check that there is a big enough ovector for all -o settings. */
|
||||||
|
|
||||||
for (om = only_matching; om != NULL; om = om->next)
|
for (om = only_matching; om != NULL; om = om->next)
|
||||||
|
|
|
@ -956,3 +956,27 @@ RC=0
|
||||||
pcre2grep: Requested group 1 cannot be captured.
|
pcre2grep: Requested group 1 cannot be captured.
|
||||||
pcre2grep: Use --om-capture to increase the size of the capture vector.
|
pcre2grep: Use --om-capture to increase the size of the capture vector.
|
||||||
RC=2
|
RC=2
|
||||||
|
---------------------------- Test 129 -----------------------------
|
||||||
|
The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the
|
||||||
|
lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox
|
||||||
|
RC=0
|
||||||
|
---------------------------- Test 130 -----------------------------
|
||||||
|
fox
|
||||||
|
fox
|
||||||
|
fox
|
||||||
|
fox
|
||||||
|
RC=0
|
||||||
|
---------------------------- Test 131 -----------------------------
|
||||||
|
2
|
||||||
|
RC=0
|
||||||
|
---------------------------- Test 132 -----------------------------
|
||||||
|
match 1:
|
||||||
|
a
|
||||||
|
match 2:
|
||||||
|
b
|
||||||
|
---
|
||||||
|
a
|
||||||
|
RC=0
|
||||||
|
---------------------------- Test 133 -----------------------------
|
||||||
|
=AB3CD5=
|
||||||
|
RC=0
|
||||||
|
|
|
@ -29,3 +29,6 @@ RC=1
|
||||||
---------------------------- Test U5 ------------------------------
|
---------------------------- Test U5 ------------------------------
|
||||||
CD Z
|
CD Z
|
||||||
RC=0
|
RC=0
|
||||||
|
---------------------------- Test U6 -----------------------------
|
||||||
|
=ǓǤ=
|
||||||
|
RC=0
|
||||||
|
|
|
@ -40,3 +40,5 @@ T
|
||||||
T
|
T
|
||||||
T
|
T
|
||||||
T
|
T
|
||||||
|
0:T:AA
|
||||||
|
The quick brown
|
||||||
|
|
|
@ -28,3 +28,5 @@ T
|
||||||
T
|
T
|
||||||
T
|
T
|
||||||
T
|
T
|
||||||
|
0:T:AA
|
||||||
|
The quick brown
|
||||||
|
|
Loading…
Reference in New Issue