pcre2grep update: -m and $x{..}, $o{..} escapes. Also some doc updates.
This commit is contained in:
parent
3bdc76e4f3
commit
81da2b97e3
10
ChangeLog
10
ChangeLog
|
@ -76,6 +76,16 @@ the subject \xe5A. Fixes Bugzilla #2642.
|
|||
14. Fixed a bug in character set matching when JIT is enabled and both unicode
|
||||
scripts and unicode classes are present at the same time.
|
||||
|
||||
15. Added GNU grep's -m (aka --max-count) option to pcre2grep.
|
||||
|
||||
16. Refactored substitution processing in pcre2grep strings, both for the -O
|
||||
option and when dealing with callouts. There is now a single function that
|
||||
handles $ expansion in all cases (instead of multiple copies of almost
|
||||
identical code). This means that the same escape sequences are available
|
||||
everywhere, which was not previously the case. At the same time, the escape
|
||||
sequences $x{...} and $o{...} have been introduced, to allow for characters
|
||||
whose code points are greater than 255 in Unicode mode.
|
||||
|
||||
|
||||
Version 10.35 09-May-2020
|
||||
---------------------------
|
||||
|
|
6
README
6
README
|
@ -892,6 +892,6 @@ The distribution should contain the files listed below.
|
|||
) environments
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 20 March 2020
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 22 September 2020
|
||||
|
|
25
RunGrepTest
25
RunGrepTest
|
@ -661,6 +661,26 @@ echo "---------------------------- Test 128 -----------------------------" >>tes
|
|||
(cd $srcdir; $valgrind $vjs $pcre2grep -o1 --om-capture=0 'pattern()()()()' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 129 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m 2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 130 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -o -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 131 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -oc -m2 'fox' testdata/grepinput) >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 132 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -A3 '^match'; echo '---'; head -1) <testdata/grepinput >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test 133 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
# Now compare the results.
|
||||
|
||||
$cf $srcdir/testdata/grepoutput testtrygrep
|
||||
|
@ -694,6 +714,10 @@ if [ $utf8 -ne 0 ] ; then
|
|||
(cd $srcdir; $valgrind $vjs $pcre2grep -U -o '....' $builddir/testtemp1grep) >>testtrygrep
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
echo "---------------------------- Test U6 -----------------------------" >>testtrygrep
|
||||
(cd $srcdir; $valgrind $vjs $pcre2grep -u -m1 -O '=$x{1d3}$o{744}=' 'fox') <testdata/grepinputv >>testtrygrep 2>&1
|
||||
echo "RC=$?" >>testtrygrep
|
||||
|
||||
$cf $srcdir/testdata/grepoutput8 testtrygrep
|
||||
if [ $? != 0 ] ; then exit 1; fi
|
||||
|
||||
|
@ -764,6 +788,7 @@ if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scri
|
|||
$valgrind $vjs $pcre2grep '(T)(..(.))()()()()()()()(..)(?C"/bin/echo|Arg1: [$11] [${11}]")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep '(T)(?C"|$0:$1$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep '(T)(?C"|$1$n")(*F)' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
$valgrind $vjs $pcre2grep -m1 '(T)(?C"|$0:$1:$x{41}$o{101}$n")' $srcdir/testdata/grepinputv >>testtrygrep
|
||||
|
||||
if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'Non-fork callout scripts in patterns are supported'; then
|
||||
$cf $srcdir/testdata/grepoutputCN testtrygrep
|
||||
|
|
|
@ -892,6 +892,6 @@ The distribution should contain the files listed below.
|
|||
) environments
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 20 March 2020
|
||||
Email local part: Philip.Hazel
|
||||
Email domain: gmail.com
|
||||
Last updated: 22 September 2020
|
||||
|
|
|
@ -111,8 +111,8 @@ matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
|
|||
(either shown literally, or as an offset), scanning resumes immediately
|
||||
following the match, so that further matches on the same line can be found. If
|
||||
there are multiple patterns, they are all tried on the remainder of the line,
|
||||
but patterns that follow the one that matched are not tried on the earlier part
|
||||
of the line.
|
||||
but patterns that follow the one that matched are not tried on the earlier
|
||||
matched part of the line.
|
||||
</P>
|
||||
<P>
|
||||
This behaviour means that the order in which multiple patterns are specified
|
||||
|
@ -146,11 +146,10 @@ ignored.
|
|||
<br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
|
||||
<P>
|
||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||
is identified as a binary file, and is processed specially. (GNU grep
|
||||
identifies binary files in this manner.) However, if the newline type is
|
||||
specified as NUL, that is, the line terminator is a binary zero, the test for
|
||||
a binary file is not applied. See the <b>--binary-files</b> option for a means
|
||||
of changing the way binary files are handled.
|
||||
is identified as a binary file, and is processed specially. However, if the
|
||||
newline type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the <b>--binary-files</b>
|
||||
option for a means of changing the way binary files are handled.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">BINARY ZEROS IN PATTERNS</a><br>
|
||||
<P>
|
||||
|
@ -443,8 +442,8 @@ Ignore upper/lower case distinctions during comparisons.
|
|||
<P>
|
||||
<b>--include</b>=<i>pattern</i>
|
||||
If any <b>--include</b> patterns are specified, the only files that are
|
||||
processed are those that match one of the patterns (and do not match an
|
||||
<b>--exclude</b> pattern). This option does not affect directories, but it
|
||||
processed are those whose names match one of the patterns and do not match an
|
||||
<b>--exclude</b> pattern. This option does not affect directories, but it
|
||||
applies to all files, whether listed on the command line, obtained from
|
||||
<b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
|
||||
expression, and is matched against the final component of the file name, not
|
||||
|
@ -463,8 +462,8 @@ may be given any number of times; all the files are read.
|
|||
<P>
|
||||
<b>--include-dir</b>=<i>pattern</i>
|
||||
If any <b>--include-dir</b> patterns are specified, the only directories that
|
||||
are processed are those that match one of the patterns (and do not match an
|
||||
<b>--exclude-dir</b> pattern). This applies to all directories, whether listed
|
||||
are processed are those whose names match one of the patterns and do not match
|
||||
an <b>--exclude-dir</b> pattern. This applies to all directories, whether listed
|
||||
on the command line, obtained from <b>--file-list</b>, or by scanning a parent
|
||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||
the final component of the directory name, not the entire path. The <b>-F</b>,
|
||||
|
@ -487,8 +486,9 @@ a separate line. Searching normally stops as soon as a matching line is found
|
|||
in a file. However, if the <b>-c</b> (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches. This
|
||||
opeion overrides any previous <b>-H</b>, <b>-h</b>, or <b>-L</b> options.
|
||||
<b>-c</b> is a way of suppressing the listing of files with no matches that
|
||||
occurs with <b>-c</b> on its own. This option overrides any previous <b>-H</b>,
|
||||
<b>-h</b>, or <b>-L</b> options.
|
||||
</P>
|
||||
<P>
|
||||
<b>--label</b>=<i>name</i>
|
||||
|
@ -501,8 +501,8 @@ short form for this option.
|
|||
When this option is given, non-compressed input is read and processed line by
|
||||
line, and the output is flushed after each write. By default, input is read in
|
||||
large chunks, unless <b>pcre2grep</b> can determine that it is reading from a
|
||||
terminal (which is currently possible only in Unix-like environments or
|
||||
Windows). Output to terminal is normally automatically flushed by the operating
|
||||
terminal, which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed by the operating
|
||||
system. This option can be useful when the input or output is attached to a
|
||||
pipe and you do not want <b>pcre2grep</b> to buffer up large amounts of data.
|
||||
However, its use will affect performance, and the <b>-M</b> (multiline) option
|
||||
|
@ -528,6 +528,49 @@ locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
|||
used. There is no short form for this option.
|
||||
</P>
|
||||
<P>
|
||||
<b>-M</b>, <b>--multiline</b>
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||
used with <b>-M</b> may usefully contain literal newline characters and internal
|
||||
occurrences of ^ and $ characters. The output for a successful match may
|
||||
consist of more than one line. The first line is the line in which the match
|
||||
started, and the last line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the end of that line.
|
||||
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
|
||||
match has been handled, scanning restarts at the beginning of the line after
|
||||
the one in which the match ended.
|
||||
<br>
|
||||
<br>
|
||||
The newline sequence that separates multiple lines must be matched as part of
|
||||
the pattern. For example, to find the phrase "regular expression" in a file
|
||||
where "regular" might be at the end of a line and "expression" at the start of
|
||||
the next line, you could use this command:
|
||||
<pre>
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
</pre>
|
||||
The \s escape sequence matches any white space character, including newlines,
|
||||
and is followed by + so as to match trailing white space on the first line as
|
||||
well as possibly handling a two-character newline sequence.
|
||||
<br>
|
||||
<br>
|
||||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
||||
does not work when input is read line by line (see <b>--line-buffered</b>.)
|
||||
</P>
|
||||
<P>
|
||||
<b>-m</b> <i>number</i>, <b>--max-count</b>=<i>number</i>
|
||||
Stop processing after finding <i>number</i> matching lines, or non-matching
|
||||
lines if <b>-v</b> is also set. Any trailing context lines are output after the
|
||||
final match. In multiline mode, each multiline match counts as just one line
|
||||
for this purpose. If this limit is reached when reading the standard input from
|
||||
a regular file, the file is left positioned just after the last matching line.
|
||||
If <b>-c</b> is also set, the count that is output is never greater than
|
||||
<i>number</i>. This option has no effect if used with <b>-L</b>, <b>-l</b>, or
|
||||
<b>-q</b>, or when just checking for a match in a binary file.
|
||||
</P>
|
||||
<P>
|
||||
<b>--match-limit</b>=<i>number</i>
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
|
@ -568,38 +611,6 @@ set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
|||
smaller than the starting buffer size.
|
||||
</P>
|
||||
<P>
|
||||
<b>-M</b>, <b>--multiline</b>
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||
used with <b>-M</b> may usefully contain literal newline characters and internal
|
||||
occurrences of ^ and $ characters. The output for a successful match may
|
||||
consist of more than one line. The first line is the line in which the match
|
||||
started, and the last line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the end of that line.
|
||||
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
|
||||
match has been handled, scanning restarts at the beginning of the line after
|
||||
the one in which the match ended.
|
||||
<br>
|
||||
<br>
|
||||
The newline sequence that separates multiple lines must be matched as part of
|
||||
the pattern. For example, to find the phrase "regular expression" in a file
|
||||
where "regular" might be at the end of a line and "expression" at the start of
|
||||
the next line, you could use this command:
|
||||
<pre>
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
</pre>
|
||||
The \s escape sequence matches any white space character, including newlines,
|
||||
and is followed by + so as to match trailing white space on the first line as
|
||||
well as possibly handling a two-character newline sequence.
|
||||
<br>
|
||||
<br>
|
||||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
||||
does not work when input is read line by line (see <b>--line-buffered</b>.)
|
||||
</P>
|
||||
<P>
|
||||
<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
|
||||
Six different conventions for indicating the ends of lines in scanned files are
|
||||
supported. For example:
|
||||
|
@ -648,31 +659,41 @@ It should never be needed in normal use.
|
|||
</P>
|
||||
<P>
|
||||
<b>-O</b> <i>text</i>, <b>--output</b>=<i>text</i>
|
||||
When there is a match, instead of outputting the whole line that matched,
|
||||
output just the given text, followed by an operating-system standard newline.
|
||||
The <b>--newline</b> option has no effect on this option, which is mutually
|
||||
exclusive with <b>--only-matching</b>, <b>--file-offsets</b>, and
|
||||
<b>--line-offsets</b>. Escape sequences starting with a dollar character may be
|
||||
used to insert the contents of the matched part of the line and/or captured
|
||||
substrings into the text.
|
||||
When there is a match, instead of outputting the line that matched, output just
|
||||
the text specified in this option, followed by an operating-system standard
|
||||
newline. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>,
|
||||
and <b>-C</b> options are ignored. The <b>--newline</b> option has no effect on
|
||||
this option, which is mutually exclusive with <b>--only-matching</b>,
|
||||
<b>--file-offsets</b>, and <b>--line-offsets</b>. However, like
|
||||
<b>--only-matching</b>, if there is more than one match in a line, each of them
|
||||
causes a line of output.
|
||||
<br>
|
||||
<br>
|
||||
$<digits> or ${<digits>} is replaced by the captured
|
||||
substring of the given decimal number; zero substitutes the whole match. If
|
||||
the number is greater than the number of capturing substrings, or if the
|
||||
capture is unset, the replacement is empty.
|
||||
Escape sequences starting with a dollar character may be used to insert the
|
||||
contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
<br>
|
||||
<br>
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
<br>
|
||||
<br>
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
<br>
|
||||
<br>
|
||||
$o<digits> is replaced by the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||
given octal number. In the first form, up to three octal digits are processed.
|
||||
When more digits are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
<br>
|
||||
<br>
|
||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||
processed. When more digits are needed in Unicode mode to specify a wide
|
||||
character, the second form must be used.
|
||||
<br>
|
||||
<br>
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
|
@ -741,7 +762,8 @@ option to "recurse".
|
|||
</P>
|
||||
<P>
|
||||
<b>--recursion-limit</b>=<i>number</i>
|
||||
See <b>--match-limit</b> above.
|
||||
This is an obsolete synonym for <b>--depth-limit</b>. See <b>--match-limit</b>
|
||||
above for details.
|
||||
</P>
|
||||
<P>
|
||||
<b>-s</b>, <b>--no-messages</b>
|
||||
|
@ -765,15 +787,18 @@ total would always be zero.
|
|||
<b>-u</b>, <b>--utf</b>
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
||||
<b>--include</b> options) and all subject lines that are scanned must be valid
|
||||
strings of UTF-8 characters.
|
||||
<b>--include</b> options) and all lines that are scanned must be valid strings
|
||||
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||
occurs.
|
||||
</P>
|
||||
<P>
|
||||
<b>-U</b>, <b>--utf-allow-invalid</b>
|
||||
As <b>--utf</b>, but in addition subject lines may contain invalid UTF-8 code
|
||||
unit sequences. These can never form part of any pattern match. This facility
|
||||
allows valid UTF-8 strings to be sought in executable or other binary files.
|
||||
For more details about matching in non-valid UTF-8 strings, see the
|
||||
unit sequences. These can never form part of any pattern match. Patterns
|
||||
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||
or other binary files. For more details about matching in non-valid UTF-8
|
||||
strings, see the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b>(3)</a>
|
||||
documentation.
|
||||
</P>
|
||||
|
@ -786,7 +811,9 @@ ignored.
|
|||
<P>
|
||||
<b>-v</b>, <b>--invert-match</b>
|
||||
Invert the sense of the match, so that lines which do <i>not</i> match any of
|
||||
the patterns are the ones that are found.
|
||||
the patterns are the ones that are found. When this option is set, options such
|
||||
as <b>--only-matching</b> and <b>--output</b>, which specify parts of a match
|
||||
that are to be output, are ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
|
||||
|
@ -909,12 +936,36 @@ documentation for details). Numbered callouts are ignored by <b>pcre2grep</b>;
|
|||
only callouts with string arguments are useful.
|
||||
</P>
|
||||
<br><b>
|
||||
Echoing a specific string
|
||||
</b><br>
|
||||
<P>
|
||||
Starting the callout string with a pipe character invokes an echoing facility
|
||||
that avoids calling an external program or script. This facility is always
|
||||
available, provided that callouts were not completely disabled when
|
||||
<b>pcre2grep</b> was built. The rest of the callout string is processed as a
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the <b>--output</b> (<b>-O</b>) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
<pre>
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
</pre>
|
||||
Matching continues normally after the string is output. If you want to see only
|
||||
the callout output but not any output from an actual match, you should end the
|
||||
pattern with (*FAIL).
|
||||
</P>
|
||||
<br><b>
|
||||
Calling external programs or scripts
|
||||
</b><br>
|
||||
<P>
|
||||
This facility can be independently disabled when <b>pcre2grep</b> is built. It
|
||||
is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
|
||||
where <b>lib$spawn()</b> is used, and for any other Unix-like environment where
|
||||
where <b>lib$spawn()</b> is used, and for any Unix-like environment where
|
||||
<b>fork()</b> and <b>execv()</b> are available.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -926,14 +977,11 @@ arguments:
|
|||
executable_name|arg1|arg2|...
|
||||
</pre>
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
||||
captured substring of the given decimal number, which must be greater than
|
||||
zero. If the number is greater than the number of capturing substrings, or if
|
||||
the capture is unset, the replacement is empty.
|
||||
</P>
|
||||
<P>
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||
started by a dollar character. These are the same as for the <b>--output</b>
|
||||
(<b>-O</b>) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
<pre>
|
||||
echo -e "abcde\n12345" | pcre2grep \
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -946,28 +994,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
</pre>
|
||||
The parameters for the system call that is used to run the
|
||||
program or script are zero-terminated strings. This means that binary zero
|
||||
characters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in the
|
||||
string (for example, a dollar not followed by another character) cause the
|
||||
callout to be ignored. If running the program fails for any reason (including
|
||||
the non-existence of the executable), a local matching failure occurs and the
|
||||
matcher backtracks in the normal way.
|
||||
</P>
|
||||
<br><b>
|
||||
Echoing a specific string
|
||||
</b><br>
|
||||
<P>
|
||||
This facility is always available, provided that callouts were not completely
|
||||
disabled when <b>pcre2grep</b> was built. If the callout string starts with a
|
||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
||||
having been passed through the same escape processing as text from the --output
|
||||
option. This provides a simple echoing facility that avoids calling an external
|
||||
program or script. No terminator is added to the string, so if you want a
|
||||
newline, you must include it explicitly. Matching continues normally after the
|
||||
string is output. If you want to see only the callout output but not any output
|
||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
||||
The parameters for the system call that is used to run the program or script
|
||||
are zero-terminated strings. This means that binary zero characters in the
|
||||
callout argument will cause premature termination of their substrings, and
|
||||
therefore should not be present. Any syntax errors in the string (for example,
|
||||
a dollar not followed by another character) causes the callout to be ignored.
|
||||
If running the program fails for any reason (including the non-existence of the
|
||||
executable), a local matching failure occurs and the matcher backtracks in the
|
||||
normal way.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">MATCHING ERRORS</a><br>
|
||||
<P>
|
||||
|
@ -999,7 +1033,8 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3).
|
||||
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2callout</b>(3),
|
||||
<b>pcre2unicode</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
|
@ -1012,7 +1047,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 25 January 2020
|
||||
Last updated: 04 October 2020
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -323,7 +323,7 @@ test data, command lines that begin with # may appear. This file format, with
|
|||
some restrictions, can also be processed by the <b>perltest.sh</b> script that
|
||||
is distributed with PCRE2 as a means of checking that the behaviour of PCRE2
|
||||
and Perl is the same. For a specification of <b>perltest.sh</b>, see the
|
||||
comments near its beginning.
|
||||
comments near its beginning. See also the #perltest command below.
|
||||
</P>
|
||||
<P>
|
||||
When the input is a terminal, <b>pcre2test</b> prompts for each line of input,
|
||||
|
@ -420,14 +420,20 @@ patterns. Modifiers on a pattern can change these settings.
|
|||
<pre>
|
||||
#perltest
|
||||
</pre>
|
||||
The appearance of this line causes all subsequent modifier settings to be
|
||||
checked for compatibility with the <b>perltest.sh</b> script, which is used to
|
||||
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
||||
lines, #pattern commands, and #subject commands that set or unset "mark", no
|
||||
command lines are permitted, because they and many of the modifiers are
|
||||
specific to <b>pcre2test</b>, and should not be used in test files that are also
|
||||
processed by <b>perltest.sh</b>. The <b>#perltest</b> command helps detect tests
|
||||
that are accidentally put in the wrong file.
|
||||
This line is used in test files that can also be processed by <b>perltest.sh</b>
|
||||
to confirm that Perl gives the same results as PCRE2. Subsequent tests are
|
||||
checked for the use of <b>pcre2test</b> features that are incompatible with the
|
||||
<b>perltest.sh</b> script.
|
||||
</P>
|
||||
<P>
|
||||
Patterns must use '/' as their delimiter, and only certain modifiers are
|
||||
supported. Comment lines, #pattern commands, and #subject commands that set or
|
||||
unset "mark" are recognized and acted on. The #perltest, #forbid_utf, and
|
||||
#newline_default commands, which are needed in the relevant pcre2test files,
|
||||
are silently ignored. All other command lines are ignored, but give a warning
|
||||
message. The <b>#perltest</b> command helps detect tests that are accidentally
|
||||
put in the wrong file or use the wrong delimiter. For more details of the
|
||||
<b>perltest.sh</b> script see the comments it contains.
|
||||
<pre>
|
||||
#pop [<modifiers>]
|
||||
#popcopy [<modifiers>]
|
||||
|
@ -2113,7 +2119,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 20 March 2020
|
||||
Last updated: 14 September 2020
|
||||
<br>
|
||||
Copyright © 1997-2020 University of Cambridge.
|
||||
<br>
|
||||
|
|
232
doc/pcre2grep.1
232
doc/pcre2grep.1
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2GREP 1 "25 January 2020" "PCRE2 10.35"
|
||||
.TH PCRE2GREP 1 "04 October 2020" "PCRE2 10.36"
|
||||
.SH NAME
|
||||
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -79,8 +79,8 @@ matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
|
|||
(either shown literally, or as an offset), scanning resumes immediately
|
||||
following the match, so that further matches on the same line can be found. If
|
||||
there are multiple patterns, they are all tried on the remainder of the line,
|
||||
but patterns that follow the one that matched are not tried on the earlier part
|
||||
of the line.
|
||||
but patterns that follow the one that matched are not tried on the earlier
|
||||
matched part of the line.
|
||||
.P
|
||||
This behaviour means that the order in which multiple patterns are specified
|
||||
can affect the output when one of the above options is used. This is no longer
|
||||
|
@ -115,11 +115,10 @@ ignored.
|
|||
.rs
|
||||
.sp
|
||||
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||
is identified as a binary file, and is processed specially. (GNU grep
|
||||
identifies binary files in this manner.) However, if the newline type is
|
||||
specified as NUL, that is, the line terminator is a binary zero, the test for
|
||||
a binary file is not applied. See the \fB--binary-files\fP option for a means
|
||||
of changing the way binary files are handled.
|
||||
is identified as a binary file, and is processed specially. However, if the
|
||||
newline type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the \fB--binary-files\fP
|
||||
option for a means of changing the way binary files are handled.
|
||||
.
|
||||
.
|
||||
.SH "BINARY ZEROS IN PATTERNS"
|
||||
|
@ -383,8 +382,8 @@ Ignore upper/lower case distinctions during comparisons.
|
|||
.TP
|
||||
\fB--include\fP=\fIpattern\fP
|
||||
If any \fB--include\fP patterns are specified, the only files that are
|
||||
processed are those that match one of the patterns (and do not match an
|
||||
\fB--exclude\fP pattern). This option does not affect directories, but it
|
||||
processed are those whose names match one of the patterns and do not match an
|
||||
\fB--exclude\fP pattern. This option does not affect directories, but it
|
||||
applies to all files, whether listed on the command line, obtained from
|
||||
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
|
||||
expression, and is matched against the final component of the file name, not
|
||||
|
@ -401,8 +400,8 @@ may be given any number of times; all the files are read.
|
|||
.TP
|
||||
\fB--include-dir\fP=\fIpattern\fP
|
||||
If any \fB--include-dir\fP patterns are specified, the only directories that
|
||||
are processed are those that match one of the patterns (and do not match an
|
||||
\fB--exclude-dir\fP pattern). This applies to all directories, whether listed
|
||||
are processed are those whose names match one of the patterns and do not match
|
||||
an \fB--exclude-dir\fP pattern. This applies to all directories, whether listed
|
||||
on the command line, obtained from \fB--file-list\fP, or by scanning a parent
|
||||
directory. The pattern is a PCRE2 regular expression, and is matched against
|
||||
the final component of the directory name, not the entire path. The \fB-F\fP,
|
||||
|
@ -423,8 +422,9 @@ a separate line. Searching normally stops as soon as a matching line is found
|
|||
in a file. However, if the \fB-c\fP (count) option is also used, matching
|
||||
continues in order to obtain the correct count, and those files that have at
|
||||
least one match are listed along with their counts. Using this option with
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches. This
|
||||
opeion overrides any previous \fB-H\fP, \fB-h\fP, or \fB-L\fP options.
|
||||
\fB-c\fP is a way of suppressing the listing of files with no matches that
|
||||
occurs with \fB-c\fP on its own. This option overrides any previous \fB-H\fP,
|
||||
\fB-h\fP, or \fB-L\fP options.
|
||||
.TP
|
||||
\fB--label\fP=\fIname\fP
|
||||
This option supplies a name to be used for the standard input when file names
|
||||
|
@ -435,8 +435,8 @@ short form for this option.
|
|||
When this option is given, non-compressed input is read and processed line by
|
||||
line, and the output is flushed after each write. By default, input is read in
|
||||
large chunks, unless \fBpcre2grep\fP can determine that it is reading from a
|
||||
terminal (which is currently possible only in Unix-like environments or
|
||||
Windows). Output to terminal is normally automatically flushed by the operating
|
||||
terminal, which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed by the operating
|
||||
system. This option can be useful when the input or output is attached to a
|
||||
pipe and you do not want \fBpcre2grep\fP to buffer up large amounts of data.
|
||||
However, its use will affect performance, and the \fB-M\fP (multiline) option
|
||||
|
@ -459,6 +459,45 @@ the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
|
|||
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||
used. There is no short form for this option.
|
||||
.TP
|
||||
\fB-M\fP, \fB--multiline\fP
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||
used with \fB-M\fP may usefully contain literal newline characters and internal
|
||||
occurrences of ^ and $ characters. The output for a successful match may
|
||||
consist of more than one line. The first line is the line in which the match
|
||||
started, and the last line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the end of that line.
|
||||
If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a
|
||||
match has been handled, scanning restarts at the beginning of the line after
|
||||
the one in which the match ended.
|
||||
.sp
|
||||
The newline sequence that separates multiple lines must be matched as part of
|
||||
the pattern. For example, to find the phrase "regular expression" in a file
|
||||
where "regular" might be at the end of a line and "expression" at the start of
|
||||
the next line, you could use this command:
|
||||
.sp
|
||||
pcre2grep -M 'regular\es+expression' <file>
|
||||
.sp
|
||||
The \es escape sequence matches any white space character, including newlines,
|
||||
and is followed by + so as to match trailing white space on the first line as
|
||||
well as possibly handling a two-character newline sequence.
|
||||
.sp
|
||||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the \fB-M\fP option
|
||||
does not work when input is read line by line (see \fB--line-buffered\fP.)
|
||||
.TP
|
||||
\fB-m\fP \fInumber\fP, \fB--max-count\fP=\fInumber\fP
|
||||
Stop processing after finding \fInumber\fP matching lines, or non-matching
|
||||
lines if \fB-v\fP is also set. Any trailing context lines are output after the
|
||||
final match. In multiline mode, each multiline match counts as just one line
|
||||
for this purpose. If this limit is reached when reading the standard input from
|
||||
a regular file, the file is left positioned just after the last matching line.
|
||||
If \fB-c\fP is also set, the count that is output is never greater than
|
||||
\fInumber\fP. This option has no effect if used with \fB-L\fP, \fB-l\fP, or
|
||||
\fB-q\fP, or when just checking for a match in a binary file.
|
||||
.TP
|
||||
\fB--match-limit\fP=\fInumber\fP
|
||||
Processing some regular expression patterns may take a very long time to search
|
||||
for all possible matching strings. Others may require a very large amount of
|
||||
|
@ -493,35 +532,6 @@ This limits the expansion of the processing buffer, whose initial size can be
|
|||
set by \fB--buffer-size\fP. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
.TP
|
||||
\fB-M\fP, \fB--multiline\fP
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||
used with \fB-M\fP may usefully contain literal newline characters and internal
|
||||
occurrences of ^ and $ characters. The output for a successful match may
|
||||
consist of more than one line. The first line is the line in which the match
|
||||
started, and the last line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the end of that line.
|
||||
If \fB-v\fP is set, none of the lines in a multi-line match are output. Once a
|
||||
match has been handled, scanning restarts at the beginning of the line after
|
||||
the one in which the match ended.
|
||||
.sp
|
||||
The newline sequence that separates multiple lines must be matched as part of
|
||||
the pattern. For example, to find the phrase "regular expression" in a file
|
||||
where "regular" might be at the end of a line and "expression" at the start of
|
||||
the next line, you could use this command:
|
||||
.sp
|
||||
pcre2grep -M 'regular\es+expression' <file>
|
||||
.sp
|
||||
The \es escape sequence matches any white space character, including newlines,
|
||||
and is followed by + so as to match trailing white space on the first line as
|
||||
well as possibly handling a two-character newline sequence.
|
||||
.sp
|
||||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that \fBpcre2grep\fP buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the \fB-M\fP option
|
||||
does not work when input is read line by line (see \fB--line-buffered\fP.)
|
||||
.TP
|
||||
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
|
||||
Six different conventions for indicating the ends of lines in scanned files are
|
||||
supported. For example:
|
||||
|
@ -565,27 +575,36 @@ use of JIT at run time. It is provided for testing and working round problems.
|
|||
It should never be needed in normal use.
|
||||
.TP
|
||||
\fB-O\fP \fItext\fP, \fB--output\fP=\fItext\fP
|
||||
When there is a match, instead of outputting the whole line that matched,
|
||||
output just the given text, followed by an operating-system standard newline.
|
||||
The \fB--newline\fP option has no effect on this option, which is mutually
|
||||
exclusive with \fB--only-matching\fP, \fB--file-offsets\fP, and
|
||||
\fB--line-offsets\fP. Escape sequences starting with a dollar character may be
|
||||
used to insert the contents of the matched part of the line and/or captured
|
||||
substrings into the text.
|
||||
When there is a match, instead of outputting the line that matched, output just
|
||||
the text specified in this option, followed by an operating-system standard
|
||||
newline. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP,
|
||||
and \fB-C\fP options are ignored. The \fB--newline\fP option has no effect on
|
||||
this option, which is mutually exclusive with \fB--only-matching\fP,
|
||||
\fB--file-offsets\fP, and \fB--line-offsets\fP. However, like
|
||||
\fB--only-matching\fP, if there is more than one match in a line, each of them
|
||||
causes a line of output.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured
|
||||
substring of the given decimal number; zero substitutes the whole match. If
|
||||
the number is greater than the number of capturing substrings, or if the
|
||||
capture is unset, the replacement is empty.
|
||||
Escape sequences starting with a dollar character may be used to insert the
|
||||
contents of the matched part of the line and/or captured substrings into the
|
||||
text.
|
||||
.sp
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero substitutes the whole match. If the number is greater than
|
||||
the number of capturing substrings, or if the capture is unset, the replacement
|
||||
is empty.
|
||||
.sp
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by form feed; $n by
|
||||
newline; $r by carriage return; $t by tab; $v by vertical tab.
|
||||
.sp
|
||||
$o<digits> is replaced by the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose code point is the
|
||||
given octal number. In the first form, up to three octal digits are processed.
|
||||
When more digits are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
.sp
|
||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character represented by the
|
||||
given hexadecimal number. In the first form, up to two hexadecimal digits are
|
||||
processed. When more digits are needed in Unicode mode to specify a wide
|
||||
character, the second form must be used.
|
||||
.sp
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar.
|
||||
|
@ -644,7 +663,8 @@ immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
|
|||
option to "recurse".
|
||||
.TP
|
||||
\fB--recursion-limit\fP=\fInumber\fP
|
||||
See \fB--match-limit\fP above.
|
||||
This is an obsolete synonym for \fB--depth-limit\fP. See \fB--match-limit\fP
|
||||
above for details.
|
||||
.TP
|
||||
\fB-s\fP, \fB--no-messages\fP
|
||||
Suppress error messages about non-existent or unreadable files. Such files are
|
||||
|
@ -665,14 +685,17 @@ total would always be zero.
|
|||
\fB-u\fP, \fB--utf\fP
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
||||
\fB--include\fP options) and all subject lines that are scanned must be valid
|
||||
strings of UTF-8 characters.
|
||||
\fB--include\fP options) and all lines that are scanned must be valid strings
|
||||
of UTF-8 characters. If an invalid UTF-8 string is encountered, an error
|
||||
occurs.
|
||||
.TP
|
||||
\fB-U\fP, \fB--utf-allow-invalid\fP
|
||||
As \fB--utf\fP, but in addition subject lines may contain invalid UTF-8 code
|
||||
unit sequences. These can never form part of any pattern match. This facility
|
||||
allows valid UTF-8 strings to be sought in executable or other binary files.
|
||||
For more details about matching in non-valid UTF-8 strings, see the
|
||||
unit sequences. These can never form part of any pattern match. Patterns
|
||||
themselves, however, must still be valid UTF-8 strings. This facility allows
|
||||
valid UTF-8 strings to be sought within arbitrary byte sequences in executable
|
||||
or other binary files. For more details about matching in non-valid UTF-8
|
||||
strings, see the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP(3)
|
||||
.\"
|
||||
|
@ -685,7 +708,9 @@ ignored.
|
|||
.TP
|
||||
\fB-v\fP, \fB--invert-match\fP
|
||||
Invert the sense of the match, so that lines which do \fInot\fP match any of
|
||||
the patterns are the ones that are found.
|
||||
the patterns are the ones that are found. When this option is set, options such
|
||||
as \fB--only-matching\fP and \fB--output\fP, which specify parts of a match
|
||||
that are to be output, are ignored.
|
||||
.TP
|
||||
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
|
||||
Force the patterns only to match "words". That is, there must be a word
|
||||
|
@ -812,12 +837,36 @@ documentation for details). Numbered callouts are ignored by \fBpcre2grep\fP;
|
|||
only callouts with string arguments are useful.
|
||||
.
|
||||
.
|
||||
.SS "Echoing a specific string"
|
||||
.rs
|
||||
.sp
|
||||
Starting the callout string with a pipe character invokes an echoing facility
|
||||
that avoids calling an external program or script. This facility is always
|
||||
available, provided that callouts were not completely disabled when
|
||||
\fBpcre2grep\fP was built. The rest of the callout string is processed as a
|
||||
zero-terminated string, which means it should not contain any internal binary
|
||||
zeros. It is written to the output, having first been passed through the same
|
||||
escape processing as text from the \fB--output\fP (\fB-O\fP) option (see
|
||||
above). However, $0 cannot be used to insert a matched substring because the
|
||||
match is still in progress. Instead, the single character '0' is inserted. Any
|
||||
syntax errors in the string (for example, a dollar not followed by another
|
||||
character) causes the callout to be ignored. No terminator is added to the
|
||||
output string, so if you want a newline, you must include it explicitly using
|
||||
the escape $n. For example:
|
||||
.sp
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
.sp
|
||||
Matching continues normally after the string is output. If you want to see only
|
||||
the callout output but not any output from an actual match, you should end the
|
||||
pattern with (*FAIL).
|
||||
.
|
||||
.
|
||||
.SS "Calling external programs or scripts"
|
||||
.rs
|
||||
.sp
|
||||
This facility can be independently disabled when \fBpcre2grep\fP is built. It
|
||||
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
|
||||
where \fBlib$spawn()\fP is used, and for any other Unix-like environment where
|
||||
where \fBlib$spawn()\fP is used, and for any Unix-like environment where
|
||||
\fBfork()\fP and \fBexecv()\fP are available.
|
||||
.P
|
||||
If the callout string does not start with a pipe (vertical bar) character, it
|
||||
|
@ -828,13 +877,11 @@ arguments:
|
|||
executable_name|arg1|arg2|...
|
||||
.sp
|
||||
Any substring (including the executable name) may contain escape sequences
|
||||
started by a dollar character: $<digits> or ${<digits>} is replaced by the
|
||||
captured substring of the given decimal number, which must be greater than
|
||||
zero. If the number is greater than the number of capturing substrings, or if
|
||||
the capture is unset, the replacement is empty.
|
||||
.P
|
||||
Any other character is substituted by itself. In particular, $$ is replaced by
|
||||
a single dollar and $| is replaced by a pipe character. Here is an example:
|
||||
started by a dollar character. These are the same as for the \fB--output\fP
|
||||
(\fB-O\fP) option documented above, except that $0 cannot insert the matched
|
||||
string because the match is still in progress. Instead, the character '0'
|
||||
is inserted. If you need a literal dollar or pipe character in any
|
||||
substring, use $$ or $| respectively. Here is an example:
|
||||
.sp
|
||||
echo -e "abcde\en12345" | pcre2grep \e
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -847,28 +894,14 @@ a single dollar and $| is replaced by a pipe character. Here is an example:
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
.sp
|
||||
The parameters for the system call that is used to run the
|
||||
program or script are zero-terminated strings. This means that binary zero
|
||||
characters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in the
|
||||
string (for example, a dollar not followed by another character) cause the
|
||||
callout to be ignored. If running the program fails for any reason (including
|
||||
the non-existence of the executable), a local matching failure occurs and the
|
||||
matcher backtracks in the normal way.
|
||||
.
|
||||
.
|
||||
.SS "Echoing a specific string"
|
||||
.rs
|
||||
.sp
|
||||
This facility is always available, provided that callouts were not completely
|
||||
disabled when \fBpcre2grep\fP was built. If the callout string starts with a
|
||||
pipe (vertical bar) character, the rest of the string is written to the output,
|
||||
having been passed through the same escape processing as text from the --output
|
||||
option. This provides a simple echoing facility that avoids calling an external
|
||||
program or script. No terminator is added to the string, so if you want a
|
||||
newline, you must include it explicitly. Matching continues normally after the
|
||||
string is output. If you want to see only the callout output but not any output
|
||||
from an actual match, you should end the relevant pattern with (*FAIL).
|
||||
The parameters for the system call that is used to run the program or script
|
||||
are zero-terminated strings. This means that binary zero characters in the
|
||||
callout argument will cause premature termination of their substrings, and
|
||||
therefore should not be present. Any syntax errors in the string (for example,
|
||||
a dollar not followed by another character) causes the callout to be ignored.
|
||||
If running the program fails for any reason (including the non-existence of the
|
||||
executable), a local matching failure occurs and the matcher backtracks in the
|
||||
normal way.
|
||||
.
|
||||
.
|
||||
.SH "MATCHING ERRORS"
|
||||
|
@ -904,7 +937,8 @@ because VMS does not distinguish between exit(0) and exit(1).
|
|||
.SH "SEE ALSO"
|
||||
.rs
|
||||
.sp
|
||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3).
|
||||
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2callout\fP(3),
|
||||
\fBpcre2unicode\fP(3).
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
|
@ -921,6 +955,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 25 January 2020
|
||||
Last updated: 04 October 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -80,7 +80,7 @@ DESCRIPTION
|
|||
following the match, so that further matches on the same line can be
|
||||
found. If there are multiple patterns, they are all tried on the re-
|
||||
mainder of the line, but patterns that follow the one that matched are
|
||||
not tried on the earlier part of the line.
|
||||
not tried on the earlier matched part of the line.
|
||||
|
||||
This behaviour means that the order in which multiple patterns are
|
||||
specified can affect the output when one of the above options is used.
|
||||
|
@ -115,10 +115,10 @@ BINARY FILES
|
|||
|
||||
By default, a file that contains a binary zero byte within the first
|
||||
1024 bytes is identified as a binary file, and is processed specially.
|
||||
(GNU grep identifies binary files in this manner.) However, if the new-
|
||||
line type is specified as NUL, that is, the line terminator is a binary
|
||||
zero, the test for a binary file is not applied. See the --binary-files
|
||||
option for a means of changing the way binary files are handled.
|
||||
However, if the newline type is specified as NUL, that is, the line
|
||||
terminator is a binary zero, the test for a binary file is not applied.
|
||||
See the --binary-files option for a means of changing the way binary
|
||||
files are handled.
|
||||
|
||||
|
||||
BINARY ZEROS IN PATTERNS
|
||||
|
@ -413,17 +413,17 @@ OPTIONS
|
|||
|
||||
--include=pattern
|
||||
If any --include patterns are specified, the only files that
|
||||
are processed are those that match one of the patterns (and
|
||||
do not match an --exclude pattern). This option does not af-
|
||||
fect directories, but it applies to all files, whether listed
|
||||
on the command line, obtained from --file-list, or by scan-
|
||||
ning a directory. The pattern is a PCRE2 regular expression,
|
||||
and is matched against the final component of the file name,
|
||||
not the entire path. The -F, -w, and -x options do not apply
|
||||
to this pattern. The option may be given any number of times.
|
||||
If a file name matches both an --include and an --exclude
|
||||
pattern, it is excluded. There is no short form for this op-
|
||||
tion.
|
||||
are processed are those whose names match one of the patterns
|
||||
and do not match an --exclude pattern. This option does not
|
||||
affect directories, but it applies to all files, whether
|
||||
listed on the command line, obtained from --file-list, or by
|
||||
scanning a directory. The pattern is a PCRE2 regular expres-
|
||||
sion, and is matched against the final component of the file
|
||||
name, not the entire path. The -F, -w, and -x options do not
|
||||
apply to this pattern. The option may be given any number of
|
||||
times. If a file name matches both an --include and an --ex-
|
||||
clude pattern, it is excluded. There is no short form for
|
||||
this option.
|
||||
|
||||
--include-from=filename
|
||||
Treat each non-empty line of the file as the data for an
|
||||
|
@ -434,8 +434,8 @@ OPTIONS
|
|||
|
||||
--include-dir=pattern
|
||||
If any --include-dir patterns are specified, the only direc-
|
||||
tories that are processed are those that match one of the
|
||||
patterns (and do not match an --exclude-dir pattern). This
|
||||
tories that are processed are those whose names match one of
|
||||
the patterns and do not match an --exclude-dir pattern. This
|
||||
applies to all directories, whether listed on the command
|
||||
line, obtained from --file-list, or by scanning a parent di-
|
||||
rectory. The pattern is a PCRE2 regular expression, and is
|
||||
|
@ -461,8 +461,9 @@ OPTIONS
|
|||
matching continues in order to obtain the correct count, and
|
||||
those files that have at least one match are listed along
|
||||
with their counts. Using this option with -c is a way of sup-
|
||||
pressing the listing of files with no matches. This opeion
|
||||
overrides any previous -H, -h, or -L options.
|
||||
pressing the listing of files with no matches that occurs
|
||||
with -c on its own. This option overrides any previous -H,
|
||||
-h, or -L options.
|
||||
|
||||
--label=name
|
||||
This option supplies a name to be used for the standard input
|
||||
|
@ -470,37 +471,84 @@ OPTIONS
|
|||
input)" is used. There is no short form for this option.
|
||||
|
||||
--line-buffered
|
||||
When this option is given, non-compressed input is read and
|
||||
processed line by line, and the output is flushed after each
|
||||
write. By default, input is read in large chunks, unless
|
||||
pcre2grep can determine that it is reading from a terminal
|
||||
(which is currently possible only in Unix-like environments
|
||||
or Windows). Output to terminal is normally automatically
|
||||
flushed by the operating system. This option can be useful
|
||||
when the input or output is attached to a pipe and you do not
|
||||
want pcre2grep to buffer up large amounts of data. However,
|
||||
its use will affect performance, and the -M (multiline) op-
|
||||
tion ceases to work. When input is from a compressed .gz or
|
||||
.bz2 file, --line-buffered is ignored.
|
||||
When this option is given, non-compressed input is read and
|
||||
processed line by line, and the output is flushed after each
|
||||
write. By default, input is read in large chunks, unless
|
||||
pcre2grep can determine that it is reading from a terminal,
|
||||
which is currently possible only in Unix-like environments or
|
||||
Windows. Output to terminal is normally automatically flushed
|
||||
by the operating system. This option can be useful when the
|
||||
input or output is attached to a pipe and you do not want
|
||||
pcre2grep to buffer up large amounts of data. However, its
|
||||
use will affect performance, and the -M (multiline) option
|
||||
ceases to work. When input is from a compressed .gz or .bz2
|
||||
file, --line-buffered is ignored.
|
||||
|
||||
--line-offsets
|
||||
Instead of showing lines or parts of lines that match, show
|
||||
Instead of showing lines or parts of lines that match, show
|
||||
each match as a line number, the offset from the start of the
|
||||
line, and a length. The line number is terminated by a colon
|
||||
(as usual; see the -n option), and the offset and length are
|
||||
separated by a comma. In this mode, no context is shown.
|
||||
That is, the -A, -B, and -C options are ignored. If there is
|
||||
more than one match in a line, each of them is shown sepa-
|
||||
rately. This option is mutually exclusive with --output,
|
||||
line, and a length. The line number is terminated by a colon
|
||||
(as usual; see the -n option), and the offset and length are
|
||||
separated by a comma. In this mode, no context is shown.
|
||||
That is, the -A, -B, and -C options are ignored. If there is
|
||||
more than one match in a line, each of them is shown sepa-
|
||||
rately. This option is mutually exclusive with --output,
|
||||
--file-offsets, and --only-matching.
|
||||
|
||||
--locale=locale-name
|
||||
This option specifies a locale to be used for pattern match-
|
||||
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
||||
ronment variables. If no locale is specified, the PCRE2 li-
|
||||
This option specifies a locale to be used for pattern match-
|
||||
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
||||
ronment variables. If no locale is specified, the PCRE2 li-
|
||||
brary's default (usually the "C" locale) is used. There is no
|
||||
short form for this option.
|
||||
|
||||
-M, --multiline
|
||||
Allow patterns to match more than one line. When this option
|
||||
is set, the PCRE2 library is called in "multiline" mode. This
|
||||
allows a matched string to extend past the end of a line and
|
||||
continue on one or more subsequent lines. Patterns used with
|
||||
-M may usefully contain literal newline characters and inter-
|
||||
nal occurrences of ^ and $ characters. The output for a suc-
|
||||
cessful match may consist of more than one line. The first
|
||||
line is the line in which the match started, and the last
|
||||
line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the
|
||||
end of that line. If -v is set, none of the lines in a
|
||||
multi-line match are output. Once a match has been handled,
|
||||
scanning restarts at the beginning of the line after the one
|
||||
in which the match ended.
|
||||
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
next line, you could use this command:
|
||||
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
|
||||
The \s escape sequence matches any white space character, in-
|
||||
cluding newlines, and is followed by + so as to match trail-
|
||||
ing white space on the first line as well as possibly han-
|
||||
dling a two-character newline sequence.
|
||||
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. With a sufficiently large processing buffer,
|
||||
this should not be a problem, but the -M option does not work
|
||||
when input is read line by line (see --line-buffered.)
|
||||
|
||||
-m number, --max-count=number
|
||||
Stop processing after finding number matching lines, or non-
|
||||
matching lines if -v is also set. Any trailing context lines
|
||||
are output after the final match. In multiline mode, each
|
||||
multiline match counts as just one line for this purpose. If
|
||||
this limit is reached when reading the standard input from a
|
||||
regular file, the file is left positioned just after the last
|
||||
matching line. If -c is also set, the count that is output
|
||||
is never greater than number. This option has no effect if
|
||||
used with -L, -l, or -q, or when just checking for a match in
|
||||
a binary file.
|
||||
|
||||
--match-limit=number
|
||||
Processing some regular expression patterns may take a very
|
||||
long time to search for all possible matching strings. Others
|
||||
|
@ -542,41 +590,6 @@ OPTIONS
|
|||
size is silently forced to be no smaller than the starting
|
||||
buffer size.
|
||||
|
||||
-M, --multiline
|
||||
Allow patterns to match more than one line. When this option
|
||||
is set, the PCRE2 library is called in "multiline" mode. This
|
||||
allows a matched string to extend past the end of a line and
|
||||
continue on one or more subsequent lines. Patterns used with
|
||||
-M may usefully contain literal newline characters and inter-
|
||||
nal occurrences of ^ and $ characters. The output for a suc-
|
||||
cessful match may consist of more than one line. The first
|
||||
line is the line in which the match started, and the last
|
||||
line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the
|
||||
end of that line. If -v is set, none of the lines in a
|
||||
multi-line match are output. Once a match has been handled,
|
||||
scanning restarts at the beginning of the line after the one
|
||||
in which the match ended.
|
||||
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
next line, you could use this command:
|
||||
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
|
||||
The \s escape sequence matches any white space character, in-
|
||||
cluding newlines, and is followed by + so as to match trail-
|
||||
ing white space on the first line as well as possibly han-
|
||||
dling a two-character newline sequence.
|
||||
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. With a sufficiently large processing buffer,
|
||||
this should not be a problem, but the -M option does not work
|
||||
when input is read line by line (see --line-buffered.)
|
||||
|
||||
-N newline-type, --newline=newline-type
|
||||
Six different conventions for indicating the ends of lines in
|
||||
scanned files are supported. For example:
|
||||
|
@ -625,97 +638,109 @@ OPTIONS
|
|||
lems. It should never be needed in normal use.
|
||||
|
||||
-O text, --output=text
|
||||
When there is a match, instead of outputting the whole line
|
||||
that matched, output just the given text, followed by an op-
|
||||
erating-system standard newline. The --newline option has no
|
||||
effect on this option, which is mutually exclusive with
|
||||
--only-matching, --file-offsets, and --line-offsets. Escape
|
||||
sequences starting with a dollar character may be used to in-
|
||||
sert the contents of the matched part of the line and/or cap-
|
||||
tured substrings into the text.
|
||||
When there is a match, instead of outputting the line that
|
||||
matched, output just the text specified in this option, fol-
|
||||
lowed by an operating-system standard newline. In this mode,
|
||||
no context is shown. That is, the -A, -B, and -C options are
|
||||
ignored. The --newline option has no effect on this option,
|
||||
which is mutually exclusive with --only-matching, --file-off-
|
||||
sets, and --line-offsets. However, like --only-matching, if
|
||||
there is more than one match in a line, each of them causes a
|
||||
line of output.
|
||||
|
||||
$<digits> or ${<digits>} is replaced by the captured sub-
|
||||
string of the given decimal number; zero substitutes the
|
||||
Escape sequences starting with a dollar character may be used
|
||||
to insert the contents of the matched part of the line and/or
|
||||
captured substrings into the text.
|
||||
|
||||
$<digits> or ${<digits>} is replaced by the captured sub-
|
||||
string of the given decimal number; zero substitutes the
|
||||
whole match. If the number is greater than the number of cap-
|
||||
turing substrings, or if the capture is unset, the replace-
|
||||
turing substrings, or if the capture is unset, the replace-
|
||||
ment is empty.
|
||||
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
||||
form feed; $n by newline; $r by carriage return; $t by tab;
|
||||
$a is replaced by bell; $b by backspace; $e by escape; $f by
|
||||
form feed; $n by newline; $r by carriage return; $t by tab;
|
||||
$v by vertical tab.
|
||||
|
||||
$o<digits> is replaced by the character represented by the
|
||||
given octal number; up to three digits are processed.
|
||||
$o<digits> or $o{<digits>} is replaced by the character whose
|
||||
code point is the given octal number. In the first form, up
|
||||
to three octal digits are processed. When more digits are
|
||||
needed in Unicode mode to specify a wide character, the sec-
|
||||
ond form must be used.
|
||||
|
||||
$x<digits> is replaced by the character represented by the
|
||||
given hexadecimal number; up to two digits are processed.
|
||||
$x<digits> or $x{<digits>} is replaced by the character rep-
|
||||
resented by the given hexadecimal number. In the first form,
|
||||
up to two hexadecimal digits are processed. When more digits
|
||||
are needed in Unicode mode to specify a wide character, the
|
||||
second form must be used.
|
||||
|
||||
Any other character is substituted by itself. In particular,
|
||||
Any other character is substituted by itself. In particular,
|
||||
$$ is replaced by a single dollar.
|
||||
|
||||
-o, --only-matching
|
||||
Show only the part of the line that matched a pattern instead
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately,
|
||||
on a separate line of output. If -o is combined with -v (in-
|
||||
vert the sense of the match to find non-matching lines), no
|
||||
output is generated, but the return code is set appropri-
|
||||
ately. If the matched portion of the line is empty, nothing
|
||||
is output unless the file name or line number are being
|
||||
printed, in which case they are shown on an otherwise empty
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately,
|
||||
on a separate line of output. If -o is combined with -v (in-
|
||||
vert the sense of the match to find non-matching lines), no
|
||||
output is generated, but the return code is set appropri-
|
||||
ately. If the matched portion of the line is empty, nothing
|
||||
is output unless the file name or line number are being
|
||||
printed, in which case they are shown on an otherwise empty
|
||||
line. This option is mutually exclusive with --output,
|
||||
--file-offsets and --line-offsets.
|
||||
|
||||
-onumber, --only-matching=number
|
||||
Show only the part of the line that matched the capturing
|
||||
Show only the part of the line that matched the capturing
|
||||
parentheses of the given number. Up to 50 capturing parenthe-
|
||||
ses are supported by default. This limit can be changed via
|
||||
the --om-capture option. A pattern may contain any number of
|
||||
capturing parentheses, but only those whose number is within
|
||||
the limit can be accessed by -o. An error occurs if the num-
|
||||
ses are supported by default. This limit can be changed via
|
||||
the --om-capture option. A pattern may contain any number of
|
||||
capturing parentheses, but only those whose number is within
|
||||
the limit can be accessed by -o. An error occurs if the num-
|
||||
ber specified by -o is greater than the limit.
|
||||
|
||||
-o0 is the same as -o without a number. Because these options
|
||||
can be given without an argument (see above), if an argument
|
||||
is present, it must be given in the same shell item, for ex-
|
||||
ample, -o3 or --only-matching=2. The comments given for the
|
||||
non-argument case above also apply to this option. If the
|
||||
specified capturing parentheses do not exist in the pattern,
|
||||
or were not set in the match, nothing is output unless the
|
||||
can be given without an argument (see above), if an argument
|
||||
is present, it must be given in the same shell item, for ex-
|
||||
ample, -o3 or --only-matching=2. The comments given for the
|
||||
non-argument case above also apply to this option. If the
|
||||
specified capturing parentheses do not exist in the pattern,
|
||||
or were not set in the match, nothing is output unless the
|
||||
file name or line number are being output.
|
||||
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
(but see the next but one option).
|
||||
|
||||
--om-capture=number
|
||||
Set the number of capturing parentheses that can be accessed
|
||||
Set the number of capturing parentheses that can be accessed
|
||||
by -o. The default is 50.
|
||||
|
||||
--om-separator=text
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
coloured.
|
||||
|
||||
-q, --quiet
|
||||
Work quietly, that is, display nothing except error messages.
|
||||
The exit status indicates whether or not any matches were
|
||||
The exit status indicates whether or not any matches were
|
||||
found.
|
||||
|
||||
-r, --recursive
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to "re-
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to "re-
|
||||
curse".
|
||||
|
||||
--recursion-limit=number
|
||||
See --match-limit above.
|
||||
This is an obsolete synonym for --depth-limit. See --match-
|
||||
limit above for details.
|
||||
|
||||
-s, --no-messages
|
||||
Suppress error messages about non-existent or unreadable
|
||||
|
@ -737,26 +762,30 @@ OPTIONS
|
|||
|
||||
-u, --utf Operate in UTF-8 mode. This option is available only if PCRE2
|
||||
has been compiled with UTF-8 support. All patterns (including
|
||||
those for any --exclude and --include options) and all sub-
|
||||
ject lines that are scanned must be valid strings of UTF-8
|
||||
characters.
|
||||
those for any --exclude and --include options) and all lines
|
||||
that are scanned must be valid strings of UTF-8 characters.
|
||||
If an invalid UTF-8 string is encountered, an error occurs.
|
||||
|
||||
-U, --utf-allow-invalid
|
||||
As --utf, but in addition subject lines may contain invalid
|
||||
UTF-8 code unit sequences. These can never form part of any
|
||||
pattern match. This facility allows valid UTF-8 strings to be
|
||||
sought in executable or other binary files. For more details
|
||||
about matching in non-valid UTF-8 strings, see the pcre2uni-
|
||||
code(3) documentation.
|
||||
pattern match. Patterns themselves, however, must still be
|
||||
valid UTF-8 strings. This facility allows valid UTF-8 strings
|
||||
to be sought within arbitrary byte sequences in executable or
|
||||
other binary files. For more details about matching in non-
|
||||
valid UTF-8 strings, see the pcre2unicode(3) documentation.
|
||||
|
||||
-V, --version
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
command line is ignored.
|
||||
|
||||
-v, --invert-match
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found.
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found. When
|
||||
this option is set, options such as --only-matching and
|
||||
--output, which specify parts of a match that are to be out-
|
||||
put, are ignored.
|
||||
|
||||
-w, --word-regex, --word-regexp
|
||||
Force the patterns only to match "words". That is, there must
|
||||
|
@ -878,30 +907,49 @@ USING PCRE2'S CALLOUT FACILITY
|
|||
mentation for details). Numbered callouts are ignored by pcre2grep;
|
||||
only callouts with string arguments are useful.
|
||||
|
||||
Echoing a specific string
|
||||
|
||||
Starting the callout string with a pipe character invokes an echoing
|
||||
facility that avoids calling an external program or script. This facil-
|
||||
ity is always available, provided that callouts were not completely
|
||||
disabled when pcre2grep was built. The rest of the callout string is
|
||||
processed as a zero-terminated string, which means it should not con-
|
||||
tain any internal binary zeros. It is written to the output, having
|
||||
first been passed through the same escape processing as text from the
|
||||
--output (-O) option (see above). However, $0 cannot be used to insert
|
||||
a matched substring because the match is still in progress. Instead,
|
||||
the single character '0' is inserted. Any syntax errors in the string
|
||||
(for example, a dollar not followed by another character) causes the
|
||||
callout to be ignored. No terminator is added to the output string, so
|
||||
if you want a newline, you must include it explicitly using the escape
|
||||
$n. For example:
|
||||
|
||||
pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' <some file>
|
||||
|
||||
Matching continues normally after the string is output. If you want to
|
||||
see only the callout output but not any output from an actual match,
|
||||
you should end the pattern with (*FAIL).
|
||||
|
||||
Calling external programs or scripts
|
||||
|
||||
This facility can be independently disabled when pcre2grep is built. It
|
||||
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
||||
where lib$spawn() is used, and for any other Unix-like environment
|
||||
where fork() and execv() are available.
|
||||
is supported for Windows, where a call to _spawnvp() is used, for VMS,
|
||||
where lib$spawn() is used, and for any Unix-like environment where
|
||||
fork() and execv() are available.
|
||||
|
||||
If the callout string does not start with a pipe (vertical bar) charac-
|
||||
ter, it is parsed into a list of substrings separated by pipe charac-
|
||||
ters. The first substring must be an executable name, with the follow-
|
||||
ter, it is parsed into a list of substrings separated by pipe charac-
|
||||
ters. The first substring must be an executable name, with the follow-
|
||||
ing substrings specifying arguments:
|
||||
|
||||
executable_name|arg1|arg2|...
|
||||
|
||||
Any substring (including the executable name) may contain escape se-
|
||||
quences started by a dollar character: $<digits> or ${<digits>} is re-
|
||||
placed by the captured substring of the given decimal number, which
|
||||
must be greater than zero. If the number is greater than the number of
|
||||
capturing substrings, or if the capture is unset, the replacement is
|
||||
empty.
|
||||
|
||||
Any other character is substituted by itself. In particular, $$ is re-
|
||||
placed by a single dollar and $| is replaced by a pipe character. Here
|
||||
is an example:
|
||||
Any substring (including the executable name) may contain escape se-
|
||||
quences started by a dollar character. These are the same as for the
|
||||
--output (-O) option documented above, except that $0 cannot insert the
|
||||
matched string because the match is still in progress. Instead, the
|
||||
character '0' is inserted. If you need a literal dollar or pipe charac-
|
||||
ter in any substring, use $$ or $| respectively. Here is an example:
|
||||
|
||||
echo -e "abcde\n12345" | pcre2grep \
|
||||
'(?x)(.)(..(.))
|
||||
|
@ -914,28 +962,15 @@ USING PCRE2'S CALLOUT FACILITY
|
|||
Arg1: [1] [234] [4] Arg2: |1| ()
|
||||
12345
|
||||
|
||||
The parameters for the system call that is used to run the program or
|
||||
The parameters for the system call that is used to run the program or
|
||||
script are zero-terminated strings. This means that binary zero charac-
|
||||
ters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in
|
||||
the string (for example, a dollar not followed by another character)
|
||||
cause the callout to be ignored. If running the program fails for any
|
||||
reason (including the non-existence of the executable), a local match-
|
||||
ters in the callout argument will cause premature termination of their
|
||||
substrings, and therefore should not be present. Any syntax errors in
|
||||
the string (for example, a dollar not followed by another character)
|
||||
causes the callout to be ignored. If running the program fails for any
|
||||
reason (including the non-existence of the executable), a local match-
|
||||
ing failure occurs and the matcher backtracks in the normal way.
|
||||
|
||||
Echoing a specific string
|
||||
|
||||
This facility is always available, provided that callouts were not com-
|
||||
pletely disabled when pcre2grep was built. If the callout string starts
|
||||
with a pipe (vertical bar) character, the rest of the string is written
|
||||
to the output, having been passed through the same escape processing as
|
||||
text from the --output option. This provides a simple echoing facility
|
||||
that avoids calling an external program or script. No terminator is
|
||||
added to the string, so if you want a newline, you must include it ex-
|
||||
plicitly. Matching continues normally after the string is output. If
|
||||
you want to see only the callout output but not any output from an ac-
|
||||
tual match, you should end the relevant pattern with (*FAIL).
|
||||
|
||||
|
||||
MATCHING ERRORS
|
||||
|
||||
|
@ -969,7 +1004,7 @@ DIAGNOSTICS
|
|||
|
||||
SEE ALSO
|
||||
|
||||
pcre2pattern(3), pcre2syntax(3), pcre2callout(3).
|
||||
pcre2pattern(3), pcre2syntax(3), pcre2callout(3), pcre2unicode(3).
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
@ -981,5 +1016,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 25 January 2020
|
||||
Last updated: 04 October 2020
|
||||
Copyright (c) 1997-2020 University of Cambridge.
|
||||
|
|
1051
doc/pcre2test.txt
1051
doc/pcre2test.txt
File diff suppressed because it is too large
Load Diff
717
src/pcre2grep.c
717
src/pcre2grep.c
|
@ -164,6 +164,10 @@ enum { DEE_READ, DEE_SKIP };
|
|||
|
||||
enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT };
|
||||
|
||||
/* Return values from decode_dollar_escape() */
|
||||
|
||||
enum { DDE_ERROR, DDE_CAPTURE, DDE_CHAR };
|
||||
|
||||
/* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some
|
||||
environments), a warning is issued if the value of fwrite() is ignored.
|
||||
Unfortunately, casting to (void) does not suppress the warning. To get round
|
||||
|
@ -179,13 +183,21 @@ handled by using STDOUT_NL as the newline string. We also use a normal double
|
|||
quote for the example, as single quotes aren't usually available. */
|
||||
|
||||
#ifdef WIN32
|
||||
#define STDOUT_NL "\r\n"
|
||||
#define QUOT "\""
|
||||
#define STDOUT_NL "\r\n"
|
||||
#define STDOUT_NL_LEN 2
|
||||
#define QUOT "\""
|
||||
#else
|
||||
#define STDOUT_NL "\n"
|
||||
#define QUOT "'"
|
||||
#define STDOUT_NL "\n"
|
||||
#define STDOUT_NL_LEN 1
|
||||
#define QUOT "'"
|
||||
#endif
|
||||
|
||||
/* This code is returned from decode_dollar_escape() when $n is encountered,
|
||||
and used to mean "output STDOUT_NL". It is, of course, not a valid Unicode code
|
||||
point. */
|
||||
|
||||
#define STDOUT_NL_CODE 0x7fffffffu
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
|
@ -224,8 +236,9 @@ static int max_bufthird = PCRE2GREP_MAX_BUFSIZE;
|
|||
static int bufsize = 3*PCRE2GREP_BUFSIZE;
|
||||
static int endlinetype;
|
||||
|
||||
static unsigned long int total_count = 0;
|
||||
static int count_limit = -1; /* Not long, so that it works with OP_NUMBER */
|
||||
static unsigned long int counts_printed = 0;
|
||||
static unsigned long int total_count = 0;
|
||||
|
||||
#ifdef WIN32
|
||||
static int dee_action = dee_SKIP;
|
||||
|
@ -277,6 +290,9 @@ static BOOL show_total_count = FALSE;
|
|||
static BOOL silent = FALSE;
|
||||
static BOOL utf = FALSE;
|
||||
|
||||
static uint8_t utf8_buffer[8];
|
||||
|
||||
|
||||
/* Structure for list of --only-matching capturing numbers. */
|
||||
|
||||
typedef struct omstr {
|
||||
|
@ -443,6 +459,7 @@ static option_item optionlist[] = {
|
|||
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "depth-limit=number", "set PCRE2 depth limit option" },
|
||||
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "recursion-limit=number", "obsolete synonym for depth-limit" },
|
||||
{ OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" },
|
||||
{ OP_NUMBER, 'm', &count_limit, "max-count=number", "stop after <number> matched lines" },
|
||||
{ OP_STRING, 'N', &newline_arg, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF, ANY, or NUL)" },
|
||||
{ OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" },
|
||||
#ifdef SUPPORT_PCRE2GREP_JIT
|
||||
|
@ -482,8 +499,13 @@ of PCRE2_NEWLINE_xx in pcre2.h. */
|
|||
static const char *newlines[] = {
|
||||
"DEFAULT", "CR", "LF", "CRLF", "ANY", "ANYCRLF", "NUL" };
|
||||
|
||||
/* UTF-8 tables - used only when the newline setting is "any". */
|
||||
/* UTF-8 tables */
|
||||
|
||||
const int utf8_table1[] =
|
||||
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
|
||||
const int utf8_table1_size = sizeof(utf8_table1) / sizeof(int);
|
||||
|
||||
const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
||||
const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
||||
|
||||
const char utf8_table4[] = {
|
||||
|
@ -531,6 +553,32 @@ else
|
|||
#endif /* not VPCOMPAT && not HAVE_MEMMOVE */
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Convert code point to UTF-8 *
|
||||
*************************************************/
|
||||
|
||||
/* A static buffer is used. Returns the number of bytes. */
|
||||
|
||||
static int
|
||||
ord2utf8(uint32_t value)
|
||||
{
|
||||
int i, j;
|
||||
uint8_t *utf8bytes = utf8_buffer;
|
||||
for (i = 0; i < utf8_table1_size; i++)
|
||||
if (value <= (uint32_t)utf8_table1[i]) break;
|
||||
utf8bytes += i;
|
||||
for (j = i; j > 0; j--)
|
||||
{
|
||||
*utf8bytes-- = 0x80 | (value & 0x3f);
|
||||
value >>= 6;
|
||||
}
|
||||
*utf8bytes = utf8_table2[i] | value;
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Case-independent string compare *
|
||||
*************************************************/
|
||||
|
@ -1788,6 +1836,7 @@ if (slen > 200)
|
|||
slen = 200;
|
||||
msg = "text that starts:\n\n";
|
||||
}
|
||||
|
||||
for (i = 1; p != NULL; p = p->next, i++)
|
||||
{
|
||||
*mrc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, (int)length,
|
||||
|
@ -1823,107 +1872,245 @@ return FALSE; /* No match, no errors */
|
|||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Decode dollar escape sequence *
|
||||
*************************************************/
|
||||
|
||||
/* Called from various places to decode $ escapes in output strings. The escape
|
||||
sequences are as follows:
|
||||
|
||||
$<digits> or ${<digits>} returns a capture number. However, if callout is TRUE,
|
||||
zero is never returned; '0' is substituted.
|
||||
|
||||
$a returns bell.
|
||||
$b returns backspace.
|
||||
$e returns escape.
|
||||
$f returns form feed.
|
||||
$n returns newline.
|
||||
$r returns carriage return.
|
||||
$t returns tab.
|
||||
$v returns vertical tab.
|
||||
$o<digits> returns the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
$o{<digits>} does the same, up to 7 digits, but gives an error for mode-invalid
|
||||
code points.
|
||||
$x<digits> returns the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
$x{<digits} does the same, up to 6 digits, but gives an error for mode-invalid
|
||||
code points.
|
||||
Any other character is substituted by itself. E.g: $$ is replaced by a single
|
||||
dollar.
|
||||
|
||||
Arguments:
|
||||
begin the start of the whole string
|
||||
string points to the $
|
||||
callout TRUE if in a callout (inhibits error messages)
|
||||
value where to return a value
|
||||
last where to return pointer to the last used character
|
||||
|
||||
Returns: DDE_ERROR after a syntax error
|
||||
DDE_CAPTURE if *value is a capture number
|
||||
DDE_CHAR if *value is a character code
|
||||
*/
|
||||
|
||||
static int
|
||||
decode_dollar_escape(PCRE2_SPTR begin, PCRE2_SPTR string, BOOL callout,
|
||||
uint32_t *value, PCRE2_SPTR *last)
|
||||
{
|
||||
uint32_t c = 0;
|
||||
int base = 10;
|
||||
int dcount;
|
||||
int rc = DDE_CHAR;
|
||||
BOOL brace = FALSE;
|
||||
|
||||
switch (*(++string))
|
||||
{
|
||||
case 0: /* Syntax error: a character must be present after $. */
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "no character after $");
|
||||
*last = string;
|
||||
return DDE_ERROR;
|
||||
|
||||
case '{':
|
||||
brace = TRUE;
|
||||
string++;
|
||||
if (!isdigit(*string)) /* Syntax error: a decimal number required. */
|
||||
{
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "decimal number expected");
|
||||
rc = DDE_ERROR;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Fall through */
|
||||
|
||||
/* The maximum capture number is 65535, so any number greater than that will
|
||||
always be an unknown capture number. We just stop incrementing, in order to
|
||||
avoid overflow. */
|
||||
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8': case '9':
|
||||
do
|
||||
{
|
||||
if (c <= 65535) c = c * 10 + (*string - '0');
|
||||
string++;
|
||||
}
|
||||
while (*string >= '0' && *string <= '9');
|
||||
string--; /* Point to last digit */
|
||||
|
||||
/* In a callout, capture number 0 is not available. No error can be given,
|
||||
so just return the character '0'. */
|
||||
|
||||
if (callout && c == 0)
|
||||
{
|
||||
*value = '0';
|
||||
}
|
||||
else
|
||||
{
|
||||
*value = c;
|
||||
rc = DDE_CAPTURE;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Limit octal numbers to 3 digits without braces, or up to 7 with braces,
|
||||
for valid Unicode code points. */
|
||||
|
||||
case 'o':
|
||||
base = 8;
|
||||
string++;
|
||||
if (*string == '{')
|
||||
{
|
||||
brace = TRUE;
|
||||
string++;
|
||||
dcount = 7;
|
||||
}
|
||||
else dcount = 3;
|
||||
for (; dcount > 0; dcount--)
|
||||
{
|
||||
if (*string < '0' || *string > '7') break;
|
||||
c = c * 8 + (*string++ - '0');
|
||||
}
|
||||
*value = c;
|
||||
string--; /* Point to last digit */
|
||||
break;
|
||||
|
||||
/* Limit hex numbers to 2 digits without braces, or up to 6 with braces,
|
||||
for valid Unicode code points. */
|
||||
|
||||
case 'x':
|
||||
base = 16;
|
||||
string++;
|
||||
if (*string == '{')
|
||||
{
|
||||
brace = TRUE;
|
||||
string++;
|
||||
dcount = 6;
|
||||
}
|
||||
else dcount = 2;
|
||||
for (; dcount > 0; dcount--)
|
||||
{
|
||||
if (!isxdigit(*string)) break;
|
||||
if (*string >= '0' && *string <= '9')
|
||||
c = c *16 + *string++ - '0';
|
||||
else
|
||||
c = c * 16 + (*string++ | 0x20) - 'a' + 10;
|
||||
}
|
||||
*value = c;
|
||||
string--; /* Point to last digit */
|
||||
break;
|
||||
|
||||
case 'a': *value = '\a'; break;
|
||||
case 'b': *value = '\b'; break;
|
||||
#ifndef EBCDIC
|
||||
case 'e': *value = '\033'; break;
|
||||
#else
|
||||
case 'e': *value = '\047'; break;
|
||||
#endif
|
||||
case 'f': *value = '\f'; break;
|
||||
case 'n': *value = STDOUT_NL_CODE; break;
|
||||
case 'r': *value = '\r'; break;
|
||||
case 't': *value = '\t'; break;
|
||||
case 'v': *value = '\v'; break;
|
||||
|
||||
default: *value = *string; break;
|
||||
}
|
||||
|
||||
if (brace)
|
||||
{
|
||||
c = string[1];
|
||||
if (c != '}')
|
||||
{
|
||||
rc = DDE_ERROR;
|
||||
if (!callout)
|
||||
{
|
||||
if ((base == 8 && c >= '0' && c <= '7') ||
|
||||
(base == 16 && isxdigit(c)))
|
||||
{
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
|
||||
"too many %s digits\n", (int)(string - begin),
|
||||
(base == 8)? "octal" : "hex");
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "missing closing brace");
|
||||
}
|
||||
}
|
||||
}
|
||||
else string++;
|
||||
}
|
||||
|
||||
/* Check maximum code point values, but take note of STDOUT_NL_CODE. */
|
||||
|
||||
if (rc == DDE_CHAR && *value != STDOUT_NL_CODE)
|
||||
{
|
||||
uint32_t max = utf? 0x0010ffffu : 0xffu;
|
||||
if (*value > max)
|
||||
{
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
|
||||
"code point greater than 0x%x is invalid\n", (int)(string - begin), max);
|
||||
rc = DDE_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
*last = string;
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check output text for errors *
|
||||
*************************************************/
|
||||
|
||||
/* Called early, to get errors before doing anything for -O text; also called
|
||||
from callouts to check before outputting.
|
||||
|
||||
Arguments:
|
||||
string an --output text string
|
||||
callout TRUE if in a callout (stops printing errors)
|
||||
|
||||
Returns: TRUE if OK, FALSE on error
|
||||
*/
|
||||
|
||||
static BOOL
|
||||
syntax_check_output_text(PCRE2_SPTR string, BOOL callout)
|
||||
{
|
||||
uint32_t value;
|
||||
PCRE2_SPTR begin = string;
|
||||
|
||||
for (; *string != 0; string++)
|
||||
{
|
||||
if (*string == '$')
|
||||
{
|
||||
PCRE2_SIZE capture_id = 0;
|
||||
BOOL brace = FALSE;
|
||||
|
||||
string++;
|
||||
|
||||
/* Syntax error: a character must be present after $. */
|
||||
if (*string == 0)
|
||||
{
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "no character after $");
|
||||
if (*string == '$' &&
|
||||
decode_dollar_escape(begin, string, callout, &value, &string) == DDE_ERROR)
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
if (*string == '{')
|
||||
{
|
||||
/* Must be a decimal number in braces, e.g: {5} or {38} */
|
||||
string++;
|
||||
|
||||
brace = TRUE;
|
||||
}
|
||||
|
||||
if ((*string >= '1' && *string <= '9') || (!callout && *string == '0'))
|
||||
{
|
||||
do
|
||||
{
|
||||
/* Maximum capture id is 65535. */
|
||||
if (capture_id <= 65535)
|
||||
capture_id = capture_id * 10 + (*string - '0');
|
||||
|
||||
string++;
|
||||
}
|
||||
while (*string >= '0' && *string <= '9');
|
||||
|
||||
if (brace)
|
||||
{
|
||||
/* Syntax error: closing brace is missing. */
|
||||
if (*string != '}')
|
||||
{
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "missing closing brace");
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* To negate the effect of the for. */
|
||||
string--;
|
||||
}
|
||||
}
|
||||
else if (brace)
|
||||
{
|
||||
/* Syntax error: a decimal number required. */
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "decimal number expected");
|
||||
return FALSE;
|
||||
}
|
||||
else if (*string == 'o')
|
||||
{
|
||||
string++;
|
||||
|
||||
if (*string < '0' || *string > '7')
|
||||
{
|
||||
/* Syntax error: an octal number required. */
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "octal number expected");
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
else if (*string == 'x')
|
||||
{
|
||||
string++;
|
||||
|
||||
if (!isxdigit((unsigned char)*string))
|
||||
{
|
||||
/* Syntax error: a hexdecimal number required. */
|
||||
if (!callout)
|
||||
fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
|
||||
(int)(string - begin), "hexadecimal number expected");
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1932,31 +2119,7 @@ for (; *string != 0; string++)
|
|||
*************************************************/
|
||||
|
||||
/* Display the output text, which is assumed to have already been syntax
|
||||
checked. Output may contain escape sequences started by the dollar sign. The
|
||||
escape sequences are substituted as follows:
|
||||
|
||||
$<digits> or ${<digits>} is replaced by the captured substring of the given
|
||||
decimal number; zero will substitute the whole match. If the number is
|
||||
greater than the number of capturing substrings, or if the capture is unset,
|
||||
the replacement is empty.
|
||||
|
||||
$a is replaced by bell.
|
||||
$b is replaced by backspace.
|
||||
$e is replaced by escape.
|
||||
$f is replaced by form feed.
|
||||
$n is replaced by newline.
|
||||
$r is replaced by carriage return.
|
||||
$t is replaced by tab.
|
||||
$v is replaced by vertical tab.
|
||||
|
||||
$o<digits> is replaced by the character represented by the given octal
|
||||
number; up to three digits are processed.
|
||||
|
||||
$x<digits> is replaced by the character represented by the given hexadecimal
|
||||
number; up to two digits are processed.
|
||||
|
||||
Any other character is substituted by itself. E.g: $$ is replaced by a single
|
||||
dollar.
|
||||
checked. Output may contain escape sequences started by the dollar sign.
|
||||
|
||||
Arguments:
|
||||
string: the output text
|
||||
|
@ -1973,121 +2136,54 @@ static BOOL
|
|||
display_output_text(PCRE2_SPTR string, BOOL callout, PCRE2_SPTR subject,
|
||||
PCRE2_SIZE *ovector, PCRE2_SIZE capture_top)
|
||||
{
|
||||
uint32_t value;
|
||||
BOOL printed = FALSE;
|
||||
PCRE2_SPTR begin = string;
|
||||
|
||||
for (; *string != 0; string++)
|
||||
{
|
||||
int ch = EOF;
|
||||
if (*string == '$')
|
||||
{
|
||||
PCRE2_SIZE capture_id = 0;
|
||||
BOOL brace = FALSE;
|
||||
|
||||
string++;
|
||||
|
||||
if (*string == '{')
|
||||
switch(decode_dollar_escape(begin, string, callout, &value, &string))
|
||||
{
|
||||
/* Must be a decimal number in braces, e.g: {5} or {38} */
|
||||
string++;
|
||||
|
||||
brace = TRUE;
|
||||
}
|
||||
|
||||
if ((*string >= '1' && *string <= '9') || (!callout && *string == '0'))
|
||||
{
|
||||
do
|
||||
case DDE_CHAR:
|
||||
if (value == STDOUT_NL_CODE)
|
||||
{
|
||||
/* Maximum capture id is 65535. */
|
||||
if (capture_id <= 65535)
|
||||
capture_id = capture_id * 10 + (*string - '0');
|
||||
|
||||
string++;
|
||||
fprintf(stdout, STDOUT_NL);
|
||||
printed = FALSE;
|
||||
continue;
|
||||
}
|
||||
while (*string >= '0' && *string <= '9');
|
||||
break; /* Will print value */
|
||||
|
||||
if (!brace)
|
||||
{
|
||||
/* To negate the effect of the for. */
|
||||
string--;
|
||||
}
|
||||
|
||||
if (capture_id < capture_top)
|
||||
case DDE_CAPTURE:
|
||||
if (value < capture_top)
|
||||
{
|
||||
PCRE2_SIZE capturesize;
|
||||
capture_id *= 2;
|
||||
|
||||
capturesize = ovector[capture_id + 1] - ovector[capture_id];
|
||||
value *= 2;
|
||||
capturesize = ovector[value + 1] - ovector[value];
|
||||
if (capturesize > 0)
|
||||
{
|
||||
print_match(subject + ovector[capture_id], capturesize);
|
||||
print_match(subject + ovector[value], capturesize);
|
||||
printed = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (*string == 'a') ch = '\a';
|
||||
else if (*string == 'b') ch = '\b';
|
||||
#ifndef EBCDIC
|
||||
else if (*string == 'e') ch = '\033';
|
||||
#else
|
||||
else if (*string == 'e') ch = '\047';
|
||||
#endif
|
||||
else if (*string == 'f') ch = '\f';
|
||||
else if (*string == 'r') ch = '\r';
|
||||
else if (*string == 't') ch = '\t';
|
||||
else if (*string == 'v') ch = '\v';
|
||||
else if (*string == 'n')
|
||||
{
|
||||
fprintf(stdout, STDOUT_NL);
|
||||
printed = FALSE;
|
||||
}
|
||||
else if (*string == 'o')
|
||||
{
|
||||
string++;
|
||||
continue;
|
||||
|
||||
ch = *string - '0';
|
||||
if (string[1] >= '0' && string[1] <= '7')
|
||||
{
|
||||
string++;
|
||||
ch = ch * 8 + (*string - '0');
|
||||
}
|
||||
if (string[1] >= '0' && string[1] <= '7')
|
||||
{
|
||||
string++;
|
||||
ch = ch * 8 + (*string - '0');
|
||||
}
|
||||
default: /* Should not occur */
|
||||
break;
|
||||
}
|
||||
else if (*string == 'x')
|
||||
{
|
||||
string++;
|
||||
}
|
||||
|
||||
if (*string >= '0' && *string <= '9')
|
||||
ch = *string - '0';
|
||||
else
|
||||
ch = (*string | 0x20) - 'a' + 10;
|
||||
if (isxdigit((unsigned char)string[1]))
|
||||
{
|
||||
string++;
|
||||
ch *= 16;
|
||||
if (*string >= '0' && *string <= '9')
|
||||
ch += *string - '0';
|
||||
else
|
||||
ch += (*string | 0x20) - 'a' + 10;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ch = *string;
|
||||
}
|
||||
}
|
||||
else
|
||||
else value = *string; /* Not a $ escape */
|
||||
|
||||
if (utf && value <= 127) fprintf(stdout, "%c", *string); else
|
||||
{
|
||||
ch = *string;
|
||||
}
|
||||
if (ch != EOF)
|
||||
{
|
||||
fprintf(stdout, "%c", ch);
|
||||
printed = TRUE;
|
||||
int i;
|
||||
int n = ord2utf8(value);
|
||||
for (i = 0; i < n; i++) fputc(utf8_buffer[i], stdout);
|
||||
}
|
||||
|
||||
printed = TRUE;
|
||||
}
|
||||
|
||||
return printed;
|
||||
|
@ -2166,7 +2262,7 @@ int result = 0;
|
|||
|
||||
(void)unused; /* Avoid compiler warning */
|
||||
|
||||
/* Only callout with strings are supported. */
|
||||
/* Only callouts with strings are supported. */
|
||||
|
||||
if (string == NULL || length == 0) return 0;
|
||||
|
||||
|
@ -2185,83 +2281,43 @@ return 0;
|
|||
#else
|
||||
|
||||
/* Checking syntax and compute the number of string fragments. Callout strings
|
||||
are ignored in case of a syntax error. */
|
||||
are silently ignored in the event of a syntax error. */
|
||||
|
||||
while (length > 0)
|
||||
{
|
||||
if (*string == '|')
|
||||
{
|
||||
argsvectorlen++;
|
||||
|
||||
/* Maximum 10000 arguments allowed. */
|
||||
if (argsvectorlen > 10000) return 0;
|
||||
if (argsvectorlen > 10000) return 0; /* Too many args */
|
||||
}
|
||||
|
||||
else if (*string == '$')
|
||||
{
|
||||
PCRE2_SIZE capture_id = 0;
|
||||
uint32_t value;
|
||||
PCRE2_SPTR begin = string;
|
||||
|
||||
string++;
|
||||
length--;
|
||||
|
||||
/* Syntax error: a character must be present after $. */
|
||||
if (length == 0) return 0;
|
||||
|
||||
if (*string >= '1' && *string <= '9')
|
||||
switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
|
||||
{
|
||||
do
|
||||
case DDE_CAPTURE:
|
||||
if (value < capture_top)
|
||||
{
|
||||
/* Maximum capture id is 65535. */
|
||||
if (capture_id <= 65535)
|
||||
capture_id = capture_id * 10 + (*string - '0');
|
||||
|
||||
string++;
|
||||
length--;
|
||||
value *= 2;
|
||||
argslen += ovector[value + 1] - ovector[value];
|
||||
}
|
||||
while (length > 0 && *string >= '0' && *string <= '9');
|
||||
argslen--; /* Negate the effect of argslen++ below. */
|
||||
break;
|
||||
|
||||
/* To negate the effect of string++ below. */
|
||||
string--;
|
||||
length++;
|
||||
}
|
||||
else if (*string == '{')
|
||||
{
|
||||
/* Must be a decimal number in braces, e.g: {5} or {38} */
|
||||
string++;
|
||||
length--;
|
||||
case DDE_CHAR:
|
||||
if (value == STDOUT_NL_CODE) argslen += STDOUT_NL_LEN - 1;
|
||||
else if (utf && value > 127) argslen += ord2utf8(value) - 1;
|
||||
break;
|
||||
|
||||
/* Syntax error: a decimal number required. */
|
||||
if (length == 0) return 0;
|
||||
if (*string < '1' || *string > '9') return 0;
|
||||
|
||||
do
|
||||
{
|
||||
/* Maximum capture id is 65535. */
|
||||
if (capture_id <= 65535)
|
||||
capture_id = capture_id * 10 + (*string - '0');
|
||||
|
||||
string++;
|
||||
length--;
|
||||
|
||||
/* Syntax error: no more characters */
|
||||
if (length == 0) return 0;
|
||||
}
|
||||
while (*string >= '0' && *string <= '9');
|
||||
|
||||
/* Syntax error: closing brace is missing. */
|
||||
if (*string != '}') return 0;
|
||||
default: /* Should not occur */
|
||||
case DDE_ERROR:
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (capture_id > 0)
|
||||
{
|
||||
if (capture_id < capture_top)
|
||||
{
|
||||
capture_id *= 2;
|
||||
argslen += ovector[capture_id + 1] - ovector[capture_id];
|
||||
}
|
||||
|
||||
/* To negate the effect of argslen++ below. */
|
||||
argslen--;
|
||||
}
|
||||
length -= (string - begin);
|
||||
}
|
||||
|
||||
string++;
|
||||
|
@ -2269,6 +2325,8 @@ while (length > 0)
|
|||
argslen++;
|
||||
}
|
||||
|
||||
/* Get memory for the argument vector and its strings. */
|
||||
|
||||
args = (char*)malloc(argslen);
|
||||
if (args == NULL) return 0;
|
||||
|
||||
|
@ -2279,9 +2337,10 @@ if (argsvector == NULL)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Now reprocess the string and set up the arguments. */
|
||||
|
||||
argsptr = args;
|
||||
argsvectorptr = argsvector;
|
||||
|
||||
*argsvectorptr++ = argsptr;
|
||||
|
||||
length = calloutptr->callout_string_length;
|
||||
|
@ -2294,69 +2353,55 @@ while (length > 0)
|
|||
*argsptr++ = '\0';
|
||||
*argsvectorptr++ = argsptr;
|
||||
}
|
||||
|
||||
else if (*string == '$')
|
||||
{
|
||||
string++;
|
||||
length--;
|
||||
uint32_t value;
|
||||
PCRE2_SPTR begin = string;
|
||||
|
||||
if ((*string >= '1' && *string <= '9') || *string == '{')
|
||||
switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
|
||||
{
|
||||
PCRE2_SIZE capture_id = 0;
|
||||
|
||||
if (*string != '{')
|
||||
case DDE_CAPTURE:
|
||||
if (value < capture_top)
|
||||
{
|
||||
do
|
||||
{
|
||||
/* Maximum capture id is 65535. */
|
||||
if (capture_id <= 65535)
|
||||
capture_id = capture_id * 10 + (*string - '0');
|
||||
PCRE2_SIZE capturesize;
|
||||
value *= 2;
|
||||
capturesize = ovector[value + 1] - ovector[value];
|
||||
memcpy(argsptr, subject + ovector[value], capturesize);
|
||||
argsptr += capturesize;
|
||||
}
|
||||
break;
|
||||
|
||||
string++;
|
||||
length--;
|
||||
}
|
||||
while (length > 0 && *string >= '0' && *string <= '9');
|
||||
|
||||
/* To negate the effect of string++ below. */
|
||||
string--;
|
||||
length++;
|
||||
case DDE_CHAR:
|
||||
if (value == STDOUT_NL_CODE)
|
||||
{
|
||||
memcpy(argsptr, STDOUT_NL, STDOUT_NL_LEN);
|
||||
argsptr += STDOUT_NL_LEN;
|
||||
}
|
||||
else if (utf && value > 127)
|
||||
{
|
||||
int n = ord2utf8(value);
|
||||
memcpy(argsptr, utf8_buffer, n);
|
||||
argsptr += n;
|
||||
}
|
||||
else
|
||||
{
|
||||
string++;
|
||||
length--;
|
||||
|
||||
do
|
||||
{
|
||||
/* Maximum capture id is 65535. */
|
||||
if (capture_id <= 65535)
|
||||
capture_id = capture_id * 10 + (*string - '0');
|
||||
|
||||
string++;
|
||||
length--;
|
||||
}
|
||||
while (*string != '}');
|
||||
*argsptr++ = value;
|
||||
}
|
||||
break;
|
||||
|
||||
if (capture_id < capture_top)
|
||||
{
|
||||
PCRE2_SIZE capturesize;
|
||||
capture_id *= 2;
|
||||
default: /* Should not occur */
|
||||
case DDE_ERROR:
|
||||
return 0;
|
||||
}
|
||||
|
||||
capturesize = ovector[capture_id + 1] - ovector[capture_id];
|
||||
memcpy(argsptr, subject + ovector[capture_id], capturesize);
|
||||
argsptr += capturesize;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*argsptr++ = *string;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*argsptr++ = *string;
|
||||
length -= (string - begin);
|
||||
}
|
||||
|
||||
else *argsptr++ = *string;
|
||||
|
||||
/* Advance along the string */
|
||||
|
||||
string++;
|
||||
length--;
|
||||
}
|
||||
|
@ -2479,6 +2524,7 @@ int filepos = 0;
|
|||
unsigned long int linenumber = 1;
|
||||
unsigned long int lastmatchnumber = 0;
|
||||
unsigned long int count = 0;
|
||||
long int count_matched_lines = 0;
|
||||
char *lastmatchrestart = main_buffer;
|
||||
char *ptr = main_buffer;
|
||||
char *endptr;
|
||||
|
@ -2505,7 +2551,7 @@ bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
|
|||
input_line_buffered);
|
||||
|
||||
#ifdef SUPPORT_LIBBZ2
|
||||
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is PCRE2_SIZE; */
|
||||
if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 2; /* Gotcha: bufflength is PCRE2_SIZE */
|
||||
#endif
|
||||
|
||||
endptr = main_buffer + bufflength;
|
||||
|
@ -2533,10 +2579,23 @@ while (ptr < endptr)
|
|||
int mrc = 0;
|
||||
unsigned int options = 0;
|
||||
BOOL match;
|
||||
BOOL line_matched = FALSE;
|
||||
char *t = ptr;
|
||||
PCRE2_SIZE length, linelength;
|
||||
PCRE2_SIZE startoffset = 0;
|
||||
|
||||
/* If the -m option set a limit for the number of matched or non-matched
|
||||
lines, check it here. A limit of zero means that no matching is ever done.
|
||||
For stdin from a file, set the file position. */
|
||||
|
||||
if (count_limit >= 0 && count_matched_lines >= count_limit)
|
||||
{
|
||||
if (frtype == FR_PLAIN && filename == stdin_name && !is_file_tty(handle))
|
||||
(void)fseek(handle, (long int)filepos, SEEK_SET);
|
||||
rc = (count_limit == 0)? 1 : 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* At this point, ptr is at the start of a line. We need to find the length
|
||||
of the subject string to pass to pcre2_match(). In multiline mode, it is the
|
||||
length remainder of the data in the buffer. Otherwise, it is the length of
|
||||
|
@ -2686,6 +2745,10 @@ while (ptr < endptr)
|
|||
|
||||
if (filenames == FN_NOMATCH_ONLY) return 1;
|
||||
|
||||
/* Remember that this line matched (for counting matched lines) */
|
||||
|
||||
line_matched = TRUE;
|
||||
|
||||
/* If all we want is a yes/no answer, we can return immediately. */
|
||||
|
||||
if (quiet) return 0;
|
||||
|
@ -3067,6 +3130,11 @@ while (ptr < endptr)
|
|||
filepos += (int)(linelength + endlinelength);
|
||||
linenumber++;
|
||||
|
||||
/* If there was at least one match (or a non-match, as required) in the line,
|
||||
increment the count for the -m option. */
|
||||
|
||||
if (line_matched) count_matched_lines++;
|
||||
|
||||
/* If input is line buffered, and the buffer is not yet full, read another
|
||||
line and add it into the buffer. */
|
||||
|
||||
|
@ -4088,6 +4156,7 @@ if (only_matching_count > 1)
|
|||
pcre2grep_exit(usage(2));
|
||||
}
|
||||
|
||||
|
||||
/* Check that there is a big enough ovector for all -o settings. */
|
||||
|
||||
for (om = only_matching; om != NULL; om = om->next)
|
||||
|
|
|
@ -956,3 +956,27 @@ RC=0
|
|||
pcre2grep: Requested group 1 cannot be captured.
|
||||
pcre2grep: Use --om-capture to increase the size of the capture vector.
|
||||
RC=2
|
||||
---------------------------- Test 129 -----------------------------
|
||||
The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the
|
||||
lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox
|
||||
RC=0
|
||||
---------------------------- Test 130 -----------------------------
|
||||
fox
|
||||
fox
|
||||
fox
|
||||
fox
|
||||
RC=0
|
||||
---------------------------- Test 131 -----------------------------
|
||||
2
|
||||
RC=0
|
||||
---------------------------- Test 132 -----------------------------
|
||||
match 1:
|
||||
a
|
||||
match 2:
|
||||
b
|
||||
---
|
||||
a
|
||||
RC=0
|
||||
---------------------------- Test 133 -----------------------------
|
||||
=AB3CD5=
|
||||
RC=0
|
||||
|
|
|
@ -29,3 +29,6 @@ RC=1
|
|||
---------------------------- Test U5 ------------------------------
|
||||
CD Z
|
||||
RC=0
|
||||
---------------------------- Test U6 -----------------------------
|
||||
=ǓǤ=
|
||||
RC=0
|
||||
|
|
|
@ -40,3 +40,5 @@ T
|
|||
T
|
||||
T
|
||||
T
|
||||
0:T:AA
|
||||
The quick brown
|
||||
|
|
|
@ -28,3 +28,5 @@ T
|
|||
T
|
||||
T
|
||||
T
|
||||
0:T:AA
|
||||
The quick brown
|
||||
|
|
Loading…
Reference in New Issue