diff --git a/ChangeLog b/ChangeLog index 23d35de..5610bab 100644 --- a/ChangeLog +++ b/ChangeLog @@ -49,6 +49,8 @@ tests. tests run by 'make check', but can be run manually. The current output is from a 64-bit system. +13. Implemented -Z aka --null in pcre2grep. + Version 10.40 15-April-2022 --------------------------- diff --git a/RunGrepTest b/RunGrepTest index 443ed76..5b52770 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -68,6 +68,22 @@ diff -b /dev/null /dev/null 2>/dev/null && cf="diff -b" diff -u /dev/null /dev/null 2>/dev/null && cf="diff -u" diff -ub /dev/null /dev/null 2>/dev/null && cf="diff -ub" +# Some tests involve NUL characters. It seems impossible to handle them easily +# in many operating systems. An earlier version of this script used sed to +# translate NUL into the string ZERO, but this didn't work on Solaris (aka +# SunOS), where the version of sed explicitly doesn't like them, and also MacOS +# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine, +# even when using GNU sed. A user suggested using tr instead, which +# necessitates translating to a single character. However, on (some versions +# of?) Solaris, the normal "tr" cannot handle binary zeros, but if +# /usr/xpg4/bin/tr is available, it can do so, so test for that. + +if [ -x /usr/xpg4/bin/tr ] ; then + tr=/usr/xpg4/bin/tr +else + tr=tr +fi + # If this test is being run from "make check", $srcdir will be set. If not, set # it to the current or parent directory, whichever one contains the test data. # Subsequently, we run most of the pcre2grep tests in the source directory so @@ -685,6 +701,16 @@ echo "---------------------------- Test 134 -----------------------------" >>tes (cd $srcdir; $valgrind $vjs $pcre2grep -m1 -O '=$x{41}$x423$o{103}$o1045=' 'fox') <$srcdir/testdata/grepinputv >>testtrygrep 2>&1 echo "RC=$?" >>testtrygrep +echo "---------------------------- Test 135 -----------------------------" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -lZ 'word' ./testdata/grepinputv ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -A 1 -B 1 -HZ 'word' ./testdata/grepinputv) | $tr '\000' '@' >>testtrygrep +echo "RC=$?" >>testtrygrep +(cd $srcdir; $valgrind $vjs $pcre2grep -MHZn 'start[\s]+end' testdata/grepinputM) >>testtrygrep +echo "RC=$?" >>testtrygrep + # Now compare the results. $cf $srcdir/testdata/grepoutput testtrygrep @@ -759,22 +785,6 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep > printf '%c--------------------------- Test N6 ------------------------------\r\n' - >>testtrygrep $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep -# This next test involves NUL characters. It seems impossible to handle them -# easily in many operating systems. An earlier version of this script used sed -# to translate NUL into the string ZERO, but this didn't work on Solaris (aka -# SunOS), where the version of sed explicitly doesn't like them, and also MacOS -# (Darwin), OpenBSD, FreeBSD, NetBSD, and some Linux distributions like Alpine, -# even when using GNU sed. A user suggested using tr instead, which -# necessitates translating to a single character (@). However, on (some -# versions of?) Solaris, the normal "tr" cannot handle binary zeros, but if -# /usr/xpg4/bin/tr is available, it can do so, so test for that. - -if [ -x /usr/xpg4/bin/tr ] ; then - tr=/usr/xpg4/bin/tr -else - tr=tr -fi - printf '%c--------------------------- Test N7 ------------------------------\r\n' - >>testtrygrep printf 'abc\0def' >testNinputgrep $valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | $tr '\000' '@' >>testtrygrep diff --git a/doc/html/NON-AUTOTOOLS-BUILD.txt b/doc/html/NON-AUTOTOOLS-BUILD.txt index 4fc7a9c..04c1041 100644 --- a/doc/html/NON-AUTOTOOLS-BUILD.txt +++ b/doc/html/NON-AUTOTOOLS-BUILD.txt @@ -121,6 +121,7 @@ environment, for example. pcre2_substring.c pcre2_tables.c pcre2_ucd.c + pcre2_ucptables.c pcre2_valid_utf.c pcre2_xclass.c @@ -373,7 +374,7 @@ Otherwise: 1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe have been created. -2. Edit RunTest.bat to indentify the full or relative location of +2. Edit RunTest.bat to identify the full or relative location of the pcre2 source (wherein which the testdata folder resides), e.g.: set srcdir=C:\pcre2\pcre2-10.00 diff --git a/doc/html/README.txt b/doc/html/README.txt index 187da4c..68a29a0 100644 --- a/doc/html/README.txt +++ b/doc/html/README.txt @@ -17,7 +17,7 @@ pcre2-dev+subscribe@googlegroups.com. You can access the archives and also subscribe or manage your subscription here: -https://groups.google.com/pcre2-dev +https://groups.google.com/g/pcre2-dev Please read the NEWS file if you are upgrading from a previous release. The contents of this README file are: @@ -375,7 +375,8 @@ library. They are also documented in the pcre2build man page. necessary to specify something like LIBS="-lncurses" as well. This is because, to quote the readline INSTALL, "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing - applications which link with readline the to choose an appropriate library." + applications which link with readline the option to choose an appropriate + library." If you get error messages about missing functions tgetstr, tgetent, tputs, tgetflag, or tgoto, this is the problem, and linking with the ncurses library should fix it. @@ -400,10 +401,10 @@ library. They are also documented in the pcre2build man page. Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to be created. This is normally run under valgrind or used when PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing function and - outputs information about it is doing. The input strings are specified by - arguments: if an argument starts with "=" the rest of it is a literal input - string. Otherwise, it is assumed to be a file name, and the contents of the - file are the test string. + outputs information about what it is doing. The input strings are specified + by arguments: if an argument starts with "=" the rest of it is a literal + input string. Otherwise, it is assumed to be a file name, and the contents + of the file are the test string. . Releases before 10.30 could be compiled with --disable-stack-for-recursion, which caused pcre2_match() to use individual blocks on the heap for @@ -695,7 +696,7 @@ Test 14 contains some special UTF and UCP tests that give different output for different code unit widths. Test 15 contains a number of tests that must not be run with JIT. They check, -among other non-JIT things, the match-limiting features of the intepretive +among other non-JIT things, the match-limiting features of the interpretive matcher. Test 16 is run only when JIT support is not available. It checks that an diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index ec2e2b2..2602032 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -1017,7 +1017,7 @@ has its own memory control arrangements (see the documentation for more details). If the limit is reached, the negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2 is built; if it is not, the default is set very large and is essentially -"unlimited". +unlimited.

A value for the heap limit may also be supplied by an item at the start of a @@ -1030,19 +1030,17 @@ less than the limit set by the caller of pcre2_match() or, if no such limit is set, less than the default.

-The pcre2_match() function starts out using a 20KiB vector on the system -stack for recording backtracking points. The more nested backtracking points -there are (that is, the deeper the search tree), the more memory is needed. -Heap memory is used only if the initial vector is too small. If the heap limit -is set to a value less than 21 (in particular, zero) no heap memory will be -used. In this case, only patterns that do not have a lot of nested backtracking -can be successfully processed. +The pcre2_match() function always needs some heap memory, so setting a +value of zero guarantees a "heap limit exceeded" error. Details of how +pcre2_match() uses the heap are given in the +pcre2perform +documentation.

-Similarly, for pcre2_dfa_match(), a vector on the system stack is used -when processing pattern recursions, lookarounds, or atomic groups, and only if -this is not big enough is heap memory used. In this case, too, setting a value -of zero disables the use of the heap. +For pcre2_dfa_match(), a vector on the system stack is used when +processing pattern recursions, lookarounds, or atomic groups, and only if this +is not big enough is heap memory used. In this case, setting a value of zero +disables the use of the heap.

int pcre2_set_match_limit(pcre2_match_context *mcontext, @@ -1089,10 +1087,10 @@ less than the limit set by the caller of pcre2_match() or

This parameter limits the depth of nested backtracking in pcre2_match(). -Each time a nested backtracking point is passed, a new memory "frame" is used +Each time a nested backtracking point is passed, a new memory frame is used to remember the state of matching at that point. Thus, this parameter indirectly limits the amount of memory that is used in a match. However, -because the size of each memory "frame" depends on the number of capturing +because the size of each memory frame depends on the number of capturing parentheses, the actual memory limit varies from pattern to pattern. This limit was more useful in versions before 10.30, where function recursion was used for backtracking. @@ -3148,11 +3146,11 @@ The backtracking match limit was reached.

   PCRE2_ERROR_NOMEMORY
 
-If a pattern contains many nested backtracking points, heap memory is used to -remember them. This error is given when the memory allocation function (default -or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given -if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is -also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. +Heap memory is used to remember backgracking points. This error is given when +the memory allocation function (default or custom) fails. Note that a different +error, PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds +the heap limit. PCRE2_ERROR_NOMEMORY is also returned if +PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
   PCRE2_ERROR_NULL
 
@@ -4020,9 +4018,9 @@ Cambridge, England.


REVISION

-Last updated: 14 December 2021 +Last updated: 27 July 2022
-Copyright © 1997-2021 University of Cambridge. +Copyright © 1997-2022 University of Cambridge.

Return to the PCRE2 index page. diff --git a/doc/html/pcre2build.html b/doc/html/pcre2build.html index 0d12155..07472d1 100644 --- a/doc/html/pcre2build.html +++ b/doc/html/pcre2build.html @@ -284,12 +284,11 @@ to the configure command. This setting also applies to the counting is done differently).

-The pcre2_match() function starts out using a 20KiB vector on the system -stack to record backtracking points. The more nested backtracking points there -are (that is, the deeper the search tree), the more memory is needed. If the -initial vector is not large enough, heap memory is used, up to a certain limit, -which is specified in kibibytes (units of 1024 bytes). The limit can be changed -at run time, as described in the +The pcre2_match() function uses heap memory to record backtracking +points. The more nested backtracking points there are (that is, the deeper the +search tree), the more memory is needed. There is an upper limit, specified in +kibibytes (units of 1024 bytes). This limit can be changed at run time, as +described in the pcre2api documentation. The default limit (in effect unlimited) is 20 million. You can change this by a setting such as @@ -609,16 +608,16 @@ give a warning.

Philip Hazel
-University Computing Service +Retired from University Computing Service
Cambridge, England.


REVISION

-Last updated: 08 December 2021 +Last updated: 27 July 2022
-Copyright © 1997-2021 University of Cambridge. +Copyright © 1997-2022 University of Cambridge.

Return to the PCRE2 index page. diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html index b3252d3..640d38d 100644 --- a/doc/html/pcre2grep.html +++ b/doc/html/pcre2grep.html @@ -71,13 +71,15 @@ For example:

   pcre2grep some-pattern file1 - file3
 
-Input files are searched line by line. By default, each line that matches a +By default, input files are searched line by line. Each line that matches a pattern is copied to the standard output, and if there is more than one file, the file name is output at the start of each line, followed by a colon. -However, there are options that can change how pcre2grep behaves. In -particular, the -M option makes it possible to search for strings that -span line boundaries. What defines a line boundary is controlled by the --N (--newline) option. +However, there are options that can change how pcre2grep behaves. For +example, the -M option makes it possible to search for strings that span +line boundaries. What defines a line boundary is controlled by the -N +(--newline) option. The -h and -H options control whether or +not file names are shown, and the -Z option changes the file name +terminator to a zero byte.

The amount of memory used for buffering files that are being scanned is @@ -178,9 +180,11 @@ Output up to number lines of context after each matching line. Fewer lines are output if the next match or the end of the file is reached, or if the processing buffer size has been set too small. If file names and/or line numbers are being output, a hyphen separator is used instead of a colon for the -context lines. A line containing "--" is output between each group of lines, -unless they are in fact contiguous in the input file. The value of number -is expected to be relatively small. When -c is used, -A is ignored. +context lines (the -Z option can be used to change the file name +terminator to a zero byte). A line containing "--" is output between each group +of lines, unless they are in fact contiguous in the input file. The value of +number is expected to be relatively small. When -c is used, +-A is ignored.

-a, --text @@ -199,9 +203,10 @@ Output up to number lines of context before each matching line. Fewer lines are output if the previous match or the start of the file is within number lines, or if the processing buffer size has been set too small. If file names and/or line numbers are being output, a hyphen separator is used -instead of a colon for the context lines. A line containing "--" is output -between each group of lines, unless they are in fact contiguous in the input -file. The value of number is expected to be relatively small. When +instead of a colon for the context lines (the -Z option can be used to +change the file name terminator to a zero byte). A line containing "--" is +output between each group of lines, unless they are in fact contiguous in the +input file. The value of number is expected to be relatively small. When -c is used, -B is ignored.

@@ -411,20 +416,22 @@ shown separately. This option is mutually exclusive with --output,

-H, --with-filename Force the inclusion of the file name at the start of output lines when -searching a single file. By default, the file name is not shown in this case. -For matching lines, the file name is followed by a colon; for context lines, a -hyphen separator is used. If a line number is also being output, it follows the -file name. When the -M option causes a pattern to match more than one -line, only the first is preceded by the file name. This option overrides any -previous -h, -l, or -L options. +searching a single file. The file name is not normally shown in this case. +By default, for matching lines, the file name is followed by a colon; for +context lines, a hyphen separator is used. The -Z option can be used to +change the terminator to a zero byte. If a line number is also being output, +it follows the file name. When the -M option causes a pattern to match +more than one line, only the first is preceded by the file name. This option +overrides any previous -h, -l, or -L options.

-h, --no-filename -Suppress the output file names when searching multiple files. By default, -file names are shown when multiple files are searched. For matching lines, the -file name is followed by a colon; for context lines, a hyphen separator is used. -If a line number is also being output, it follows the file name. This option -overrides any previous -H, -L, or -l options. +Suppress the output file names when searching multiple files. File names are +normally shown when multiple files are searched. By default, for matching +lines, the file name is followed by a colon; for context lines, a hyphen +separator is used. The -Z option can be used to change the terminator to +a zero byte. If a line number is also being output, it follows the file name. +This option overrides any previous -H, -L, or -l options.

--heap-limit=number @@ -481,18 +488,20 @@ given any number of times. If a directory matches both --include-dir and -L, --files-without-match Instead of outputting lines from the files, just output the names of the files that do not contain any lines that would have been output. Each file name is -output once, on a separate line. This option overrides any previous -H, --h, or -l options. +output once, on a separate line by default, but if the -Z option is set, +they are separated by zero bytes instead of newlines. This option overrides any +previous -H, -h, or -l options.

-l, --files-with-matches Instead of outputting lines from the files, just output the names of the files containing lines that would have been output. Each file name is output once, on -a separate line. Searching normally stops as soon as a matching line is found -in a file. However, if the -c (count) option is also used, matching -continues in order to obtain the correct count, and those files that have at -least one match are listed along with their counts. Using this option with --c is a way of suppressing the listing of files with no matches that +a separate line, but if the -Z option is set, they are separated by zero +bytes instead of newlines. Searching normally stops as soon as a matching line +is found in a file. However, if the -c (count) option is also used, +matching continues in order to obtain the correct count, and those files that +have at least one match are listed along with their counts. Using this option +with -c is a way of suppressing the listing of files with no matches that occurs with -c on its own. This option overrides any previous -H, -h, or -L options.

@@ -592,10 +601,7 @@ value set by --match-limit is reached, an error occurs.

The --heap-limit option specifies, as a number of kibibytes (units of -1024 bytes), the amount of heap memory that may be used for matching. Heap -memory is needed only if matching the pattern requires a significant number of -nested backtracking points to be remembered. This parameter can be set to zero -to forbid the use of heap memory altogether. +1024 bytes), the maximum amount of heap memory that may be used for matching.

The --depth-limit option limits the depth of nested backtracking points, @@ -839,6 +845,13 @@ pattern and ")$" at the end. This option applies only to the patterns that are matched against the contents of files; it does not apply to patterns specified by any of the --include or --exclude options.

+

+-Z, --null +Terminate files names in the regular output with a zero byte (the NUL +character) instead of what would normally appear. This is useful when file +names contain unusual characters such as colons, hyphens, or even newlines. The +option does not apply to file names in error messages. +


ENVIRONMENT VARIABLES

The environment variables LC_ALL and LC_CTYPE are examined, in that @@ -1053,9 +1066,9 @@ Cambridge, England.


REVISION

-Last updated: 31 August 2021 +Last updated: 30 July 2022
-Copyright © 1997-2021 University of Cambridge. +Copyright © 1997-2022 University of Cambridge.

Return to the PCRE2 index page. diff --git a/doc/html/pcre2limits.html b/doc/html/pcre2limits.html index c8bc01b..43a15d3 100644 --- a/doc/html/pcre2limits.html +++ b/doc/html/pcre2limits.html @@ -71,13 +71,18 @@ is 255 code units for the 8-bit library and 65535 code units for the 16-bit and The maximum length of a string argument to a callout is the largest number a 32-bit unsigned integer can hold.

+

+The maximum amount of heap memory used for matching is controlled by the heap +limit, which can be set in a pattern or in a match context. The default is a +very large number, effectively unlimited. +


AUTHOR

Philip Hazel
-University Computing Service +Retired from University Computing Service
Cambridge, England.
@@ -86,9 +91,9 @@ Cambridge, England. REVISION

-Last updated: 02 February 2019 +Last updated: 26 July 2022
-Copyright © 1997-2019 University of Cambridge. +Copyright © 1997-2022 University of Cambridge.

Return to the PCRE2 index page. diff --git a/doc/html/pcre2perform.html b/doc/html/pcre2perform.html index 80d716c..ab7dfc8 100644 --- a/doc/html/pcre2perform.html +++ b/doc/html/pcre2perform.html @@ -83,12 +83,31 @@ From release 10.30, the interpretive (non-JIT) version of pcre2_match() uses very little system stack at run time. In earlier releases recursive function calls could use a great deal of stack, and this could cause problems, but this usage has been eliminated. Backtracking positions are now explicitly -remembered in memory frames controlled by the code. An initial 20KiB vector of -frames is allocated on the system stack (enough for about 100 frames for small -patterns), but if this is insufficient, heap memory is used. The amount of heap -memory can be limited; if the limit is set to zero, only the initial stack -vector is used. Rewriting patterns to be time-efficient, as described below, -may also reduce the memory requirements. +remembered in memory frames controlled by the code. +

+

+The size of each frame depends on the size of pointer variables and the number +of capturing parenthesized groups in the pattern being matched. On a 64-bit +system the frame size for a pattern with no captures is 128 bytes. For each +capturing group the size increases by 16 bytes. +

+

+Until release 10.41, an initial 20KiB frames vector was allocated on the system +stack, but this still caused some issues for multi-thread applications where +each thread has a very small stack. From release 10.41 backtracking memory +frames are always held in heap memory. An initial heap allocation is obtained +the first time any match data block is passed to pcre2_match(). This is +remembered with the match data block and re-used if that block is used for +another match. It is freed when the match data block itself is freed. +

+

+The size of the initial block is the larger of 20KiB or ten times the pattern's +frame size, unless the heap limit is less than this, in which case the heap +limit is used. If the initial block proves to be too small during matching, it +is replaced by a larger block, subject to the heap limit. The heap limit is +checked only when a new block is to be allocated. Reducing the heap limit +between calls to pcre2_match() with the same match data block does not +affect the saved block.

In contrast to pcre2_match(), pcre2_dfa_match() does use recursive @@ -245,16 +264,16 @@ pattern to match. This is done by repeatedly matching with different limits.

Philip Hazel
-University Computing Service +Retired from University Computing Service
Cambridge, England.


REVISION

-Last updated: 03 February 2019 +Last updated: 27 July 2022
-Copyright © 1997-2019 University of Cambridge. +Copyright © 1997-2022 University of Cambridge.

Return to the PCRE2 index page. diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html index 373e5df..650ff0a 100644 --- a/doc/html/pcre2test.html +++ b/doc/html/pcre2test.html @@ -1241,7 +1241,8 @@ pattern, but can be overridden by modifiers on the subject. copy=<number or name> copy captured substring depth_limit=<n> set a depth limit dfa use pcre2_dfa_match() - find_limits find match and depth limits + find_limits find heap, match and depth limits + find_limits_noheap find match and depth limits get=<number or name> extract captured substring getall extract all captured substrings /g global global matching @@ -1564,7 +1565,7 @@ Setting heap, match, and depth limits

The heap_limit, match_limit, and depth_limit modifiers set the appropriate limits in the match context. These values are ignored when the -find_limits modifier is specified. +find_limits or find_limits_noheap modifier is specified.


Finding minimum limits @@ -1574,8 +1575,12 @@ If the find_limits modifier is present on a subject line, pcre2testpcre2_set_heap_limit(), pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds -the minimum values for each parameter that allows the match to complete without -error. If JIT is being used, only the match limit is relevant. +the smallest value for each parameter that allows the match to complete without +a "limit exceeded" error. The match itself may succeed or fail. An alternative +modifier, find_limits_noheap, omits the heap limit. This is used in the +standard tests, because the minimum heap limit varies between systems. If JIT +is being used, only the match limit is relevant, and the other two are +automatically omitted.

When using this modifier, the pattern should not contain any limit settings @@ -1603,9 +1608,7 @@ overall amount of computing resource that is used.

For both kinds of matching, the heap_limit number, which is in kibibytes -(units of 1024 bytes), limits the amount of heap memory used for matching. A -value of zero disables the use of any heap memory; many simple pattern matches -can be done without using the heap, so zero is not an unreasonable setting. +(units of 1024 bytes), limits the amount of heap memory used for matching.


Showing MARK names @@ -1623,12 +1626,10 @@ Showing memory usage

The memory modifier causes pcre2test to log the sizes of all heap memory allocation and freeing calls that occur during a call to -pcre2_match() or pcre2_dfa_match(). These occur only when a match -requires a bigger vector than the default for remembering backtracking points -(pcre2_match()) or for internal workspace (pcre2_dfa_match()). In -many cases there will be no heap memory used and therefore no additional -output. No heap memory is allocated during matching with JIT, so in that case -the memory modifier never has any effect. For this modifier to work, the +pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory +is used only when a match requires more internal workspace that the default +allocation on the stack, so in many cases there will be no output. No heap +memory is allocated during matching with JIT. For this modifier to work, the null_context modifier must not be set on both the pattern and the subject, though it can be set on one or the other.

@@ -1690,7 +1691,8 @@ Normally, pcre2test passes a context block to pcre2_match(), If the null_context modifier is set, however, NULL is passed. This is for testing that the matching and substitution functions behave correctly in this case (they use default values). This modifier cannot be used with the -find_limits or substitute_callout modifiers. +find_limits, find_limits_noheap, or substitute_callout +modifiers.

Similarly, for testing purposes, if the null_subject or @@ -2141,7 +2143,7 @@ Cambridge, England.


REVISION

-Last updated: 12 January 2022 +Last updated: 27 July 2022
Copyright © 1997-2022 University of Cambridge.
diff --git a/doc/pcre2.txt b/doc/pcre2.txt index c70fb9b..762c326 100644 --- a/doc/pcre2.txt +++ b/doc/pcre2.txt @@ -1028,7 +1028,7 @@ PCRE2 CONTEXTS pcre2jit documentation for more details). If the limit is reached, the negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default limit can be set when PCRE2 is built; if it is not, the default is set - very large and is essentially "unlimited". + very large and is essentially unlimited. A value for the heap limit may also be supplied by an item at the start of a pattern of the form @@ -1039,19 +1039,15 @@ PCRE2 CONTEXTS less ddd is less than the limit set by the caller of pcre2_match() or, if no such limit is set, less than the default. - The pcre2_match() function starts out using a 20KiB vector on the sys- - tem stack for recording backtracking points. The more nested backtrack- - ing points there are (that is, the deeper the search tree), the more - memory is needed. Heap memory is used only if the initial vector is - too small. If the heap limit is set to a value less than 21 (in partic- - ular, zero) no heap memory will be used. In this case, only patterns - that do not have a lot of nested backtracking can be successfully pro- - cessed. + The pcre2_match() function always needs some heap memory, so setting a + value of zero guarantees a "heap limit exceeded" error. Details of how + pcre2_match() uses the heap are given in the pcre2perform documenta- + tion. - Similarly, for pcre2_dfa_match(), a vector on the system stack is used - when processing pattern recursions, lookarounds, or atomic groups, and - only if this is not big enough is heap memory used. In this case, too, - setting a value of zero disables the use of the heap. + For pcre2_dfa_match(), a vector on the system stack is used when pro- + cessing pattern recursions, lookarounds, or atomic groups, and only if + this is not big enough is heap memory used. In this case, setting a + value of zero disables the use of the heap. int pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t value); @@ -1093,12 +1089,12 @@ PCRE2 CONTEXTS This parameter limits the depth of nested backtracking in pcre2_match(). Each time a nested backtracking point is passed, a new - memory "frame" is used to remember the state of matching at that point. + memory frame is used to remember the state of matching at that point. Thus, this parameter indirectly limits the amount of memory that is - used in a match. However, because the size of each memory "frame" de- - pends on the number of capturing parentheses, the actual memory limit - varies from pattern to pattern. This limit was more useful in versions - before 10.30, where function recursion was used for backtracking. + used in a match. However, because the size of each memory frame depends + on the number of capturing parentheses, the actual memory limit varies + from pattern to pattern. This limit was more useful in versions before + 10.30, where function recursion was used for backtracking. The depth limit is not relevant, and is ignored, when matching is done using JIT compiled code. However, it is supported by pcre2_dfa_match(), @@ -3051,12 +3047,12 @@ ERROR RETURNS FROM pcre2_match() PCRE2_ERROR_NOMEMORY - If a pattern contains many nested backtracking points, heap memory is - used to remember them. This error is given when the memory allocation - function (default or custom) fails. Note that a different error, - PCRE2_ERROR_HEAPLIMIT, is given if the amount of memory needed exceeds - the heap limit. PCRE2_ERROR_NOMEMORY is also returned if - PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. + Heap memory is used to remember backgracking points. This error is + given when the memory allocation function (default or custom) fails. + Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given if the + amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is + also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory alloca- + tion fails. PCRE2_ERROR_NULL @@ -3860,8 +3856,8 @@ AUTHOR REVISION - Last updated: 14 December 2021 - Copyright (c) 1997-2021 University of Cambridge. + Last updated: 27 July 2022 + Copyright (c) 1997-2022 University of Cambridge. ------------------------------------------------------------------------------ @@ -4118,41 +4114,40 @@ LIMITING PCRE2 RESOURCE USAGE pcre2_dfa_match() matching function, and to JIT matching (though the counting is done differently). - The pcre2_match() function starts out using a 20KiB vector on the sys- - tem stack to record backtracking points. The more nested backtracking - points there are (that is, the deeper the search tree), the more memory - is needed. If the initial vector is not large enough, heap memory is - used, up to a certain limit, which is specified in kibibytes (units of - 1024 bytes). The limit can be changed at run time, as described in the - pcre2api documentation. The default limit (in effect unlimited) is 20 - million. You can change this by a setting such as + The pcre2_match() function uses heap memory to record backtracking + points. The more nested backtracking points there are (that is, the + deeper the search tree), the more memory is needed. There is an upper + limit, specified in kibibytes (units of 1024 bytes). This limit can be + changed at run time, as described in the pcre2api documentation. The + default limit (in effect unlimited) is 20 million. You can change this + by a setting such as --with-heap-limit=500 - which limits the amount of heap to 500 KiB. This limit applies only to + which limits the amount of heap to 500 KiB. This limit applies only to interpretive matching in pcre2_match() and pcre2_dfa_match(), which may - also use the heap for internal workspace when processing complicated - patterns. This limit does not apply when JIT (which has its own memory + also use the heap for internal workspace when processing complicated + patterns. This limit does not apply when JIT (which has its own memory arrangements) is used. - You can also explicitly limit the depth of nested backtracking in the + You can also explicitly limit the depth of nested backtracking in the pcre2_match() interpreter. This limit defaults to the value that is set - for --with-match-limit. You can set a lower default limit by adding, + for --with-match-limit. You can set a lower default limit by adding, for example, --with-match-limit-depth=10000 - to the configure command. This value can be overridden at run time. - This depth limit indirectly limits the amount of heap memory that is - used, but because the size of each backtracking "frame" depends on the - number of capturing parentheses in a pattern, the amount of heap that - is used before the limit is reached varies from pattern to pattern. + to the configure command. This value can be overridden at run time. + This depth limit indirectly limits the amount of heap memory that is + used, but because the size of each backtracking "frame" depends on the + number of capturing parentheses in a pattern, the amount of heap that + is used before the limit is reached varies from pattern to pattern. This limit was more useful in versions before 10.30, where function re- cursion was used for backtracking. As well as applying to pcre2_match(), the depth limit also controls the - depth of recursive function calls in pcre2_dfa_match(). These are used - for lookaround assertions, atomic groups, and recursion within pat- + depth of recursive function calls in pcre2_dfa_match(). These are used + for lookaround assertions, atomic groups, and recursion within pat- terns. The limit does not apply to JIT matching. @@ -4160,67 +4155,67 @@ CREATING CHARACTER TABLES AT BUILD TIME PCRE2 uses fixed tables for processing characters whose code points are less than 256. By default, PCRE2 is built with a set of tables that are - distributed in the file src/pcre2_chartables.c.dist. These tables are + distributed in the file src/pcre2_chartables.c.dist. These tables are for ASCII codes only. If you add --enable-rebuild-chartables - to the configure command, the distributed tables are no longer used. + to the configure command, the distributed tables are no longer used. Instead, a program called pcre2_dftables is compiled and run. This out- puts the source for new set of tables, created in the default locale of - your C run-time system. This method of replacing the tables does not + your C run-time system. This method of replacing the tables does not work if you are cross compiling, because pcre2_dftables needs to be run on the local host and therefore not compiled with the cross compiler. If you need to create alternative tables when cross compiling, you will - have to do so "by hand". There may also be other reasons for creating - tables manually. To cause pcre2_dftables to be built on the local + have to do so "by hand". There may also be other reasons for creating + tables manually. To cause pcre2_dftables to be built on the local host, run a normal compiling command, and then run the program with the output file as its argument, for example: cc src/pcre2_dftables.c -o pcre2_dftables ./pcre2_dftables src/pcre2_chartables.c - This builds the tables in the default locale of the local host. If you + This builds the tables in the default locale of the local host. If you want to specify a locale, you must use the -L option: LC_ALL=fr_FR ./pcre2_dftables -L src/pcre2_chartables.c You can also specify -b (with or without -L). This causes the tables to - be written in binary instead of as source code. A set of binary tables - can be loaded into memory by an application and passed to pcre2_com- + be written in binary instead of as source code. A set of binary tables + can be loaded into memory by an application and passed to pcre2_com- pile() in the same way as tables created by calling pcre2_maketables(). - The tables are just a string of bytes, independent of hardware charac- - teristics such as endianness. This means they can be bundled with an - application that runs in different environments, to ensure consistent + The tables are just a string of bytes, independent of hardware charac- + teristics such as endianness. This means they can be bundled with an + application that runs in different environments, to ensure consistent behaviour. USING EBCDIC CODE - PCRE2 assumes by default that it will run in an environment where the - character code is ASCII or Unicode, which is a superset of ASCII. This + PCRE2 assumes by default that it will run in an environment where the + character code is ASCII or Unicode, which is a superset of ASCII. This is the case for most computer operating systems. PCRE2 can, however, be compiled to run in an 8-bit EBCDIC environment by adding --enable-ebcdic --disable-unicode to the configure command. This setting implies --enable-rebuild-charta- - bles. You should only use it if you know that you are in an EBCDIC en- + bles. You should only use it if you know that you are in an EBCDIC en- vironment (for example, an IBM mainframe operating system). - It is not possible to support both EBCDIC and UTF-8 codes in the same - version of the library. Consequently, --enable-unicode and --enable- + It is not possible to support both EBCDIC and UTF-8 codes in the same + version of the library. Consequently, --enable-unicode and --enable- ebcdic are mutually exclusive. The EBCDIC character that corresponds to an ASCII LF is assumed to have - the value 0x15 by default. However, in some EBCDIC environments, 0x25 + the value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In such an environment you should use --enable-ebcdic-nl25 as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR - has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and + has the same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is not chosen as LF is made to correspond to the Unicode NEL char- acter (which, in Unicode, is 0x85). @@ -4232,47 +4227,47 @@ USING EBCDIC CODE PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS By default pcre2grep supports the use of callouts with string arguments - within the patterns it is matching. There are two kinds: one that gen- + within the patterns it is matching. There are two kinds: one that gen- erates output using local code, and another that calls an external pro- - gram or script. If --disable-pcre2grep-callout-fork is added to the - configure command, only the first kind of callout is supported; if - --disable-pcre2grep-callout is used, all callouts are completely ig- - nored. For more details of pcre2grep callouts, see the pcre2grep docu- + gram or script. If --disable-pcre2grep-callout-fork is added to the + configure command, only the first kind of callout is supported; if + --disable-pcre2grep-callout is used, all callouts are completely ig- + nored. For more details of pcre2grep callouts, see the pcre2grep docu- mentation. PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT - By default, pcre2grep reads all files as plain text. You can build it - so that it recognizes files whose names end in .gz or .bz2, and reads + By default, pcre2grep reads all files as plain text. You can build it + so that it recognizes files whose names end in .gz or .bz2, and reads them with libz or libbz2, respectively, by adding one or both of --enable-pcre2grep-libz --enable-pcre2grep-libbz2 to the configure command. These options naturally require that the rel- - evant libraries are installed on your system. Configuration will fail + evant libraries are installed on your system. Configuration will fail if they are not. PCRE2GREP BUFFER SIZE - pcre2grep uses an internal buffer to hold a "window" on the file it is + pcre2grep uses an internal buffer to hold a "window" on the file it is scanning, in order to be able to output "before" and "after" lines when it finds a match. The default starting size of the buffer is 20KiB. The - buffer itself is three times this size, but because of the way it is + buffer itself is three times this size, but because of the way it is used for holding "before" lines, the longest line that is guaranteed to be processable is the notional buffer size. If a longer line is encoun- - tered, pcre2grep automatically expands the buffer, up to a specified - maximum size, whose default is 1MiB or the starting size, whichever is - the larger. You can change the default parameter values by adding, for + tered, pcre2grep automatically expands the buffer, up to a specified + maximum size, whose default is 1MiB or the starting size, whichever is + the larger. You can change the default parameter values by adding, for example, --with-pcre2grep-bufsize=51200 --with-pcre2grep-max-bufsize=2097152 - to the configure command. The caller of pcre2grep can override these - values by using --buffer-size and --max-buffer-size on the command + to the configure command. The caller of pcre2grep can override these + values by using --buffer-size and --max-buffer-size on the command line. @@ -4283,26 +4278,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT --enable-pcre2test-libreadline --enable-pcre2test-libedit - to the configure command, pcre2test is linked with the libreadline or- - libedit library, respectively, and when its input is from a terminal, - it reads it using the readline() function. This provides line-editing - and history facilities. Note that libreadline is GPL-licensed, so if - you distribute a binary of pcre2test linked in this way, there may be + to the configure command, pcre2test is linked with the libreadline or- + libedit library, respectively, and when its input is from a terminal, + it reads it using the readline() function. This provides line-editing + and history facilities. Note that libreadline is GPL-licensed, so if + you distribute a binary of pcre2test linked in this way, there may be licensing issues. These can be avoided by linking instead with libedit, which has a BSD licence. - Setting --enable-pcre2test-libreadline causes the -lreadline option to - be added to the pcre2test build. In many operating environments with a - sytem-installed readline library this is sufficient. However, in some + Setting --enable-pcre2test-libreadline causes the -lreadline option to + be added to the pcre2test build. In many operating environments with a + sytem-installed readline library this is sufficient. However, in some environments (e.g. if an unmodified distribution version of readline is - in use), some extra configuration may be necessary. The INSTALL file + in use), some extra configuration may be necessary. The INSTALL file for libreadline says this: "Readline uses the termcap functions, but does not link with the termcap or curses library itself, allowing applications which link with readline the to choose an appropriate library." - If your environment has not been set up so that an appropriate library + If your environment has not been set up so that an appropriate library is automatically included, you may need to add something like LIBS="-ncurses" @@ -4316,7 +4311,7 @@ INCLUDING DEBUGGING CODE --enable-debug - to the configure command, additional debugging code is included in the + to the configure command, additional debugging code is included in the build. This feature is intended for use by the PCRE2 maintainers. @@ -4326,14 +4321,14 @@ DEBUGGING WITH VALGRIND SUPPORT --enable-valgrind - to the configure command, PCRE2 will use valgrind annotations to mark - certain memory regions as unaddressable. This allows it to detect in- + to the configure command, PCRE2 will use valgrind annotations to mark + certain memory regions as unaddressable. This allows it to detect in- valid memory accesses, and is mostly useful for debugging PCRE2 itself. CODE COVERAGE REPORTING - If your C compiler is gcc, you can build a version of PCRE2 that can + If your C compiler is gcc, you can build a version of PCRE2 that can generate a code coverage report for its test suite. To enable this, you must install lcov version 1.6 or above. Then specify @@ -4342,20 +4337,20 @@ CODE COVERAGE REPORTING to the configure command and build PCRE2 in the usual way. Note that using ccache (a caching C compiler) is incompatible with code - coverage reporting. If you have configured ccache to run automatically + coverage reporting. If you have configured ccache to run automatically on your system, you must set the environment variable CCACHE_DISABLE=1 before running make to build PCRE2, so that ccache is not used. - When --enable-coverage is used, the following addition targets are + When --enable-coverage is used, the following addition targets are added to the Makefile: make coverage - This creates a fresh coverage report for the PCRE2 test suite. It is - equivalent to running "make coverage-reset", "make coverage-baseline", + This creates a fresh coverage report for the PCRE2 test suite. It is + equivalent to running "make coverage-reset", "make coverage-baseline", "make check", and then "make coverage-report". make coverage-reset @@ -4372,73 +4367,73 @@ CODE COVERAGE REPORTING make coverage-clean-report - This removes the generated coverage report without cleaning the cover- + This removes the generated coverage report without cleaning the cover- age data itself. make coverage-clean-data - This removes the captured coverage data without removing the coverage + This removes the captured coverage data without removing the coverage files created at compile time (*.gcno). make coverage-clean - This cleans all coverage data including the generated coverage report. - For more information about code coverage, see the gcov and lcov docu- + This cleans all coverage data including the generated coverage report. + For more information about code coverage, see the gcov and lcov docu- mentation. DISABLING THE Z AND T FORMATTING MODIFIERS - The C99 standard defines formatting modifiers z and t for size_t and - ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers + The C99 standard defines formatting modifiers z and t for size_t and + ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in environments other than old versions of Microsoft Visual Studio when - __STDC_VERSION__ is defined and has a value greater than or equal to - 199901L (indicating support for C99). However, there is at least one + __STDC_VERSION__ is defined and has a value greater than or equal to + 199901L (indicating support for C99). However, there is at least one environment that claims to be C99 but does not support these modifiers. If --disable-percent-zt is specified, no use is made of the z or t modifiers. Instead of %td or - %zu, a suitable format is used depending in the size of long for the + %zu, a suitable format is used depending in the size of long for the platform. SUPPORT FOR FUZZERS - There is a special option for use by people who want to run fuzzing + There is a special option for use by people who want to run fuzzing tests on PCRE2: --enable-fuzz-support At present this applies only to the 8-bit library. If set, it causes an - extra library called libpcre2-fuzzsupport.a to be built, but not in- - stalled. This contains a single function called LLVMFuzzerTestOneIn- - put() whose arguments are a pointer to a string and the length of the - string. When called, this function tries to compile the string as a - pattern, and if that succeeds, to match it. This is done both with no - options and with some random options bits that are generated from the + extra library called libpcre2-fuzzsupport.a to be built, but not in- + stalled. This contains a single function called LLVMFuzzerTestOneIn- + put() whose arguments are a pointer to a string and the length of the + string. When called, this function tries to compile the string as a + pattern, and if that succeeds, to match it. This is done both with no + options and with some random options bits that are generated from the string. - Setting --enable-fuzz-support also causes a binary called pcre2fuz- - zcheck to be created. This is normally run under valgrind or used when + Setting --enable-fuzz-support also causes a binary called pcre2fuz- + zcheck to be created. This is normally run under valgrind or used when PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing - function and outputs information about what it is doing. The input - strings are specified by arguments: if an argument starts with "=" the - rest of it is a literal input string. Otherwise, it is assumed to be a + function and outputs information about what it is doing. The input + strings are specified by arguments: if an argument starts with "=" the + rest of it is a literal input string. Otherwise, it is assumed to be a file name, and the contents of the file are the test string. OBSOLETE OPTION - In versions of PCRE2 prior to 10.30, there were two ways of handling - backtracking in the pcre2_match() function. The default was to use the + In versions of PCRE2 prior to 10.30, there were two ways of handling + backtracking in the pcre2_match() function. The default was to use the system stack, but if --disable-stack-for-recursion - was set, memory on the heap was used. From release 10.30 onwards this - has changed (the stack is no longer used) and this option now does + was set, memory on the heap was used. From release 10.30 onwards this + has changed (the stack is no longer used) and this option now does nothing except give a warning. @@ -4450,14 +4445,14 @@ SEE ALSO AUTHOR Philip Hazel - University Computing Service + Retired from University Computing Service Cambridge, England. REVISION - Last updated: 08 December 2021 - Copyright (c) 1997-2021 University of Cambridge. + Last updated: 27 July 2022 + Copyright (c) 1997-2022 University of Cambridge. ------------------------------------------------------------------------------ @@ -5596,18 +5591,22 @@ SIZE AND OTHER LIMITATIONS The maximum length of a string argument to a callout is the largest number a 32-bit unsigned integer can hold. + The maximum amount of heap memory used for matching is controlled by + the heap limit, which can be set in a pattern or in a match context. + The default is a very large number, effectively unlimited. + AUTHOR Philip Hazel - University Computing Service + Retired from University Computing Service Cambridge, England. REVISION - Last updated: 02 February 2019 - Copyright (c) 1997-2019 University of Cambridge. + Last updated: 26 July 2022 + Copyright (c) 1997-2022 University of Cambridge. ------------------------------------------------------------------------------ @@ -9773,152 +9772,169 @@ STACK AND HEAP USAGE AT RUN TIME sive function calls could use a great deal of stack, and this could cause problems, but this usage has been eliminated. Backtracking posi- tions are now explicitly remembered in memory frames controlled by the - code. An initial 20KiB vector of frames is allocated on the system - stack (enough for about 100 frames for small patterns), but if this is - insufficient, heap memory is used. The amount of heap memory can be - limited; if the limit is set to zero, only the initial stack vector is - used. Rewriting patterns to be time-efficient, as described below, may - also reduce the memory requirements. + code. - In contrast to pcre2_match(), pcre2_dfa_match() does use recursive - function calls, but only for processing atomic groups, lookaround as- + The size of each frame depends on the size of pointer variables and the + number of capturing parenthesized groups in the pattern being matched. + On a 64-bit system the frame size for a pattern with no captures is 128 + bytes. For each capturing group the size increases by 16 bytes. + + Until release 10.41, an initial 20KiB frames vector was allocated on + the system stack, but this still caused some issues for multi-thread + applications where each thread has a very small stack. From release + 10.41 backtracking memory frames are always held in heap memory. An + initial heap allocation is obtained the first time any match data block + is passed to pcre2_match(). This is remembered with the match data + block and re-used if that block is used for another match. It is freed + when the match data block itself is freed. + + The size of the initial block is the larger of 20KiB or ten times the + pattern's frame size, unless the heap limit is less than this, in which + case the heap limit is used. If the initial block proves to be too + small during matching, it is replaced by a larger block, subject to the + heap limit. The heap limit is checked only when a new block is to be + allocated. Reducing the heap limit between calls to pcre2_match() with + the same match data block does not affect the saved block. + + In contrast to pcre2_match(), pcre2_dfa_match() does use recursive + function calls, but only for processing atomic groups, lookaround as- sertions, and recursion within the pattern. The original version of the - code used to allocate quite large internal workspace vectors on the - stack, which caused some problems for some patterns in environments - with small stacks. From release 10.32 the code for pcre2_dfa_match() - has been re-factored to use heap memory when necessary for internal - workspace when recursing, though recursive function calls are still + code used to allocate quite large internal workspace vectors on the + stack, which caused some problems for some patterns in environments + with small stacks. From release 10.32 the code for pcre2_dfa_match() + has been re-factored to use heap memory when necessary for internal + workspace when recursing, though recursive function calls are still used. - The "match depth" parameter can be used to limit the depth of function - recursion, and the "match heap" parameter to limit heap memory in + The "match depth" parameter can be used to limit the depth of function + recursion, and the "match heap" parameter to limit heap memory in pcre2_dfa_match(). PROCESSING TIME - Certain items in regular expression patterns are processed more effi- + Certain items in regular expression patterns are processed more effi- ciently than others. It is more efficient to use a character class like - [aeiou] than a set of single-character alternatives such as - (a|e|i|o|u). In general, the simplest construction that provides the + [aeiou] than a set of single-character alternatives such as + (a|e|i|o|u). In general, the simplest construction that provides the required behaviour is usually the most efficient. Jeffrey Friedl's book - contains a lot of useful general discussion about optimizing regular + contains a lot of useful general discussion about optimizing regular expressions for efficient performance. This document contains a few ob- servations about PCRE2. - Using Unicode character properties (the \p, \P, and \X escapes) is - slow, because PCRE2 has to use a multi-stage table lookup whenever it - needs a character's property. If you can find an alternative pattern + Using Unicode character properties (the \p, \P, and \X escapes) is + slow, because PCRE2 has to use a multi-stage table lookup whenever it + needs a character's property. If you can find an alternative pattern that does not use character properties, it will probably be faster. - By default, the escape sequences \b, \d, \s, and \w, and the POSIX - character classes such as [:alpha:] do not use Unicode properties, + By default, the escape sequences \b, \d, \s, and \w, and the POSIX + character classes such as [:alpha:] do not use Unicode properties, partly for backwards compatibility, and partly for performance reasons. - However, you can set the PCRE2_UCP option or start the pattern with - (*UCP) if you want Unicode character properties to be used. This can - double the matching time for items such as \d, when matched with - pcre2_match(); the performance loss is less with a DFA matching func- + However, you can set the PCRE2_UCP option or start the pattern with + (*UCP) if you want Unicode character properties to be used. This can + double the matching time for items such as \d, when matched with + pcre2_match(); the performance loss is less with a DFA matching func- tion, and in both cases there is not much difference for \b. - When a pattern begins with .* not in atomic parentheses, nor in paren- - theses that are the subject of a backreference, and the PCRE2_DOTALL - option is set, the pattern is implicitly anchored by PCRE2, since it - can match only at the start of a subject string. If the pattern has + When a pattern begins with .* not in atomic parentheses, nor in paren- + theses that are the subject of a backreference, and the PCRE2_DOTALL + option is set, the pattern is implicitly anchored by PCRE2, since it + can match only at the start of a subject string. If the pattern has multiple top-level branches, they must all be anchorable. The optimiza- - tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au- + tion can be disabled by the PCRE2_NO_DOTSTAR_ANCHOR option, and is au- tomatically disabled if the pattern contains (*PRUNE) or (*SKIP). - If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be- - cause the dot metacharacter does not then match a newline, and if the - subject string contains newlines, the pattern may match from the char- + If PCRE2_DOTALL is not set, PCRE2 cannot make this optimization, be- + cause the dot metacharacter does not then match a newline, and if the + subject string contains newlines, the pattern may match from the char- acter immediately following one of them instead of from the very start. For example, the pattern .*second - matches the subject "first\nand second" (where \n stands for a newline - character), with the match starting at the seventh character. In order - to do this, PCRE2 has to retry the match starting after every newline + matches the subject "first\nand second" (where \n stands for a newline + character), with the match starting at the seventh character. In order + to do this, PCRE2 has to retry the match starting after every newline in the subject. - If you are using such a pattern with subject strings that do not con- - tain newlines, the best performance is obtained by setting - PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex- - plicit anchoring. That saves PCRE2 from having to scan along the sub- + If you are using such a pattern with subject strings that do not con- + tain newlines, the best performance is obtained by setting + PCRE2_DOTALL, or starting the pattern with ^.* or ^.*? to indicate ex- + plicit anchoring. That saves PCRE2 from having to scan along the sub- ject looking for a newline to restart at. - Beware of patterns that contain nested indefinite repeats. These can - take a long time to run when applied to a string that does not match. + Beware of patterns that contain nested indefinite repeats. These can + take a long time to run when applied to a string that does not match. Consider the pattern fragment ^(a+)* - This can match "aaaa" in 16 different ways, and this number increases - very rapidly as the string gets longer. (The * repeat can match 0, 1, - 2, 3, or 4 times, and for each of those cases other than 0 or 4, the + - repeats can match different numbers of times.) When the remainder of - the pattern is such that the entire match is going to fail, PCRE2 has - in principle to try every possible variation, and this can take an ex- + This can match "aaaa" in 16 different ways, and this number increases + very rapidly as the string gets longer. (The * repeat can match 0, 1, + 2, 3, or 4 times, and for each of those cases other than 0 or 4, the + + repeats can match different numbers of times.) When the remainder of + the pattern is such that the entire match is going to fail, PCRE2 has + in principle to try every possible variation, and this can take an ex- tremely long time, even for relatively short strings. An optimization catches some of the more simple cases such as (a+)*b - where a literal character follows. Before embarking on the standard - matching procedure, PCRE2 checks that there is a "b" later in the sub- - ject string, and if there is not, it fails the match immediately. How- - ever, when there is no following literal this optimization cannot be + where a literal character follows. Before embarking on the standard + matching procedure, PCRE2 checks that there is a "b" later in the sub- + ject string, and if there is not, it fails the match immediately. How- + ever, when there is no following literal this optimization cannot be used. You can see the difference by comparing the behaviour of (a+)*\d - with the pattern above. The former gives a failure almost instantly - when applied to a whole line of "a" characters, whereas the latter + with the pattern above. The former gives a failure almost instantly + when applied to a whole line of "a" characters, whereas the latter takes an appreciable time with strings longer than about 20 characters. In many cases, the solution to this kind of performance issue is to use - an atomic group or a possessive quantifier. This can often reduce mem- + an atomic group or a possessive quantifier. This can often reduce mem- ory requirements as well. As another example, consider this pattern: ([^<]|<(?!inet))+ - It matches from wherever it starts until it encounters " - matches" when a match succeeds. If the word is "text", which - is equivalent to the -a or --text option, binary files are - processed in the same way as any other file. In this case, - when a match succeeds, the output may be binary garbage, - which can have nasty effects if sent to a terminal. If the - word is "without-match", which is equivalent to the -I op- + Specify how binary files are to be processed. If the word is + "binary" (the default), pattern matching is performed on bi- + nary files, but the only output is "Binary file + matches" when a match succeeds. If the word is "text", which + is equivalent to the -a or --text option, binary files are + processed in the same way as any other file. In this case, + when a match succeeds, the output may be binary garbage, + which can have nasty effects if sent to a terminal. If the + word is "without-match", which is equivalent to the -I op- tion, binary files are not processed at all; they are assumed - not to be of interest and are skipped without causing any + not to be of interest and are skipped without causing any output or affecting the return code. --buffer-size=number - Set the parameter that controls how much memory is obtained + Set the parameter that controls how much memory is obtained at the start of processing for buffering files that are being scanned. See also --max-buffer-size below. -C number, --context=number - Output number lines of context both before and after each - matching line. This is equivalent to setting both -A and -B + Output number lines of context both before and after each + matching line. This is equivalent to setting both -A and -B to the same value. -c, --count - Do not output lines from the files that are being scanned; - instead output the number of lines that would have been + Do not output lines from the files that are being scanned; + instead output the number of lines that would have been shown, either because they matched, or, if -v is set, because - they failed to match. By default, this count is exactly the - same as the number of lines that would have been output, but - if the -M (multiline) option is used (without -v), there may - be more suppressed lines than the count (that is, the number + they failed to match. By default, this count is exactly the + same as the number of lines that would have been output, but + if the -M (multiline) option is used (without -v), there may + be more suppressed lines than the count (that is, the number of matches). - If no lines are selected, the number zero is output. If sev- - eral files are are being scanned, a count is output for each - of them and the -t option can be used to cause a total to be - output at the end. However, if the --files-with-matches op- - tion is also used, only those files whose counts are greater + If no lines are selected, the number zero is output. If sev- + eral files are are being scanned, a count is output for each + of them and the -t option can be used to cause a total to be + output at the end. However, if the --files-with-matches op- + tion is also used, only those files whose counts are greater than zero are listed. When -c is used, the -A, -B, and -C op- tions are ignored. --colour, --color If this option is given without any data, it is equivalent to - "--colour=auto". If data is required, it must be given in + "--colour=auto". If data is required, it must be given in the same shell item, separated by an equals sign. --colour=value, --color=value This option specifies under what circumstances the parts of a line that matched a pattern should be coloured in the output. - By default, the output is not coloured. The value (which is - optional, see above) may be "never", "always", or "auto". In - the latter case, colouring happens only if the standard out- - put is connected to a terminal. More resources are used when + By default, the output is not coloured. The value (which is + optional, see above) may be "never", "always", or "auto". In + the latter case, colouring happens only if the standard out- + put is connected to a terminal. More resources are used when colouring is enabled, because pcre2grep has to search for all - possible matches in a line, not just one, in order to colour + possible matches in a line, not just one, in order to colour them all. - The colour that is used can be specified by setting one of - the environment variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, + The colour that is used can be specified by setting one of + the environment variables PCRE2GREP_COLOUR, PCRE2GREP_COLOR, PCREGREP_COLOUR, or PCREGREP_COLOR, which are checked in that order. If none of these are set, pcre2grep looks for - GREP_COLORS or GREP_COLOR (in that order). The value of the - variable should be a string of two numbers, separated by a - semicolon, except in the case of GREP_COLORS, which must + GREP_COLORS or GREP_COLOR (in that order). The value of the + variable should be a string of two numbers, separated by a + semicolon, except in the case of GREP_COLORS, which must start with "ms=" or "mt=" followed by two semicolon-separated - colours, terminated by the end of the string or by a colon. - If GREP_COLORS does not start with "ms=" or "mt=" it is ig- + colours, terminated by the end of the string or by a colon. + If GREP_COLORS does not start with "ms=" or "mt=" it is ig- nored, and GREP_COLOR is checked. - If the string obtained from one of the above variables con- + If the string obtained from one of the above variables con- tains any characters other than semicolon or digits, the set- ting is ignored and the default colour is used. The string is copied directly into the control string for setting colour on - a terminal, so it is your responsibility to ensure that the - values make sense. If no relevant environment variable is + a terminal, so it is your responsibility to ensure that the + values make sense. If no relevant environment variable is set, the default is "1;31", which gives red. -D action, --devices=action - If an input path is not a regular file or a directory, "ac- - tion" specifies how it is to be processed. Valid values are + If an input path is not a regular file or a directory, "ac- + tion" specifies how it is to be processed. Valid values are "read" (the default) or "skip" (silently skip the path). -d action, --directories=action If an input path is a directory, "action" specifies how it is - to be processed. Valid values are "read" (the default in - non-Windows environments, for compatibility with GNU grep), - "recurse" (equivalent to the -r option), or "skip" (silently - skip the path, the default in Windows environments). In the - "read" case, directories are read as if they were ordinary - files. In some operating systems the effect of reading a di- - rectory like this is an immediate end-of-file; in others it + to be processed. Valid values are "read" (the default in + non-Windows environments, for compatibility with GNU grep), + "recurse" (equivalent to the -r option), or "skip" (silently + skip the path, the default in Windows environments). In the + "read" case, directories are read as if they were ordinary + files. In some operating systems the effect of reading a di- + rectory like this is an immediate end-of-file; in others it may provoke an error. --depth-limit=number @@ -276,133 +281,136 @@ OPTIONS -e pattern, --regex=pattern, --regexp=pattern Specify a pattern to be matched. This option can be used mul- tiple times in order to specify several patterns. It can also - be used as a way of specifying a single pattern that starts - with a hyphen. When -e is used, no argument pattern is taken - from the command line; all arguments are treated as file - names. There is no limit to the number of patterns. They are - applied to each line in the order in which they are defined + be used as a way of specifying a single pattern that starts + with a hyphen. When -e is used, no argument pattern is taken + from the command line; all arguments are treated as file + names. There is no limit to the number of patterns. They are + applied to each line in the order in which they are defined until one matches. - If -f is used with -e, the command line patterns are matched + If -f is used with -e, the command line patterns are matched first, followed by the patterns from the file(s), independent - of the order in which these options are specified. Note that - multiple use of -e is not the same as a single pattern with + of the order in which these options are specified. Note that + multiple use of -e is not the same as a single pattern with alternatives. For example, X|Y finds the first character in a - line that is X or Y, whereas if the two patterns are given + line that is X or Y, whereas if the two patterns are given separately, with X first, pcre2grep finds X if it is present, even if it follows Y in the line. It finds Y only if there is - no X in the line. This matters only if you are using -o or + no X in the line. This matters only if you are using -o or --colo(u)r to show the part(s) of the line that matched. --exclude=pattern Files (but not directories) whose names match the pattern are - skipped without being processed. This applies to all files, - whether listed on the command line, obtained from --file- + skipped without being processed. This applies to all files, + whether listed on the command line, obtained from --file- list, or by scanning a directory. The pattern is a PCRE2 reg- - ular expression, and is matched against the final component + ular expression, and is matched against the final component of the file name, not the entire path. The -F, -w, and -x op- - tions do not apply to this pattern. The option may be given + tions do not apply to this pattern. The option may be given any number of times in order to specify multiple patterns. If - a file name matches both an --include and an --exclude pat- + a file name matches both an --include and an --exclude pat- tern, it is excluded. There is no short form for this option. --exclude-from=filename - Treat each non-empty line of the file as the data for an + Treat each non-empty line of the file as the data for an --exclude option. What constitutes a newline when reading the - file is the operating system's default. The --newline option - has no effect on this option. This option may be given more + file is the operating system's default. The --newline option + has no effect on this option. This option may be given more than once in order to specify a number of files to read. --exclude-dir=pattern Directories whose names match the pattern are skipped without - being processed, whatever the setting of the --recursive op- - tion. This applies to all directories, whether listed on the - command line, obtained from --file-list, or by scanning a - parent directory. The pattern is a PCRE2 regular expression, - and is matched against the final component of the directory - name, not the entire path. The -F, -w, and -x options do not - apply to this pattern. The option may be given any number of - times in order to specify more than one pattern. If a direc- - tory matches both --include-dir and --exclude-dir, it is ex- + being processed, whatever the setting of the --recursive op- + tion. This applies to all directories, whether listed on the + command line, obtained from --file-list, or by scanning a + parent directory. The pattern is a PCRE2 regular expression, + and is matched against the final component of the directory + name, not the entire path. The -F, -w, and -x options do not + apply to this pattern. The option may be given any number of + times in order to specify more than one pattern. If a direc- + tory matches both --include-dir and --exclude-dir, it is ex- cluded. There is no short form for this option. -F, --fixed-strings - Interpret each data-matching pattern as a list of fixed - strings, separated by newlines, instead of as a regular ex- + Interpret each data-matching pattern as a list of fixed + strings, separated by newlines, instead of as a regular ex- pression. What constitutes a newline for this purpose is con- trolled by the --newline option. The -w (match as a word) and - -x (match whole line) options can be used with -F. They ap- - ply to each of the fixed strings. A line is selected if any + -x (match whole line) options can be used with -F. They ap- + ply to each of the fixed strings. A line is selected if any of the fixed strings are found in it (subject to -w or -x, if - present). This option applies only to the patterns that are - matched against the contents of files; it does not apply to - patterns specified by any of the --include or --exclude op- + present). This option applies only to the patterns that are + matched against the contents of files; it does not apply to + patterns specified by any of the --include or --exclude op- tions. -f filename, --file=filename - Read patterns from the file, one per line, and match them - against each line of input. As is the case with patterns on - the command line, no delimiters should be used. What consti- - tutes a newline when reading the file is the operating sys- - tem's default interpretation of \n. The --newline option has - no effect on this option. Trailing white space is removed - from each line, and blank lines are ignored. An empty file - contains no patterns and therefore matches nothing. Patterns - read from a file in this way may contain binary zeros, which - are treated as ordinary data characters. See also the com- - ments about multiple patterns versus a single pattern with + Read patterns from the file, one per line, and match them + against each line of input. As is the case with patterns on + the command line, no delimiters should be used. What consti- + tutes a newline when reading the file is the operating sys- + tem's default interpretation of \n. The --newline option has + no effect on this option. Trailing white space is removed + from each line, and blank lines are ignored. An empty file + contains no patterns and therefore matches nothing. Patterns + read from a file in this way may contain binary zeros, which + are treated as ordinary data characters. See also the com- + ments about multiple patterns versus a single pattern with alternatives in the description of -e above. - If this option is given more than once, all the specified - files are read. A data line is output if any of the patterns - match it. A file name can be given as "-" to refer to the - standard input. When -f is used, patterns specified on the - command line using -e may also be present; they are tested - before the file's patterns. However, no other pattern is + If this option is given more than once, all the specified + files are read. A data line is output if any of the patterns + match it. A file name can be given as "-" to refer to the + standard input. When -f is used, patterns specified on the + command line using -e may also be present; they are tested + before the file's patterns. However, no other pattern is taken from the command line; all arguments are treated as the names of paths to be searched. --file-list=filename - Read a list of files and/or directories that are to be + Read a list of files and/or directories that are to be scanned from the given file, one per line. What constitutes a - newline when reading the file is the operating system's de- - fault. Trailing white space is removed from each line, and + newline when reading the file is the operating system's de- + fault. Trailing white space is removed from each line, and blank lines are ignored. These paths are processed before any - that are listed on the command line. The file name can be - given as "-" to refer to the standard input. If --file and - --file-list are both specified as "-", patterns are read - first. This is useful only when the standard input is a ter- - minal, from which further lines (the list of files) can be + that are listed on the command line. The file name can be + given as "-" to refer to the standard input. If --file and + --file-list are both specified as "-", patterns are read + first. This is useful only when the standard input is a ter- + minal, from which further lines (the list of files) can be read after an end-of-file indication. If this option is given more than once, all the specified files are read. --file-offsets - Instead of showing lines or parts of lines that match, show - each match as an offset from the start of the file and a - length, separated by a comma. In this mode, no context is - shown. That is, the -A, -B, and -C options are ignored. If + Instead of showing lines or parts of lines that match, show + each match as an offset from the start of the file and a + length, separated by a comma. In this mode, no context is + shown. That is, the -A, -B, and -C options are ignored. If there is more than one match in a line, each of them is shown - separately. This option is mutually exclusive with --output, + separately. This option is mutually exclusive with --output, --line-offsets, and --only-matching. -H, --with-filename - Force the inclusion of the file name at the start of output - lines when searching a single file. By default, the file name - is not shown in this case. For matching lines, the file name - is followed by a colon; for context lines, a hyphen separator - is used. If a line number is also being output, it follows - the file name. When the -M option causes a pattern to match - more than one line, only the first is preceded by the file - name. This option overrides any previous -h, -l, or -L op- - tions. + Force the inclusion of the file name at the start of output + lines when searching a single file. The file name is not nor- + mally shown in this case. By default, for matching lines, + the file name is followed by a colon; for context lines, a + hyphen separator is used. The -Z option can be used to change + the terminator to a zero byte. If a line number is also being + output, it follows the file name. When the -M option causes a + pattern to match more than one line, only the first is pre- + ceded by the file name. This option overrides any previous + -h, -l, or -L options. -h, --no-filename Suppress the output file names when searching multiple files. - By default, file names are shown when multiple files are - searched. For matching lines, the file name is followed by a - colon; for context lines, a hyphen separator is used. If a - line number is also being output, it follows the file name. - This option overrides any previous -H, -L, or -l options. + File names are normally shown when multiple files are + searched. By default, for matching lines, the file name is + followed by a colon; for context lines, a hyphen separator is + used. The -Z option can be used to change the terminator to a + zero byte. If a line number is also being output, it follows + the file name. This option overrides any previous -H, -L, or + -l options. --heap-limit=number See --match-limit below. @@ -455,21 +463,23 @@ OPTIONS Instead of outputting lines from the files, just output the names of the files that do not contain any lines that would have been output. Each file name is output once, on a sepa- - rate line. This option overrides any previous -H, -h, or -l - options. + rate line by default, but if the -Z option is set, they are + separated by zero bytes instead of newlines. This option + overrides any previous -H, -h, or -l options. -l, --files-with-matches - Instead of outputting lines from the files, just output the + Instead of outputting lines from the files, just output the names of the files containing lines that would have been out- - put. Each file name is output once, on a separate line. - Searching normally stops as soon as a matching line is found - in a file. However, if the -c (count) option is also used, - matching continues in order to obtain the correct count, and - those files that have at least one match are listed along - with their counts. Using this option with -c is a way of sup- - pressing the listing of files with no matches that occurs - with -c on its own. This option overrides any previous -H, - -h, or -L options. + put. Each file name is output once, on a separate line, but + if the -Z option is set, they are separated by zero bytes in- + stead of newlines. Searching normally stops as soon as a + matching line is found in a file. However, if the -c (count) + option is also used, matching continues in order to obtain + the correct count, and those files that have at least one + match are listed along with their counts. Using this option + with -c is a way of suppressing the listing of files with no + matches that occurs with -c on its own. This option overrides + any previous -H, -h, or -L options. --label=name This option supplies a name to be used for the standard input @@ -571,29 +581,26 @@ OPTIONS an error occurs. The --heap-limit option specifies, as a number of kibibytes - (units of 1024 bytes), the amount of heap memory that may be - used for matching. Heap memory is needed only if matching the - pattern requires a significant number of nested backtracking - points to be remembered. This parameter can be set to zero to - forbid the use of heap memory altogether. + (units of 1024 bytes), the maximum amount of heap memory that + may be used for matching. - The --depth-limit option limits the depth of nested back- + The --depth-limit option limits the depth of nested back- tracking points, which indirectly limits the amount of memory that is used. The amount of memory needed for each backtrack- - ing point depends on the number of capturing parentheses in + ing point depends on the number of capturing parentheses in the pattern, so the amount of memory that is used before this - limit acts varies from pattern to pattern. This limit is of + limit acts varies from pattern to pattern. This limit is of use only if it is set smaller than --match-limit. - There are no short forms for these options. The default lim- - its can be set when the PCRE2 library is compiled; if they - are not specified, the defaults are very large and so effec- + There are no short forms for these options. The default lim- + its can be set when the PCRE2 library is compiled; if they + are not specified, the defaults are very large and so effec- tively unlimited. --max-buffer-size=number - This limits the expansion of the processing buffer, whose - initial size can be set by --buffer-size. The maximum buffer - size is silently forced to be no smaller than the starting + This limits the expansion of the processing buffer, whose + initial size can be set by --buffer-size. The maximum buffer + size is silently forced to be no smaller than the starting buffer size. -N newline-type, --newline=newline-type @@ -602,55 +609,55 @@ OPTIONS pcre2grep -N CRLF 'some pattern' - The newline type may be specified in upper, lower, or mixed - case. If the newline type is NUL, lines are separated by bi- - nary zero characters. The other types are the single-charac- - ter sequences CR (carriage return) and LF (linefeed), the - two-character sequence CRLF, an "anycrlf" type, which recog- - nizes any of the preceding three types, and an "any" type, - for which any Unicode line ending sequence is assumed to end - a line. The Unicode sequences are the three just mentioned, - plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL - (next line, U+0085), LS (line separator, U+2028), and PS + The newline type may be specified in upper, lower, or mixed + case. If the newline type is NUL, lines are separated by bi- + nary zero characters. The other types are the single-charac- + ter sequences CR (carriage return) and LF (linefeed), the + two-character sequence CRLF, an "anycrlf" type, which recog- + nizes any of the preceding three types, and an "any" type, + for which any Unicode line ending sequence is assumed to end + a line. The Unicode sequences are the three just mentioned, + plus VT (vertical tab, U+000B), FF (form feed, U+000C), NEL + (next line, U+0085), LS (line separator, U+2028), and PS (paragraph separator, U+2029). - When the PCRE2 library is built, a default line-ending se- - quence is specified. This is normally the standard sequence - for the operating system. Unless otherwise specified by this + When the PCRE2 library is built, a default line-ending se- + quence is specified. This is normally the standard sequence + for the operating system. Unless otherwise specified by this option, pcre2grep uses the library's default. - This option makes it possible to use pcre2grep to scan files + This option makes it possible to use pcre2grep to scan files that have come from other environments without having to mod- - ify their line endings. If the data that is being scanned - does not agree with the convention set by this option, - pcre2grep may behave in strange ways. Note that this option - does not apply to files specified by the -f, --exclude-from, - or --include-from options, which are expected to use the op- + ify their line endings. If the data that is being scanned + does not agree with the convention set by this option, + pcre2grep may behave in strange ways. Note that this option + does not apply to files specified by the -f, --exclude-from, + or --include-from options, which are expected to use the op- erating system's standard newline sequence. -n, --line-number Precede each output line by its line number in the file, fol- - lowed by a colon for matching lines or a hyphen for context + lowed by a colon for matching lines or a hyphen for context lines. If the file name is also being output, it precedes the - line number. When the -M option causes a pattern to match - more than one line, only the first is preceded by its line + line number. When the -M option causes a pattern to match + more than one line, only the first is preceded by its line number. This option is forced if --line-offsets is used. - --no-jit If the PCRE2 library is built with support for just-in-time + --no-jit If the PCRE2 library is built with support for just-in-time compiling (which speeds up matching), pcre2grep automatically makes use of this, unless it was explicitly disabled at build - time. This option can be used to disable the use of JIT at - run time. It is provided for testing and working round prob- + time. This option can be used to disable the use of JIT at + run time. It is provided for testing and working round prob- lems. It should never be needed in normal use. -O text, --output=text - When there is a match, instead of outputting the line that - matched, output just the text specified in this option, fol- - lowed by an operating-system standard newline. In this mode, - no context is shown. That is, the -A, -B, and -C options are - ignored. The --newline option has no effect on this option, + When there is a match, instead of outputting the line that + matched, output just the text specified in this option, fol- + lowed by an operating-system standard newline. In this mode, + no context is shown. That is, the -A, -B, and -C options are + ignored. The --newline option has no effect on this option, which is mutually exclusive with --only-matching, --file-off- - sets, and --line-offsets. However, like --only-matching, if + sets, and --line-offsets. However, like --only-matching, if there is more than one match in a line, each of them causes a line of output. @@ -658,160 +665,167 @@ OPTIONS to insert the contents of the matched part of the line and/or captured substrings into the text. - $ or ${} is replaced by the captured sub- - string of the given decimal number; zero substitutes the + $ or ${} is replaced by the captured sub- + string of the given decimal number; zero substitutes the whole match. If the number is greater than the number of cap- - turing substrings, or if the capture is unset, the replace- + turing substrings, or if the capture is unset, the replace- ment is empty. - $a is replaced by bell; $b by backspace; $e by escape; $f by - form feed; $n by newline; $r by carriage return; $t by tab; + $a is replaced by bell; $b by backspace; $e by escape; $f by + form feed; $n by newline; $r by carriage return; $t by tab; $v by vertical tab. $o or $o{} is replaced by the character whose - code point is the given octal number. In the first form, up - to three octal digits are processed. When more digits are - needed in Unicode mode to specify a wide character, the sec- + code point is the given octal number. In the first form, up + to three octal digits are processed. When more digits are + needed in Unicode mode to specify a wide character, the sec- ond form must be used. - $x or $x{} is replaced by the character rep- - resented by the given hexadecimal number. In the first form, - up to two hexadecimal digits are processed. When more digits - are needed in Unicode mode to specify a wide character, the + $x or $x{} is replaced by the character rep- + resented by the given hexadecimal number. In the first form, + up to two hexadecimal digits are processed. When more digits + are needed in Unicode mode to specify a wide character, the second form must be used. - Any other character is substituted by itself. In particular, + Any other character is substituted by itself. In particular, $$ is replaced by a single dollar. -o, --only-matching Show only the part of the line that matched a pattern instead - of the whole line. In this mode, no context is shown. That - is, the -A, -B, and -C options are ignored. If there is more - than one match in a line, each of them is shown separately, - on a separate line of output. If -o is combined with -v (in- - vert the sense of the match to find non-matching lines), no - output is generated, but the return code is set appropri- - ately. If the matched portion of the line is empty, nothing - is output unless the file name or line number are being - printed, in which case they are shown on an otherwise empty + of the whole line. In this mode, no context is shown. That + is, the -A, -B, and -C options are ignored. If there is more + than one match in a line, each of them is shown separately, + on a separate line of output. If -o is combined with -v (in- + vert the sense of the match to find non-matching lines), no + output is generated, but the return code is set appropri- + ately. If the matched portion of the line is empty, nothing + is output unless the file name or line number are being + printed, in which case they are shown on an otherwise empty line. This option is mutually exclusive with --output, --file-offsets and --line-offsets. -onumber, --only-matching=number - Show only the part of the line that matched the capturing + Show only the part of the line that matched the capturing parentheses of the given number. Up to 50 capturing parenthe- - ses are supported by default. This limit can be changed via - the --om-capture option. A pattern may contain any number of - capturing parentheses, but only those whose number is within - the limit can be accessed by -o. An error occurs if the num- + ses are supported by default. This limit can be changed via + the --om-capture option. A pattern may contain any number of + capturing parentheses, but only those whose number is within + the limit can be accessed by -o. An error occurs if the num- ber specified by -o is greater than the limit. -o0 is the same as -o without a number. Because these options - can be given without an argument (see above), if an argument - is present, it must be given in the same shell item, for ex- - ample, -o3 or --only-matching=2. The comments given for the - non-argument case above also apply to this option. If the - specified capturing parentheses do not exist in the pattern, - or were not set in the match, nothing is output unless the + can be given without an argument (see above), if an argument + is present, it must be given in the same shell item, for ex- + ample, -o3 or --only-matching=2. The comments given for the + non-argument case above also apply to this option. If the + specified capturing parentheses do not exist in the pattern, + or were not set in the match, nothing is output unless the file name or line number are being output. - If this option is given multiple times, multiple substrings - are output for each match, in the order the options are - given, and all on one line. For example, -o3 -o1 -o3 causes - the substrings matched by capturing parentheses 3 and 1 and - then 3 again to be output. By default, there is no separator + If this option is given multiple times, multiple substrings + are output for each match, in the order the options are + given, and all on one line. For example, -o3 -o1 -o3 causes + the substrings matched by capturing parentheses 3 and 1 and + then 3 again to be output. By default, there is no separator (but see the next but one option). --om-capture=number - Set the number of capturing parentheses that can be accessed + Set the number of capturing parentheses that can be accessed by -o. The default is 50. --om-separator=text - Specify a separating string for multiple occurrences of -o. - The default is an empty string. Separating strings are never + Specify a separating string for multiple occurrences of -o. + The default is an empty string. Separating strings are never coloured. -q, --quiet Work quietly, that is, display nothing except error messages. - The exit status indicates whether or not any matches were + The exit status indicates whether or not any matches were found. -r, --recursive - If any given path is a directory, recursively scan the files - it contains, taking note of any --include and --exclude set- - tings. By default, a directory is read as a normal file; in - some operating systems this gives an immediate end-of-file. - This option is a shorthand for setting the -d option to "re- + If any given path is a directory, recursively scan the files + it contains, taking note of any --include and --exclude set- + tings. By default, a directory is read as a normal file; in + some operating systems this gives an immediate end-of-file. + This option is a shorthand for setting the -d option to "re- curse". --recursion-limit=number - This is an obsolete synonym for --depth-limit. See --match- + This is an obsolete synonym for --depth-limit. See --match- limit above for details. -s, --no-messages - Suppress error messages about non-existent or unreadable - files. Such files are quietly skipped. However, the return + Suppress error messages about non-existent or unreadable + files. Such files are quietly skipped. However, the return code is still 2, even if matches were found in other files. -t, --total-count - This option is useful when scanning more than one file. If - used on its own, -t suppresses all output except for a grand - total number of matching lines (or non-matching lines if -v + This option is useful when scanning more than one file. If + used on its own, -t suppresses all output except for a grand + total number of matching lines (or non-matching lines if -v is used) in all the files. If -t is used with -c, a grand to- - tal is output except when the previous output is just one - line. In other words, it is not output when just one file's - count is listed. If file names are being output, the grand - total is preceded by "TOTAL:". Otherwise, it appears as just - another number. The -t option is ignored when used with -L - (list files without matches), because the grand total would + tal is output except when the previous output is just one + line. In other words, it is not output when just one file's + count is listed. If file names are being output, the grand + total is preceded by "TOTAL:". Otherwise, it appears as just + another number. The -t option is ignored when used with -L + (list files without matches), because the grand total would always be zero. -u, --utf Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled with UTF-8 support. All patterns (including - those for any --exclude and --include options) and all lines - that are scanned must be valid strings of UTF-8 characters. + those for any --exclude and --include options) and all lines + that are scanned must be valid strings of UTF-8 characters. If an invalid UTF-8 string is encountered, an error occurs. -U, --utf-allow-invalid - As --utf, but in addition subject lines may contain invalid - UTF-8 code unit sequences. These can never form part of any - pattern match. Patterns themselves, however, must still be + As --utf, but in addition subject lines may contain invalid + UTF-8 code unit sequences. These can never form part of any + pattern match. Patterns themselves, however, must still be valid UTF-8 strings. This facility allows valid UTF-8 strings to be sought within arbitrary byte sequences in executable or - other binary files. For more details about matching in non- + other binary files. For more details about matching in non- valid UTF-8 strings, see the pcre2unicode(3) documentation. -V, --version - Write the version numbers of pcre2grep and the PCRE2 library - to the standard output and then exit. Anything else on the + Write the version numbers of pcre2grep and the PCRE2 library + to the standard output and then exit. Anything else on the command line is ignored. -v, --invert-match - Invert the sense of the match, so that lines which do not - match any of the patterns are the ones that are found. When - this option is set, options such as --only-matching and - --output, which specify parts of a match that are to be out- + Invert the sense of the match, so that lines which do not + match any of the patterns are the ones that are found. When + this option is set, options such as --only-matching and + --output, which specify parts of a match that are to be out- put, are ignored. -w, --word-regex, --word-regexp Force the patterns only to match "words". That is, there must - be a word boundary at the start and end of each matched - string. This is equivalent to having "\b(?:" at the start of - each pattern, and ")\b" at the end. This option applies only - to the patterns that are matched against the contents of - files; it does not apply to patterns specified by any of the + be a word boundary at the start and end of each matched + string. This is equivalent to having "\b(?:" at the start of + each pattern, and ")\b" at the end. This option applies only + to the patterns that are matched against the contents of + files; it does not apply to patterns specified by any of the --include or --exclude options. -x, --line-regex, --line-regexp - Force the patterns to start matching only at the beginnings - of lines, and in addition, require them to match entire + Force the patterns to start matching only at the beginnings + of lines, and in addition, require them to match entire lines. In multiline mode the match may be more than one line. This is equivalent to having "^(?:" at the start of each pat- - tern and ")$" at the end. This option applies only to the - patterns that are matched against the contents of files; it - does not apply to patterns specified by any of the --include + tern and ")$" at the end. This option applies only to the + patterns that are matched against the contents of files; it + does not apply to patterns specified by any of the --include or --exclude options. + -Z, --null + Terminate files names in the regular output with a zero byte + (the NUL character) instead of what would normally appear. + This is useful when file names contain unusual characters + such as colons, hyphens, or even newlines. The option does + not apply to file names in error messages. + ENVIRONMENT VARIABLES @@ -823,137 +837,137 @@ ENVIRONMENT VARIABLES NEWLINES - The -N (--newline) option allows pcre2grep to scan files with newline - conventions that differ from the default. This option affects only the - way scanned files are processed. It does not affect the interpretation - of files specified by the -f, --file-list, --exclude-from, or --in- + The -N (--newline) option allows pcre2grep to scan files with newline + conventions that differ from the default. This option affects only the + way scanned files are processed. It does not affect the interpretation + of files specified by the -f, --file-list, --exclude-from, or --in- clude-from options. - Any parts of the scanned input files that are written to the standard - output are copied with whatever newline sequences they have in the in- - put. However, if the final line of a file is output, and it does not - end with a newline sequence, a newline sequence is added. If the new- - line setting is CR, LF, CRLF or NUL, that line ending is output; for + Any parts of the scanned input files that are written to the standard + output are copied with whatever newline sequences they have in the in- + put. However, if the final line of a file is output, and it does not + end with a newline sequence, a newline sequence is added. If the new- + line setting is CR, LF, CRLF or NUL, that line ending is output; for the other settings (ANYCRLF or ANY) a single NL is used. - The newline setting does not affect the way in which pcre2grep writes - newlines in informational messages to the standard output and error - streams. Under Windows, the standard output is set to be binary, so - that "\r\n" at the ends of output lines that are copied from the input - is not converted to "\r\r\n" by the C I/O library. This means that any - messages written to the standard output must end with "\r\n". For all - other operating systems, and for all messages to the standard error + The newline setting does not affect the way in which pcre2grep writes + newlines in informational messages to the standard output and error + streams. Under Windows, the standard output is set to be binary, so + that "\r\n" at the ends of output lines that are copied from the input + is not converted to "\r\r\n" by the C I/O library. This means that any + messages written to the standard output must end with "\r\n". For all + other operating systems, and for all messages to the standard error stream, "\n" is used. OPTIONS COMPATIBILITY Many of the short and long forms of pcre2grep's options are the same as - in the GNU grep program. Any long option of the form --xxx-regexp (GNU + in the GNU grep program. Any long option of the form --xxx-regexp (GNU terminology) is also available as --xxx-regex (PCRE2 terminology). How- - ever, the --depth-limit, --file-list, --file-offsets, --heap-limit, - --include-dir, --line-offsets, --locale, --match-limit, -M, --multi- - line, -N, --newline, --om-separator, --output, -u, --utf, -U, and + ever, the --depth-limit, --file-list, --file-offsets, --heap-limit, + --include-dir, --line-offsets, --locale, --match-limit, -M, --multi- + line, -N, --newline, --om-separator, --output, -u, --utf, -U, and --utf-allow-invalid options are specific to pcre2grep, as is the use of the --only-matching option with a capturing parentheses number. - Although most of the common options work the same way, a few are dif- - ferent in pcre2grep. For example, the --include option's argument is a - glob for GNU grep, but a regular expression for pcre2grep. If both the - -c and -l options are given, GNU grep lists only file names, without + Although most of the common options work the same way, a few are dif- + ferent in pcre2grep. For example, the --include option's argument is a + glob for GNU grep, but a regular expression for pcre2grep. If both the + -c and -l options are given, GNU grep lists only file names, without counts, but pcre2grep gives the counts as well. OPTIONS WITH DATA There are four different ways in which an option with data can be spec- - ified. If a short form option is used, the data may follow immedi- + ified. If a short form option is used, the data may follow immedi- ately, or (with one exception) in the next command line item. For exam- ple: -f/some/file -f /some/file - The exception is the -o option, which may appear with or without data. - Because of this, if data is present, it must follow immediately in the + The exception is the -o option, which may appear with or without data. + Because of this, if data is present, it must follow immediately in the same item, for example -o3. - If a long form option is used, the data may appear in the same command - line item, separated by an equals character, or (with two exceptions) + If a long form option is used, the data may appear in the same command + line item, separated by an equals character, or (with two exceptions) it may appear in the next command line item. For example: --file=/some/file --file /some/file - Note, however, that if you want to supply a file name beginning with ~ - as data in a shell command, and have the shell expand ~ to a home di- - rectory, you must separate the file name from the option, because the + Note, however, that if you want to supply a file name beginning with ~ + as data in a shell command, and have the shell expand ~ to a home di- + rectory, you must separate the file name from the option, because the shell does not treat ~ specially unless it is at the start of an item. - The exceptions to the above are the --colour (or --color) and --only- - matching options, for which the data is optional. If one of these op- - tions does have data, it must be given in the first form, using an + The exceptions to the above are the --colour (or --color) and --only- + matching options, for which the data is optional. If one of these op- + tions does have data, it must be given in the first form, using an equals character. Otherwise pcre2grep will assume that it has no data. USING PCRE2'S CALLOUT FACILITY - pcre2grep has, by default, support for calling external programs or - scripts or echoing specific strings during matching by making use of - PCRE2's callout facility. However, this support can be completely or - partially disabled when pcre2grep is built. You can find out whether - your binary has support for callouts by running it with the --help op- - tion. If callout support is completely disabled, all callouts in pat- + pcre2grep has, by default, support for calling external programs or + scripts or echoing specific strings during matching by making use of + PCRE2's callout facility. However, this support can be completely or + partially disabled when pcre2grep is built. You can find out whether + your binary has support for callouts by running it with the --help op- + tion. If callout support is completely disabled, all callouts in pat- terns are ignored by pcre2grep. If the facility is partially disabled, - calling external programs is not supported, and callouts that request + calling external programs is not supported, and callouts that request it are ignored. - A callout in a PCRE2 pattern is of the form (?C) where the argu- - ment is either a number or a quoted string (see the pcre2callout docu- - mentation for details). Numbered callouts are ignored by pcre2grep; + A callout in a PCRE2 pattern is of the form (?C) where the argu- + ment is either a number or a quoted string (see the pcre2callout docu- + mentation for details). Numbered callouts are ignored by pcre2grep; only callouts with string arguments are useful. Echoing a specific string - Starting the callout string with a pipe character invokes an echoing + Starting the callout string with a pipe character invokes an echoing facility that avoids calling an external program or script. This facil- - ity is always available, provided that callouts were not completely - disabled when pcre2grep was built. The rest of the callout string is - processed as a zero-terminated string, which means it should not con- - tain any internal binary zeros. It is written to the output, having - first been passed through the same escape processing as text from the - --output (-O) option (see above). However, $0 cannot be used to insert - a matched substring because the match is still in progress. Instead, - the single character '0' is inserted. Any syntax errors in the string - (for example, a dollar not followed by another character) causes the - callout to be ignored. No terminator is added to the output string, so - if you want a newline, you must include it explicitly using the escape + ity is always available, provided that callouts were not completely + disabled when pcre2grep was built. The rest of the callout string is + processed as a zero-terminated string, which means it should not con- + tain any internal binary zeros. It is written to the output, having + first been passed through the same escape processing as text from the + --output (-O) option (see above). However, $0 cannot be used to insert + a matched substring because the match is still in progress. Instead, + the single character '0' is inserted. Any syntax errors in the string + (for example, a dollar not followed by another character) causes the + callout to be ignored. No terminator is added to the output string, so + if you want a newline, you must include it explicitly using the escape $n. For example: pcre2grep '(.)(..(.))(?C"|[$1] [$2] [$3]$n")' - Matching continues normally after the string is output. If you want to - see only the callout output but not any output from an actual match, + Matching continues normally after the string is output. If you want to + see only the callout output but not any output from an actual match, you should end the pattern with (*FAIL). Calling external programs or scripts This facility can be independently disabled when pcre2grep is built. It - is supported for Windows, where a call to _spawnvp() is used, for VMS, - where lib$spawn() is used, and for any Unix-like environment where + is supported for Windows, where a call to _spawnvp() is used, for VMS, + where lib$spawn() is used, and for any Unix-like environment where fork() and execv() are available. If the callout string does not start with a pipe (vertical bar) charac- - ter, it is parsed into a list of substrings separated by pipe charac- - ters. The first substring must be an executable name, with the follow- + ter, it is parsed into a list of substrings separated by pipe charac- + ters. The first substring must be an executable name, with the follow- ing substrings specifying arguments: executable_name|arg1|arg2|... - Any substring (including the executable name) may contain escape se- - quences started by a dollar character. These are the same as for the + Any substring (including the executable name) may contain escape se- + quences started by a dollar character. These are the same as for the --output (-O) option documented above, except that $0 cannot insert the - matched string because the match is still in progress. Instead, the + matched string because the match is still in progress. Instead, the character '0' is inserted. If you need a literal dollar or pipe charac- ter in any substring, use $$ or $| respectively. Here is an example: @@ -968,43 +982,43 @@ USING PCRE2'S CALLOUT FACILITY Arg1: [1] [234] [4] Arg2: |1| () 12345 - The parameters for the system call that is used to run the program or + The parameters for the system call that is used to run the program or script are zero-terminated strings. This means that binary zero charac- - ters in the callout argument will cause premature termination of their - substrings, and therefore should not be present. Any syntax errors in - the string (for example, a dollar not followed by another character) + ters in the callout argument will cause premature termination of their + substrings, and therefore should not be present. Any syntax errors in + the string (for example, a dollar not followed by another character) causes the callout to be ignored. If running the program fails for any - reason (including the non-existence of the executable), a local match- + reason (including the non-existence of the executable), a local match- ing failure occurs and the matcher backtracks in the normal way. MATCHING ERRORS - It is possible to supply a regular expression that takes a very long - time to fail to match certain lines. Such patterns normally involve - nested indefinite repeats, for example: (a+)*\d when matched against a - line of a's with no final digit. The PCRE2 matching function has a re- - source limit that causes it to abort in these circumstances. If this - happens, pcre2grep outputs an error message and the line that caused - the problem to the standard error stream. If there are more than 20 + It is possible to supply a regular expression that takes a very long + time to fail to match certain lines. Such patterns normally involve + nested indefinite repeats, for example: (a+)*\d when matched against a + line of a's with no final digit. The PCRE2 matching function has a re- + source limit that causes it to abort in these circumstances. If this + happens, pcre2grep outputs an error message and the line that caused + the problem to the standard error stream. If there are more than 20 such errors, pcre2grep gives up. - The --match-limit option of pcre2grep can be used to set the overall - resource limit. There are also other limits that affect the amount of - memory used during matching; see the discussion of --heap-limit and + The --match-limit option of pcre2grep can be used to set the overall + resource limit. There are also other limits that affect the amount of + memory used during matching; see the discussion of --heap-limit and --depth-limit above. DIAGNOSTICS Exit status is 0 if any matches were found, 1 if no matches were found, - and 2 for syntax errors, overlong lines, non-existent or inaccessible - files (even if matches were found in other files) or too many matching + and 2 for syntax errors, overlong lines, non-existent or inaccessible + files (even if matches were found in other files) or too many matching errors. Using the -s option to suppress error messages about inaccessi- ble files does not affect the return code. - When run under VMS, the return code is placed in the symbol - PCRE2GREP_RC because VMS does not distinguish between exit(0) and + When run under VMS, the return code is placed in the symbol + PCRE2GREP_RC because VMS does not distinguish between exit(0) and exit(1). @@ -1022,5 +1036,5 @@ AUTHOR REVISION - Last updated: 31 August 2021 - Copyright (c) 1997-2021 University of Cambridge. + Last updated: 30 July 2022 + Copyright (c) 1997-2022 University of Cambridge. diff --git a/doc/pcre2test.txt b/doc/pcre2test.txt index ed7dd20..86a4b5a 100644 --- a/doc/pcre2test.txt +++ b/doc/pcre2test.txt @@ -1111,7 +1111,8 @@ SUBJECT MODIFIERS copy= copy captured substring depth_limit= set a depth limit dfa use pcre2_dfa_match() - find_limits find match and depth limits + find_limits find heap, match and depth limits + find_limits_noheap find match and depth limits get= extract captured substring getall extract all captured substrings /g global global matching @@ -1411,7 +1412,7 @@ SUBJECT MODIFIERS The heap_limit, match_limit, and depth_limit modifiers set the appro- priate limits in the match context. These values are ignored when the - find_limits modifier is specified. + find_limits or find_limits_noheap modifier is specified. Finding minimum limits @@ -1419,8 +1420,12 @@ SUBJECT MODIFIERS calls the relevant matching function several times, setting different values in the match context via pcre2_set_heap_limit(), pcre2_set_match_limit(), or pcre2_set_depth_limit() until it finds the - minimum values for each parameter that allows the match to complete - without error. If JIT is being used, only the match limit is relevant. + smallest value for each parameter that allows the match to complete + without a "limit exceeded" error. The match itself may succeed or fail. + An alternative modifier, find_limits_noheap, omits the heap limit. This + is used in the standard tests, because the minimum heap limit varies + between systems. If JIT is being used, only the match limit is rele- + vant, and the other two are automatically omitted. When using this modifier, the pattern should not contain any limit set- tings such as (*LIMIT_MATCH=...) within it. If such a setting is @@ -1446,9 +1451,7 @@ SUBJECT MODIFIERS For both kinds of matching, the heap_limit number, which is in kibibytes (units of 1024 bytes), limits the amount of heap memory used - for matching. A value of zero disables the use of any heap memory; many - simple pattern matches can be done without using the heap, so zero is - not an unreasonable setting. + for matching. Showing MARK names @@ -1463,13 +1466,11 @@ SUBJECT MODIFIERS The memory modifier causes pcre2test to log the sizes of all heap mem- ory allocation and freeing calls that occur during a call to - pcre2_match() or pcre2_dfa_match(). These occur only when a match re- - quires a bigger vector than the default for remembering backtracking - points (pcre2_match()) or for internal workspace (pcre2_dfa_match()). - In many cases there will be no heap memory used and therefore no addi- - tional output. No heap memory is allocated during matching with JIT, so - in that case the memory modifier never has any effect. For this modi- - fier to work, the null_context modifier must not be set on both the + pcre2_match() or pcre2_dfa_match(). In the latter case, heap memory is + used only when a match requires more internal workspace that the de- + fault allocation on the stack, so in many cases there will be no out- + put. No heap memory is allocated during matching with JIT. For this + modifier to work, the null_context modifier must not be set on both the pattern and the subject, though it can be set on one or the other. Setting a starting offset @@ -1518,45 +1519,46 @@ SUBJECT MODIFIERS null_context modifier is set, however, NULL is passed. This is for testing that the matching and substitution functions behave correctly in this case (they use default values). This modifier cannot be used - with the find_limits or substitute_callout modifiers. + with the find_limits, find_limits_noheap, or substitute_callout modi- + fiers. - Similarly, for testing purposes, if the null_subject or null_replace- - ment modifier is set, the subject or replacement string pointers are + Similarly, for testing purposes, if the null_subject or null_replace- + ment modifier is set, the subject or replacement string pointers are passed as NULL, respectively, to the relevant functions. THE ALTERNATIVE MATCHING FUNCTION - By default, pcre2test uses the standard PCRE2 matching function, + By default, pcre2test uses the standard PCRE2 matching function, pcre2_match() to match each subject line. PCRE2 also supports an alter- - native matching function, pcre2_dfa_match(), which operates in a dif- - ferent way, and has some restrictions. The differences between the two + native matching function, pcre2_dfa_match(), which operates in a dif- + ferent way, and has some restrictions. The differences between the two functions are described in the pcre2matching documentation. - If the dfa modifier is set, the alternative matching function is used. - This function finds all possible matches at a given point in the sub- - ject. If, however, the dfa_shortest modifier is set, processing stops - after the first match is found. This is always the shortest possible + If the dfa modifier is set, the alternative matching function is used. + This function finds all possible matches at a given point in the sub- + ject. If, however, the dfa_shortest modifier is set, processing stops + after the first match is found. This is always the shortest possible match. DEFAULT OUTPUT FROM pcre2test - This section describes the output when the normal matching function, + This section describes the output when the normal matching function, pcre2_match(), is being used. - When a match succeeds, pcre2test outputs the list of captured sub- - strings, starting with number 0 for the string that matched the whole + When a match succeeds, pcre2test outputs the list of captured sub- + strings, starting with number 0 for the string that matched the whole pattern. Otherwise, it outputs "No match" when the return is PCRE2_ER- - ROR_NOMATCH, or "Partial match:" followed by the partially matching - substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is - the entire substring that was inspected during the partial match; it - may include characters before the actual match start if a lookbehind + ROR_NOMATCH, or "Partial match:" followed by the partially matching + substring when the return is PCRE2_ERROR_PARTIAL. (Note that this is + the entire substring that was inspected during the partial match; it + may include characters before the actual match start if a lookbehind assertion, \K, \b, or \B was involved.) For any other return, pcre2test outputs the PCRE2 negative error number - and a short descriptive phrase. If the error is a failed UTF string - check, the code unit offset of the start of the failing character is + and a short descriptive phrase. If the error is a failed UTF string + check, the code unit offset of the start of the failing character is also output. Here is an example of an interactive pcre2test run. $ pcre2test @@ -1572,8 +1574,8 @@ DEFAULT OUTPUT FROM pcre2test Unset capturing substrings that are not followed by one that is set are not shown by pcre2test unless the allcaptures modifier is specified. In the following example, there are two capturing substrings, but when the - first data line is matched, the second, unset substring is not shown. - An "internal" unset substring is shown as "", as for the second + first data line is matched, the second, unset substring is not shown. + An "internal" unset substring is shown as "", as for the second data line. re> /(a)|(b)/ @@ -1585,11 +1587,11 @@ DEFAULT OUTPUT FROM pcre2test 1: 2: b - If the strings contain any non-printing characters, they are output as - \xhh escapes if the value is less than 256 and UTF mode is not set. + If the strings contain any non-printing characters, they are output as + \xhh escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \x{hh...} escapes. See below for the defi- - nition of non-printing characters. If the aftertext modifier is set, - the output for substring 0 is followed by the the rest of the subject + nition of non-printing characters. If the aftertext modifier is set, + the output for substring 0 is followed by the the rest of the subject string, identified by "0+" like this: re> /cat/aftertext @@ -1609,8 +1611,8 @@ DEFAULT OUTPUT FROM pcre2test 0: ipp 1: pp - "No match" is output only if the first match attempt fails. Here is an - example of a failure message (the offset 4 that is specified by the + "No match" is output only if the first match attempt fails. Here is an + example of a failure message (the offset 4 that is specified by the offset modifier is past the end of the subject string): re> /xyz/ @@ -1618,7 +1620,7 @@ DEFAULT OUTPUT FROM pcre2test Error -24 (bad offset value) Note that whereas patterns can be continued over several lines (a plain - ">" prompt is used for continuations), subject lines may not. However + ">" prompt is used for continuations), subject lines may not. However newlines can be included in a subject by means of the \n escape (or \r, \r\n, etc., depending on the newline sequence setting). @@ -1626,7 +1628,7 @@ DEFAULT OUTPUT FROM pcre2test OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION When the alternative matching function, pcre2_dfa_match(), is used, the - output consists of a list of all the matches that start at the first + output consists of a list of all the matches that start at the first point in the subject where there is at least one match. For example: re> /(tang|tangerine|tan)/ @@ -1635,11 +1637,11 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION 1: tang 2: tan - Using the normal matching function on this data finds only "tang". The - longest matching string is always given first (and numbered zero). Af- - ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol- + Using the normal matching function on this data finds only "tang". The + longest matching string is always given first (and numbered zero). Af- + ter a PCRE2_ERROR_PARTIAL return, the output is "Partial match:", fol- lowed by the partially matching substring. Note that this is the entire - substring that was inspected during the partial match; it may include + substring that was inspected during the partial match; it may include characters before the actual match start if a lookbehind assertion, \b, or \B was involved. (\K is not supported for DFA matching.) @@ -1655,16 +1657,16 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION 1: tan 0: tan - The alternative matching function does not support substring capture, - so the modifiers that are concerned with captured substrings are not + The alternative matching function does not support substring capture, + so the modifiers that are concerned with captured substrings are not relevant. RESTARTING AFTER A PARTIAL MATCH - When the alternative matching function has given the PCRE2_ERROR_PAR- + When the alternative matching function has given the PCRE2_ERROR_PAR- TIAL return, indicating that the subject partially matched the pattern, - you can restart the match with additional subject data by means of the + you can restart the match with additional subject data by means of the dfa_restart modifier. For example: re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ @@ -1673,37 +1675,37 @@ RESTARTING AFTER A PARTIAL MATCH data> n05\=dfa,dfa_restart 0: n05 - For further information about partial matching, see the pcre2partial + For further information about partial matching, see the pcre2partial documentation. CALLOUTS If the pattern contains any callout requests, pcre2test's callout func- - tion is called during matching unless callout_none is specified. This + tion is called during matching unless callout_none is specified. This works with both matching functions, and with JIT, though there are some - differences in behaviour. The output for callouts with numerical argu- + differences in behaviour. The output for callouts with numerical argu- ments and those with string arguments is slightly different. Callouts with numerical arguments By default, the callout function displays the callout number, the start - and current positions in the subject text at the callout time, and the + and current positions in the subject text at the callout time, and the next pattern item to be tested. For example: --->pqrabcdef 0 ^ ^ \d - This output indicates that callout number 0 occurred for a match at- - tempt starting at the fourth character of the subject string, when the - pointer was at the seventh character, and when the next pattern item - was \d. Just one circumflex is output if the start and current posi- + This output indicates that callout number 0 occurred for a match at- + tempt starting at the fourth character of the subject string, when the + pointer was at the seventh character, and when the next pattern item + was \d. Just one circumflex is output if the start and current posi- tions are the same, or if the current position precedes the start posi- tion, which can happen if the callout is in a lookbehind assertion. Callouts numbered 255 are assumed to be automatic callouts, inserted as a result of the auto_callout pattern modifier. In this case, instead of - showing the callout number, the offset in the pattern, preceded by a + showing the callout number, the offset in the pattern, preceded by a plus, is output. For example: re> /\d?[A-E]\*/auto_callout @@ -1730,17 +1732,17 @@ CALLOUTS +12 ^ ^ 0: abc - The mark changes between matching "a" and "b", but stays the same for - the rest of the match, so nothing more is output. If, as a result of - backtracking, the mark reverts to being unset, the text "" is + The mark changes between matching "a" and "b", but stays the same for + the rest of the match, so nothing more is output. If, as a result of + backtracking, the mark reverts to being unset, the text "" is output. Callouts with string arguments The output for a callout with a string argument is similar, except that - instead of outputting a callout number before the position indicators, - the callout string and its offset in the pattern string are output be- - fore the reflection of the subject string, and the subject string is + instead of outputting a callout number before the position indicators, + the callout string and its offset in the pattern string are output be- + fore the reflection of the subject string, and the subject string is reflected for each callout. For example: re> /^ab(?C'first')cd(?C"second")ef/ @@ -1756,26 +1758,26 @@ CALLOUTS Callout modifiers - The callout function in pcre2test returns zero (carry on matching) by - default, but you can use a callout_fail modifier in a subject line to + The callout function in pcre2test returns zero (carry on matching) by + default, but you can use a callout_fail modifier in a subject line to change this and other parameters of the callout (see below). If the callout_capture modifier is set, the current captured groups are output when a callout occurs. This is useful only for non-DFA matching, - as pcre2_dfa_match() does not support capturing, so no captures are + as pcre2_dfa_match() does not support capturing, so no captures are ever shown. The normal callout output, showing the callout number or pattern offset - (as described above) is suppressed if the callout_no_where modifier is + (as described above) is suppressed if the callout_no_where modifier is set. - When using the interpretive matching function pcre2_match() without - JIT, setting the callout_extra modifier causes additional output from - pcre2test's callout function to be generated. For the first callout in - a match attempt at a new starting position in the subject, "New match - attempt" is output. If there has been a backtrack since the last call- + When using the interpretive matching function pcre2_match() without + JIT, setting the callout_extra modifier causes additional output from + pcre2test's callout function to be generated. For the first callout in + a match attempt at a new starting position in the subject, "New match + attempt" is output. If there has been a backtrack since the last call- out (or start of matching if this is the first callout), "Backtrack" is - output, followed by "No other matching paths" if the backtrack ended + output, followed by "No other matching paths" if the backtrack ended the previous match attempt. For example: re> /(a+)b/auto_callout,no_start_optimize,no_auto_possess @@ -1812,86 +1814,86 @@ CALLOUTS +1 ^ a+ No match - Notice that various optimizations must be turned off if you want all - possible matching paths to be scanned. If no_start_optimize is not - used, there is an immediate "no match", without any callouts, because - the starting optimization fails to find "b" in the subject, which it - knows must be present for any match. If no_auto_possess is not used, - the "a+" item is turned into "a++", which reduces the number of back- + Notice that various optimizations must be turned off if you want all + possible matching paths to be scanned. If no_start_optimize is not + used, there is an immediate "no match", without any callouts, because + the starting optimization fails to find "b" in the subject, which it + knows must be present for any match. If no_auto_possess is not used, + the "a+" item is turned into "a++", which reduces the number of back- tracks. - The callout_extra modifier has no effect if used with the DFA matching + The callout_extra modifier has no effect if used with the DFA matching function, or with JIT. Return values from callouts - The default return from the callout function is zero, which allows + The default return from the callout function is zero, which allows matching to continue. The callout_fail modifier can be given one or two numbers. If there is only one number, 1 is returned instead of 0 (caus- ing matching to backtrack) when a callout of that number is reached. If - two numbers (:) are given, 1 is returned when callout is - reached and there have been at least callouts. The callout_error + two numbers (:) are given, 1 is returned when callout is + reached and there have been at least callouts. The callout_error modifier is similar, except that PCRE2_ERROR_CALLOUT is returned, caus- - ing the entire matching process to be aborted. If both these modifiers - are set for the same callout number, callout_error takes precedence. - Note that callouts with string arguments are always given the number + ing the entire matching process to be aborted. If both these modifiers + are set for the same callout number, callout_error takes precedence. + Note that callouts with string arguments are always given the number zero. - The callout_data modifier can be given an unsigned or a negative num- - ber. This is set as the "user data" that is passed to the matching - function, and passed back when the callout function is invoked. Any - value other than zero is used as a return from pcre2test's callout + The callout_data modifier can be given an unsigned or a negative num- + ber. This is set as the "user data" that is passed to the matching + function, and passed back when the callout function is invoked. Any + value other than zero is used as a return from pcre2test's callout function. Inserting callouts can be helpful when using pcre2test to check compli- - cated regular expressions. For further information about callouts, see + cated regular expressions. For further information about callouts, see the pcre2callout documentation. NON-PRINTING CHARACTERS When pcre2test is outputting text in the compiled version of a pattern, - bytes other than 32-126 are always treated as non-printing characters + bytes other than 32-126 are always treated as non-printing characters and are therefore shown as hex escapes. - When pcre2test is outputting text that is a matched part of a subject - string, it behaves in the same way, unless a different locale has been - set for the pattern (using the locale modifier). In this case, the is- + When pcre2test is outputting text that is a matched part of a subject + string, it behaves in the same way, unless a different locale has been + set for the pattern (using the locale modifier). In this case, the is- print() function is used to distinguish printing and non-printing char- acters. SAVING AND RESTORING COMPILED PATTERNS - It is possible to save compiled patterns on disc or elsewhere, and + It is possible to save compiled patterns on disc or elsewhere, and reload them later, subject to a number of restrictions. JIT data cannot - be saved. The host on which the patterns are reloaded must be running + be saved. The host on which the patterns are reloaded must be running the same version of PCRE2, with the same code unit width, and must also - have the same endianness, pointer width and PCRE2_SIZE type. Before - compiled patterns can be saved they must be serialized, that is, con- - verted to a stream of bytes. A single byte stream may contain any num- - ber of compiled patterns, but they must all use the same character ta- - bles. A single copy of the tables is included in the byte stream (its + have the same endianness, pointer width and PCRE2_SIZE type. Before + compiled patterns can be saved they must be serialized, that is, con- + verted to a stream of bytes. A single byte stream may contain any num- + ber of compiled patterns, but they must all use the same character ta- + bles. A single copy of the tables is included in the byte stream (its size is 1088 bytes). - The functions whose names begin with pcre2_serialize_ are used for se- - rializing and de-serializing. They are described in the pcre2serialize - documentation. In this section we describe the features of pcre2test + The functions whose names begin with pcre2_serialize_ are used for se- + rializing and de-serializing. They are described in the pcre2serialize + documentation. In this section we describe the features of pcre2test that can be used to test these functions. - Note that "serialization" in PCRE2 does not convert compiled patterns - to an abstract format like Java or .NET. It just makes a reloadable + Note that "serialization" in PCRE2 does not convert compiled patterns + to an abstract format like Java or .NET. It just makes a reloadable byte code stream. Hence the restrictions on reloading mentioned above. - In pcre2test, when a pattern with push modifier is successfully com- - piled, it is pushed onto a stack of compiled patterns, and pcre2test - expects the next line to contain a new pattern (or command) instead of + In pcre2test, when a pattern with push modifier is successfully com- + piled, it is pushed onto a stack of compiled patterns, and pcre2test + expects the next line to contain a new pattern (or command) instead of a subject line. By contrast, the pushcopy modifier causes a copy of the - compiled pattern to be stacked, leaving the original available for im- - mediate matching. By using push and/or pushcopy, a number of patterns - can be compiled and retained. These modifiers are incompatible with + compiled pattern to be stacked, leaving the original available for im- + mediate matching. By using push and/or pushcopy, a number of patterns + can be compiled and retained. These modifiers are incompatible with posix, and control modifiers that act at match time are ignored (with a - message) for the stacked patterns. The jitverify modifier applies only + message) for the stacked patterns. The jitverify modifier applies only at compile time. The command @@ -1899,21 +1901,21 @@ SAVING AND RESTORING COMPILED PATTERNS #save causes all the stacked patterns to be serialized and the result written - to the named file. Afterwards, all the stacked patterns are freed. The + to the named file. Afterwards, all the stacked patterns are freed. The command #load - reads the data in the file, and then arranges for it to be de-serial- - ized, with the resulting compiled patterns added to the pattern stack. - The pattern on the top of the stack can be retrieved by the #pop com- - mand, which must be followed by lines of subjects that are to be - matched with the pattern, terminated as usual by an empty line or end - of file. This command may be followed by a modifier list containing - only control modifiers that act after a pattern has been compiled. In - particular, hex, posix, posix_nosub, push, and pushcopy are not al- - lowed, nor are any option-setting modifiers. The JIT modifiers are, - however permitted. Here is an example that saves and reloads two pat- + reads the data in the file, and then arranges for it to be de-serial- + ized, with the resulting compiled patterns added to the pattern stack. + The pattern on the top of the stack can be retrieved by the #pop com- + mand, which must be followed by lines of subjects that are to be + matched with the pattern, terminated as usual by an empty line or end + of file. This command may be followed by a modifier list containing + only control modifiers that act after a pattern has been compiled. In + particular, hex, posix, posix_nosub, push, and pushcopy are not al- + lowed, nor are any option-setting modifiers. The JIT modifiers are, + however permitted. Here is an example that saves and reloads two pat- terns. /abc/push @@ -1926,10 +1928,10 @@ SAVING AND RESTORING COMPILED PATTERNS #pop jit,bincode abc - If jitverify is used with #pop, it does not automatically imply jit, + If jitverify is used with #pop, it does not automatically imply jit, which is different behaviour from when it is used on a pattern. - The #popcopy command is analagous to the pushcopy modifier in that it + The #popcopy command is analagous to the pushcopy modifier in that it makes current a copy of the topmost stack pattern, leaving the original still on the stack. @@ -1949,5 +1951,5 @@ AUTHOR REVISION - Last updated: 12 January 2022 + Last updated: 27 July 2022 Copyright (c) 1997-2022 University of Cambridge. diff --git a/src/pcre2grep.c b/src/pcre2grep.c index 9ff070e..2443428 100644 --- a/src/pcre2grep.c +++ b/src/pcre2grep.c @@ -205,9 +205,6 @@ point. */ * Global variables * *************************************************/ -/* Jeffrey Friedl has some debugging requirements that are not part of the -regular code. */ - static const char *colour_string = "1;31"; static const char *colour_option = NULL; static const char *dee_option = NULL; @@ -220,6 +217,10 @@ static const char *output_text = NULL; static char *main_buffer = NULL; +static const char *printname_nl = STDOUT_NL; /* Changed to NULL for -Z */ +static int printname_colon = ':'; /* Changed to 0 for -Z */ +static int printname_hyphen = '-'; /* Changed to 0 for -Z */ + static int after_context = 0; static int before_context = 0; static int binary_files = BIN_BINARY; @@ -483,6 +484,7 @@ static option_item optionlist[] = { { OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" }, { OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" }, { OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" }, + { OP_NODATA, 'Z', NULL, "null", "output 0 byte after file names" }, { OP_NODATA, 0, NULL, NULL, NULL } }; @@ -1773,7 +1775,7 @@ if (after_context > 0 && lastmatchnumber > 0) { char *pp = end_of_line(lastmatchrestart, endptr, &ellength); if (ellength == 0 && pp == main_buffer + bufsize) break; - if (printname != NULL) fprintf(stdout, "%s-", printname); + if (printname != NULL) fprintf(stdout, "%s%c", printname, printname_hyphen); if (number) fprintf(stdout, "%lu-", lastmatchnumber++); FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout); lastmatchrestart = pp; @@ -2439,10 +2441,10 @@ if (pid == 0) } else if (pid > 0) { - (void)fflush(stdout); + (void)fflush(stdout); (void)waitpid(pid, &result, 0); - (void)fflush(stdout); - } + (void)fflush(stdout); + } #endif /* End Windows/VMS/other handling */ free(args); @@ -2730,7 +2732,9 @@ while (ptr < endptr) else if (filenames == FN_MATCH_ONLY) { - fprintf(stdout, "%s" STDOUT_NL, printname); + fprintf(stdout, "%s", printname); + if (printname_nl == NULL) fprintf(stdout, "%c", 0); + else fprintf(stdout, "%s", printname_nl); return 0; } @@ -2749,7 +2753,8 @@ while (ptr < endptr) { PCRE2_SIZE oldstartoffset; - if (printname != NULL) fprintf(stdout, "%s:", printname); + if (printname != NULL) fprintf(stdout, "%s%c", printname, + printname_colon); if (number) fprintf(stdout, "%lu:", linenumber); /* Handle --line-offsets */ @@ -2871,7 +2876,8 @@ while (ptr < endptr) while (lastmatchrestart < p) { char *pp = lastmatchrestart; - if (printname != NULL) fprintf(stdout, "%s-", printname); + if (printname != NULL) fprintf(stdout, "%s%c", printname, + printname_hyphen); if (number) fprintf(stdout, "%lu-", lastmatchnumber++); pp = end_of_line(pp, endptr, &ellength); FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout); @@ -2912,7 +2918,8 @@ while (ptr < endptr) { int ellength; char *pp = p; - if (printname != NULL) fprintf(stdout, "%s-", printname); + if (printname != NULL) fprintf(stdout, "%s%c", printname, + printname_hyphen); if (number) fprintf(stdout, "%lu-", linenumber - linecount--); pp = end_of_line(pp, endptr, &ellength); FWRITE_IGNORE(p, 1, pp - p, stdout); @@ -2926,7 +2933,8 @@ while (ptr < endptr) if (after_context > 0 || before_context > 0) endhyphenpending = TRUE; - if (printname != NULL) fprintf(stdout, "%s:", printname); + if (printname != NULL) fprintf(stdout, "%s%c", printname, + printname_colon); if (number) fprintf(stdout, "%lu:", linenumber); /* In multiline mode, or if colouring, we have to split the line(s) up @@ -3131,7 +3139,9 @@ were none. If we found a match, we won't have got this far. */ if (filenames == FN_NOMATCH_ONLY) { - fprintf(stdout, "%s" STDOUT_NL, printname); + fprintf(stdout, "%s", printname); + if (printname_nl == NULL) fprintf(stdout, "%c", 0); + else fprintf(stdout, "%s", printname_nl); return 0; } @@ -3142,7 +3152,7 @@ if (count_only && !quiet) if (count > 0 || !omit_zero_count) { if (printname != NULL && filenames != FN_NONE) - fprintf(stdout, "%s:", printname); + fprintf(stdout, "%s%c", printname, printname_colon); fprintf(stdout, "%lu" STDOUT_NL, count); counts_printed++; } @@ -3528,8 +3538,6 @@ switch(letter) case 'u': options |= PCRE2_UTF; utf = TRUE; break; case 'U': options |= PCRE2_UTF|PCRE2_MATCH_INVALID_UTF; utf = TRUE; break; case 'v': invert = TRUE; break; - case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break; - case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break; case 'V': { @@ -3540,6 +3548,10 @@ switch(letter) pcre2grep_exit(0); break; + case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break; + case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break; + case 'Z': printname_colon = printname_hyphen = 0; printname_nl = NULL; break; + default: fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter); pcre2grep_exit(usage(2)); @@ -4259,8 +4271,6 @@ if (DEE_option != NULL) (void)pcre2_set_compile_extra_options(compile_context, extra_options); -/* Check the values for Jeffrey Friedl's debugging options. */ - /* If use_jit is set, check whether JIT is available. If not, do not try to use JIT. */ diff --git a/testdata/grepoutput b/testdata/grepoutput index 66af4cf..41a90b0 100644 --- a/testdata/grepoutput +++ b/testdata/grepoutput @@ -991,3 +991,22 @@ RC=0 ---------------------------- Test 134 ----------------------------- =AB3CD5= RC=0 +---------------------------- Test 135 ----------------------------- +./testdata/grepinputv@The word is cat in this line +RC=0 +./testdata/grepinputv@./testdata/grepinputv@RC=0 +./testdata/grepinputv@This line contains \E and (regex) *meta* [characters]. +./testdata/grepinputv@The word is cat in this line +./testdata/grepinputv@The caterpillar sat on the mat +RC=0 +testdata/grepinputM3:start end in between start +end and following +testdata/grepinputM7:start end in between start +end and following start +end other stuff +testdata/grepinputM11:start end in between start + +end +testdata/grepinputM16:start end in between start +end +RC=0