Add explicit heap limiting options to pcre2_match(), with associated features

for listing, configuring, etc.
This commit is contained in:
Philip.Hazel 2017-04-11 11:47:25 +00:00
parent f0126dc7ae
commit 14989bd454
47 changed files with 2322 additions and 1778 deletions

View File

@ -78,6 +78,7 @@
# fix by David Gaussmann
# 2016-10-07 PH added PCREGREP_MAX_BUFSIZE
# 2017-03-11 PH turned HEAP_MATCH_RECURSE into a NO-OP for 10.30
# 2017-04-08 PH added HEAP_LIMIT
PROJECT(PCRE2 C)
@ -143,6 +144,9 @@ SET(PCRE2_LINK_SIZE "2" CACHE STRING
SET(PCRE2_PARENS_NEST_LIMIT "250" CACHE STRING
"Default nested parentheses limit. See PARENS_NEST_LIMIT in config.h.in for details.")
SET(PCRE2_HEAP_LIMIT "20000000" CACHE STRING
"Default limit on heap memory (kilobytes). See HEAP_LIMIT in config.h.in for details.")
SET(PCRE2_MATCH_LIMIT "10000000" CACHE STRING
"Default limit on internal looping. See MATCH_LIMIT in config.h.in for details.")
@ -765,6 +769,7 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Rebuild char tables ............. : ${PCRE2_REBUILD_CHARTABLES}")
MESSAGE(STATUS " Internal link size .............. : ${PCRE2_LINK_SIZE}")
MESSAGE(STATUS " Parentheses nest limit .......... : ${PCRE2_PARENS_NEST_LIMIT}")
MESSAGE(STATUS " Heap limit ...................... : ${PCRE2_HEAP_LIMIT}")
MESSAGE(STATUS " Match limit ..................... : ${PCRE2_MATCH_LIMIT}")
MESSAGE(STATUS " Match depth limit ............... : ${PCRE2_MATCH_LIMIT_DEPTH}")
MESSAGE(STATUS " Build shared libs ............... : ${BUILD_SHARED_LIBS}")

View File

@ -121,6 +121,11 @@ single-branch conditions with a false condition (e.g. DEFINE) at the start of a
branch. For example, /(?(DEFINE)...)^A/ and /(...){0}^B/ are now flagged as
anchored.
22. Added an explicit limit on the amount of heap used by pcre2_match(), set by
pcre2_set_heap_limit() or (*LIMIT_HEAP=xxx). Upgraded pcre2test to show the
heap limit along with other pattern information, and to find the minimum when
the find_limits modifier is set.
Version 10.23 14-February-2017
------------------------------

View File

@ -69,6 +69,7 @@ dist_html_DATA = \
doc/html/pcre2_set_character_tables.html \
doc/html/pcre2_set_compile_recursion_guard.html \
doc/html/pcre2_set_depth_limit.html \
doc/html/pcre2_set_heap_limit.html \
doc/html/pcre2_set_match_limit.html \
doc/html/pcre2_set_max_pattern_length.html \
doc/html/pcre2_set_offset_limit.html \
@ -152,6 +153,7 @@ dist_man_MANS = \
doc/pcre2_set_character_tables.3 \
doc/pcre2_set_compile_recursion_guard.3 \
doc/pcre2_set_depth_limit.3 \
doc/pcre2_set_heap_limit.3 \
doc/pcre2_set_match_limit.3 \
doc/pcre2_set_max_pattern_length.3 \
doc/pcre2_set_offset_limit.3 \

23
README
View File

@ -223,10 +223,10 @@ library. They are also documented in the pcre2build man page.
--with-parens-nest-limit=500
. PCRE2 has a counter that can be set to limit the amount of resources it uses
when matching a pattern. If the limit is exceeded during a match, the match
fails. The default is ten million. You can change the default by setting, for
example,
. PCRE2 has a counter that can be set to limit the amount of computing resource
it uses when matching a pattern with the Perl-compatible matching function.
If the limit is exceeded during a match, the match fails. The default is ten
million. You can change the default by setting, for example,
--with-match-limit=500000
@ -235,14 +235,23 @@ library. They are also documented in the pcre2build man page.
pcre2api man page (search for pcre2_set_match_limit).
. There is a separate counter that limits the depth of nested backtracking
during a matching process, which in turn limits the amount of memory that is
used. This also has a default of ten million, which is essentially
during a matching process, which indirectly limits the amount of heap memory
that is used. This also has a default of ten million, which is essentially
"unlimited". You can change the default by setting, for example,
--with-match-limit-depth=5000
There is more discussion in the pcre2api man page (search for
pcre2_set_depth_limit).
. You can also set an explicit limit on the amount of heap memory used by
the pcre2_match() interpreter:
--with-heap-limit=500
The units are kilobytes. This limit does not apply when the JIT optimization
(which has its own memory control features) is used. There is more discussion
on the pcre2api man page (search for pcre2_set_heap_limit).
. In the 8-bit library, the default maximum compiled pattern size is around
64K bytes. You can increase this by adding --with-link-size=3 to the
@ -865,4 +874,4 @@ The distribution should contain the files listed below.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 17 March 2017
Last updated: 11 April 2017

View File

@ -489,7 +489,7 @@ for bmode in "$test8" "$test16" "$test32"; do
for opt in "" $jitopt; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry
if [ $? = 0 ] ; then
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -63,-62,-2,-1,0,100,188,189,190,191 >>testtry
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -64,-62,-2,-1,0,100,188,189,190,191 >>testtry
checkresult $? 2 "$opt"
fi
done

View File

@ -36,6 +36,7 @@
#cmakedefine NEVER_BACKSLASH_C 1
#define LINK_SIZE @PCRE2_LINK_SIZE@
#define HEAP_LIMIT @PCRE2_HEAP_LIMIT@
#define MATCH_LIMIT @PCRE2_MATCH_LIMIT@
#define MATCH_LIMIT_DEPTH @PCRE2_MATCH_LIMIT_DEPTH@
#define NEWLINE_DEFAULT @NEWLINE_DEFAULT@

View File

@ -263,6 +263,12 @@ AC_ARG_WITH(parens-nest-limit,
[nested parentheses limit (default=250)]),
, with_parens_nest_limit=250)
# Handle --with-heap-limit
AC_ARG_WITH(heap-limit,
AS_HELP_STRING([--with-heap-limit=N],
[default limit on heap memory (kilobytes, default=20000000)]),
, with_heap_limit=20000000)
# Handle --with-match-limit=N
AC_ARG_WITH(match-limit,
AS_HELP_STRING([--with-match-limit=N],
@ -285,7 +291,7 @@ AC_ARG_WITH(match-limit-depth,
AC_ARG_WITH(match-limit-recursion,,
, with_match_limit_recursion=UNSET)
# Handle --enable-valgrind
AC_ARG_ENABLE(valgrind,
AS_HELP_STRING([--enable-valgrind],
@ -680,12 +686,12 @@ AC_DEFINE_UNQUOTED([PARENS_NEST_LIMIT], [$with_parens_nest_limit], [
stack that is used while compiling a pattern.])
AC_DEFINE_UNQUOTED([MATCH_LIMIT], [$with_match_limit], [
The value of MATCH_LIMIT determines the default number of times the internal
match() function can record a backtrack position during a single matching
attempt. There is a runtime interface for setting a different limit. The
limit exists in order to catch runaway regular expressions that take for ever
to determine that they do not match. The default is set very large so that it
does not accidentally catch legitimate cases.])
The value of MATCH_LIMIT determines the default number of times the
pcre2_match() function can record a backtrack position during a single
matching attempt. There is a runtime interface for setting a different limit.
The limit exists in order to catch runaway regular expressions that take for
ever to determine that they do not match. The default is set very large so
that it does not accidentally catch legitimate cases.])
# --with-match-limit-recursion is an obsolete synonym for --with-match-limit-depth
@ -694,7 +700,7 @@ cat <<EOF
WARNING: --with-match-limit-recursion is an obsolete option. Please use
--with-match-limit-depth in future. If both are set, --with-match-limit-depth
will be used.
will be used. See also --with-heap-limit.
EOF
if test "$with_match_limit_depth" = "MATCH_LIMIT"; then
@ -711,6 +717,10 @@ AC_DEFINE_UNQUOTED([MATCH_LIMIT_DEPTH], [$with_match_limit_depth], [
be less than the value of MATCH_LIMIT. The default is to use the same value
as MATCH_LIMIT. There is a runtime method for setting a different limit.])
AC_DEFINE_UNQUOTED([HEAP_LIMIT], [$with_heap_limit], [
This limits the amount of memory that pcre2_match() may use while matching
a pattern. The value is in kilobytes.])
AC_DEFINE([MAX_NAME_SIZE], [32], [
This limit is parameterized just in case anybody ever wants to
change it. Care must be taken if it is increased, because it guards
@ -971,6 +981,7 @@ $PACKAGE-$VERSION configuration summary:
Rebuild char tables ................ : ${enable_rebuild_chartables}
Internal link size ................. : ${with_link_size}
Nested parentheses limit ........... : ${with_parens_nest_limit}
Heap limit ......................... : ${with_heap_limit} kilobytes
Match limit ........................ : ${with_match_limit}
Match depth limit .................. : ${with_match_limit_depth}
Build shared libs .................. : ${enable_shared}

View File

@ -223,10 +223,10 @@ library. They are also documented in the pcre2build man page.
--with-parens-nest-limit=500
. PCRE2 has a counter that can be set to limit the amount of resources it uses
when matching a pattern. If the limit is exceeded during a match, the match
fails. The default is ten million. You can change the default by setting, for
example,
. PCRE2 has a counter that can be set to limit the amount of computing resource
it uses when matching a pattern with the Perl-compatible matching function.
If the limit is exceeded during a match, the match fails. The default is ten
million. You can change the default by setting, for example,
--with-match-limit=500000
@ -235,14 +235,23 @@ library. They are also documented in the pcre2build man page.
pcre2api man page (search for pcre2_set_match_limit).
. There is a separate counter that limits the depth of nested backtracking
during a matching process, which in turn limits the amount of memory that is
used. This also has a default of ten million, which is essentially
during a matching process, which indirectly limits the amount of heap memory
that is used. This also has a default of ten million, which is essentially
"unlimited". You can change the default by setting, for example,
--with-match-limit-depth=5000
There is more discussion in the pcre2api man page (search for
pcre2_set_depth_limit).
. You can also set an explicit limit on the amount of heap memory used by
the pcre2_match() interpreter:
--with-heap-limit=500
The units are kilobytes. This limit does not apply when the JIT optimization
(which has its own memory control features) is used. There is more discussion
on the pcre2api man page (search for pcre2_set_heap_limit).
. In the 8-bit library, the default maximum compiled pattern size is around
64K bytes. You can increase this by adding --with-link-size=3 to the
@ -865,4 +874,4 @@ The distribution should contain the files listed below.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
Last updated: 17 March 2017
Last updated: 11 April 2017

View File

@ -213,6 +213,9 @@ in the library.
<tr><td><a href="pcre2_set_depth_limit.html">pcre2_set_depth_limit</a></td>
<td>&nbsp;&nbsp;Set the match backtracking depth limit</td></tr>
<tr><td><a href="pcre2_set_heap_limit.html">pcre2_set_heap_limit</a></td>
<td>&nbsp;&nbsp;Set the match backtracking heap limit</td></tr>
<tr><td><a href="pcre2_set_match_limit.html">pcre2_set_match_limit</a></td>
<td>&nbsp;&nbsp;Set the match limit</td></tr>

View File

@ -45,6 +45,7 @@ point to a uint32_t integer variable. The available codes are:
PCRE2_CONFIG_BSR Indicates what \R matches by default:
PCRE2_BSR_UNICODE
PCRE2_BSR_ANYCRLF
PCRE2_CONFIG_HEAPLIMIT Default heap memory limit
PCRE2_CONFIG_DEPTHLIMIT Default backtracking depth limit
PCRE2_CONFIG_JIT Availability of just-in-time compiler support (1=yes 0=no)
PCRE2_CONFIG_JITTARGET Information (a string) about the target architecture for the JIT compiler

View File

@ -44,6 +44,7 @@ A match context is needed only if you want to:
<pre>
Set up a callout function
Set a matching offset limit
Change the heap memory limit
Change the backtracking match limit
Change the backtracking depth limit
Set custom memory management specifically for the match

View File

@ -51,6 +51,7 @@ request are as follows:
PCRE2_INFO_FRAMESIZE Size of backtracking frame
PCRE2_INFO_HASBACKSLASHC Return 1 if pattern contains \C
PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist in the pattern
PCRE2_INFO_HEAPLIMIT Heap memory limit if set, otherwise PCRE2_ERROR_UNSET
PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0
PCRE2_INFO_LASTCODETYPE Type of must-be-present information

View File

@ -182,6 +182,10 @@ document for an overview of all the PCRE2 documentation.
<b> PCRE2_SIZE <i>value</i>);</b>
<br>
<br>
<b>int pcre2_set_heap_limit(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
@ -793,6 +797,7 @@ A match context is required if you want to:
<pre>
Set up a callout function
Set an offset limit for matching an unanchored pattern
Change the limit on the amount of heap used when matching
Change the backtracking match limit
Change the backtracking depth limit
Set custom memory management specifically for the match
@ -851,14 +856,47 @@ subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
start within the first line of the subject. If this is set with an offset
limit, a match must occur in the first line and also within the offset limit.
In other words, whichever limit comes first is used.
<b>int pcre2_set_heap_limit(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
The <i>heap_limit</i> parameter specifies, in units of kilobytes, the maximum
amount of heap memory that <b>pcre2_match()</b> may use to hold backtracking
information when running an interpretive match. This limit does not apply to
matching with the JIT optimization, which has its own memory control
arrangements (see the
<a href="pcre2jit.html"><b>pcre2jit</b></a>
documentation for more details), nor does it apply to <b>pcre2_dfa_match()</b>.
If the limit is reached, the negative error code PCRE2_ERROR_HEAPLIMIT is
returned. The default limit is set when PCRE2 is built; the default default is
very large and is essentially "unlimited".
</P>
<P>
A value for the heap limit may also be supplied by an item at the start of a
pattern of the form
<pre>
(*LIMIT_HEAP=ddd)
</pre>
where ddd is a decimal number. However, such a setting is ignored unless ddd is
less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
limit is set, less than the default.
</P>
<P>
The <b>pcre2_match()</b> function starts out using a 20K vector on the system
stack for recording backtracking points. The more nested backtracking points
there are (that is, the deeper the search tree), the more memory is needed.
Heap memory is used only if the initial vector is too small. If the heap limit
is set to a value less than 21 (in particular, zero) no heap memory will be
used. In this case, only patterns that do not have a lot of nested backtracking
can be successfully processed.
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
The <i>match_limit</i> parameter provides a means of preventing PCRE2 from using
up too many resources when processing patterns that are not going to match, but
which have a very large number of possibilities in their search trees. The
classic example is a pattern that uses nested unlimited repeats.
up too many computing resources when processing patterns that are not going to
match, but which have a very large number of possibilities in their search
trees. The classic example is a pattern that uses nested unlimited repeats.
</P>
<P>
There is an internal counter in <b>pcre2_match()</b> that is incremented each
@ -895,16 +933,20 @@ limit is set, less than the default.
This parameter limits the depth of nested backtracking in <b>pcre2_match()</b>.
Each time a nested backtracking point is passed, a new memory "frame" is used
to remember the state of matching at that point. Thus, this parameter
indirectly limits the amount of memory that is used in a match.
indirectly limits the amount of memory that is used in a match. However,
because the size of each memory "frame" depends on the number of capturing
parentheses, the actual memory limit varies from pattern to pattern. This limit
was more useful in versions before 10.30, where function recursion was used for
backtracking.
</P>
<P>
This limit is not relevant, and is ignored, when matching is done using JIT
compiled code. However, it is supported by <b>pcre2_dfa_match()</b>, which uses
it to limit the depth of internal recursive function calls that implement
lookaround assertions and pattern recursions. This is, therefore, an indirect
limit on the amount of system stack that is used. A recursive pattern such as
/(.)(?1)/, when matched to a very long string using <b>pcre2_dfa_match()</b>,
can use a great deal of stack.
The depth limit is not relevant, and is ignored, when matching is done using
JIT compiled code. However, it is supported by <b>pcre2_dfa_match()</b>, which
uses it to limit the depth of internal recursive function calls that implement
atomic groups, lookaround assertions, and pattern recursions. This is,
therefore, an indirect limit on the amount of system stack that is used. A
recursive pattern such as /(.)(?1)/, when matched to a very long string using
<b>pcre2_dfa_match()</b>, can use a great deal of stack.
</P>
<P>
The default value for the depth limit can be set when PCRE2 is built; the
@ -958,6 +1000,12 @@ The output is a uint32_t integer that gives the default limit for the depth of
nested backtracking in <b>pcre2_match()</b> or the depth of nested recursions
and lookarounds in <b>pcre2_dfa_match()</b>. Further details are given with
<b>pcre2_set_depth_limit()</b> above.
<pre>
PCRE2_CONFIG_HEAPLIMIT
</pre>
The output is a uint32_t integer that gives, in kilobytes, the default limit
for the amount of heap memory used by <b>pcre2_match()</b>. Further details are
given with <b>pcre2_set_heap_limit()</b> above.
<pre>
PCRE2_CONFIG_JIT
</pre>
@ -1786,6 +1834,13 @@ Return 1 if the pattern contains any explicit matches for CR or LF characters,
otherwise 0. The third argument should point to an <b>uint32_t</b> variable. An
explicit match is either a literal CR or LF character, or \r or \n or one of
the equivalent hexadecimal or octal escape sequences.
<pre>
PCRE2_INFO_HEAPLIMIT
</pre>
If the pattern set a heap memory limit by including an item of the form
(*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argument
should point to an unsigned 32-bit integer. If no such value has been set, the
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
<pre>
PCRE2_INFO_JCHANGED
</pre>
@ -2554,7 +2609,8 @@ The backtracking match limit was reached.
</pre>
If a pattern contains many nested backtracking points, heap memory is used to
remember them. This error is given when the memory allocation function (default
or custom) fails.
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
if the amount of memory needed exceeds the heap limit.
<pre>
PCRE2_ERROR_NULL
</pre>
@ -3271,7 +3327,7 @@ Cambridge, England.
</P>
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
<P>
Last updated: 04 April 2017
Last updated: 11 April 2017
<br>
Copyright &copy; 1997-2017 University of Cambridge.
<br>

View File

@ -265,17 +265,41 @@ to the <b>configure</b> command. This setting has no effect on the
(though the counting is done differently).
</P>
<P>
In some environments it is desirable to limit the depth of nested backtracking
in order to restrict the maximum amount of heap memory that is used. A second
limit controls this; it defaults to the value that is set for
--with-match-limit. You can set a lower default limit by adding, for example,
The <b>pcre2_match()</b> function starts out using a 20K vector on the system
stack to record backtracking points. The more nested backtracking points there
are (that is, the deeper the search tree), the more memory is needed. If the
initial vector is not large enough, heap memory is used, up to a certain limit,
which is specified in kilobytes. The limit can be changed at run time, as
described in the
<a href="pcre2api.html"><b>pcre2api</b></a>
documentation. The default limit (in effect unlimited) is 20 million. You can
change this by a setting such as
<pre>
--with-heap-limit=500
</pre>
which limits the amount of heap to 500 kilobytes. This limit applies only to
interpretive matching in pcre2_match(). It does not apply when JIT (which has
its own memory arrangements) is used, nor does it apply to
<b>pcre2_dfa_match()</b>.
</P>
<P>
You can also explicitly limit the depth of nested backtracking in the
<b>pcre2_match()</b> interpreter. This limit defaults to the value that is set
for --with-match-limit. You can set a lower default limit by adding, for
example,
<pre>
--with-match-limit_depth=10000
</pre>
to the <b>configure</b> command. This value can also be overridden at run time.
As well as applying to <b>pcre2_match()</b>, this limit also controls the depth
of recursive function calls in <b>pcre2_dfa_match()</b>. These are used for
lookaround assertions, atomic groups, and recursion within patterns.
to the <b>configure</b> command. This value can be overridden at run time. This
depth limit indirectly limits the amount of heap memory that is used, but
because the size of each backtracking "frame" depends on the number of
capturing parentheses in a pattern, the amount of heap that is used before the
limit is reached varies from pattern to pattern. This limit was more useful in
versions before 10.30, where function recursion was used for backtracking.
However, as well as applying to <b>pcre2_match()</b>, this limit also controls
the depth of recursive function calls in <b>pcre2_dfa_match()</b>. These are
used for lookaround assertions, atomic groups, and recursion within patterns.
The limit does not apply to JIT matching.
</P>
<br><a name="SEC12" href="#TOC1">CREATING CHARACTER TABLES AT BUILD TIME</a><br>
<P>
@ -530,7 +554,7 @@ Cambridge, England.
</P>
<br><a name="SEC25" href="#TOC1">REVISION</a><br>
<P>
Last updated: 31 March 2017
Last updated: 10 April 2017
<br>
Copyright &copy; 1997-2017 University of Cambridge.
<br>

View File

@ -404,6 +404,10 @@ file name is followed by a colon; for context lines, a hyphen separator is used.
If a line number is also being output, it follows the file name.
</P>
<P>
<b>--heap-limit</b>=<i>number</i>
See <b>--match-limit</b> below.
</P>
<P>
<b>--help</b>
Output a help message, giving brief details of the command options and file
type support, and then exit. Anything else on the command line is
@ -505,7 +509,7 @@ used. There is no short form for this option.
<b>--match-limit</b>=<i>number</i>
Processing some regular expression patterns may take a very long time to search
for all possible matching strings. Others may require a very large amount of
memory. There are two options that set resource limits for matching.
memory. There are three options that set resource limits for matching.
<br>
<br>
The <b>--match-limit</b> option provides a means of limiting computing resource
@ -516,13 +520,24 @@ counter that is incremented each time around its main processing loop. If the
value set by <b>--match-limit</b> is reached, an error occurs.
<br>
<br>
The <b>--heap-limit</b> option specifies, as a number of kilobytes, the amount
of heap memory that may be used for matching. Heap memory is needed only if
matching the pattern requires a significant number of nested backtracking
points to be remembered. This parameter can be set to zero to forbid the use of
heap memory altogether.
<br>
<br>
The <b>--depth-limit</b> option limits the depth of nested backtracking points,
which in turn limits the amount of memory that is used. This limit is of use
only if it is set smaller than <b>--match-limit</b>.
which indirectly limits the amount of memory that is used. The amount of memory
needed for each backtracking point depends on the number of capturing
parentheses in the pattern, so the amount of memory that is used before this
limit acts varies from pattern to pattern. This limit is of use only if it is
set smaller than <b>--match-limit</b>.
<br>
<br>
There are no short forms for these options. The default settings are specified
when the PCRE2 library is compiled, with the default default being 10 million.
when the PCRE2 library is compiled, with the default defaults being very large
and so effectively unlimited.
</P>
<P>
\fB--max-buffer-size=<i>number</i>
@ -764,11 +779,12 @@ Many of the short and long forms of <b>pcre2grep</b>'s options are the same
as in the GNU <b>grep</b> program. Any long option of the form
<b>--xxx-regexp</b> (GNU terminology) is also available as <b>--xxx-regex</b>
(PCRE2 terminology). However, the <b>--depth-limit</b>, <b>--file-list</b>,
<b>--file-offsets</b>, <b>--include-dir</b>, <b>--line-offsets</b>,
<b>--locale</b>, <b>--match-limit</b>, <b>-M</b>, <b>--multiline</b>, <b>-N</b>,
<b>--newline</b>, <b>--om-separator</b>, <b>--output</b>, <b>-u</b>, and
<b>--utf-8</b> options are specific to <b>pcre2grep</b>, as is the use of the
<b>--only-matching</b> option with a capturing parentheses number.
<b>--file-offsets</b>, <b>--heap-limit</b>, <b>--include-dir</b>,
<b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>, <b>-M</b>,
<b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--om-separator</b>,
<b>--output</b>, <b>-u</b>, and <b>--utf-8</b> options are specific to
<b>pcre2grep</b>, as is the use of the <b>--only-matching</b> option with a
capturing parentheses number.
</P>
<P>
Although most of the common options work the same way, a few are different in
@ -891,9 +907,9 @@ there are more than 20 such errors, <b>pcre2grep</b> gives up.
</P>
<P>
The <b>--match-limit</b> option of <b>pcre2grep</b> can be used to set the
overall resource limit; there is a second option called <b>--depth-limit</b>
that sets a limit on the amount of memory that is used (see the discussion of
these options above).
overall resource limit. There are also other limits that affect the amount of
memory used during matching; see the discussion of <b>--heap-limit</b> and
<b>--depth-limit</b> above.
</P>
<br><a name="SEC12" href="#TOC1">DIAGNOSTICS</a><br>
<P>
@ -918,7 +934,7 @@ Cambridge, England.
</P>
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
<P>
Last updated: 06 April 2017
Last updated: 11 April 2017
<br>
Copyright &copy; 1997-2017 University of Cambridge.
<br>

View File

@ -170,14 +170,15 @@ the application to apply the JIT optimization by calling
<b>pcre2_jit_compile()</b> is ignored.
</P>
<br><b>
Setting match and backtracking depth limits
Setting match resource limits
</b><br>
<P>
The pcre2_match() function contains a counter that is incremented every time it
goes round its main loop. The caller of <b>pcre2_match()</b> can set a limit on
this counter, which therefore limits the amount of computing resource used for
a match. The maximum depth of nested backtracking can also be limited, and this
restricts the amount of heap memory that is used.
a match. The maximum depth of nested backtracking can also be limited; this
indirectly restricts the amount of heap memory that is used, but there is also
an explicit memory limit that can be set.
</P>
<P>
These facilities are provided to catch runaway matches that are provoked by
@ -186,6 +187,7 @@ unlimited repeats applied to a long string that does not match). When one of
these limits is reached, <b>pcre2_match()</b> gives an error return. The limits
can also be set by items at the start of the pattern of the form
<pre>
(*LIMIT_HEAP=d)
(*LIMIT_MATCH=d)
(*LIMIT_DEPTH=d)
</pre>
@ -200,11 +202,13 @@ Prior to release 10.30, LIMIT_DEPTH was called LIMIT_RECURSION. This name is
still recognized for backwards compatibility.
</P>
<P>
The match limit is used (but in a different way) when JIT is being used, but it
is not relevant, and is ignored, when matching with <b>pcre2_dfa_match()</b>.
However, the depth limit is relevant for DFA matching, which uses function
recursion for recursions within the pattern. In this case, the depth limit
controls the amount of system stack that is used.
The heap limit applies only when the <b>pcre2_match()</b> interpreter is used
for matching. It does not apply to JIT or DFA matching. The match limit is used
(but in a different way) when JIT is being used, but it is not relevant, and is
ignored, when matching with <b>pcre2_dfa_match()</b>. The depth limit is ignored
by JIT but is relevant for DFA matching, which uses function recursion for
recursions within the pattern. In this case, the depth limit controls the
amount of system stack that is used.
<a name="newlines"></a></P>
<br><b>
Newline conventions
@ -3434,7 +3438,7 @@ Cambridge, England.
</P>
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
<P>
Last updated: 03 April 2017
Last updated: 11 April 2017
<br>
Copyright &copy; 1997-2017 University of Cambridge.
<br>

View File

@ -83,11 +83,12 @@ From release 10.30, the interpretive (non-JIT) version of <b>pcre2_match()</b>
uses very little system stack at run time. In earlier releases recursive
function calls could use a great deal of stack, and this could cause problems,
but this usage has been eliminated. Backtracking positions are now explicitly
remembered in memory frames controlled by the code. An initial 10K vector of
frames is allocated on the system stack (enough for about 50 frames for small
patterns), but if this is insufficient, heap memory is used. Rewriting patterns
to be time-efficient, as described below, may also reduce the memory
requirements.
remembered in memory frames controlled by the code. An initial 20K vector of
frames is allocated on the system stack (enough for about 100 frames for small
patterns), but if this is insufficient, heap memory is used. The amount of heap
memory can be limited; if the limit is set to zero, only the initial stack
vector is used. Rewriting patterns to be time-efficient, as described below,
may also reduce the memory requirements.
</P>
<P>
In contrast to <b>pcre2_match()</b>, <b>pcre2_dfa_match()</b> does use recursive
@ -243,7 +244,7 @@ Cambridge, England.
</P>
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
<P>
Last updated: 31 March 2017
Last updated: 08 April 2017
<br>
Copyright &copy; 1997-2017 University of Cambridge.
<br>

View File

@ -235,6 +235,12 @@ Behave as if each pattern line has the <b>jit</b> modifier; after successful
compilation, each pattern is passed to the just-in-time compiler, if available.
</P>
<P>
<b>-jitverify</b>
Behave as if each pattern line has the <b>jitverify</b> modifier; after
successful compilation, each pattern is passed to the just-in-time compiler, if
available, and the use of JIT is verified.
</P>
<P>
\fB-pattern\fB <i>modifier-list</i>
Behave as if each pattern line contains the given modifiers.
</P>
@ -1088,6 +1094,7 @@ pattern.
get=&#60;number or name&#62; extract captured substring
getall extract all captured substrings
/g global global matching
heap_limit=&#60;n&#62; set a limit on heap memory
jitstack=&#60;n&#62; set size of JIT stack
mark show mark values
match_limit=&#60;n&#62; set a match limit
@ -1330,11 +1337,11 @@ stack that is larger than the default 32K is necessary only for very
complicated patterns.
</P>
<br><b>
Setting match and depth limits
Setting heap, match, and depth limits
</b><br>
<P>
The <b>match_limit</b> and <b>depth_limit</b> modifiers set the appropriate
limits in the match context. These values are ignored when the
The <b>heap_limit</b>, <b>match_limit</b>, and <b>depth_limit</b> modifiers set
the appropriate limits in the match context. These values are ignored when the
<b>find_limits</b> modifier is specified.
</P>
<br><b>
@ -1343,8 +1350,8 @@ Finding minimum limits
<P>
If the <b>find_limits</b> modifier is present on a subject line, <b>pcre2test</b>
calls the relevant matching function several times, setting different values in
the match context via <b>pcre2_set_match_limit()</b> or
<b>pcre2_set_depth_limit()</b> until it finds the minimum values for each
the match context via <b>pcre2_set_heap_limit(), \fBpcre2_set_match_limit()</b>,
or <b>pcre2_set_depth_limit()</b> until it finds the minimum values for each
parameter that allows the match to complete without error.
</P>
<P>
@ -1360,9 +1367,9 @@ increasing length of subject string.
</P>
<P>
For non-DFA matching, the minimum <i>depth_limit</i> number is a measure of how
much memory for recording backtracking points is needed to complete the match
attempt. In the case of DFA matching, <i>depth_limit</i> controls the depth of
recursive calls of the internal function that is used for handling pattern
much nested backtracking happens (that is, how deeply the pattern's tree is
searched). In the case of DFA matching, <i>depth_limit</i> controls the depth of
recursive calls of the internal function that is used for handling pattern
recursion, lookaround assertions, and atomic groups.
</P>
<br><b>
@ -1800,7 +1807,7 @@ Cambridge, England.
</P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P>
Last updated: 04 April 2017
Last updated: 11 April 2017
<br>
Copyright &copy; 1997-2017 University of Cambridge.
<br>

View File

@ -213,6 +213,9 @@ in the library.
<tr><td><a href="pcre2_set_depth_limit.html">pcre2_set_depth_limit</a></td>
<td>&nbsp;&nbsp;Set the match backtracking depth limit</td></tr>
<tr><td><a href="pcre2_set_heap_limit.html">pcre2_set_heap_limit</a></td>
<td>&nbsp;&nbsp;Set the match backtracking heap limit</td></tr>
<tr><td><a href="pcre2_set_match_limit.html">pcre2_set_match_limit</a></td>
<td>&nbsp;&nbsp;Set the match limit</td></tr>

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
.TH PCRE2_CONFIG 3 "24 March 2017" "PCRE2 10.30"
.TH PCRE2_CONFIG 3 "11 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH SYNOPSIS
@ -31,6 +31,7 @@ point to a uint32_t integer variable. The available codes are:
PCRE2_CONFIG_BSR Indicates what \eR matches by default:
PCRE2_BSR_UNICODE
PCRE2_BSR_ANYCRLF
PCRE2_CONFIG_HEAPLIMIT Default heap memory limit
PCRE2_CONFIG_DEPTHLIMIT Default backtracking depth limit
.\" JOIN
PCRE2_CONFIG_JIT Availability of just-in-time compiler

View File

@ -1,4 +1,4 @@
.TH PCRE2_MATCH 3 "04 April 2017" "PCRE2 10.30"
.TH PCRE2_MATCH 3 "11 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH SYNOPSIS
@ -32,6 +32,7 @@ A match context is needed only if you want to:
.sp
Set up a callout function
Set a matching offset limit
Change the heap memory limit
Change the backtracking match limit
Change the backtracking depth limit
Set custom memory management specifically for the match

View File

@ -1,4 +1,4 @@
.TH PCRE2_PATTERN_INFO 3 "25 March 2017" "PCRE2 10.30"
.TH PCRE2_PATTERN_INFO 3 "11 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH SYNOPSIS
@ -43,6 +43,9 @@ request are as follows:
.\" JOIN
PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches
exist in the pattern
.\" JOIN
PCRE2_INFO_HEAPLIMIT Heap memory limit if set,
otherwise PCRE2_ERROR_UNSET
PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0
PCRE2_INFO_LASTCODETYPE Type of must-be-present information

View File

@ -0,0 +1,28 @@
.TH PCRE2_SET_DEPTH_LIMIT 3 "11 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH SYNOPSIS
.rs
.sp
.B #include <pcre2.h>
.PP
.nf
.B int pcre2_set_heap_limit(pcre2_match_context *\fImcontext\fP,
.B " uint32_t \fIvalue\fP);"
.fi
.
.SH DESCRIPTION
.rs
.sp
This function sets the backtracking heap limit field in a match context. The
result is always zero.
.P
There is a complete description of the PCRE2 native API in the
.\" HREF
\fBpcre2api\fP
.\"
page and a description of the POSIX API in the
.\" HREF
\fBpcre2posix\fP
.\"
page.

View File

@ -1,4 +1,4 @@
.TH PCRE2API 3 "04 April 2017" "PCRE2 10.30"
.TH PCRE2API 3 "11 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@ -123,6 +123,9 @@ document for an overview of all the PCRE2 documentation.
.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
.B " PCRE2_SIZE \fIvalue\fP);"
.sp
.B int pcre2_set_heap_limit(pcre2_match_context *\fImcontext\fP,
.B " uint32_t \fIvalue\fP);"
.sp
.B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
.B " uint32_t \fIvalue\fP);"
.sp
@ -753,6 +756,7 @@ A match context is required if you want to:
.sp
Set up a callout function
Set an offset limit for matching an unanchored pattern
Change the limit on the amount of heap used when matching
Change the backtracking match limit
Change the backtracking depth limit
Set custom memory management specifically for the match
@ -816,14 +820,49 @@ limit, a match must occur in the first line and also within the offset limit.
In other words, whichever limit comes first is used.
.sp
.nf
.B int pcre2_set_heap_limit(pcre2_match_context *\fImcontext\fP,
.B " uint32_t \fIvalue\fP);"
.fi
.sp
The \fIheap_limit\fP parameter specifies, in units of kilobytes, the maximum
amount of heap memory that \fBpcre2_match()\fP may use to hold backtracking
information when running an interpretive match. This limit does not apply to
matching with the JIT optimization, which has its own memory control
arrangements (see the
.\" HREF
\fBpcre2jit\fP
.\"
documentation for more details), nor does it apply to \fBpcre2_dfa_match()\fP.
If the limit is reached, the negative error code PCRE2_ERROR_HEAPLIMIT is
returned. The default limit is set when PCRE2 is built; the default default is
very large and is essentially "unlimited".
.P
A value for the heap limit may also be supplied by an item at the start of a
pattern of the form
.sp
(*LIMIT_HEAP=ddd)
.sp
where ddd is a decimal number. However, such a setting is ignored unless ddd is
less than the limit set by the caller of \fBpcre2_match()\fP or, if no such
limit is set, less than the default.
.P
The \fBpcre2_match()\fP function starts out using a 20K vector on the system
stack for recording backtracking points. The more nested backtracking points
there are (that is, the deeper the search tree), the more memory is needed.
Heap memory is used only if the initial vector is too small. If the heap limit
is set to a value less than 21 (in particular, zero) no heap memory will be
used. In this case, only patterns that do not have a lot of nested backtracking
can be successfully processed.
.sp
.nf
.B int pcre2_set_match_limit(pcre2_match_context *\fImcontext\fP,
.B " uint32_t \fIvalue\fP);"
.fi
.sp
The \fImatch_limit\fP parameter provides a means of preventing PCRE2 from using
up too many resources when processing patterns that are not going to match, but
which have a very large number of possibilities in their search trees. The
classic example is a pattern that uses nested unlimited repeats.
up too many computing resources when processing patterns that are not going to
match, but which have a very large number of possibilities in their search
trees. The classic example is a pattern that uses nested unlimited repeats.
.P
There is an internal counter in \fBpcre2_match()\fP that is incremented each
time round its main matching loop. If this value reaches the match limit,
@ -859,15 +898,19 @@ limit is set, less than the default.
This parameter limits the depth of nested backtracking in \fBpcre2_match()\fP.
Each time a nested backtracking point is passed, a new memory "frame" is used
to remember the state of matching at that point. Thus, this parameter
indirectly limits the amount of memory that is used in a match.
indirectly limits the amount of memory that is used in a match. However,
because the size of each memory "frame" depends on the number of capturing
parentheses, the actual memory limit varies from pattern to pattern. This limit
was more useful in versions before 10.30, where function recursion was used for
backtracking.
.P
This limit is not relevant, and is ignored, when matching is done using JIT
compiled code. However, it is supported by \fBpcre2_dfa_match()\fP, which uses
it to limit the depth of internal recursive function calls that implement
lookaround assertions and pattern recursions. This is, therefore, an indirect
limit on the amount of system stack that is used. A recursive pattern such as
/(.)(?1)/, when matched to a very long string using \fBpcre2_dfa_match()\fP,
can use a great deal of stack.
The depth limit is not relevant, and is ignored, when matching is done using
JIT compiled code. However, it is supported by \fBpcre2_dfa_match()\fP, which
uses it to limit the depth of internal recursive function calls that implement
atomic groups, lookaround assertions, and pattern recursions. This is,
therefore, an indirect limit on the amount of system stack that is used. A
recursive pattern such as /(.)(?1)/, when matched to a very long string using
\fBpcre2_dfa_match()\fP, can use a great deal of stack.
.P
The default value for the depth limit can be set when PCRE2 is built; the
default default is the same value as the default for the match limit. If the
@ -921,6 +964,12 @@ The output is a uint32_t integer that gives the default limit for the depth of
nested backtracking in \fBpcre2_match()\fP or the depth of nested recursions
and lookarounds in \fBpcre2_dfa_match()\fP. Further details are given with
\fBpcre2_set_depth_limit()\fP above.
.sp
PCRE2_CONFIG_HEAPLIMIT
.sp
The output is a uint32_t integer that gives, in kilobytes, the default limit
for the amount of heap memory used by \fBpcre2_match()\fP. Further details are
given with \fBpcre2_set_heap_limit()\fP above.
.sp
PCRE2_CONFIG_JIT
.sp
@ -1784,6 +1833,13 @@ Return 1 if the pattern contains any explicit matches for CR or LF characters,
otherwise 0. The third argument should point to an \fBuint32_t\fP variable. An
explicit match is either a literal CR or LF character, or \er or \en or one of
the equivalent hexadecimal or octal escape sequences.
.sp
PCRE2_INFO_HEAPLIMIT
.sp
If the pattern set a heap memory limit by including an item of the form
(*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argument
should point to an unsigned 32-bit integer. If no such value has been set, the
call to \fBpcre2_pattern_info()\fP returns the error PCRE2_ERROR_UNSET.
.sp
PCRE2_INFO_JCHANGED
.sp
@ -2603,7 +2659,8 @@ The backtracking match limit was reached.
.sp
If a pattern contains many nested backtracking points, heap memory is used to
remember them. This error is given when the memory allocation function (default
or custom) fails.
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
if the amount of memory needed exceeds the heap limit.
.sp
PCRE2_ERROR_NULL
.sp
@ -3322,6 +3379,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 04 April 2017
Last updated: 11 April 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2BUILD 3 "31 March 2017" "PCRE2 10.30"
.TH PCRE2BUILD 3 "10 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.
@ -260,17 +260,42 @@ to the \fBconfigure\fP command. This setting has no effect on the
\fBpcre2_dfa_match()\fP matching function, but it does also limit JIT matching
(though the counting is done differently).
.P
In some environments it is desirable to limit the depth of nested backtracking
in order to restrict the maximum amount of heap memory that is used. A second
limit controls this; it defaults to the value that is set for
--with-match-limit. You can set a lower default limit by adding, for example,
The \fBpcre2_match()\fP function starts out using a 20K vector on the system
stack to record backtracking points. The more nested backtracking points there
are (that is, the deeper the search tree), the more memory is needed. If the
initial vector is not large enough, heap memory is used, up to a certain limit,
which is specified in kilobytes. The limit can be changed at run time, as
described in the
.\" HREF
\fBpcre2api\fP
.\"
documentation. The default limit (in effect unlimited) is 20 million. You can
change this by a setting such as
.sp
--with-heap-limit=500
.sp
which limits the amount of heap to 500 kilobytes. This limit applies only to
interpretive matching in pcre2_match(). It does not apply when JIT (which has
its own memory arrangements) is used, nor does it apply to
\fBpcre2_dfa_match()\fP.
.P
You can also explicitly limit the depth of nested backtracking in the
\fBpcre2_match()\fP interpreter. This limit defaults to the value that is set
for --with-match-limit. You can set a lower default limit by adding, for
example,
.sp
--with-match-limit_depth=10000
.sp
to the \fBconfigure\fP command. This value can also be overridden at run time.
As well as applying to \fBpcre2_match()\fP, this limit also controls the depth
of recursive function calls in \fBpcre2_dfa_match()\fP. These are used for
lookaround assertions, atomic groups, and recursion within patterns.
to the \fBconfigure\fP command. This value can be overridden at run time. This
depth limit indirectly limits the amount of heap memory that is used, but
because the size of each backtracking "frame" depends on the number of
capturing parentheses in a pattern, the amount of heap that is used before the
limit is reached varies from pattern to pattern. This limit was more useful in
versions before 10.30, where function recursion was used for backtracking.
However, as well as applying to \fBpcre2_match()\fP, this limit also controls
the depth of recursive function calls in \fBpcre2_dfa_match()\fP. These are
used for lookaround assertions, atomic groups, and recursion within patterns.
The limit does not apply to JIT matching.
.
.
.SH "CREATING CHARACTER TABLES AT BUILD TIME"
@ -547,6 +572,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 31 March 2017
Last updated: 10 April 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2GREP 1 "06 April 2017" "PCRE2 10.30"
.TH PCRE2GREP 1 "11 April 2017" "PCRE2 10.30"
.SH NAME
pcre2grep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS
@ -347,6 +347,9 @@ file names are shown when multiple files are searched. For matching lines, the
file name is followed by a colon; for context lines, a hyphen separator is used.
If a line number is also being output, it follows the file name.
.TP
\fB--heap-limit\fP=\fInumber\fP
See \fB--match-limit\fP below.
.TP
\fB--help\fP
Output a help message, giving brief details of the command options and file
type support, and then exit. Anything else on the command line is
@ -436,7 +439,7 @@ used. There is no short form for this option.
\fB--match-limit\fP=\fInumber\fP
Processing some regular expression patterns may take a very long time to search
for all possible matching strings. Others may require a very large amount of
memory. There are two options that set resource limits for matching.
memory. There are three options that set resource limits for matching.
.sp
The \fB--match-limit\fP option provides a means of limiting computing resource
usage when processing patterns that are not going to match, but which have a
@ -445,12 +448,22 @@ is a pattern that uses nested unlimited repeats. Internally, PCRE2 has a
counter that is incremented each time around its main processing loop. If the
value set by \fB--match-limit\fP is reached, an error occurs.
.sp
The \fB--heap-limit\fP option specifies, as a number of kilobytes, the amount
of heap memory that may be used for matching. Heap memory is needed only if
matching the pattern requires a significant number of nested backtracking
points to be remembered. This parameter can be set to zero to forbid the use of
heap memory altogether.
.sp
The \fB--depth-limit\fP option limits the depth of nested backtracking points,
which in turn limits the amount of memory that is used. This limit is of use
only if it is set smaller than \fB--match-limit\fP.
which indirectly limits the amount of memory that is used. The amount of memory
needed for each backtracking point depends on the number of capturing
parentheses in the pattern, so the amount of memory that is used before this
limit acts varies from pattern to pattern. This limit is of use only if it is
set smaller than \fB--match-limit\fP.
.sp
There are no short forms for these options. The default settings are specified
when the PCRE2 library is compiled, with the default default being 10 million.
when the PCRE2 library is compiled, with the default defaults being very large
and so effectively unlimited.
.TP
\fB--max-buffer-size=\fInumber\fP
This limits the expansion of the processing buffer, whose initial size can be
@ -670,11 +683,12 @@ Many of the short and long forms of \fBpcre2grep\fP's options are the same
as in the GNU \fBgrep\fP program. Any long option of the form
\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
(PCRE2 terminology). However, the \fB--depth-limit\fP, \fB--file-list\fP,
\fB--file-offsets\fP, \fB--include-dir\fP, \fB--line-offsets\fP,
\fB--locale\fP, \fB--match-limit\fP, \fB-M\fP, \fB--multiline\fP, \fB-N\fP,
\fB--newline\fP, \fB--om-separator\fP, \fB--output\fP, \fB-u\fP, and
\fB--utf-8\fP options are specific to \fBpcre2grep\fP, as is the use of the
\fB--only-matching\fP option with a capturing parentheses number.
\fB--file-offsets\fP, \fB--heap-limit\fP, \fB--include-dir\fP,
\fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP, \fB-M\fP,
\fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
\fB--output\fP, \fB-u\fP, and \fB--utf-8\fP options are specific to
\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
capturing parentheses number.
.P
Although most of the common options work the same way, a few are different in
\fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
@ -799,9 +813,9 @@ message and the line that caused the problem to the standard error stream. If
there are more than 20 such errors, \fBpcre2grep\fP gives up.
.P
The \fB--match-limit\fP option of \fBpcre2grep\fP can be used to set the
overall resource limit; there is a second option called \fB--depth-limit\fP
that sets a limit on the amount of memory that is used (see the discussion of
these options above).
overall resource limit. There are also other limits that affect the amount of
memory used during matching; see the discussion of \fB--heap-limit\fP and
\fB--depth-limit\fP above.
.
.
.SH DIAGNOSTICS
@ -834,6 +848,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 06 April 2017
Last updated: 11 April 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -383,6 +383,9 @@ OPTIONS
colon; for context lines, a hyphen separator is used. If a
line number is also being output, it follows the file name.
--heap-limit=number
See --match-limit below.
--help Output a help message, giving brief details of the command
options and file type support, and then exit. Anything else
on the command line is ignored.
@ -482,7 +485,7 @@ OPTIONS
--match-limit=number
Processing some regular expression patterns may take a very
long time to search for all possible matching strings. Others
may require a very large amount of memory. There are two
may require a very large amount of memory. There are three
options that set resource limits for matching.
The --match-limit option provides a means of limiting comput-
@ -494,237 +497,248 @@ OPTIONS
processing loop. If the value set by --match-limit is
reached, an error occurs.
The --depth-limit option limits the depth of nested back-
tracking points, which in turn limits the amount of memory
that is used. This limit is of use only if it is set smaller
than --match-limit.
The --heap-limit option specifies, as a number of kilobytes,
the amount of heap memory that may be used for matching. Heap
memory is needed only if matching the pattern requires a sig-
nificant number of nested backtracking points to be remem-
bered. This parameter can be set to zero to forbid the use of
heap memory altogether.
The --depth-limit option limits the depth of nested back-
tracking points, which indirectly limits the amount of memory
that is used. The amount of memory needed for each backtrack-
ing point depends on the number of capturing parentheses in
the pattern, so the amount of memory that is used before this
limit acts varies from pattern to pattern. This limit is of
use only if it is set smaller than --match-limit.
There are no short forms for these options. The default set-
tings are specified when the PCRE2 library is compiled, with
the default default being 10 million.
the default defaults being very large and so effectively
unlimited.
--max-buffer-size=number
This limits the expansion of the processing buffer, whose
initial size can be set by --buffer-size. The maximum buffer
size is silently forced to be no smaller than the starting
This limits the expansion of the processing buffer, whose
initial size can be set by --buffer-size. The maximum buffer
size is silently forced to be no smaller than the starting
buffer size.
-M, --multiline
Allow patterns to match more than one line. When this option
Allow patterns to match more than one line. When this option
is set, the PCRE2 library is called in "multiline" mode. This
allows a matched string to extend past the end of a line and
continue on one or more subsequent lines. Patterns used with
allows a matched string to extend past the end of a line and
continue on one or more subsequent lines. Patterns used with
-M may usefully contain literal newline characters and inter-
nal occurrences of ^ and $ characters. The output for a suc-
cessful match may consist of more than one line. The first
line is the line in which the match started, and the last
line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the
end of that line. If -v is set, none of the lines in a
multi-line match are output. Once a match has been handled,
scanning restarts at the beginning of the line after the one
nal occurrences of ^ and $ characters. The output for a suc-
cessful match may consist of more than one line. The first
line is the line in which the match started, and the last
line is the line in which the match ended. If the matched
string ends with a newline sequence, the output ends at the
end of that line. If -v is set, none of the lines in a
multi-line match are output. Once a match has been handled,
scanning restarts at the beginning of the line after the one
in which the match ended.
The newline sequence that separates multiple lines must be
matched as part of the pattern. For example, to find the
phrase "regular expression" in a file where "regular" might
be at the end of a line and "expression" at the start of the
The newline sequence that separates multiple lines must be
matched as part of the pattern. For example, to find the
phrase "regular expression" in a file where "regular" might
be at the end of a line and "expression" at the start of the
next line, you could use this command:
pcre2grep -M 'regular\s+expression' <file>
The \s escape sequence matches any white space character,
including newlines, and is followed by + so as to match
trailing white space on the first line as well as possibly
The \s escape sequence matches any white space character,
including newlines, and is followed by + so as to match
trailing white space on the first line as well as possibly
handling a two-character newline sequence.
There is a limit to the number of lines that can be matched,
imposed by the way that pcre2grep buffers the input file as
it scans it. With a sufficiently large processing buffer,
There is a limit to the number of lines that can be matched,
imposed by the way that pcre2grep buffers the input file as
it scans it. With a sufficiently large processing buffer,
this should not be a problem, but the -M option does not work
when input is read line by line (see --line-buffered.)
-N newline-type, --newline=newline-type
The PCRE2 library supports five different conventions for
indicating the ends of lines. They are the single-character
sequences CR (carriage return) and LF (linefeed), the two-
character sequence CRLF, an "anycrlf" convention, which rec-
ognizes any of the preceding three types, and an "any" con-
The PCRE2 library supports five different conventions for
indicating the ends of lines. They are the single-character
sequences CR (carriage return) and LF (linefeed), the two-
character sequence CRLF, an "anycrlf" convention, which rec-
ognizes any of the preceding three types, and an "any" con-
vention, in which any Unicode line ending sequence is assumed
to end a line. The Unicode sequences are the three just men-
tioned, plus VT (vertical tab, U+000B), FF (form feed,
U+000C), NEL (next line, U+0085), LS (line separator,
to end a line. The Unicode sequences are the three just men-
tioned, plus VT (vertical tab, U+000B), FF (form feed,
U+000C), NEL (next line, U+0085), LS (line separator,
U+2028), and PS (paragraph separator, U+2029).
When the PCRE2 library is built, a default line-ending
sequence is specified. This is normally the standard
When the PCRE2 library is built, a default line-ending
sequence is specified. This is normally the standard
sequence for the operating system. Unless otherwise specified
by this option, pcre2grep uses the library's default. The
by this option, pcre2grep uses the library's default. The
possible values for this option are CR, LF, CRLF, ANYCRLF, or
ANY. This makes it possible to use pcre2grep to scan files
ANY. This makes it possible to use pcre2grep to scan files
that have come from other environments without having to mod-
ify their line endings. If the data that is being scanned
does not agree with the convention set by this option,
pcre2grep may behave in strange ways. Note that this option
does not apply to files specified by the -f, --exclude-from,
or --include-from options, which are expected to use the
ify their line endings. If the data that is being scanned
does not agree with the convention set by this option,
pcre2grep may behave in strange ways. Note that this option
does not apply to files specified by the -f, --exclude-from,
or --include-from options, which are expected to use the
operating system's standard newline sequence.
-n, --line-number
Precede each output line by its line number in the file, fol-
lowed by a colon for matching lines or a hyphen for context
lowed by a colon for matching lines or a hyphen for context
lines. If the file name is also being output, it precedes the
line number. When the -M option causes a pattern to match
more than one line, only the first is preceded by its line
line number. When the -M option causes a pattern to match
more than one line, only the first is preceded by its line
number. This option is forced if --line-offsets is used.
--no-jit If the PCRE2 library is built with support for just-in-time
--no-jit If the PCRE2 library is built with support for just-in-time
compiling (which speeds up matching), pcre2grep automatically
makes use of this, unless it was explicitly disabled at build
time. This option can be used to disable the use of JIT at
run time. It is provided for testing and working round prob-
time. This option can be used to disable the use of JIT at
run time. It is provided for testing and working round prob-
lems. It should never be needed in normal use.
-O text, --output=text
When there is a match, instead of outputting the whole line
that matched, output just the given text. This option is
mutually exclusive with --only-matching, --file-offsets, and
When there is a match, instead of outputting the whole line
that matched, output just the given text. This option is
mutually exclusive with --only-matching, --file-offsets, and
--line-offsets. Escape sequences starting with a dollar char-
acter may be used to insert the contents of the matched part
acter may be used to insert the contents of the matched part
of the line and/or captured substrings into the text.
$<digits> or ${<digits>} is replaced by the captured sub-
string of the given decimal number; zero substitutes the
$<digits> or ${<digits>} is replaced by the captured sub-
string of the given decimal number; zero substitutes the
whole match. If the number is greater than the number of cap-
turing substrings, or if the capture is unset, the replace-
turing substrings, or if the capture is unset, the replace-
ment is empty.
$a is replaced by bell; $b by backspace; $e by escape; $f by
form feed; $n by newline; $r by carriage return; $t by tab;
$a is replaced by bell; $b by backspace; $e by escape; $f by
form feed; $n by newline; $r by carriage return; $t by tab;
$v by vertical tab.
$o<digits> is replaced by the character represented by the
$o<digits> is replaced by the character represented by the
given octal number; up to three digits are processed.
$x<digits> is replaced by the character represented by the
$x<digits> is replaced by the character represented by the
given hexadecimal number; up to two digits are processed.
Any other character is substituted by itself. In particular,
Any other character is substituted by itself. In particular,
$$ is replaced by a single dollar.
-o, --only-matching
Show only the part of the line that matched a pattern instead
of the whole line. In this mode, no context is shown. That
is, the -A, -B, and -C options are ignored. If there is more
than one match in a line, each of them is shown separately,
on a separate line of output. If -o is combined with -v
(invert the sense of the match to find non-matching lines),
no output is generated, but the return code is set appropri-
ately. If the matched portion of the line is empty, nothing
is output unless the file name or line number are being
printed, in which case they are shown on an otherwise empty
of the whole line. In this mode, no context is shown. That
is, the -A, -B, and -C options are ignored. If there is more
than one match in a line, each of them is shown separately,
on a separate line of output. If -o is combined with -v
(invert the sense of the match to find non-matching lines),
no output is generated, but the return code is set appropri-
ately. If the matched portion of the line is empty, nothing
is output unless the file name or line number are being
printed, in which case they are shown on an otherwise empty
line. This option is mutually exclusive with --output,
--file-offsets and --line-offsets.
-onumber, --only-matching=number
Show only the part of the line that matched the capturing
Show only the part of the line that matched the capturing
parentheses of the given number. Up to 32 capturing parenthe-
ses are supported, and -o0 is equivalent to -o without a num-
ber. Because these options can be given without an argument
(see above), if an argument is present, it must be given in
the same shell item, for example, -o3 or --only-matching=2.
ber. Because these options can be given without an argument
(see above), if an argument is present, it must be given in
the same shell item, for example, -o3 or --only-matching=2.
The comments given for the non-argument case above also apply
to this option. If the specified capturing parentheses do not
exist in the pattern, or were not set in the match, nothing
is output unless the file name or line number are being out-
exist in the pattern, or were not set in the match, nothing
is output unless the file name or line number are being out-
put.
If this option is given multiple times, multiple substrings
are output for each match, in the order the options are
given, and all on one line. For example, -o3 -o1 -o3 causes
the substrings matched by capturing parentheses 3 and 1 and
then 3 again to be output. By default, there is no separator
If this option is given multiple times, multiple substrings
are output for each match, in the order the options are
given, and all on one line. For example, -o3 -o1 -o3 causes
the substrings matched by capturing parentheses 3 and 1 and
then 3 again to be output. By default, there is no separator
(but see the next option).
--om-separator=text
Specify a separating string for multiple occurrences of -o.
The default is an empty string. Separating strings are never
Specify a separating string for multiple occurrences of -o.
The default is an empty string. Separating strings are never
coloured.
-q, --quiet
Work quietly, that is, display nothing except error messages.
The exit status indicates whether or not any matches were
The exit status indicates whether or not any matches were
found.
-r, --recursive
If any given path is a directory, recursively scan the files
it contains, taking note of any --include and --exclude set-
tings. By default, a directory is read as a normal file; in
some operating systems this gives an immediate end-of-file.
This option is a shorthand for setting the -d option to
If any given path is a directory, recursively scan the files
it contains, taking note of any --include and --exclude set-
tings. By default, a directory is read as a normal file; in
some operating systems this gives an immediate end-of-file.
This option is a shorthand for setting the -d option to
"recurse".
--recursion-limit=number
See --match-limit above.
-s, --no-messages
Suppress error messages about non-existent or unreadable
files. Such files are quietly skipped. However, the return
Suppress error messages about non-existent or unreadable
files. Such files are quietly skipped. However, the return
code is still 2, even if matches were found in other files.
-t, --total-count
This option is useful when scanning more than one file. If
used on its own, -t suppresses all output except for a grand
total number of matching lines (or non-matching lines if -v
is used) in all the files. If -t is used with -c, a grand
total is output except when the previous output is just one
line. In other words, it is not output when just one file's
count is listed. If file names are being output, the grand
total is preceded by "TOTAL:". Otherwise, it appears as just
another number. The -t option is ignored when used with -L
(list files without matches), because the grand total would
This option is useful when scanning more than one file. If
used on its own, -t suppresses all output except for a grand
total number of matching lines (or non-matching lines if -v
is used) in all the files. If -t is used with -c, a grand
total is output except when the previous output is just one
line. In other words, it is not output when just one file's
count is listed. If file names are being output, the grand
total is preceded by "TOTAL:". Otherwise, it appears as just
another number. The -t option is ignored when used with -L
(list files without matches), because the grand total would
always be zero.
-u, --utf-8
Operate in UTF-8 mode. This option is available only if PCRE2
has been compiled with UTF-8 support. All patterns (including
those for any --exclude and --include options) and all sub-
ject lines that are scanned must be valid strings of UTF-8
those for any --exclude and --include options) and all sub-
ject lines that are scanned must be valid strings of UTF-8
characters.
-V, --version
Write the version numbers of pcre2grep and the PCRE2 library
to the standard output and then exit. Anything else on the
Write the version numbers of pcre2grep and the PCRE2 library
to the standard output and then exit. Anything else on the
command line is ignored.
-v, --invert-match
Invert the sense of the match, so that lines which do not
Invert the sense of the match, so that lines which do not
match any of the patterns are the ones that are found.
-w, --word-regex, --word-regexp
Force the patterns to match only whole words. This is equiva-
lent to having \b at the start and end of the pattern. This
option applies only to the patterns that are matched against
the contents of files; it does not apply to patterns speci-
lent to having \b at the start and end of the pattern. This
option applies only to the patterns that are matched against
the contents of files; it does not apply to patterns speci-
fied by any of the --include or --exclude options.
-x, --line-regex, --line-regexp
Force the patterns to be anchored (each must start matching
at the beginning of a line) and in addition, require them to
match entire lines. In multiline mode the match may be more
Force the patterns to be anchored (each must start matching
at the beginning of a line) and in addition, require them to
match entire lines. In multiline mode the match may be more
than one line. This is equivalent to having \A and \Z charac-
ters at the start and end of each alternative top-level
ters at the start and end of each alternative top-level
branch in every pattern. This option applies only to the pat-
terns that are matched against the contents of files; it does
not apply to patterns specified by any of the --include or
not apply to patterns specified by any of the --include or
--exclude options.
ENVIRONMENT VARIABLES
The environment variables LC_ALL and LC_CTYPE are examined, in that
order, for a locale. The first one that is set is used. This can be
overridden by the --locale option. If no locale is set, the PCRE2
The environment variables LC_ALL and LC_CTYPE are examined, in that
order, for a locale. The first one that is set is used. This can be
overridden by the --locale option. If no locale is set, the PCRE2
library's default (usually the "C" locale) is used.
@ -732,99 +746,99 @@ NEWLINES
The -N (--newline) option allows pcre2grep to scan files with different
newline conventions from the default. Any parts of the input files that
are written to the standard output are copied identically, with what-
ever newline sequences they have in the input. However, the setting of
this option does not affect the interpretation of files specified by
are written to the standard output are copied identically, with what-
ever newline sequences they have in the input. However, the setting of
this option does not affect the interpretation of files specified by
the -f, --exclude-from, or --include-from options, which are assumed to
use the operating system's standard newline sequence, nor does it
affect the way in which pcre2grep writes informational messages to the
use the operating system's standard newline sequence, nor does it
affect the way in which pcre2grep writes informational messages to the
standard error and output streams. For these it uses the string "\n" to
indicate newlines, relying on the C I/O library to convert this to an
indicate newlines, relying on the C I/O library to convert this to an
appropriate sequence.
OPTIONS COMPATIBILITY
Many of the short and long forms of pcre2grep's options are the same as
in the GNU grep program. Any long option of the form --xxx-regexp (GNU
in the GNU grep program. Any long option of the form --xxx-regexp (GNU
terminology) is also available as --xxx-regex (PCRE2 terminology). How-
ever, the --depth-limit, --file-list, --file-offsets, --include-dir,
--line-offsets, --locale, --match-limit, -M, --multiline, -N, --new-
line, --om-separator, --output, -u, and --utf-8 options are specific to
pcre2grep, as is the use of the --only-matching option with a capturing
parentheses number.
ever, the --depth-limit, --file-list, --file-offsets, --heap-limit,
--include-dir, --line-offsets, --locale, --match-limit, -M, --multi-
line, -N, --newline, --om-separator, --output, -u, and --utf-8 options
are specific to pcre2grep, as is the use of the --only-matching option
with a capturing parentheses number.
Although most of the common options work the same way, a few are dif-
ferent in pcre2grep. For example, the --include option's argument is a
glob for GNU grep, but a regular expression for pcre2grep. If both the
-c and -l options are given, GNU grep lists only file names, without
Although most of the common options work the same way, a few are dif-
ferent in pcre2grep. For example, the --include option's argument is a
glob for GNU grep, but a regular expression for pcre2grep. If both the
-c and -l options are given, GNU grep lists only file names, without
counts, but pcre2grep gives the counts as well.
OPTIONS WITH DATA
There are four different ways in which an option with data can be spec-
ified. If a short form option is used, the data may follow immedi-
ified. If a short form option is used, the data may follow immedi-
ately, or (with one exception) in the next command line item. For exam-
ple:
-f/some/file
-f /some/file
The exception is the -o option, which may appear with or without data.
Because of this, if data is present, it must follow immediately in the
The exception is the -o option, which may appear with or without data.
Because of this, if data is present, it must follow immediately in the
same item, for example -o3.
If a long form option is used, the data may appear in the same command
line item, separated by an equals character, or (with two exceptions)
If a long form option is used, the data may appear in the same command
line item, separated by an equals character, or (with two exceptions)
it may appear in the next command line item. For example:
--file=/some/file
--file /some/file
Note, however, that if you want to supply a file name beginning with ~
as data in a shell command, and have the shell expand ~ to a home
Note, however, that if you want to supply a file name beginning with ~
as data in a shell command, and have the shell expand ~ to a home
directory, you must separate the file name from the option, because the
shell does not treat ~ specially unless it is at the start of an item.
The exceptions to the above are the --colour (or --color) and --only-
matching options, for which the data is optional. If one of these
options does have data, it must be given in the first form, using an
The exceptions to the above are the --colour (or --color) and --only-
matching options, for which the data is optional. If one of these
options does have data, it must be given in the first form, using an
equals character. Otherwise pcre2grep will assume that it has no data.
USING PCRE2'S CALLOUT FACILITY
pcre2grep has, by default, support for calling external programs or
scripts or echoing specific strings during matching by making use of
PCRE2's callout facility. However, this support can be disabled when
pcre2grep is built. You can find out whether your binary has support
for callouts by running it with the --help option. If the support is
pcre2grep has, by default, support for calling external programs or
scripts or echoing specific strings during matching by making use of
PCRE2's callout facility. However, this support can be disabled when
pcre2grep is built. You can find out whether your binary has support
for callouts by running it with the --help option. If the support is
not enabled, all callouts in patterns are ignored by pcre2grep.
A callout in a PCRE2 pattern is of the form (?C<arg>) where the argu-
ment is either a number or a quoted string (see the pcre2callout docu-
mentation for details). Numbered callouts are ignored by pcre2grep;
A callout in a PCRE2 pattern is of the form (?C<arg>) where the argu-
ment is either a number or a quoted string (see the pcre2callout docu-
mentation for details). Numbered callouts are ignored by pcre2grep;
only callouts with string arguments are useful.
Calling external programs or scripts
If the callout string does not start with a pipe (vertical bar) charac-
ter, it is parsed into a list of substrings separated by pipe charac-
ters. The first substring must be an executable name, with the follow-
ter, it is parsed into a list of substrings separated by pipe charac-
ters. The first substring must be an executable name, with the follow-
ing substrings specifying arguments:
executable_name|arg1|arg2|...
Any substring (including the executable name) may contain escape
sequences started by a dollar character: $<digits> or ${<digits>} is
replaced by the captured substring of the given decimal number, which
must be greater than zero. If the number is greater than the number of
capturing substrings, or if the capture is unset, the replacement is
Any substring (including the executable name) may contain escape
sequences started by a dollar character: $<digits> or ${<digits>} is
replaced by the captured substring of the given decimal number, which
must be greater than zero. If the number is greater than the number of
capturing substrings, or if the capture is unset, the replacement is
empty.
Any other character is substituted by itself. In particular, $$ is
replaced by a single dollar and $| is replaced by a pipe character.
Any other character is substituted by itself. In particular, $$ is
replaced by a single dollar and $| is replaced by a pipe character.
Here is an example:
echo -e "abcde\n12345" | pcre2grep \
@ -840,49 +854,49 @@ USING PCRE2'S CALLOUT FACILITY
The parameters for the execv() system call that is used to run the pro-
gram or script are zero-terminated strings. This means that binary zero
characters in the callout argument will cause premature termination of
their substrings, and therefore should not be present. Any syntax
errors in the string (for example, a dollar not followed by another
character) cause the callout to be ignored. If running the program
characters in the callout argument will cause premature termination of
their substrings, and therefore should not be present. Any syntax
errors in the string (for example, a dollar not followed by another
character) cause the callout to be ignored. If running the program
fails for any reason (including the non-existence of the executable), a
local matching failure occurs and the matcher backtracks in the normal
local matching failure occurs and the matcher backtracks in the normal
way.
Echoing a specific string
If the callout string starts with a pipe (vertical bar) character, the
If the callout string starts with a pipe (vertical bar) character, the
rest of the string is written to the output, having been passed through
the same escape processing as text from the --output option. This pro-
the same escape processing as text from the --output option. This pro-
vides a simple echoing facility that avoids calling an external program
or script. No terminator is added to the string, so if you want a new-
line, you must include it explicitly. Matching continues normally
after the string is output. If you want to see only the callout output
but not any output from an actual match, you should end the relevant
or script. No terminator is added to the string, so if you want a new-
line, you must include it explicitly. Matching continues normally
after the string is output. If you want to see only the callout output
but not any output from an actual match, you should end the relevant
pattern with (*FAIL).
MATCHING ERRORS
It is possible to supply a regular expression that takes a very long
time to fail to match certain lines. Such patterns normally involve
nested indefinite repeats, for example: (a+)*\d when matched against a
line of a's with no final digit. The PCRE2 matching function has a
resource limit that causes it to abort in these circumstances. If this
happens, pcre2grep outputs an error message and the line that caused
the problem to the standard error stream. If there are more than 20
It is possible to supply a regular expression that takes a very long
time to fail to match certain lines. Such patterns normally involve
nested indefinite repeats, for example: (a+)*\d when matched against a
line of a's with no final digit. The PCRE2 matching function has a
resource limit that causes it to abort in these circumstances. If this
happens, pcre2grep outputs an error message and the line that caused
the problem to the standard error stream. If there are more than 20
such errors, pcre2grep gives up.
The --match-limit option of pcre2grep can be used to set the overall
resource limit; there is a second option called --depth-limit that sets
a limit on the amount of memory that is used (see the discussion of
these options above).
The --match-limit option of pcre2grep can be used to set the overall
resource limit. There are also other limits that affect the amount of
memory used during matching; see the discussion of --heap-limit and
--depth-limit above.
DIAGNOSTICS
Exit status is 0 if any matches were found, 1 if no matches were found,
and 2 for syntax errors, overlong lines, non-existent or inaccessible
files (even if matches were found in other files) or too many matching
and 2 for syntax errors, overlong lines, non-existent or inaccessible
files (even if matches were found in other files) or too many matching
errors. Using the -s option to suppress error messages about inaccessi-
ble files does not affect the return code.
@ -901,5 +915,5 @@ AUTHOR
REVISION
Last updated: 06 April 2017
Last updated: 11 April 2017
Copyright (c) 1997-2017 University of Cambridge.

View File

@ -1,4 +1,4 @@
.TH PCRE2PATTERN 3 "03 April 2017" "PCRE2 10.30"
.TH PCRE2PATTERN 3 "11 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -138,14 +138,15 @@ the application to apply the JIT optimization by calling
\fBpcre2_jit_compile()\fP is ignored.
.
.
.SS "Setting match and backtracking depth limits"
.SS "Setting match resource limits"
.rs
.sp
The pcre2_match() function contains a counter that is incremented every time it
goes round its main loop. The caller of \fBpcre2_match()\fP can set a limit on
this counter, which therefore limits the amount of computing resource used for
a match. The maximum depth of nested backtracking can also be limited, and this
restricts the amount of heap memory that is used.
a match. The maximum depth of nested backtracking can also be limited; this
indirectly restricts the amount of heap memory that is used, but there is also
an explicit memory limit that can be set.
.P
These facilities are provided to catch runaway matches that are provoked by
patterns with huge matching trees (a typical example is a pattern with nested
@ -153,6 +154,7 @@ unlimited repeats applied to a long string that does not match). When one of
these limits is reached, \fBpcre2_match()\fP gives an error return. The limits
can also be set by items at the start of the pattern of the form
.sp
(*LIMIT_HEAP=d)
(*LIMIT_MATCH=d)
(*LIMIT_DEPTH=d)
.sp
@ -165,11 +167,13 @@ setting of one of these limits, the lower value is used.
Prior to release 10.30, LIMIT_DEPTH was called LIMIT_RECURSION. This name is
still recognized for backwards compatibility.
.P
The match limit is used (but in a different way) when JIT is being used, but it
is not relevant, and is ignored, when matching with \fBpcre2_dfa_match()\fP.
However, the depth limit is relevant for DFA matching, which uses function
recursion for recursions within the pattern. In this case, the depth limit
controls the amount of system stack that is used.
The heap limit applies only when the \fBpcre2_match()\fP interpreter is used
for matching. It does not apply to JIT or DFA matching. The match limit is used
(but in a different way) when JIT is being used, but it is not relevant, and is
ignored, when matching with \fBpcre2_dfa_match()\fP. The depth limit is ignored
by JIT but is relevant for DFA matching, which uses function recursion for
recursions within the pattern. In this case, the depth limit controls the
amount of system stack that is used.
.
.
.\" HTML <a name="newlines"></a>
@ -3465,6 +3469,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 03 April 2017
Last updated: 11 April 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2PERFORM 3 "31 March 2017" "PCRE2 10.30"
.TH PCRE2PERFORM 3 "08 April 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 PERFORMANCE"
@ -69,11 +69,12 @@ From release 10.30, the interpretive (non-JIT) version of \fBpcre2_match()\fP
uses very little system stack at run time. In earlier releases recursive
function calls could use a great deal of stack, and this could cause problems,
but this usage has been eliminated. Backtracking positions are now explicitly
remembered in memory frames controlled by the code. An initial 10K vector of
frames is allocated on the system stack (enough for about 50 frames for small
patterns), but if this is insufficient, heap memory is used. Rewriting patterns
to be time-efficient, as described below, may also reduce the memory
requirements.
remembered in memory frames controlled by the code. An initial 20K vector of
frames is allocated on the system stack (enough for about 100 frames for small
patterns), but if this is insufficient, heap memory is used. The amount of heap
memory can be limited; if the limit is set to zero, only the initial stack
vector is used. Rewriting patterns to be time-efficient, as described below,
may also reduce the memory requirements.
.P
In contrast to \fBpcre2_match()\fP, \fBpcre2_dfa_match()\fP does use recursive
function calls, but only for processing atomic groups, lookaround assertions,
@ -231,6 +232,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 31 March 2017
Last updated: 08 April 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "08 April 2017" "PCRE 10.30"
.TH PCRE2TEST 1 "11 April 2017" "PCRE 10.30"
.SH NAME
pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
@ -1063,6 +1063,7 @@ pattern.
get=<number or name> extract captured substring
getall extract all captured substrings
/g global global matching
heap_limit=<n> set a limit on heap memory
jitstack=<n> set size of JIT stack
mark show mark values
match_limit=<n> set a match limit
@ -1293,11 +1294,11 @@ stack that is larger than the default 32K is necessary only for very
complicated patterns.
.
.
.SS "Setting match and depth limits"
.SS "Setting heap, match, and depth limits"
.rs
.sp
The \fBmatch_limit\fP and \fBdepth_limit\fP modifiers set the appropriate
limits in the match context. These values are ignored when the
The \fBheap_limit\fP, \fBmatch_limit\fP, and \fBdepth_limit\fP modifiers set
the appropriate limits in the match context. These values are ignored when the
\fBfind_limits\fP modifier is specified.
.
.
@ -1306,8 +1307,8 @@ limits in the match context. These values are ignored when the
.sp
If the \fBfind_limits\fP modifier is present on a subject line, \fBpcre2test\fP
calls the relevant matching function several times, setting different values in
the match context via \fBpcre2_set_match_limit()\fP or
\fBpcre2_set_depth_limit()\fP until it finds the minimum values for each
the match context via \fBpcre2_set_heap_limit(), \fBpcre2_set_match_limit()\fP,
or \fBpcre2_set_depth_limit()\fP until it finds the minimum values for each
parameter that allows the match to complete without error.
.P
If JIT is being used, only the match limit is relevant. If DFA matching is
@ -1320,9 +1321,9 @@ numbers of matching possibilities, it can become large very quickly with
increasing length of subject string.
.P
For non-DFA matching, the minimum \fIdepth_limit\fP number is a measure of how
much memory for recording backtracking points is needed to complete the match
attempt. In the case of DFA matching, \fIdepth_limit\fP controls the depth of
recursive calls of the internal function that is used for handling pattern
much nested backtracking happens (that is, how deeply the pattern's tree is
searched). In the case of DFA matching, \fIdepth_limit\fP controls the depth of
recursive calls of the internal function that is used for handling pattern
recursion, lookaround assertions, and atomic groups.
.
.
@ -1782,6 +1783,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 08 April 2017
Last updated: 11 April 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

File diff suppressed because it is too large Load Diff

View File

@ -132,6 +132,10 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to 1 if you have the <zlib.h> header file. */
#undef HAVE_ZLIB_H
/* This limits the amount of memory that pcre2_match() may use while matching
a pattern. The value is in kilobytes. */
#undef HEAP_LIMIT
/* The value of LINK_SIZE determines the number of bytes used to store links
as offsets within the compiled regex. The default is 2, which allows for
compiled patterns up to 64K long. This covers the vast majority of cases.
@ -143,7 +147,7 @@ sure both macros are undefined; an emulation function will then be used. */
#undef LT_OBJDIR
/* The value of MATCH_LIMIT determines the default number of times the
internal match() function can record a backtrack position during a single
pcre2_match() function can record a backtrack position during a single
matching attempt. There is a runtime interface for setting a different
limit. The limit exists in order to catch runaway regular expressions that
take for ever to determine that they do not match. The default is set very

View File

@ -268,6 +268,7 @@ numbers must not be changed. */
#define PCRE2_ERROR_BADSUBSPATTERN (-60)
#define PCRE2_ERROR_TOOMANYREPLACE (-61)
#define PCRE2_ERROR_BADSERIALIZEDDATA (-62)
#define PCRE2_ERROR_HEAPLIMIT (-63)
/* Request types for pcre2_pattern_info() */
@ -297,6 +298,7 @@ numbers must not be changed. */
#define PCRE2_INFO_SIZE 22
#define PCRE2_INFO_HASBACKSLASHC 23
#define PCRE2_INFO_FRAMESIZE 24
#define PCRE2_INFO_HEAPLIMIT 25
/* Request types for pcre2_config(). */
@ -313,6 +315,7 @@ numbers must not be changed. */
#define PCRE2_CONFIG_UNICODE 9
#define PCRE2_CONFIG_UNICODE_VERSION 10
#define PCRE2_CONFIG_VERSION 11
#define PCRE2_CONFIG_HEAPLIMIT 12
/* Types for code units in patterns and subject strings. */
@ -452,6 +455,8 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
int (*)(pcre2_callout_block *, void *), void *); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_heap_limit(pcre2_match_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_match_limit(pcre2_match_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@ -676,6 +681,7 @@ pcre2_compile are called by application code. */
#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_)
#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_)
#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_)
#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_)
#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_)
#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_)
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)

View File

@ -268,6 +268,7 @@ numbers must not be changed. */
#define PCRE2_ERROR_BADSUBSPATTERN (-60)
#define PCRE2_ERROR_TOOMANYREPLACE (-61)
#define PCRE2_ERROR_BADSERIALIZEDDATA (-62)
#define PCRE2_ERROR_HEAPLIMIT (-63)
/* Request types for pcre2_pattern_info() */
@ -297,6 +298,7 @@ numbers must not be changed. */
#define PCRE2_INFO_SIZE 22
#define PCRE2_INFO_HASBACKSLASHC 23
#define PCRE2_INFO_FRAMESIZE 24
#define PCRE2_INFO_HEAPLIMIT 25
/* Request types for pcre2_config(). */
@ -313,6 +315,7 @@ numbers must not be changed. */
#define PCRE2_CONFIG_UNICODE 9
#define PCRE2_CONFIG_UNICODE_VERSION 10
#define PCRE2_CONFIG_VERSION 11
#define PCRE2_CONFIG_HEAPLIMIT 12
/* Types for code units in patterns and subject strings. */
@ -452,6 +455,8 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
int (*)(pcre2_callout_block *, void *), void *); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_heap_limit(pcre2_match_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_match_limit(pcre2_match_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@ -676,6 +681,7 @@ pcre2_compile are called by application code. */
#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_)
#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_)
#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_)
#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_)
#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_)
#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_)
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)

View File

@ -727,6 +727,7 @@ enum { PSO_OPT, /* Value is an option bit */
PSO_FLG, /* Value is a flag bit */
PSO_NL, /* Value is a newline type */
PSO_BSR, /* Value is a \R type */
PSO_LIMH, /* Read integer value for heap limit */
PSO_LIMM, /* Read integer value for match limit */
PSO_LIMD }; /* Read integer value for depth limit */
@ -749,6 +750,7 @@ static pso pso_list[] = {
{ (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
{ (uint8_t *)STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
{ (uint8_t *)STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
{ (uint8_t *)STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
{ (uint8_t *)STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
{ (uint8_t *)STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
{ (uint8_t *)STRING_LIMIT_RECURSION_EQ, 16, PSO_LIMD, 0 },
@ -8853,6 +8855,7 @@ uint32_t firstcu, reqcu; /* Value of first/req code unit */
uint32_t setflags = 0; /* NL and BSR set flags */
uint32_t skipatstart; /* When checking (*UTF) etc */
uint32_t limit_heap = UINT32_MAX;
uint32_t limit_match = UINT32_MAX; /* Unset match limits */
uint32_t limit_depth = UINT32_MAX;
@ -9026,6 +9029,7 @@ while (patlen - skipatstart >= 2 &&
case PSO_LIMM:
case PSO_LIMD:
case PSO_LIMH:
c = 0;
pp = skipatstart;
if (!IS_DIGIT(ptr[pp]))
@ -9045,7 +9049,8 @@ while (patlen - skipatstart >= 2 &&
ptr += pp;
goto HAD_EARLY_ERROR;
}
if (p->type == PSO_LIMM) limit_match = c;
if (p->type == PSO_LIMH) limit_heap = c;
else if (p->type == PSO_LIMM) limit_match = c;
else limit_depth = c;
skipatstart += pp - skipatstart;
break;
@ -9288,6 +9293,7 @@ re->magic_number = MAGIC_NUMBER;
re->compile_options = options;
re->overall_options = cb.external_options;
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
re->limit_heap = limit_heap;
re->limit_match = limit_match;
re->limit_depth = limit_depth;
re->first_codeunit = 0;

View File

@ -84,6 +84,7 @@ if (where == NULL) /* Requests a length */
return PCRE2_ERROR_BADOPTION;
case PCRE2_CONFIG_BSR:
case PCRE2_CONFIG_HEAPLIMIT:
case PCRE2_CONFIG_JIT:
case PCRE2_CONFIG_LINKSIZE:
case PCRE2_CONFIG_MATCHLIMIT:
@ -116,6 +117,10 @@ switch (what)
#endif
break;
case PCRE2_CONFIG_HEAPLIMIT:
*((uint32_t *)where) = HEAP_LIMIT;
break;
case PCRE2_CONFIG_JIT:
#ifdef SUPPORT_JIT
*((uint32_t *)where) = 1;

View File

@ -168,6 +168,7 @@ const pcre2_match_context PRIV(default_match_context) = {
NULL,
NULL,
PCRE2_UNSET, /* Offset limit */
HEAP_LIMIT,
MATCH_LIMIT,
MATCH_LIMIT_DEPTH };
@ -346,6 +347,13 @@ mcontext->callout_data = callout_data;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t limit)
{
mcontext->heap_limit = limit;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t limit)
{

View File

@ -256,6 +256,7 @@ static const unsigned char match_error_texts[] =
"match with end before start is not supported\0"
"too many replacements (more than INT_MAX)\0"
"bad serialized data\0"
"heap limit exceeded\0"
;

View File

@ -240,6 +240,16 @@ not rely on this. */
#define COMPILE_ERROR_BASE 100
/* The initial frames vector for remembering backtracking points in
pcre2_match() is allocated on the system stack, of this size (bytes). The size
must be a multiple of sizeof(PCRE2_SPTR) in all environments, so making it a
multiple of 8 is best. Typical frame sizes are a few hundred bytes (it depends
on the number of capturing parentheses) so 20K handles quite a few frames. A
larger vector on the heap is obtained for patterns that need more frames. The
maximum size of this can be limited. */
#define START_FRAMES_SIZE 20480
/* Define the default BSR convention. */
#ifdef BSR_ANYCRLF
@ -922,6 +932,7 @@ a positive value. */
#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
#define STRING_NOTEMPTY_RIGHTPAR "NOTEMPTY)"
#define STRING_NOTEMPTY_ATSTART_RIGHTPAR "NOTEMPTY_ATSTART)"
#define STRING_LIMIT_HEAP_EQ "LIMIT_HEAP="
#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH="
#define STRING_LIMIT_DEPTH_EQ "LIMIT_DEPTH="
#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION="
@ -1196,6 +1207,7 @@ only. */
#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
#define STRING_NOTEMPTY_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_RIGHT_PARENTHESIS
#define STRING_NOTEMPTY_ATSTART_RIGHTPAR STR_N STR_O STR_T STR_E STR_M STR_P STR_T STR_Y STR_UNDERSCORE STR_A STR_T STR_S STR_T STR_A STR_R STR_T STR_RIGHT_PARENTHESIS
#define STRING_LIMIT_HEAP_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_H STR_E STR_A STR_P STR_EQUALS_SIGN
#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN
#define STRING_LIMIT_DEPTH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_D STR_E STR_P STR_T STR_H STR_EQUALS_SIGN
#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN

View File

@ -585,6 +585,7 @@ typedef struct pcre2_real_match_context {
int (*callout)(pcre2_callout_block *, void *);
void *callout_data;
PCRE2_SIZE offset_limit;
uint32_t heap_limit;
uint32_t match_limit;
uint32_t depth_limit;
} pcre2_real_match_context;
@ -614,6 +615,7 @@ typedef struct pcre2_real_code {
uint32_t compile_options; /* Options passed to pcre2_compile() */
uint32_t overall_options; /* Options after processing the pattern */
uint32_t flags; /* Various state flags */
uint32_t limit_heap; /* Limit set in the pattern */
uint32_t limit_match; /* Limit set in the pattern */
uint32_t limit_depth; /* Limit set in the pattern */
uint32_t first_codeunit; /* Starting code unit */
@ -808,9 +810,10 @@ typedef struct match_block {
heapframe *match_frames; /* Points to vector of frames */
heapframe *match_frames_top; /* Points after the end of the vector */
heapframe *stack_frames; /* The original vector on the stack */
uint32_t match_call_count; /* Number of times a new frame is created */
PCRE2_SIZE heap_limit; /* As it says */
uint32_t match_limit; /* As it says */
uint32_t match_limit_depth; /* As it says */
uint32_t match_call_count; /* Number of times a new frame is created */
BOOL hitend; /* Hit the end of the subject at some point */
BOOL hasthen; /* Pattern contains (*THEN) */
const uint8_t *lcc; /* Points to lower casing table */

View File

@ -64,15 +64,6 @@ information, and fields within it. */
#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
/* The initial frames vector for remembering backtracking points is allocated
on the system stack, of this size (bytes). The size must be a multiple of
sizeof(PCRE2_SPTR) in all environments, so making it a multiple of 8 is best.
Typical frame sizes are a few hundred bytes (it depends on the number of
capturing parentheses) so 10K handles quite a few frames. A larger vector on
the heap is obtained for patterns that need more frames. */
#define START_FRAMES_SIZE 10240
/* Masks for identifying the public options that are permitted at match time. */
#define PUBLIC_MATCH_OPTIONS \
@ -618,14 +609,22 @@ backtracking point. */
MATCH_RECURSE:
/* Set up a new backtracking frame. If the vector is full, get a new one
on the heap, doubling the size. */
on the heap, doubling the size, but constrained by the heap limit. */
N = (heapframe *)((char *)F + frame_size);
if (N >= mb->match_frames_top)
{
PCRE2_SIZE newsize = mb->frame_vector_size * 2;
heapframe *new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
heapframe *new;
if ((newsize / 1024) > mb->heap_limit)
{
PCRE2_SIZE maxsize = ((mb->heap_limit * 1024)/frame_size) * frame_size;
if (mb->frame_vector_size == maxsize) return PCRE2_ERROR_HEAPLIMIT;
newsize = maxsize;
}
new = mb->memctl.malloc(newsize, mb->memctl.memory_data);
if (new == NULL) return PCRE2_ERROR_NOMEMORY;
memcpy(new, mb->match_frames, mb->frame_vector_size);
@ -802,13 +801,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
Fstart_match == mb->start_subject + mb->start_offset)))
RRETURN(MATCH_NOMATCH);
/* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
/* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
the end of the subject. */
if (Feptr < mb->end_subject &&
((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
RRETURN(MATCH_NOMATCH);
RRETURN(MATCH_NOMATCH);
/* We have a successful match of the whole pattern. Record the result and
then do a direct return from the function. If there is space in the offset
vector, set any pairs that follow the highest-numbered captured string but
@ -6093,13 +6092,13 @@ set up later. */
utf = (re->overall_options & PCRE2_UTF) != 0;
mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
time. */
if (mb->partial != 0 &&
/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
time. */
if (mb->partial != 0 &&
((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
return PCRE2_ERROR_BADOPTION;
return PCRE2_ERROR_BADOPTION;
/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
we must also check that a starting offset does not point into the middle of a
@ -6266,9 +6265,22 @@ correct when calling match() more than once for non-anchored patterns. */
frame_size = sizeof(heapframe) + ((re->top_bracket - 1) * 2 * sizeof(PCRE2_SIZE));
/* Limits set in the pattern override the match context only if they are
smaller. */
mb->heap_limit = (mcontext->heap_limit < re->limit_heap)?
mcontext->heap_limit : re->limit_heap;
mb->match_limit = (mcontext->match_limit < re->limit_match)?
mcontext->match_limit : re->limit_match;
mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
mcontext->depth_limit : re->limit_depth;
/* If a pattern has very many capturing parentheses, the frame size may be very
large. Ensure that there are at least 10 available frames by getting an initial
vector on the heap if necessary. */
vector on the heap if necessary, except when the heap limit prevents this. Get
fewer if possible. (The heap limit is in kilobytes.) */
if (frame_size <= START_FRAMES_SIZE/10)
{
@ -6278,6 +6290,11 @@ if (frame_size <= START_FRAMES_SIZE/10)
else
{
mb->frame_vector_size = frame_size * 10;
if ((mb->frame_vector_size / 1024) > mb->heap_limit)
{
if (frame_size > mb->heap_limit * 1024) return PCRE2_ERROR_HEAPLIMIT;
mb->frame_vector_size = ((mb->heap_limit * 1024)/frame_size) * frame_size;
}
mb->match_frames = mb->memctl.malloc(mb->frame_vector_size,
mb->memctl.memory_data);
if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY;
@ -6292,14 +6309,6 @@ to avoid uninitialized memory read errors when it is copied to a new frame. */
memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff,
re->top_bracket * 2 * sizeof(PCRE2_SIZE));
/* Limits set in the pattern override the match context only if they are
smaller. */
mb->match_limit = (mcontext->match_limit < re->limit_match)?
mcontext->match_limit : re->limit_match;
mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
mcontext->depth_limit : re->limit_depth;
/* Pointers to the individual character tables */
mb->lcc = re->tables + lcc_offset;

View File

@ -80,6 +80,7 @@ if (where == NULL) /* Requests field length */
case PCRE2_INFO_FIRSTCODEUNIT:
case PCRE2_INFO_HASBACKSLASHC:
case PCRE2_INFO_HASCRORLF:
case PCRE2_INFO_HEAPLIMIT:
case PCRE2_INFO_JCHANGED:
case PCRE2_INFO_LASTCODETYPE:
case PCRE2_INFO_LASTCODEUNIT:
@ -171,6 +172,11 @@ switch(what)
*((uint32_t *)where) = (re->flags & PCRE2_HASCRORLF) != 0;
break;
case PCRE2_INFO_HEAPLIMIT:
*((uint32_t *)where) = re->limit_heap;
if (re->limit_heap == UINT32_MAX) return PCRE2_ERROR_UNSET;
break;
case PCRE2_INFO_JCHANGED:
*((uint32_t *)where) = (re->flags & PCRE2_JCHANGED) != 0;
break;

View File

@ -212,6 +212,7 @@ static const uint8_t *character_tables = NULL;
static uint32_t pcre2_options = 0;
static uint32_t process_options = 0;
static PCRE2_SIZE heap_limit = PCRE2_UNSET;
static uint32_t match_limit = 0;
static uint32_t depth_limit = 0;
@ -330,7 +331,7 @@ static const char *incexname[4] = { "--include", "--exclude",
/* Structure for options and list of them */
enum { OP_NODATA, OP_STRING, OP_OP_STRING, OP_NUMBER, OP_U32NUMBER,
enum { OP_NODATA, OP_STRING, OP_OP_STRING, OP_NUMBER, OP_U32NUMBER, OP_SIZE,
OP_OP_NUMBER, OP_OP_NUMBERS, OP_PATLIST, OP_FILELIST, OP_BINFILES };
typedef struct option_item {
@ -356,16 +357,17 @@ used to identify them. */
#define N_LOFFSETS (-10)
#define N_FOFFSETS (-11)
#define N_LBUFFER (-12)
#define N_M_LIMIT (-13)
#define N_M_LIMIT_DEP (-14)
#define N_BUFSIZE (-15)
#define N_NOJIT (-16)
#define N_FILE_LIST (-17)
#define N_BINARY_FILES (-18)
#define N_EXCLUDE_FROM (-19)
#define N_INCLUDE_FROM (-20)
#define N_OM_SEPARATOR (-21)
#define N_MAX_BUFSIZE (-22)
#define N_H_LIMIT (-13)
#define N_M_LIMIT (-14)
#define N_M_LIMIT_DEP (-15)
#define N_BUFSIZE (-16)
#define N_NOJIT (-17)
#define N_FILE_LIST (-18)
#define N_BINARY_FILES (-19)
#define N_EXCLUDE_FROM (-20)
#define N_INCLUDE_FROM (-21)
#define N_OM_SEPARATOR (-22)
#define N_MAX_BUFSIZE (-23)
static option_item optionlist[] = {
{ OP_NODATA, N_NULL, NULL, "", "terminate options" },
@ -397,6 +399,7 @@ static option_item optionlist[] = {
{ OP_NODATA, N_LBUFFER, NULL, "line-buffered", "use line buffering" },
{ OP_NODATA, N_LOFFSETS, NULL, "line-offsets", "output line numbers and offsets, not text" },
{ OP_STRING, N_LOCALE, &locale, "locale=locale", "use the named locale" },
{ OP_SIZE, N_H_LIMIT, &heap_limit, "heap-limit=number", "set PCRE2 heap limit option (kilobytes)" },
{ OP_U32NUMBER, N_M_LIMIT, &match_limit, "match-limit=number", "set PCRE2 match limit option" },
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "depth-limit=number", "set PCRE2 depth limit option" },
{ OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "recursion-limit=number", "obsolete synonym for depth-limit" },
@ -525,9 +528,9 @@ pcre2grep_exit(int rc)
{
if (resource_error)
{
fprintf(stderr, "pcre2grep: Error %d, %d or %d means that a resource limit "
"was exceeded.\n", PCRE2_ERROR_JIT_STACKLIMIT, PCRE2_ERROR_MATCHLIMIT,
PCRE2_ERROR_DEPTHLIMIT);
fprintf(stderr, "pcre2grep: Error %d, %d, %d or %d means that a resource "
"limit was exceeded.\n", PCRE2_ERROR_JIT_STACKLIMIT, PCRE2_ERROR_MATCHLIMIT,
PCRE2_ERROR_DEPTHLIMIT, PCRE2_ERROR_HEAPLIMIT);
fprintf(stderr, "pcre2grep: Check your regex for nested unlimited loops.\n");
}
exit(rc);
@ -1647,7 +1650,7 @@ for (i = 1; p != NULL; p = p->next, i++)
FWRITE(matchptr, 1, slen, stderr); /* In case binary zero included */
fprintf(stderr, "\n\n");
if (*mrc == PCRE2_ERROR_MATCHLIMIT || *mrc == PCRE2_ERROR_DEPTHLIMIT ||
*mrc == PCRE2_ERROR_JIT_STACKLIMIT)
*mrc == PCRE2_ERROR_HEAPLIMIT || *mrc == PCRE2_ERROR_JIT_STACKLIMIT)
resource_error = TRUE;
if (error_count++ > 20)
{
@ -3796,7 +3799,7 @@ for (i = 1; i < argc; i++)
/* Otherwise, deal with a single string or numeric data value. */
else if (op->type != OP_NUMBER && op->type != OP_U32NUMBER &&
op->type != OP_OP_NUMBER)
op->type != OP_OP_NUMBER && op->type != OP_SIZE)
{
*((char **)op->dataptr) = option_data;
}
@ -3804,6 +3807,7 @@ for (i = 1; i < argc; i++)
{
unsigned long int n = decode_number(option_data, op, longop);
if (op->type == OP_U32NUMBER) *((uint32_t *)op->dataptr) = n;
else if (op->type == OP_SIZE) *((PCRE2_SIZE *)op->dataptr) = n;
else *((int *)op->dataptr) = n;
}
}
@ -3839,6 +3843,7 @@ if (output_text != NULL &&
/* Put limits into the match data block. */
if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
if (match_limit > 0) pcre2_set_match_limit(match_context, match_limit);
if (depth_limit > 0) pcre2_set_depth_limit(match_context, depth_limit);

View File

@ -588,6 +588,7 @@ static modstruct modlist[] = {
{ "get", MOD_DAT, MOD_NN, DO(get_numbers), DO(get_names) },
{ "getall", MOD_DAT, MOD_CTL, CTL_GETALL, DO(control) },
{ "global", MOD_PNDP, MOD_CTL, CTL_GLOBAL, PO(control) },
{ "heap_limit", MOD_CTM, MOD_INT, 0, MO(heap_limit) },
{ "hex", MOD_PAT, MOD_CTL, CTL_HEXPAT, PO(control) },
{ "info", MOD_PAT, MOD_CTL, CTL_INFO, PO(control) },
{ "jit", MOD_PAT, MOD_IND, 7, PO(jit) },
@ -1207,6 +1208,14 @@ are supported. */
else \
pcre2_set_depth_limit_32(G(a,32),b)
#define PCRE2_SET_HEAP_LIMIT(a,b) \
if (test_mode == PCRE8_MODE) \
pcre2_set_heap_limit_8(G(a,8),b); \
else if (test_mode == PCRE16_MODE) \
pcre2_set_heap_limit_16(G(a,16),b); \
else \
pcre2_set_heap_limit_32(G(a,32),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) \
if (test_mode == PCRE8_MODE) \
pcre2_set_match_limit_8(G(a,8),b); \
@ -1643,6 +1652,12 @@ the three different cases. */
else \
G(pcre2_set_depth_limit_,BITTWO)(G(a,BITTWO),b)
#define PCRE2_SET_HEAP_LIMIT(a,b) \
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
G(pcre2_set_heap_limit_,BITONE)(G(a,BITONE),b); \
else \
G(pcre2_set_heap_limit_,BITTWO)(G(a,BITTWO),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) \
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
G(pcre2_set_match_limit_,BITONE)(G(a,BITONE),b); \
@ -1856,6 +1871,7 @@ the three different cases. */
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
pcre2_set_compile_recursion_guard_8(G(a,8),b,c)
#define PCRE2_SET_DEPTH_LIMIT(a,b) pcre2_set_depth_limit_8(G(a,8),b)
#define PCRE2_SET_HEAP_LIMIT(a,b) pcre2_set_heap_limit_8(G(a,8),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_8(G(a,8),b)
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_8(G(a,8),b)
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_8(G(a,8),b)
@ -1952,6 +1968,7 @@ the three different cases. */
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
pcre2_set_compile_recursion_guard_16(G(a,16),b,c)
#define PCRE2_SET_DEPTH_LIMIT(a,b) pcre2_set_depth_limit_16(G(a,16),b)
#define PCRE2_SET_HEAP_LIMIT(a,b) pcre2_set_heap_limit_16(G(a,16),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_16(G(a,16),b)
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_16(G(a,16),b)
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_16(G(a,16),b)
@ -2048,6 +2065,7 @@ the three different cases. */
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
pcre2_set_compile_recursion_guard_32(G(a,32),b,c)
#define PCRE2_SET_DEPTH_LIMIT(a,b) pcre2_set_depth_limit_32(G(a,32),b)
#define PCRE2_SET_HEAP_LIMIT(a,b) pcre2_set_heap_limit_32(G(a,32),b)
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_32(G(a,32),b)
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_32(G(a,32),b)
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_32(G(a,32),b)
@ -4040,14 +4058,28 @@ if ((pat_patctl.control & CTL_INFO) != 0)
{
void *nametable;
uint8_t *start_bits;
BOOL match_limit_set, depth_limit_set;
BOOL heap_limit_set, match_limit_set, depth_limit_set;
uint32_t backrefmax, bsr_convention, capture_count, first_ctype, first_cunit,
hasbackslashc, hascrorlf, jchanged, last_ctype, last_cunit, match_empty,
match_limit, minlength, nameentrysize, namecount, newline_convention,
depth_limit;
depth_limit, heap_limit, match_limit, minlength, nameentrysize, namecount,
newline_convention;
/* These info requests may return PCRE2_ERROR_UNSET. */
switch(pattern_info(PCRE2_INFO_HEAPLIMIT, &heap_limit, TRUE))
{
case 0:
heap_limit_set = TRUE;
break;
case PCRE2_ERROR_UNSET:
heap_limit_set = FALSE;
break;
default:
return PR_ABEND;
}
switch(pattern_info(PCRE2_INFO_MATCHLIMIT, &match_limit, TRUE))
{
case 0:
@ -4106,6 +4138,9 @@ if ((pat_patctl.control & CTL_INFO) != 0)
if (maxlookbehind > 0)
fprintf(outfile, "Max lookbehind = %d\n", maxlookbehind);
if (heap_limit_set)
fprintf(outfile, "Heap limit = %u\n", heap_limit);
if (match_limit_set)
fprintf(outfile, "Match limit = %u\n", match_limit);
@ -5353,10 +5388,15 @@ uint32_t max = UINT32_MAX;
PCRE2_SET_MATCH_LIMIT(dat_context, max);
PCRE2_SET_DEPTH_LIMIT(dat_context, max);
PCRE2_SET_HEAP_LIMIT(dat_context, max);
for (;;)
{
if (errnumber == PCRE2_ERROR_MATCHLIMIT)
if (errnumber == PCRE2_ERROR_HEAPLIMIT)
{
PCRE2_SET_HEAP_LIMIT(dat_context, mid);
}
else if (errnumber == PCRE2_ERROR_MATCHLIMIT)
{
PCRE2_SET_MATCH_LIMIT(dat_context, mid);
}
@ -5393,13 +5433,23 @@ for (;;)
capcount == PCRE2_ERROR_NOMATCH ||
capcount == PCRE2_ERROR_PARTIAL)
{
/* If we've not hit the error with a heap limit less than the size of the
initial stack frame vector, the heap is not being used, so the minimum
limit is zero; there's no need to go on. The other limits are always
greater than zero. */
if (errnumber == PCRE2_ERROR_HEAPLIMIT && mid < START_FRAMES_SIZE/1024)
{
fprintf(outfile, "Minimum %s limit = 0\n", msg);
break;
}
if (mid == min + 1)
{
fprintf(outfile, "Minimum %s limit = %d\n", msg, mid);
break;
}
max = mid;
mid = (min + mid)/2;
}
max = mid;
mid = (min + max)/2;
}
else break; /* Some other error */
}
@ -6662,20 +6712,32 @@ else for (gmatched = 0;; gmatched++)
(double)CLOCKS_PER_SEC);
}
/* Find the match and depth limits if requested. The match limit is not
relevant for DFA matching and the depth limit is not relevant for JIT. */
/* Find the heap, match and depth limits if requested. The match and heap
limits are not relevant for DFA matching and the depth limit is not relevant
for JIT. */
if ((dat_datctl.control & CTL_FINDLIMITS) != 0)
{
if ((dat_datctl.control & CTL_DFA) == 0)
{
if (FLD(compiled_code, executable_jit) == NULL ||
(dat_datctl.options & PCRE2_NO_JIT) != 0)
{
capcount = check_match_limit(pp, arg_ulen, PCRE2_ERROR_HEAPLIMIT,
"heap");
}
capcount = check_match_limit(pp, arg_ulen, PCRE2_ERROR_MATCHLIMIT,
"match");
}
else capcount = 0;
if (FLD(compiled_code, executable_jit) == NULL ||
(dat_datctl.options & PCRE2_NO_JIT) != 0 ||
(dat_datctl.control & CTL_DFA) != 0)
{
capcount = check_match_limit(pp, arg_ulen, PCRE2_ERROR_DEPTHLIMIT,
"depth");
}
}
/* Otherwise just run a single match, setting up a callout if required (the
@ -7402,6 +7464,8 @@ printf(" \\C is supported\n");
printf(" Internal link size = %d\n", optval);
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);
printf(" Parentheses nest limit = %d\n", optval);
(void)PCRE2_CONFIG(PCRE2_CONFIG_HEAPLIMIT, &optval);
printf(" Default heap limit = %d\n", optval);
(void)PCRE2_CONFIG(PCRE2_CONFIG_MATCHLIMIT, &optval);
printf(" Default match limit = %d\n", optval);
(void)PCRE2_CONFIG(PCRE2_CONFIG_DEPTHLIMIT, &optval);

13
testdata/testoutput15 vendored
View File

@ -12,11 +12,13 @@ Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzbbbbbb\=find_limits
Minimum heap limit = 0
Minimum match limit = 7
Minimum depth limit = 7
0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazz
1: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaz\=find_limits
Minimum heap limit = 0
Minimum match limit = 20481
Minimum depth limit = 30
No match
@ -26,6 +28,7 @@ Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
/* this is a C style comment */\=find_limits
Minimum heap limit = 0
Minimum match limit = 64
Minimum depth limit = 7
0: /* this is a C style comment */
@ -33,21 +36,25 @@ Minimum depth limit = 7
/^(?>a)++/
aa\=find_limits
Minimum heap limit = 0
Minimum match limit = 5
Minimum depth limit = 3
0: aa
aaaaaaaaa\=find_limits
Minimum heap limit = 0
Minimum match limit = 12
Minimum depth limit = 3
0: aaaaaaaaa
/(a)(?1)++/
aa\=find_limits
Minimum heap limit = 0
Minimum match limit = 7
Minimum depth limit = 5
0: aa
1: a
aaaaaaaaa\=find_limits
Minimum heap limit = 0
Minimum match limit = 21
Minimum depth limit = 5
0: aaaaaaaaa
@ -55,30 +62,35 @@ Minimum depth limit = 5
/a(?:.)*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum heap limit = 0
Minimum match limit = 24
Minimum depth limit = 3
0: abbbbbbbbbbbbbbbbbbbbba
/a(?:.(*THEN))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum heap limit = 0
Minimum match limit = 66
Minimum depth limit = 45
0: abbbbbbbbbbbbbbbbbbbbba
/a(?:.(*THEN:ABC))*?a/ims
abbbbbbbbbbbbbbbbbbbbba\=find_limits
Minimum heap limit = 0
Minimum match limit = 66
Minimum depth limit = 45
0: abbbbbbbbbbbbbbbbbbbbba
/^(?>a+)(?>b+)(?>c+)(?>d+)(?>e+)/
aabbccddee\=find_limits
Minimum heap limit = 0
Minimum match limit = 7
Minimum depth limit = 7
0: aabbccddee
/^(?>(a+))(?>(b+))(?>(c+))(?>(d+))(?>(e+))/
aabbccddee\=find_limits
Minimum heap limit = 0
Minimum match limit = 12
Minimum depth limit = 12
0: aabbccddee
@ -90,6 +102,7 @@ Minimum depth limit = 12
/^(?>(a+))(?>b+)(?>(c+))(?>d+)(?>(e+))/
aabbccddee\=find_limits
Minimum heap limit = 0
Minimum match limit = 10
Minimum depth limit = 10
0: aabbccddee

View File

@ -15609,7 +15609,7 @@ Last code unit = 'c'
Subject length lower bound = 4
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -64: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
Error -2: partial match
Error -1: no match