Add user data to recursion guard; get ready for RC1 (again)
This commit is contained in:
parent
d1f5dd5bf2
commit
9fcdf2cc6f
|
@ -1,7 +1,7 @@
|
|||
Change Log for PCRE2
|
||||
--------------------
|
||||
|
||||
Version 10.00 24-November-2014
|
||||
Version 10.00 28-November-2014
|
||||
------------------------------
|
||||
|
||||
Version 10.00 is the first release of PCRE2, a revised API for the PCRE
|
||||
|
|
2
NEWS
2
NEWS
|
@ -1,7 +1,7 @@
|
|||
News about PCRE2 releases
|
||||
-------------------------
|
||||
|
||||
Version 10.00 24-November-2014
|
||||
Version 10.00 28-November-2014
|
||||
------------------------------
|
||||
|
||||
Version 10.00 is the first release of PCRE2, a revised API for the PCRE
|
||||
|
|
|
@ -11,7 +11,7 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
|
|||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [00])
|
||||
m4_define(pcre2_prerelease, [-RC1])
|
||||
m4_define(pcre2_date, [2014-11-24])
|
||||
m4_define(pcre2_date, [2014-11-28])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
|
|
@ -39,14 +39,12 @@ code units; for other types of data it is in bytes.
|
|||
<P>
|
||||
If <b>where</b> is not NULL, for PCRE2_CONFIG_JITTARGET,
|
||||
PCRE2_CONFIG_UNICODE_VERSION, and PCRE2_CONFIG_VERSION it must point to a
|
||||
buffer that is large enough to hold the string. For PCRE2_CONFIG_MATCHLIMIT,
|
||||
PCRE2_CONFIG_PARENSLIMIT, and PCRE2_CONFIG_RECURSIONLIMIT it must point to an
|
||||
unsigned long int variable, and for all other codes to an int variable. The
|
||||
available codes are:
|
||||
buffer that is large enough to hold the string. For all other codes it must
|
||||
point to a uint32_t integer variable. The available codes are:
|
||||
<pre>
|
||||
PCRE2_CONFIG_BSR Indicates what \R matches by default:
|
||||
0 all Unicode line endings
|
||||
1 CR, LF, or CRLF only
|
||||
PCRE2_BSR_UNICODE
|
||||
PCRE2_BSR_ANYCRLF
|
||||
PCRE2_CONFIG_JIT Availability of just-in-time compiler
|
||||
support (1=yes 0=no)
|
||||
PCRE2_CONFIG_JITTARGET Information about the target archi-
|
||||
|
@ -54,11 +52,11 @@ available codes are:
|
|||
PCRE2_CONFIG_LINKSIZE Configured internal link size (2, 3, 4)
|
||||
PCRE2_CONFIG_MATCHLIMIT Default internal resource limit
|
||||
PCRE2_CONFIG_NEWLINE Code for the default newline sequence:
|
||||
1 for CR
|
||||
2 for LF
|
||||
3 for CRLF
|
||||
4 for ANY
|
||||
5 for ANYCRLF
|
||||
PCRE2_NEWLINE_CR
|
||||
PCRE2_NEWLINE_LF
|
||||
PCRE2_NEWLINE_CRLF
|
||||
PCRE2_NEWLINE_ANY
|
||||
PCRE2_NEWLINE_ANYCRLF
|
||||
PCRE2_CONFIG_PARENSLIMIT Default parentheses nesting limit
|
||||
PCRE2_CONFIG_RECURSIONLIMIT Internal recursion depth limit
|
||||
PCRE2_CONFIG_STACKRECURSE Recursion implementation (1=stack
|
||||
|
|
|
@ -78,7 +78,7 @@ the requested information, in bytes. The following information is available:
|
|||
The <i>where</i> argument must point to an unsigned 32-bit integer (uint32_t
|
||||
variable), except for the following <i>what</i> values:
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTBITMAP const uint8_t
|
||||
PCRE2_INFO_FIRSTBITMAP const uint8_t *
|
||||
PCRE2_INFO_JITSIZE size_t
|
||||
PCRE2_INFO_NAMETABLE PCRE2_SPTR
|
||||
PCRE2_INFO_SIZE size_t
|
||||
|
|
|
@ -20,7 +20,7 @@ SYNOPSIS
|
|||
</P>
|
||||
<P>
|
||||
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t));</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
|
@ -28,11 +28,12 @@ DESCRIPTION
|
|||
<P>
|
||||
This function defines, within a compile context, a function that is called
|
||||
whenever <b>pcre2_compile()</b> starts to compile a parenthesized part of a
|
||||
pattern. The argument to the function gives the current depth of parenthesis
|
||||
nesting. The function should return zero if all is well, or non-zero to force
|
||||
an error. This feature is provided so that applications can check the available
|
||||
system stack space, in order to avoid running out. The result of this function
|
||||
is always zero.
|
||||
pattern. The first argument to the function gives the current depth of
|
||||
parenthesis nesting, and the second is user data that is supplied when the
|
||||
function is set up. The callout function should return zero if all is well, or
|
||||
non-zero to force an error. This feature is provided so that applications can
|
||||
check the available system stack space, in order to avoid running out. The
|
||||
result of <b>pcre2_set_compile_recursion_guard()</b> is always zero.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -24,31 +24,32 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC9" href="#SEC9">PCRE2 NATIVE API AUXILIARY FUNCTIONS</a>
|
||||
<li><a name="TOC10" href="#SEC10">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a>
|
||||
<li><a name="TOC11" href="#SEC11">PCRE2 API OVERVIEW</a>
|
||||
<li><a name="TOC12" href="#SEC12">NEWLINES</a>
|
||||
<li><a name="TOC13" href="#SEC13">MULTITHREADING</a>
|
||||
<li><a name="TOC14" href="#SEC14">PCRE2 CONTEXTS</a>
|
||||
<li><a name="TOC15" href="#SEC15">CHECKING BUILD-TIME OPTIONS</a>
|
||||
<li><a name="TOC16" href="#SEC16">COMPILING A PATTERN</a>
|
||||
<li><a name="TOC17" href="#SEC17">COMPILATION ERROR CODES</a>
|
||||
<li><a name="TOC18" href="#SEC18">JUST-IN-TIME (JIT) COMPILATION</a>
|
||||
<li><a name="TOC19" href="#SEC19">LOCALE SUPPORT</a>
|
||||
<li><a name="TOC20" href="#SEC20">INFORMATION ABOUT A COMPILED PATTERN</a>
|
||||
<li><a name="TOC21" href="#SEC21">THE MATCH DATA BLOCK</a>
|
||||
<li><a name="TOC22" href="#SEC22">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
||||
<li><a name="TOC23" href="#SEC23">NEWLINE HANDLING WHEN MATCHING</a>
|
||||
<li><a name="TOC24" href="#SEC24">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC25" href="#SEC25">OTHER INFORMATION ABOUT A MATCH</a>
|
||||
<li><a name="TOC26" href="#SEC26">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
||||
<li><a name="TOC27" href="#SEC27">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
||||
<li><a name="TOC28" href="#SEC28">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC29" href="#SEC29">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
||||
<li><a name="TOC30" href="#SEC30">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
||||
<li><a name="TOC31" href="#SEC31">DUPLICATE SUBPATTERN NAMES</a>
|
||||
<li><a name="TOC32" href="#SEC32">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
||||
<li><a name="TOC33" href="#SEC33">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
||||
<li><a name="TOC34" href="#SEC34">SEE ALSO</a>
|
||||
<li><a name="TOC35" href="#SEC35">AUTHOR</a>
|
||||
<li><a name="TOC36" href="#SEC36">REVISION</a>
|
||||
<li><a name="TOC12" href="#SEC12">STRING LENGTHS AND OFFSETS</a>
|
||||
<li><a name="TOC13" href="#SEC13">NEWLINES</a>
|
||||
<li><a name="TOC14" href="#SEC14">MULTITHREADING</a>
|
||||
<li><a name="TOC15" href="#SEC15">PCRE2 CONTEXTS</a>
|
||||
<li><a name="TOC16" href="#SEC16">CHECKING BUILD-TIME OPTIONS</a>
|
||||
<li><a name="TOC17" href="#SEC17">COMPILING A PATTERN</a>
|
||||
<li><a name="TOC18" href="#SEC18">COMPILATION ERROR CODES</a>
|
||||
<li><a name="TOC19" href="#SEC19">JUST-IN-TIME (JIT) COMPILATION</a>
|
||||
<li><a name="TOC20" href="#SEC20">LOCALE SUPPORT</a>
|
||||
<li><a name="TOC21" href="#SEC21">INFORMATION ABOUT A COMPILED PATTERN</a>
|
||||
<li><a name="TOC22" href="#SEC22">THE MATCH DATA BLOCK</a>
|
||||
<li><a name="TOC23" href="#SEC23">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
||||
<li><a name="TOC24" href="#SEC24">NEWLINE HANDLING WHEN MATCHING</a>
|
||||
<li><a name="TOC25" href="#SEC25">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC26" href="#SEC26">OTHER INFORMATION ABOUT A MATCH</a>
|
||||
<li><a name="TOC27" href="#SEC27">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
||||
<li><a name="TOC28" href="#SEC28">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
||||
<li><a name="TOC29" href="#SEC29">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC30" href="#SEC30">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
||||
<li><a name="TOC31" href="#SEC31">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
||||
<li><a name="TOC32" href="#SEC32">DUPLICATE SUBPATTERN NAMES</a>
|
||||
<li><a name="TOC33" href="#SEC33">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
||||
<li><a name="TOC34" href="#SEC34">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
||||
<li><a name="TOC35" href="#SEC35">SEE ALSO</a>
|
||||
<li><a name="TOC36" href="#SEC36">AUTHOR</a>
|
||||
<li><a name="TOC37" href="#SEC37">REVISION</a>
|
||||
</ul>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
|
@ -148,7 +149,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t));</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS</a><br>
|
||||
<P>
|
||||
|
@ -164,7 +165,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -424,8 +425,18 @@ matched.
|
|||
Finally, there are functions for finding out information about a compiled
|
||||
pattern (<b>pcre2_pattern_info()</b>) and about the configuration with which
|
||||
PCRE2 was built (<b>pcre2_config()</b>).
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">STRING LENGTHS AND OFFSETS</a><br>
|
||||
<P>
|
||||
The PCRE2 API uses string lengths and offsets into strings of code units in
|
||||
several places. These values are always of type PCRE2_SIZE, which is an
|
||||
unsigned integer type, currently always defined as <i>size_t</i>. The largest
|
||||
value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved
|
||||
as a special indicator for zero-terminated strings and unset offsets.
|
||||
Therefore, the longest string that can be handled is one less than this
|
||||
maximum.
|
||||
<a name="newlines"></a></P>
|
||||
<br><a name="SEC12" href="#TOC1">NEWLINES</a><br>
|
||||
<br><a name="SEC13" href="#TOC1">NEWLINES</a><br>
|
||||
<P>
|
||||
PCRE2 supports five different conventions for indicating line breaks in
|
||||
strings: a single CR (carriage return) character, a single LF (linefeed)
|
||||
|
@ -460,7 +471,7 @@ The choice of newline convention does not affect the interpretation of
|
|||
the \n or \r escape sequences, nor does it affect what \R matches; this has
|
||||
its own separate convention.
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">MULTITHREADING</a><br>
|
||||
<br><a name="SEC14" href="#TOC1">MULTITHREADING</a><br>
|
||||
<P>
|
||||
In a multithreaded application it is important to keep thread-specific data
|
||||
separate from data that can be shared between threads. The PCRE2 library code
|
||||
|
@ -505,7 +516,7 @@ storing the results of a match. This includes details of what was matched, as
|
|||
well as additional information such as the name of a (*MARK) setting. Each
|
||||
thread must provide its own version of this memory.
|
||||
</P>
|
||||
<br><a name="SEC14" href="#TOC1">PCRE2 CONTEXTS</a><br>
|
||||
<br><a name="SEC15" href="#TOC1">PCRE2 CONTEXTS</a><br>
|
||||
<P>
|
||||
Some PCRE2 functions have a lot of parameters, many of which are used only by
|
||||
specialist applications, for example, those that use custom memory management
|
||||
|
@ -636,7 +647,7 @@ This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
|
|||
depth of parenthesis nesting in a pattern. This limit stops rogue patterns
|
||||
using up too much system stack when being compiled.
|
||||
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t));</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
There is at least one application that runs PCRE2 in threads with very limited
|
||||
|
@ -644,8 +655,14 @@ system stack, where running out of stack is to be avoided at all costs. The
|
|||
parenthesis limit above cannot take account of how much stack is actually
|
||||
available. For a finer control, you can supply a function that is called
|
||||
whenever <b>pcre2_compile()</b> starts to compile a parenthesized part of a
|
||||
pattern. The argument to the function gives the current depth of nesting. The
|
||||
function should return zero if all is well, or non-zero to force an error.
|
||||
pattern. This function can check the actual stack size (or anything else that
|
||||
it wants to, of course).
|
||||
</P>
|
||||
<P>
|
||||
The first argument to the callout function gives the current depth of
|
||||
nesting, and the second is user data that is set up by the last argument of
|
||||
<b>pcre2_set_compile_recursion_guard()</b>. The callout function should return
|
||||
zero if all is well, or non-zero to force an error.
|
||||
<a name="matchcontext"></a></P>
|
||||
<br><b>
|
||||
The match context
|
||||
|
@ -679,7 +696,7 @@ A match context is created with default values for its parameters. These can
|
|||
be changed by calling the following functions, which return 0 on success, or
|
||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -780,7 +797,7 @@ exit so that they can be re-used when possible during the match. In the absence
|
|||
of these functions, the normal custom memory management functions are used, if
|
||||
supplied, otherwise the system functions.
|
||||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
|
||||
<br><a name="SEC16" href="#TOC1">CHECKING BUILD-TIME OPTIONS</a><br>
|
||||
<P>
|
||||
<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||
</P>
|
||||
|
@ -807,15 +824,15 @@ available:
|
|||
<pre>
|
||||
PCRE2_CONFIG_BSR
|
||||
</pre>
|
||||
The output is an integer whose value indicates what character sequences the \R
|
||||
escape sequence matches by default. A value of PCRE2_BSR_UNICODE means that \R
|
||||
matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means
|
||||
that \R matches only CR, LF, or CRLF. The default can be overridden when a
|
||||
pattern is compiled.
|
||||
The output is a uint32_t integer whose value indicates what character
|
||||
sequences the \R escape sequence matches by default. A value of
|
||||
PCRE2_BSR_UNICODE means that \R matches any Unicode line ending sequence; a
|
||||
value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The
|
||||
default can be overridden when a pattern is compiled.
|
||||
<pre>
|
||||
PCRE2_CONFIG_JIT
|
||||
</pre>
|
||||
The output is an integer that is set to one if support for just-in-time
|
||||
The output is a uint32_t integer that is set to one if support for just-in-time
|
||||
compiling is available; otherwise it is set to zero.
|
||||
<pre>
|
||||
PCRE2_CONFIG_JITTARGET
|
||||
|
@ -831,12 +848,13 @@ for the terminating zero.
|
|||
<pre>
|
||||
PCRE2_CONFIG_LINKSIZE
|
||||
</pre>
|
||||
The output is an integer that contains the number of bytes used for internal
|
||||
linkage in compiled regular expressions. When PCRE2 is configured, the value
|
||||
can be set to 2, 3, or 4, with the default being 2. This is the value that is
|
||||
returned by <b>pcre2_config()</b>. However, when the 16-bit library is compiled,
|
||||
a value of 3 is rounded up to 4, and when the 32-bit library is compiled,
|
||||
internal linkages always use 4 bytes, so the configured value is not relevant.
|
||||
The output is a uint32_t integer that contains the number of bytes used for
|
||||
internal linkage in compiled regular expressions. When PCRE2 is configured, the
|
||||
value can be set to 2, 3, or 4, with the default being 2. This is the value
|
||||
that is returned by <b>pcre2_config()</b>. However, when the 16-bit library is
|
||||
compiled, a value of 3 is rounded up to 4, and when the 32-bit library is
|
||||
compiled, internal linkages always use 4 bytes, so the configured value is not
|
||||
relevant.
|
||||
</P>
|
||||
<P>
|
||||
The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all
|
||||
|
@ -846,14 +864,14 @@ be compiled by those two libraries, but at the expense of slower matching.
|
|||
<pre>
|
||||
PCRE2_CONFIG_MATCHLIMIT
|
||||
</pre>
|
||||
The output is an unsigned long integer that gives the default limit for the
|
||||
number of internal matching function calls in a <b>pcre2_match()</b> execution.
|
||||
Further details are given with <b>pcre2_match()</b> below.
|
||||
The output is a uint32_t integer that gives the default limit for the number of
|
||||
internal matching function calls in a <b>pcre2_match()</b> execution. Further
|
||||
details are given with <b>pcre2_match()</b> below.
|
||||
<pre>
|
||||
PCRE2_CONFIG_NEWLINE
|
||||
</pre>
|
||||
The output is an integer whose value specifies the default character sequence
|
||||
that is recognized as meaning "newline". The values are:
|
||||
The output is a uint32_t integer whose value specifies the default character
|
||||
sequence that is recognized as meaning "newline". The values are:
|
||||
<pre>
|
||||
PCRE2_NEWLINE_CR Carriage return (CR)
|
||||
PCRE2_NEWLINE_LF Linefeed (LF)
|
||||
|
@ -866,7 +884,7 @@ operating system.
|
|||
<pre>
|
||||
PCRE2_CONFIG_PARENSLIMIT
|
||||
</pre>
|
||||
The output is an unsigned long integer that gives the maximum depth of nesting
|
||||
The output is a uint32_t integer that gives the maximum depth of nesting
|
||||
of parentheses (of any kind) in a pattern. This limit is imposed to cap the
|
||||
amount of system stack used when a pattern is compiled. It is specified when
|
||||
PCRE2 is built; the default is 250. This limit does not take into account the
|
||||
|
@ -875,16 +893,15 @@ over compilation stack usage, see <b>pcre2_set_compile_recursion_guard()</b>.
|
|||
<pre>
|
||||
PCRE2_CONFIG_RECURSIONLIMIT
|
||||
</pre>
|
||||
The output is an unsigned long integer that gives the default limit for the
|
||||
depth of recursion when calling the internal matching function in a
|
||||
<b>pcre2_match()</b> execution. Further details are given with
|
||||
<b>pcre2_match()</b> below.
|
||||
The output is a uint32_t integer that gives the default limit for the depth of
|
||||
recursion when calling the internal matching function in a <b>pcre2_match()</b>
|
||||
execution. Further details are given with <b>pcre2_match()</b> below.
|
||||
<pre>
|
||||
PCRE2_CONFIG_STACKRECURSE
|
||||
</pre>
|
||||
The output is an integer that is set to one if internal recursion when running
|
||||
<b>pcre2_match()</b> is implemented by recursive function calls that use the
|
||||
system stack to remember their state. This is the usual way that PCRE2 is
|
||||
The output is a uint32_t integer that is set to one if internal recursion when
|
||||
running <b>pcre2_match()</b> is implemented by recursive function calls that use
|
||||
the system stack to remember their state. This is the usual way that PCRE2 is
|
||||
compiled. The output is zero if PCRE2 was compiled to use blocks of data on the
|
||||
heap instead of recursive function calls.
|
||||
<pre>
|
||||
|
@ -900,8 +917,8 @@ string plus one unit for the terminating zero.
|
|||
<pre>
|
||||
PCRE2_CONFIG_UNICODE
|
||||
</pre>
|
||||
The output is an integer that is set to one if Unicode support is available;
|
||||
otherwise it is set to zero. Unicode support implies UTF support.
|
||||
The output is a uint32_t integer that is set to one if Unicode support is
|
||||
available; otherwise it is set to zero. Unicode support implies UTF support.
|
||||
<pre>
|
||||
PCRE2_CONFIG_VERSION
|
||||
</pre>
|
||||
|
@ -912,7 +929,7 @@ the PCRE2 version string, zero-terminated. The number of code units used is
|
|||
returned. This is the length of the string plus one unit for the terminating
|
||||
zero.
|
||||
<a name="compiling"></a></P>
|
||||
<br><a name="SEC16" href="#TOC1">COMPILING A PATTERN</a><br>
|
||||
<br><a name="SEC17" href="#TOC1">COMPILING A PATTERN</a><br>
|
||||
<P>
|
||||
<b>pcre2_code *pcre2_compile(PCRE2_SPTR <i>pattern</i>, PCRE2_SIZE <i>length</i>,</b>
|
||||
<b> uint32_t <i>options</i>, int *<i>errorcode</i>, PCRE2_SIZE *<i>erroroffset,</i></b>
|
||||
|
@ -1267,7 +1284,7 @@ the behaviour of PCRE2 are given in the
|
|||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">COMPILATION ERROR CODES</a><br>
|
||||
<br><a name="SEC18" href="#TOC1">COMPILATION ERROR CODES</a><br>
|
||||
<P>
|
||||
There are over 80 positive error codes that <b>pcre2_compile()</b> may return if
|
||||
it finds an error in the pattern. There are also some negative error codes that
|
||||
|
@ -1277,7 +1294,7 @@ are used for invalid UTF strings. These are the same as given by
|
|||
page. The <b>pcre2_get_error_message()</b> function can be called to obtain a
|
||||
textual error message from any error code.
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">JUST-IN-TIME (JIT) COMPILATION</a><br>
|
||||
<br><a name="SEC19" href="#TOC1">JUST-IN-TIME (JIT) COMPILATION</a><br>
|
||||
<P>
|
||||
<b>int pcre2_jit_compile(pcre2_code *<i>code</i>, uint32_t <i>options</i>);</b>
|
||||
<br>
|
||||
|
@ -1315,7 +1332,7 @@ patterns to be analyzed, and for one-off matches and simple patterns the
|
|||
benefit of faster execution might be offset by a much slower compilation time.
|
||||
Most, but not all patterns can be optimized by the JIT compiler.
|
||||
<a name="localesupport"></a></P>
|
||||
<br><a name="SEC19" href="#TOC1">LOCALE SUPPORT</a><br>
|
||||
<br><a name="SEC20" href="#TOC1">LOCALE SUPPORT</a><br>
|
||||
<P>
|
||||
PCRE2 handles caseless matching, and determines whether characters are letters,
|
||||
digits, or whatever, by reference to a set of tables, indexed by character code
|
||||
|
@ -1371,7 +1388,7 @@ is saved with the compiled pattern, and the same tables are used by
|
|||
compilation, and matching all happen in the same locale, but different patterns
|
||||
can be processed in different locales.
|
||||
<a name="infoaboutpattern"></a></P>
|
||||
<br><a name="SEC20" href="#TOC1">INFORMATION ABOUT A COMPILED PATTERN</a><br>
|
||||
<br><a name="SEC21" href="#TOC1">INFORMATION ABOUT A COMPILED PATTERN</a><br>
|
||||
<P>
|
||||
<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||
</P>
|
||||
|
@ -1660,7 +1677,7 @@ getting memory in which to place the compiled data is the value returned by
|
|||
this option plus the size of the <b>pcre2_code</b> structure. Processing a
|
||||
pattern with the JIT compiler does not alter the value returned by this option.
|
||||
<a name="matchdatablock"></a></P>
|
||||
<br><a name="SEC21" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||
<P>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
|
@ -1712,7 +1729,7 @@ and
|
|||
<a href="#matchotherdata">other match data</a>
|
||||
below.
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
||||
<br><a name="SEC23" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
||||
<P>
|
||||
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
|
@ -1926,7 +1943,7 @@ examples, in the
|
|||
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
||||
<P>
|
||||
When PCRE2 is built, a default newline convention is set; this is usually the
|
||||
standard convention for the operating system. The default can be overridden in
|
||||
|
@ -1961,7 +1978,7 @@ LF in the characters that it matches.
|
|||
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
||||
valid newline sequence and explicit \r or \n escapes appear in the pattern.
|
||||
<a name="matchedstrings"></a></P>
|
||||
<br><a name="SEC24" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
||||
<P>
|
||||
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
|
@ -2051,7 +2068,7 @@ parentheses, no more than <i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by
|
|||
<b>pcre2_match()</b>. The other elements retain whatever values they previously
|
||||
had.
|
||||
<a name="matchotherdata"></a></P>
|
||||
<br><a name="SEC25" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
||||
<P>
|
||||
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
|
@ -2081,7 +2098,7 @@ UTF character when UTF checking fails. Details are given in the
|
|||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
<a name="errorlist"></a></P>
|
||||
<br><a name="SEC26" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
||||
<br><a name="SEC27" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
||||
<P>
|
||||
If <b>pcre2_match()</b> fails, it returns a negative number. This can be
|
||||
converted to a text string by calling <b>pcre2_get_error_message()</b>. Negative
|
||||
|
@ -2190,7 +2207,7 @@ is attempted.
|
|||
</pre>
|
||||
The internal recursion limit was reached.
|
||||
<a name="extractbynumber"></a></P>
|
||||
<br><a name="SEC27" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b> unsigned int <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
|
||||
|
@ -2262,7 +2279,7 @@ no capturing group of that number in the pattern, or because the group with
|
|||
that number did not participate in the match, or because the ovector was too
|
||||
small to capture that group.
|
||||
</P>
|
||||
<br><a name="SEC28" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
|
||||
|
@ -2297,7 +2314,7 @@ can be distinguished from a genuine zero-length substring by inspecting the
|
|||
appropriate offset in the ovector, which contain PCRE2_UNSET for unset
|
||||
substrings.
|
||||
<a name="extractbyname"></a></P>
|
||||
<br><a name="SEC29" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
||||
<br><a name="SEC30" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
|
||||
<b> PCRE2_SPTR <i>name</i>);</b>
|
||||
|
@ -2349,7 +2366,7 @@ names are not included in the compiled code. The matching process uses only
|
|||
numbers. For this reason, the use of different names for subpatterns of the
|
||||
same number causes an error at compile time.
|
||||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
|
@ -2410,7 +2427,7 @@ straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid
|
|||
replacement string (unrecognized sequence following a dollar sign), and
|
||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
|
||||
</P>
|
||||
<br><a name="SEC31" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<br><a name="SEC32" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
||||
<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
|
||||
|
@ -2455,7 +2472,7 @@ The format of the name table is described above in the section entitled
|
|||
Given all the relevant entries for the name, you can extract each of their
|
||||
numbers, and hence the captured data.
|
||||
</P>
|
||||
<br><a name="SEC32" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
||||
<br><a name="SEC33" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
||||
<P>
|
||||
The traditional matching function uses a similar algorithm to Perl, which stops
|
||||
when it finds the first match at a given point in the subject. If you want to
|
||||
|
@ -2473,7 +2490,7 @@ substring. Then return 1, which forces <b>pcre2_match()</b> to backtrack and try
|
|||
other alternatives. Ultimately, when it runs out of matches,
|
||||
<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH.
|
||||
<a name="dfamatch"></a></P>
|
||||
<br><a name="SEC33" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
||||
<br><a name="SEC34" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
||||
<P>
|
||||
<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
|
@ -2647,13 +2664,13 @@ some plausibility checks are made on the contents of the workspace, which
|
|||
should contain data about the previous partial match. If any of these checks
|
||||
fail, this error is given.
|
||||
</P>
|
||||
<br><a name="SEC34" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC35" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2build</b>(3), <b>pcre2callout</b>(3), <b>pcre2demo(3)</b>,
|
||||
<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3),
|
||||
<b>pcre2sample</b>(3), <b>pcre2stack</b>(3), <b>pcre2unicode</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC35" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC36" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -2662,9 +2679,9 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC36" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC37" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 November 2014
|
||||
Last updated: 26 November 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -26,7 +26,7 @@ please consult the man page, in case the conversion went wrong.
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>int (*pcre2_callout)(pcre2_callout_block *);</b>
|
||||
<b>int (*pcre2_callout)(pcre2_callout_block *, void *);</b>
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
||||
<P>
|
||||
|
@ -137,14 +137,17 @@ callouts such as the example above are obeyed.
|
|||
<P>
|
||||
During matching, when PCRE2 reaches a callout point, if an external function is
|
||||
set in the match context, it is called. This applies to both normal and DFA
|
||||
matching. The only argument to the callout function is a pointer to a
|
||||
<b>pcre2_callout</b> block. This structure contains the following fields:
|
||||
matching. The first argument to the callout function is a pointer to a
|
||||
<b>pcre2_callout</b> block. The second argument is the void * callout data that
|
||||
was supplied when the callout was set up by calling <b>pcre2_set_callout()</b>
|
||||
(see the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation). The callout block structure contains the following fields:
|
||||
<pre>
|
||||
uint32_t <i>version</i>;
|
||||
uint32_t <i>callout_number</i>;
|
||||
uint32_t <i>capture_top</i>;
|
||||
uint32_t <i>capture_last</i>;
|
||||
void *<i>callout_data</i>;
|
||||
PCRE2_SIZE *<i>offset_vector</i>;
|
||||
PCRE2_SPTR <i>mark</i>;
|
||||
PCRE2_SPTR <i>subject</i>;
|
||||
|
@ -203,14 +206,6 @@ substrings have been captured, the value of <i>capture_last</i> is 0. This is
|
|||
always the case for the DFA matching functions.
|
||||
</P>
|
||||
<P>
|
||||
The <i>callout_data</i> field contains a value that is passed to a matching
|
||||
function specifically so that it can be passed back in callouts. It is set in
|
||||
the match context when the callout is set up by calling
|
||||
<b>pcre2_set_callout()</b> (see the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation).
|
||||
</P>
|
||||
<P>
|
||||
The <i>pattern_position</i> field contains the offset to the next item to be
|
||||
matched in the pattern string.
|
||||
</P>
|
||||
|
@ -259,7 +254,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 November 2014
|
||||
Last updated: 25 November 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -32,6 +32,21 @@ However, the speed of execution is slower. In the 32-bit library, the internal
|
|||
linkage size is always 4.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length (in code units) of a subject string is one less than the
|
||||
largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned
|
||||
integer type, usually defined as size_t. Its maximum value (that is
|
||||
~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings
|
||||
and unset offsets.
|
||||
</P>
|
||||
<P>
|
||||
Note that when using the traditional matching function, PCRE2 uses recursion to
|
||||
handle subpatterns and indefinite repetition. This means that the available
|
||||
stack space may limit the size of a subject string that can be processed by
|
||||
certain patterns. For a discussion of stack issues, see the
|
||||
<a href="pcre2stack.html"><b>pcre2stack</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
All values in repeating quantifiers must be less than 65536.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -55,16 +70,6 @@ maximum number of named subpatterns is 10000.
|
|||
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb
|
||||
is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of a subject string is the largest number a PCRE2_SIZE
|
||||
variable can hold. PCRE2_SIZE is an unsigned integer type, usually defined as
|
||||
size_t. However, when using the traditional matching function, PCRE2 uses
|
||||
recursion to handle subpatterns and indefinite repetition. This means that the
|
||||
available stack space may limit the size of a subject string that can be
|
||||
processed by certain patterns. For a discussion of stack issues, see the
|
||||
<a href="pcre2stack.html"><b>pcre2stack</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
</b><br>
|
||||
|
@ -80,7 +85,7 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 29 September 2014
|
||||
Last updated: 25 November 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
|
|
177
doc/pcre2.txt
177
doc/pcre2.txt
|
@ -248,7 +248,7 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS
|
|||
uint32_t value);
|
||||
|
||||
int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
||||
int (*guard_function)(uint32_t));
|
||||
int (*guard_function)(uint32_t, void *), void *user_data);
|
||||
|
||||
|
||||
PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS
|
||||
|
@ -262,7 +262,7 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS
|
|||
void pcre2_match_context_free(pcre2_match_context *mcontext);
|
||||
|
||||
int pcre2_set_callout(pcre2_match_context *mcontext,
|
||||
int (*callout_function)(pcre2_callout_block *),
|
||||
int (*callout_function)(pcre2_callout_block *, void *),
|
||||
void *callout_data);
|
||||
|
||||
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
||||
|
@ -492,6 +492,17 @@ PCRE2 API OVERVIEW
|
|||
which PCRE2 was built (pcre2_config()).
|
||||
|
||||
|
||||
STRING LENGTHS AND OFFSETS
|
||||
|
||||
The PCRE2 API uses string lengths and offsets into strings of code
|
||||
units in several places. These values are always of type PCRE2_SIZE,
|
||||
which is an unsigned integer type, currently always defined as size_t.
|
||||
The largest value that can be stored in such a type (that is
|
||||
~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated
|
||||
strings and unset offsets. Therefore, the longest string that can be
|
||||
handled is one less than this maximum.
|
||||
|
||||
|
||||
NEWLINES
|
||||
|
||||
PCRE2 supports five different conventions for indicating line breaks in
|
||||
|
@ -694,16 +705,20 @@ PCRE2 CONTEXTS
|
|||
rogue patterns using up too much system stack when being compiled.
|
||||
|
||||
int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
||||
int (*guard_function)(uint32_t));
|
||||
int (*guard_function)(uint32_t, void *), void *user_data);
|
||||
|
||||
There is at least one application that runs PCRE2 in threads with very
|
||||
limited system stack, where running out of stack is to be avoided at
|
||||
all costs. The parenthesis limit above cannot take account of how much
|
||||
stack is actually available. For a finer control, you can supply a
|
||||
function that is called whenever pcre2_compile() starts to compile a
|
||||
parenthesized part of a pattern. The argument to the function gives the
|
||||
current depth of nesting. The function should return zero if all is
|
||||
well, or non-zero to force an error.
|
||||
parenthesized part of a pattern. This function can check the actual
|
||||
stack size (or anything else that it wants to, of course).
|
||||
|
||||
The first argument to the callout function gives the current depth of
|
||||
nesting, and the second is user data that is set up by the last argu-
|
||||
ment of pcre2_set_compile_recursion_guard(). The callout function
|
||||
should return zero if all is well, or non-zero to force an error.
|
||||
|
||||
The match context
|
||||
|
||||
|
@ -734,7 +749,7 @@ PCRE2 CONTEXTS
|
|||
on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
|
||||
int pcre2_set_callout(pcre2_match_context *mcontext,
|
||||
int (*callout_function)(pcre2_callout_block *),
|
||||
int (*callout_function)(pcre2_callout_block *, void *),
|
||||
void *callout_data);
|
||||
|
||||
This sets up a "callout" function, which PCRE2 will call at specified
|
||||
|
@ -853,16 +868,16 @@ CHECKING BUILD-TIME OPTIONS
|
|||
|
||||
PCRE2_CONFIG_BSR
|
||||
|
||||
The output is an integer whose value indicates what character sequences
|
||||
the \R escape sequence matches by default. A value of PCRE2_BSR_UNICODE
|
||||
means that \R matches any Unicode line ending sequence; a value of
|
||||
PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The
|
||||
default can be overridden when a pattern is compiled.
|
||||
The output is a uint32_t integer whose value indicates what character
|
||||
sequences the \R escape sequence matches by default. A value of
|
||||
PCRE2_BSR_UNICODE means that \R matches any Unicode line ending
|
||||
sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR,
|
||||
LF, or CRLF. The default can be overridden when a pattern is compiled.
|
||||
|
||||
PCRE2_CONFIG_JIT
|
||||
|
||||
The output is an integer that is set to one if support for just-in-time
|
||||
compiling is available; otherwise it is set to zero.
|
||||
The output is a uint32_t integer that is set to one if support for
|
||||
just-in-time compiling is available; otherwise it is set to zero.
|
||||
|
||||
PCRE2_CONFIG_JITTARGET
|
||||
|
||||
|
@ -877,13 +892,13 @@ CHECKING BUILD-TIME OPTIONS
|
|||
|
||||
PCRE2_CONFIG_LINKSIZE
|
||||
|
||||
The output is an integer that contains the number of bytes used for
|
||||
internal linkage in compiled regular expressions. When PCRE2 is config-
|
||||
ured, the value can be set to 2, 3, or 4, with the default being 2.
|
||||
This is the value that is returned by pcre2_config(). However, when the
|
||||
16-bit library is compiled, a value of 3 is rounded up to 4, and when
|
||||
the 32-bit library is compiled, internal linkages always use 4 bytes,
|
||||
so the configured value is not relevant.
|
||||
The output is a uint32_t integer that contains the number of bytes used
|
||||
for internal linkage in compiled regular expressions. When PCRE2 is
|
||||
configured, the value can be set to 2, 3, or 4, with the default being
|
||||
2. This is the value that is returned by pcre2_config(). However, when
|
||||
the 16-bit library is compiled, a value of 3 is rounded up to 4, and
|
||||
when the 32-bit library is compiled, internal linkages always use 4
|
||||
bytes, so the configured value is not relevant.
|
||||
|
||||
The default value of 2 for the 8-bit and 16-bit libraries is sufficient
|
||||
for all but the most massive patterns, since it allows the size of the
|
||||
|
@ -893,14 +908,15 @@ CHECKING BUILD-TIME OPTIONS
|
|||
|
||||
PCRE2_CONFIG_MATCHLIMIT
|
||||
|
||||
The output is an unsigned long integer that gives the default limit for
|
||||
the number of internal matching function calls in a pcre2_match() exe-
|
||||
cution. Further details are given with pcre2_match() below.
|
||||
The output is a uint32_t integer that gives the default limit for the
|
||||
number of internal matching function calls in a pcre2_match() execu-
|
||||
tion. Further details are given with pcre2_match() below.
|
||||
|
||||
PCRE2_CONFIG_NEWLINE
|
||||
|
||||
The output is an integer whose value specifies the default character
|
||||
sequence that is recognized as meaning "newline". The values are:
|
||||
The output is a uint32_t integer whose value specifies the default
|
||||
character sequence that is recognized as meaning "newline". The values
|
||||
are:
|
||||
|
||||
PCRE2_NEWLINE_CR Carriage return (CR)
|
||||
PCRE2_NEWLINE_LF Linefeed (LF)
|
||||
|
@ -908,33 +924,34 @@ CHECKING BUILD-TIME OPTIONS
|
|||
PCRE2_NEWLINE_ANY Any Unicode line ending
|
||||
PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF
|
||||
|
||||
The default should normally correspond to the standard sequence for
|
||||
The default should normally correspond to the standard sequence for
|
||||
your operating system.
|
||||
|
||||
PCRE2_CONFIG_PARENSLIMIT
|
||||
|
||||
The output is an unsigned long integer that gives the maximum depth of
|
||||
nesting of parentheses (of any kind) in a pattern. This limit is
|
||||
imposed to cap the amount of system stack used when a pattern is com-
|
||||
piled. It is specified when PCRE2 is built; the default is 250. This
|
||||
limit does not take into account the stack that may already be used by
|
||||
the calling application. For finer control over compilation stack
|
||||
usage, see pcre2_set_compile_recursion_guard().
|
||||
The output is a uint32_t integer that gives the maximum depth of nest-
|
||||
ing of parentheses (of any kind) in a pattern. This limit is imposed to
|
||||
cap the amount of system stack used when a pattern is compiled. It is
|
||||
specified when PCRE2 is built; the default is 250. This limit does not
|
||||
take into account the stack that may already be used by the calling
|
||||
application. For finer control over compilation stack usage, see
|
||||
pcre2_set_compile_recursion_guard().
|
||||
|
||||
PCRE2_CONFIG_RECURSIONLIMIT
|
||||
|
||||
The output is an unsigned long integer that gives the default limit for
|
||||
the depth of recursion when calling the internal matching function in a
|
||||
pcre2_match() execution. Further details are given with pcre2_match()
|
||||
The output is a uint32_t integer that gives the default limit for the
|
||||
depth of recursion when calling the internal matching function in a
|
||||
pcre2_match() execution. Further details are given with pcre2_match()
|
||||
below.
|
||||
|
||||
PCRE2_CONFIG_STACKRECURSE
|
||||
|
||||
The output is an integer that is set to one if internal recursion when
|
||||
running pcre2_match() is implemented by recursive function calls that
|
||||
use the system stack to remember their state. This is the usual way
|
||||
that PCRE2 is compiled. The output is zero if PCRE2 was compiled to use
|
||||
blocks of data on the heap instead of recursive function calls.
|
||||
The output is a uint32_t integer that is set to one if internal recur-
|
||||
sion when running pcre2_match() is implemented by recursive function
|
||||
calls that use the system stack to remember their state. This is the
|
||||
usual way that PCRE2 is compiled. The output is zero if PCRE2 was com-
|
||||
piled to use blocks of data on the heap instead of recursive function
|
||||
calls.
|
||||
|
||||
PCRE2_CONFIG_UNICODE_VERSION
|
||||
|
||||
|
@ -948,8 +965,8 @@ CHECKING BUILD-TIME OPTIONS
|
|||
|
||||
PCRE2_CONFIG_UNICODE
|
||||
|
||||
The output is an integer that is set to one if Unicode support is
|
||||
available; otherwise it is set to zero. Unicode support implies UTF
|
||||
The output is a uint32_t integer that is set to one if Unicode support
|
||||
is available; otherwise it is set to zero. Unicode support implies UTF
|
||||
support.
|
||||
|
||||
PCRE2_CONFIG_VERSION
|
||||
|
@ -2605,7 +2622,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 November 2014
|
||||
Last updated: 26 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -3076,7 +3093,7 @@ SYNOPSIS
|
|||
|
||||
#include <pcre2.h>
|
||||
|
||||
int (*pcre2_callout)(pcre2_callout_block *);
|
||||
int (*pcre2_callout)(pcre2_callout_block *, void *);
|
||||
|
||||
|
||||
DESCRIPTION
|
||||
|
@ -3183,15 +3200,16 @@ THE CALLOUT INTERFACE
|
|||
|
||||
During matching, when PCRE2 reaches a callout point, if an external
|
||||
function is set in the match context, it is called. This applies to
|
||||
both normal and DFA matching. The only argument to the callout function
|
||||
is a pointer to a pcre2_callout block. This structure contains the fol-
|
||||
lowing fields:
|
||||
both normal and DFA matching. The first argument to the callout func-
|
||||
tion is a pointer to a pcre2_callout block. The second argument is the
|
||||
void * callout data that was supplied when the callout was set up by
|
||||
calling pcre2_set_callout() (see the pcre2api documentation). The call-
|
||||
out block structure contains the following fields:
|
||||
|
||||
uint32_t version;
|
||||
uint32_t callout_number;
|
||||
uint32_t capture_top;
|
||||
uint32_t capture_last;
|
||||
void *callout_data;
|
||||
PCRE2_SIZE *offset_vector;
|
||||
PCRE2_SPTR mark;
|
||||
PCRE2_SPTR subject;
|
||||
|
@ -3242,28 +3260,23 @@ THE CALLOUT INTERFACE
|
|||
substrings. If no substrings have been captured, the value of cap-
|
||||
ture_last is 0. This is always the case for the DFA matching functions.
|
||||
|
||||
The callout_data field contains a value that is passed to a matching
|
||||
function specifically so that it can be passed back in callouts. It is
|
||||
set in the match context when the callout is set up by calling
|
||||
pcre2_set_callout() (see the pcre2api documentation).
|
||||
|
||||
The pattern_position field contains the offset to the next item to be
|
||||
The pattern_position field contains the offset to the next item to be
|
||||
matched in the pattern string.
|
||||
|
||||
The next_item_length field contains the length of the next item to be
|
||||
The next_item_length field contains the length of the next item to be
|
||||
matched in the pattern string. When the callout immediately precedes an
|
||||
alternation bar, a closing parenthesis, or the end of the pattern, the
|
||||
length is zero. When the callout precedes an opening parenthesis, the
|
||||
alternation bar, a closing parenthesis, or the end of the pattern, the
|
||||
length is zero. When the callout precedes an opening parenthesis, the
|
||||
length is that of the entire subpattern.
|
||||
|
||||
The pattern_position and next_item_length fields are intended to help
|
||||
in distinguishing between different automatic callouts, which all have
|
||||
The pattern_position and next_item_length fields are intended to help
|
||||
in distinguishing between different automatic callouts, which all have
|
||||
the same callout number. However, they are set for all callouts.
|
||||
|
||||
In callouts from pcre2_match() the mark field contains a pointer to the
|
||||
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||
(*THEN) item in the match, or NULL if no such items have been passed.
|
||||
Instances of (*PRUNE) or (*THEN) without a name do not obliterate a
|
||||
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||
(*THEN) item in the match, or NULL if no such items have been passed.
|
||||
Instances of (*PRUNE) or (*THEN) without a name do not obliterate a
|
||||
previous (*MARK). In callouts from the DFA matching function this field
|
||||
always contains NULL.
|
||||
|
||||
|
@ -3271,16 +3284,16 @@ THE CALLOUT INTERFACE
|
|||
RETURN VALUES
|
||||
|
||||
The external callout function returns an integer to PCRE2. If the value
|
||||
is zero, matching proceeds as normal. If the value is greater than
|
||||
zero, matching fails at the current point, but the testing of other
|
||||
is zero, matching proceeds as normal. If the value is greater than
|
||||
zero, matching fails at the current point, but the testing of other
|
||||
matching possibilities goes ahead, just as if a lookahead assertion had
|
||||
failed. If the value is less than zero, the match is abandoned, and the
|
||||
matching function returns the negative value.
|
||||
|
||||
Negative values should normally be chosen from the set of
|
||||
PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a
|
||||
standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is
|
||||
reserved for use by callout functions; it will never be used by PCRE2
|
||||
Negative values should normally be chosen from the set of
|
||||
PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a
|
||||
standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is
|
||||
reserved for use by callout functions; it will never be used by PCRE2
|
||||
itself.
|
||||
|
||||
|
||||
|
@ -3293,7 +3306,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 November 2014
|
||||
Last updated: 25 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -3891,6 +3904,18 @@ SIZE AND OTHER LIMITATIONS
|
|||
of execution is slower. In the 32-bit library, the internal linkage
|
||||
size is always 4.
|
||||
|
||||
The maximum length (in code units) of a subject string is one less than
|
||||
the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an
|
||||
unsigned integer type, usually defined as size_t. Its maximum value
|
||||
(that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-
|
||||
terminated strings and unset offsets.
|
||||
|
||||
Note that when using the traditional matching function, PCRE2 uses
|
||||
recursion to handle subpatterns and indefinite repetition. This means
|
||||
that the available stack space may limit the size of a subject string
|
||||
that can be processed by certain patterns. For a discussion of stack
|
||||
issues, see the pcre2stack documentation.
|
||||
|
||||
All values in repeating quantifiers must be less than 65536.
|
||||
|
||||
There is no limit to the number of parenthesized subpatterns, but there
|
||||
|
@ -3913,14 +3938,6 @@ SIZE AND OTHER LIMITATIONS
|
|||
(*THEN) verb is 255 for the 8-bit library and 65535 for the 16-bit and
|
||||
32-bit libraries.
|
||||
|
||||
The maximum length of a subject string is the largest number a
|
||||
PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned integer type,
|
||||
usually defined as size_t. However, when using the traditional matching
|
||||
function, PCRE2 uses recursion to handle subpatterns and indefinite
|
||||
repetition. This means that the available stack space may limit the
|
||||
size of a subject string that can be processed by certain patterns. For
|
||||
a discussion of stack issues, see the pcre2stack documentation.
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
|
@ -3931,7 +3948,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 29 September 2014
|
||||
Last updated: 25 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.PP
|
||||
.nf
|
||||
.B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " int (*\fIguard_function\fP)(uint32_t));"
|
||||
.B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
|
@ -16,11 +16,12 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.sp
|
||||
This function defines, within a compile context, a function that is called
|
||||
whenever \fBpcre2_compile()\fP starts to compile a parenthesized part of a
|
||||
pattern. The argument to the function gives the current depth of parenthesis
|
||||
nesting. The function should return zero if all is well, or non-zero to force
|
||||
an error. This feature is provided so that applications can check the available
|
||||
system stack space, in order to avoid running out. The result of this function
|
||||
is always zero.
|
||||
pattern. The first argument to the function gives the current depth of
|
||||
parenthesis nesting, and the second is user data that is supplied when the
|
||||
function is set up. The callout function should return zero if all is well, or
|
||||
non-zero to force an error. This feature is provided so that applications can
|
||||
check the available system stack space, in order to avoid running out. The
|
||||
result of \fBpcre2_set_compile_recursion_guard()\fP is always zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "25 November 2014" "PCRE2 10.00"
|
||||
.TH PCRE2API 3 "26 November 2014" "PCRE2 10.00"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -97,7 +97,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
.B " uint32_t \fIvalue\fP);"
|
||||
.sp
|
||||
.B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " int (*\fIguard_function\fP)(uint32_t));"
|
||||
.B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);"
|
||||
.fi
|
||||
.
|
||||
.
|
||||
|
@ -375,11 +375,11 @@ PCRE2 was built (\fBpcre2_config()\fP).
|
|||
.sp
|
||||
The PCRE2 API uses string lengths and offsets into strings of code units in
|
||||
several places. These values are always of type PCRE2_SIZE, which is an
|
||||
unsigned integer type, currently always defined as \fIsize_t\fP. The largest
|
||||
value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved
|
||||
as a special indicator for zero-terminated strings and unset offsets.
|
||||
Therefore, the longest string that can be handled is one less than this
|
||||
maximum.
|
||||
unsigned integer type, currently always defined as \fIsize_t\fP. The largest
|
||||
value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved
|
||||
as a special indicator for zero-terminated strings and unset offsets.
|
||||
Therefore, the longest string that can be handled is one less than this
|
||||
maximum.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="newlines"></a>
|
||||
|
@ -612,7 +612,7 @@ using up too much system stack when being compiled.
|
|||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " int (*\fIguard_function\fP)(uint32_t));"
|
||||
.B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);"
|
||||
.fi
|
||||
.sp
|
||||
There is at least one application that runs PCRE2 in threads with very limited
|
||||
|
@ -620,8 +620,13 @@ system stack, where running out of stack is to be avoided at all costs. The
|
|||
parenthesis limit above cannot take account of how much stack is actually
|
||||
available. For a finer control, you can supply a function that is called
|
||||
whenever \fBpcre2_compile()\fP starts to compile a parenthesized part of a
|
||||
pattern. The argument to the function gives the current depth of nesting. The
|
||||
function should return zero if all is well, or non-zero to force an error.
|
||||
pattern. This function can check the actual stack size (or anything else that
|
||||
it wants to, of course).
|
||||
.P
|
||||
The first argument to the callout function gives the current depth of
|
||||
nesting, and the second is user data that is set up by the last argument of
|
||||
\fBpcre2_set_compile_recursion_guard()\fP. The callout function should return
|
||||
zero if all is well, or non-zero to force an error.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="matchcontext"></a>
|
||||
|
@ -2726,6 +2731,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 25 November 2014
|
||||
Last updated: 26 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
.fi
|
||||
|
|
41
maint/README
41
maint/README
|
@ -37,11 +37,11 @@ pcre2_chartables.c.non-standard
|
|||
|
||||
README This file.
|
||||
|
||||
Unicode.tables The files in this directory (CaseFolding.txt,
|
||||
Unicode.tables The files in this directory (CaseFolding.txt,
|
||||
DerivedGeneralCategory.txt, GraphemeBreakProperty.txt,
|
||||
Scripts.txt and UnicodeData.txt) were downloaded from the
|
||||
Unicode web site. They contain information about Unicode
|
||||
characters and scripts.
|
||||
characters and scripts.
|
||||
|
||||
ucptest.c A short C program for testing the Unicode property macros
|
||||
that do lookups in the pcre2_ucd.c data, mainly useful after
|
||||
|
@ -87,21 +87,21 @@ Note also that both the pcre2syntax.3 and pcre2pattern.3 man pages contain
|
|||
lists of Unicode script names.
|
||||
|
||||
|
||||
Preparing for a PCRE release
|
||||
============================
|
||||
Preparing for a PCRE2 release
|
||||
=============================
|
||||
|
||||
This section contains a checklist of things that I consult before building a
|
||||
distribution for a new release.
|
||||
|
||||
. Ensure that the version number and version date are correct in configure.ac.
|
||||
|
||||
. Update the library version numbers in configure.ac according to the rules
|
||||
. Update the library version numbers in configure.ac according to the rules
|
||||
given below.
|
||||
|
||||
. If new build options have been added, ensure that they are added to the CMake
|
||||
files as well as to the autoconf files. The relevant files are CMakeLists.txt
|
||||
and config-cmake.h.in. After making a release tarball, test it out with CMake
|
||||
if there have been changes here.
|
||||
and config-cmake.h.in. After making a release tarball, test it out with CMake
|
||||
if there have been changes here.
|
||||
|
||||
. Run ./autogen.sh to ensure everything is up-to-date.
|
||||
|
||||
|
@ -112,7 +112,7 @@ distribution for a new release.
|
|||
different configurations, and it also runs some of them with valgrind, all of
|
||||
which can take quite some time.
|
||||
|
||||
. Run perltest.sh on the test data for tests 1 and 4. The output should match
|
||||
. Run perltest.sh on the test data for tests 1 and 4. The output should match
|
||||
the PCRE2 test output, apart from the version identification at the start of
|
||||
each test. The other tests are not Perl-compatible (they use various
|
||||
PCRE2-specific features or options).
|
||||
|
@ -122,7 +122,7 @@ distribution for a new release.
|
|||
may see a number of "pcre2_memmove defined but not used" warnings for the
|
||||
modules in which there is no call to memmove(). These can be ignored.
|
||||
|
||||
. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE,
|
||||
. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE,
|
||||
NEWS (check version and date), NON-AUTOTOOLS-BUILD, and README. Many of these
|
||||
won't need changing, but over the long term things do change.
|
||||
|
||||
|
@ -133,15 +133,15 @@ distribution for a new release.
|
|||
pcre2test to increase the stack size for test 2. Since I retired I can no
|
||||
longer do this, but instead I rely on putting out release candidates for
|
||||
folks on the pcre-dev list to test.
|
||||
|
||||
|
||||
. The buildbots at http://buildfarm.opencsw.org/ do some automated testing
|
||||
of PCRE2 and should be checked before putting out a release.
|
||||
|
||||
of PCRE2 and should be checked before putting out a release.
|
||||
|
||||
|
||||
Updating version info for libtool
|
||||
=================================
|
||||
|
||||
This set of rules for updating library version information came from a web page
|
||||
This set of rules for updating library version information came from a web page
|
||||
whose URL I have forgotten. The version information consists of three parts:
|
||||
(current, revision, age).
|
||||
|
||||
|
@ -194,7 +194,7 @@ and the zipball. Double-check with "svn status", then create an SVN tagged
|
|||
copy:
|
||||
|
||||
svn copy svn://vcs.exim.org/pcre2/code/trunk \
|
||||
svn://vcs.exim.org/pcre2/code/tags/pcre-10.xx
|
||||
svn://vcs.exim.org/pcre2/code/tags/pcre2-10.xx
|
||||
|
||||
When the new release is out, don't forget to tell webmaster@pcre.org and the
|
||||
mailing list. Also, update the list of version numbers in Bugzilla (edit
|
||||
|
@ -255,7 +255,7 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
|
||||
. An option to convert results into character offsets and character lengths.
|
||||
|
||||
. An option for pcre2grep to scan only the start of a file. I am not keen -
|
||||
. An option for pcre2grep to scan only the start of a file. I am not keen -
|
||||
this is the job of "head".
|
||||
|
||||
. A (non-Unix) user wanted pcregrep options to (a) list a file name just once,
|
||||
|
@ -282,14 +282,14 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
|
||||
. Callouts with arguments: (?Cn:ARG) for instance.
|
||||
|
||||
. Write a function that generates random matching strings for a compiled
|
||||
. Write a function that generates random matching strings for a compiled
|
||||
pattern.
|
||||
|
||||
. Pcre2grep: an option to specify the output line separator, either as a string
|
||||
or select from a fixed list. This is not straightforward, because at the
|
||||
moment it outputs whatever is in the input file.
|
||||
|
||||
. Improve the code for duplicate checking in pcre_dfa_match(). An incomplete,
|
||||
. Improve the code for duplicate checking in pcre2_dfa_match(). An incomplete,
|
||||
non-thread-safe patch showed that this can help performance for patterns
|
||||
where there are many alternatives. However, a simple thread-safe
|
||||
implementation that I tried made things worse in many simple cases, so this
|
||||
|
@ -303,7 +303,12 @@ very sensible; some are rather wacky. Some have been on this list for years.
|
|||
. Instead of having #ifdef HAVE_CONFIG_H in each module, put #include
|
||||
"something" and the the #ifdef appears only in one place, in "something".
|
||||
|
||||
. Implement something like (?(R2+)... to check outer recursions.
|
||||
|
||||
. If Perl ever supports the POSIX notation [[.something.]] PCRE2 should try
|
||||
to follow.
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 18 November 2014
|
||||
Last updated: 26 November 2014
|
||||
|
|
|
@ -368,7 +368,8 @@ PCRE2_EXP_DECL int pcre2_set_newline(pcre2_compile_context *, uint32_t); \
|
|||
PCRE2_EXP_DECL int pcre2_set_parens_nest_limit(pcre2_compile_context *, \
|
||||
uint32_t); \
|
||||
PCRE2_EXP_DECL int pcre2_set_compile_recursion_guard(\
|
||||
pcre2_compile_context *, int (*)(uint32_t)); \
|
||||
pcre2_compile_context *, int (*)(uint32_t, void *), \
|
||||
void *);
|
||||
|
||||
#define PCRE2_MATCH_CONTEXT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL \
|
||||
|
|
|
@ -6646,7 +6646,8 @@ branch_chain bc;
|
|||
|
||||
/* If set, call the external function that checks for stack availability. */
|
||||
|
||||
if (cb->cx->stack_guard != NULL && cb->cx->stack_guard(cb->parens_depth))
|
||||
if (cb->cx->stack_guard != NULL &&
|
||||
cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
|
||||
{
|
||||
*errorcodeptr= ERR33;
|
||||
return FALSE;
|
||||
|
|
|
@ -133,6 +133,7 @@ when no context is supplied to the compile function. */
|
|||
const pcre2_compile_context PRIV(default_compile_context) = {
|
||||
{ default_malloc, default_free, NULL },
|
||||
NULL,
|
||||
NULL,
|
||||
PRIV(default_tables),
|
||||
BSR_DEFAULT,
|
||||
NEWLINE_DEFAULT,
|
||||
|
@ -320,9 +321,10 @@ return 0;
|
|||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
||||
int (*guard)(uint32_t))
|
||||
int (*guard)(uint32_t, void *), void *user_data)
|
||||
{
|
||||
ccontext->stack_guard = guard;
|
||||
ccontext->stack_guard_data = user_data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -2623,7 +2623,7 @@ for (;;)
|
|||
cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
|
||||
cb.pattern_position = GET(code, LINK_SIZE + 3);
|
||||
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
|
||||
if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
|
||||
if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
|
||||
return rrc; /* Abandon */
|
||||
}
|
||||
if (rrc > 0) break; /* Fail this thread */
|
||||
|
@ -2970,7 +2970,7 @@ for (;;)
|
|||
cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
|
||||
cb.pattern_position = GET(code, 2);
|
||||
cb.next_item_length = GET(code, 2 + LINK_SIZE);
|
||||
if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
|
||||
if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
|
||||
return rrc; /* Abandon */
|
||||
}
|
||||
if (rrc == 0)
|
||||
|
|
|
@ -1877,10 +1877,10 @@ is available. */
|
|||
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
|
||||
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
|
||||
|
||||
extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
|
||||
extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
|
||||
const compile_block *);
|
||||
extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int);
|
||||
extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,
|
||||
extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,
|
||||
uint32_t *, BOOL);
|
||||
extern void _pcre2_jit_free(void *, pcre2_memctl *);
|
||||
extern size_t _pcre2_jit_get_size(void *);
|
||||
|
@ -1895,7 +1895,7 @@ extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t);
|
|||
extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t);
|
||||
extern int _pcre2_study(pcre2_real_code *);
|
||||
extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *);
|
||||
extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,
|
||||
extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,
|
||||
uint32_t *, BOOL);
|
||||
extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL);
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH */
|
||||
|
|
|
@ -552,41 +552,42 @@ code that uses them is simpler because it assumes this. */
|
|||
memory control. */
|
||||
|
||||
typedef struct pcre2_real_general_context {
|
||||
pcre2_memctl memctl;
|
||||
pcre2_memctl memctl;
|
||||
} pcre2_real_general_context;
|
||||
|
||||
/* The real compile context structure */
|
||||
|
||||
typedef struct pcre2_real_compile_context {
|
||||
pcre2_memctl memctl;
|
||||
int (*stack_guard)(uint32_t);
|
||||
pcre2_memctl memctl;
|
||||
int (*stack_guard)(uint32_t, void *);
|
||||
void *stack_guard_data;
|
||||
const uint8_t *tables;
|
||||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
} pcre2_real_compile_context;
|
||||
|
||||
/* The real match context structure. */
|
||||
|
||||
typedef struct pcre2_real_match_context {
|
||||
pcre2_memctl memctl;
|
||||
pcre2_memctl memctl;
|
||||
#ifdef HEAP_MATCH_RECURSE
|
||||
pcre2_memctl stack_memctl;
|
||||
pcre2_memctl stack_memctl;
|
||||
#endif
|
||||
#ifdef SUPPORT_JIT
|
||||
pcre2_jit_callback jit_callback;
|
||||
void *jit_callback_data;
|
||||
#endif
|
||||
int (*callout)(pcre2_callout_block *, void *);
|
||||
void *callout_data;
|
||||
uint32_t match_limit;
|
||||
uint32_t recursion_limit;
|
||||
int (*callout)(pcre2_callout_block *, void *);
|
||||
void *callout_data;
|
||||
uint32_t match_limit;
|
||||
uint32_t recursion_limit;
|
||||
} pcre2_real_match_context;
|
||||
|
||||
/* The real compiled code structure */
|
||||
|
||||
typedef struct pcre2_real_code {
|
||||
pcre2_memctl memctl; /* Memory control fields */
|
||||
pcre2_memctl memctl; /* Memory control fields */
|
||||
const uint8_t *tables; /* The character tables */
|
||||
void *executable_jit; /* Pointer to JIT code */
|
||||
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
|
||||
|
|
|
@ -1319,7 +1319,7 @@ for (;;)
|
|||
cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
|
||||
cb.pattern_position = GET(ecode, 2);
|
||||
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
|
||||
if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
|
||||
if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (rrc < 0) RRETURN(rrc);
|
||||
}
|
||||
|
@ -1723,7 +1723,7 @@ for (;;)
|
|||
cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject);
|
||||
cb.pattern_position = GET(ecode, 2);
|
||||
cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
|
||||
if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
|
||||
if ((rrc = mb->callout(&cb, mb->callout_data)) > 0)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (rrc < 0) RRETURN(rrc);
|
||||
}
|
||||
|
|
|
@ -943,13 +943,13 @@ are supported. */
|
|||
else \
|
||||
pcre2_set_character_tables_32(G(a,32),b)
|
||||
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
pcre2_set_compile_recursion_guard_8(G(a,8),b); \
|
||||
pcre2_set_compile_recursion_guard_8(G(a,8),b,c); \
|
||||
else if (test_mode == PCRE16_MODE) \
|
||||
pcre2_set_compile_recursion_guard_16(G(a,16),b); \
|
||||
pcre2_set_compile_recursion_guard_16(G(a,16),b,c); \
|
||||
else \
|
||||
pcre2_set_compile_recursion_guard_32(G(a,32),b)
|
||||
pcre2_set_compile_recursion_guard_32(G(a,32),b,c)
|
||||
|
||||
#define PCRE2_SET_MATCH_LIMIT(a,b) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
|
@ -1315,11 +1315,11 @@ the three different cases. */
|
|||
else \
|
||||
G(pcre2_set_character_tables_,BITTWO)(G(a,BITTWO),b)
|
||||
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
G(pcre2_set_compile_recursion_guard_,BITONE)(G(a,BITONE),b); \
|
||||
G(pcre2_set_compile_recursion_guard_,BITONE)(G(a,BITONE),b,c); \
|
||||
else \
|
||||
G(pcre2_set_compile_recursion_guard_,BITTWO)(G(a,BITTWO),b)
|
||||
G(pcre2_set_compile_recursion_guard_,BITTWO)(G(a,BITTWO),b,c)
|
||||
|
||||
#define PCRE2_SET_MATCH_LIMIT(a,b) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
|
@ -1512,8 +1512,8 @@ the three different cases. */
|
|||
#define PCRE2_SET_CALLOUT(a,b,c) \
|
||||
pcre2_set_callout_8(G(a,8),(int (*)(pcre2_callout_block_8 *, void *))b,c)
|
||||
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_8(G(a,8),b)
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
|
||||
pcre2_set_compile_recursion_guard_8(G(a,8),b)
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
||||
pcre2_set_compile_recursion_guard_8(G(a,8),b,c)
|
||||
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_8(G(a,8),b)
|
||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_8(G(a,8),b)
|
||||
#define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_8(G(a,8),b)
|
||||
|
@ -1593,8 +1593,8 @@ the three different cases. */
|
|||
#define PCRE2_SET_CALLOUT(a,b,c) \
|
||||
pcre2_set_callout_16(G(a,16),(int (*)(pcre2_callout_block_16 *, void *))b,c);
|
||||
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_16(G(a,16),b)
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
|
||||
pcre2_set_compile_recursion_guard_16(G(a,16),b)
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
||||
pcre2_set_compile_recursion_guard_16(G(a,16),b,c)
|
||||
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_16(G(a,16),b)
|
||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_16(G(a,16),b)
|
||||
#define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_16(G(a,16),b)
|
||||
|
@ -1674,8 +1674,8 @@ the three different cases. */
|
|||
#define PCRE2_SET_CALLOUT(a,b,c) \
|
||||
pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *, void *))b,c);
|
||||
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b)
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \
|
||||
pcre2_set_compile_recursion_guard_32(G(a,32),b)
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
||||
pcre2_set_compile_recursion_guard_32(G(a,32),b,c)
|
||||
#define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_32(G(a,32),b)
|
||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_32(G(a,32),b)
|
||||
#define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_32(G(a,32),b)
|
||||
|
@ -2104,8 +2104,9 @@ Returns: non-zero to kill the compilation
|
|||
*/
|
||||
|
||||
static int
|
||||
stack_guard(uint32_t depth)
|
||||
stack_guard(uint32_t depth, void *user_data)
|
||||
{
|
||||
(void)user_data;
|
||||
return depth > pat_patctl.stackguard_test;
|
||||
}
|
||||
|
||||
|
@ -3827,7 +3828,7 @@ PCRE2_SET_CHARACTER_TABLES(pat_context, use_tables);
|
|||
|
||||
if (pat_patctl.stackguard_test != 0)
|
||||
{
|
||||
PCRE2_SET_COMPILE_RECURSION_GUARD(pat_context, stack_guard);
|
||||
PCRE2_SET_COMPILE_RECURSION_GUARD(pat_context, stack_guard, NULL);
|
||||
}
|
||||
|
||||
/* Handle compiling via the POSIX interface, which doesn't support the
|
||||
|
@ -5686,13 +5687,13 @@ Returns: nothing
|
|||
*/
|
||||
|
||||
static void
|
||||
print_newline_config(unsigned int rc, BOOL isc)
|
||||
print_newline_config(uint32_t optval, BOOL isc)
|
||||
{
|
||||
if (!isc) printf(" Newline sequence is ");
|
||||
if (rc < sizeof(newlines)/sizeof(char *))
|
||||
printf("%s\n", newlines[rc]);
|
||||
if (optval < sizeof(newlines)/sizeof(char *))
|
||||
printf("%s\n", newlines[optval]);
|
||||
else
|
||||
printf("a non-standard value: %d\n", rc);
|
||||
printf("a non-standard value: %d\n", optval);
|
||||
}
|
||||
|
||||
|
||||
|
@ -5769,8 +5770,7 @@ Returns: the return code
|
|||
static int
|
||||
c_option(const char *arg)
|
||||
{
|
||||
unsigned long int lrc;
|
||||
int rc;
|
||||
uint32_t optval;
|
||||
int yield = 0;
|
||||
|
||||
if (arg != NULL)
|
||||
|
@ -5789,8 +5789,8 @@ if (arg != NULL)
|
|||
switch (coptlist[i].type)
|
||||
{
|
||||
case CONF_BSR:
|
||||
(void)PCRE2_CONFIG(coptlist[i].value, &rc);
|
||||
printf("%s\n", rc? "ANYCRLF" : "ANY");
|
||||
(void)PCRE2_CONFIG(coptlist[i].value, &optval);
|
||||
printf("%s\n", optval? "ANYCRLF" : "ANY");
|
||||
break;
|
||||
|
||||
case CONF_FIX:
|
||||
|
@ -5799,8 +5799,8 @@ if (arg != NULL)
|
|||
break;
|
||||
|
||||
case CONF_FIZ:
|
||||
rc = coptlist[i].value;
|
||||
printf("%d\n", rc);
|
||||
optval = coptlist[i].value;
|
||||
printf("%d\n", optval);
|
||||
break;
|
||||
|
||||
case CONF_INT:
|
||||
|
@ -5809,8 +5809,8 @@ if (arg != NULL)
|
|||
break;
|
||||
|
||||
case CONF_NL:
|
||||
(void)PCRE2_CONFIG(coptlist[i].value, &rc);
|
||||
print_newline_config(rc, TRUE);
|
||||
(void)PCRE2_CONFIG(coptlist[i].value, &optval);
|
||||
print_newline_config(optval, TRUE);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -5822,7 +5822,7 @@ if (arg != NULL)
|
|||
char ucname[16];
|
||||
strcpy(ucname, coptlist[i].name);
|
||||
for (i = 0; ucname[i] != 0; i++) ucname[i] = toupper[ucname[i];
|
||||
vms_setsymbol(ucname, 0, rc);
|
||||
vms_setsymbol(ucname, 0, optval);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -5848,8 +5848,8 @@ printf(" 16-bit support\n");
|
|||
printf(" 32-bit support\n");
|
||||
#endif
|
||||
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_UNICODE, &rc);
|
||||
if (rc != 0)
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_UNICODE, &optval);
|
||||
if (optval != 0)
|
||||
{
|
||||
printf(" UTF and UCP support (");
|
||||
print_unicode_version(stdout);
|
||||
|
@ -5857,8 +5857,8 @@ if (rc != 0)
|
|||
}
|
||||
else printf(" No Unicode support\n");
|
||||
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &rc);
|
||||
if (rc != 0)
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &optval);
|
||||
if (optval != 0)
|
||||
{
|
||||
printf(" Just-in-time compiler support: ");
|
||||
print_jit_target(stdout);
|
||||
|
@ -5869,21 +5869,21 @@ else
|
|||
printf(" No just-in-time compiler support\n");
|
||||
}
|
||||
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_NEWLINE, &rc);
|
||||
print_newline_config(rc, FALSE);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &rc);
|
||||
printf(" \\R matches %s\n", rc? "CR, LF, or CRLF only" :
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_NEWLINE, &optval);
|
||||
print_newline_config(optval, FALSE);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval);
|
||||
printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" :
|
||||
"all Unicode newlines");
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &rc);
|
||||
printf(" Internal link size = %d\n", rc);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &lrc);
|
||||
printf(" Parentheses nest limit = %ld\n", lrc);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_MATCHLIMIT, &lrc);
|
||||
printf(" Default match limit = %ld\n", lrc);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_RECURSIONLIMIT, &lrc);
|
||||
printf(" Default recursion depth limit = %ld\n", lrc);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_STACKRECURSE, &rc);
|
||||
printf(" Match recursion uses %s", rc? "stack" : "heap");
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval);
|
||||
printf(" Internal link size = %d\n", optval);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval);
|
||||
printf(" Parentheses nest limit = %d\n", optval);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_MATCHLIMIT, &optval);
|
||||
printf(" Default match limit = %d\n", optval);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_RECURSIONLIMIT, &optval);
|
||||
printf(" Default recursion depth limit = %d\n", optval);
|
||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_STACKRECURSE, &optval);
|
||||
printf(" Match recursion uses %s", optval? "stack" : "heap");
|
||||
|
||||
printf("\n");
|
||||
return 0;
|
||||
|
|
Loading…
Reference in New Issue