Implement callouts from pcre2_substitute().
This commit is contained in:
parent
80adf9d165
commit
a69267246f
|
@ -12,6 +12,8 @@ partial matches.
|
||||||
2. Fix subject buffer overread in JIT when UTF is disabled and \X or \R has
|
2. Fix subject buffer overread in JIT when UTF is disabled and \X or \R has
|
||||||
a greater than 1 fixed quantifier. This issue was found by Yunho Kim.
|
a greater than 1 fixed quantifier. This issue was found by Yunho Kim.
|
||||||
|
|
||||||
|
3. Added support for callouts from pcre2_substitute().
|
||||||
|
|
||||||
|
|
||||||
Version 10.32 10-September-2018
|
Version 10.32 10-September-2018
|
||||||
-------------------------------
|
-------------------------------
|
||||||
|
|
|
@ -85,6 +85,7 @@ dist_html_DATA = \
|
||||||
doc/html/pcre2_set_parens_nest_limit.html \
|
doc/html/pcre2_set_parens_nest_limit.html \
|
||||||
doc/html/pcre2_set_recursion_limit.html \
|
doc/html/pcre2_set_recursion_limit.html \
|
||||||
doc/html/pcre2_set_recursion_memory_management.html \
|
doc/html/pcre2_set_recursion_memory_management.html \
|
||||||
|
doc/html/pcre2_set_substitute_callout.html \
|
||||||
doc/html/pcre2_substitute.html \
|
doc/html/pcre2_substitute.html \
|
||||||
doc/html/pcre2_substring_copy_byname.html \
|
doc/html/pcre2_substring_copy_byname.html \
|
||||||
doc/html/pcre2_substring_copy_bynumber.html \
|
doc/html/pcre2_substring_copy_bynumber.html \
|
||||||
|
@ -178,6 +179,7 @@ dist_man_MANS = \
|
||||||
doc/pcre2_set_parens_nest_limit.3 \
|
doc/pcre2_set_parens_nest_limit.3 \
|
||||||
doc/pcre2_set_recursion_limit.3 \
|
doc/pcre2_set_recursion_limit.3 \
|
||||||
doc/pcre2_set_recursion_memory_management.3 \
|
doc/pcre2_set_recursion_memory_management.3 \
|
||||||
|
doc/pcre2_set_substitute_callout.3 \
|
||||||
doc/pcre2_substitute.3 \
|
doc/pcre2_substitute.3 \
|
||||||
doc/pcre2_substring_copy_byname.3 \
|
doc/pcre2_substring_copy_byname.3 \
|
||||||
doc/pcre2_substring_copy_bynumber.3 \
|
doc/pcre2_substring_copy_bynumber.3 \
|
||||||
|
|
|
@ -162,7 +162,7 @@ listing), and the short pages for individual functions, are concatenated in
|
||||||
pcre2-config show PCRE2 installation configuration information
|
pcre2-config show PCRE2 installation configuration information
|
||||||
pcre2api details of PCRE2's native C API
|
pcre2api details of PCRE2's native C API
|
||||||
pcre2build building PCRE2
|
pcre2build building PCRE2
|
||||||
pcre2callout details of the callout feature
|
pcre2callout details of the pattern callout feature
|
||||||
pcre2compat discussion of Perl compatibility
|
pcre2compat discussion of Perl compatibility
|
||||||
pcre2convert details of pattern conversion functions
|
pcre2convert details of pattern conversion functions
|
||||||
pcre2demo a demonstration C program that uses PCRE2
|
pcre2demo a demonstration C program that uses PCRE2
|
||||||
|
@ -198,7 +198,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 11 July 2018
|
Last updated: 17 September 2018
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2018 University of Cambridge.
|
Copyright © 1997-2018 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>pcre2_set_substitute_callout specification</title>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||||
|
<h1>pcre2_set_substitute_callout man page</h1>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
|
please consult the man page, in case the conversion went wrong.
|
||||||
|
<br>
|
||||||
|
<br><b>
|
||||||
|
SYNOPSIS
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
<b>#include <pcre2.h></b>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
|
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *),</b>
|
||||||
|
<b> void *<i>callout_data</i>);</b>
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
DESCRIPTION
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
This function sets the substitute callout fields in a match context (the first
|
||||||
|
argument). The second argument specifies a callout function, and the third
|
||||||
|
argument is an opaque data item that is passed to it. The result of this
|
||||||
|
function is always zero.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
|
page and a description of the POSIX API in the
|
||||||
|
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||||
|
page.
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
|
@ -182,6 +182,11 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b> void *<i>callout_data</i>);</b>
|
<b> void *<i>callout_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
|
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *, void *),</b>
|
||||||
|
<b> void *<i>callout_data</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -912,12 +917,23 @@ PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||||
<b> void *<i>callout_data</i>);</b>
|
<b> void *<i>callout_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
This sets up a "callout" function for PCRE2 to call at specified points
|
This sets up a callout function for PCRE2 to call at specified points
|
||||||
during a matching operation. Details are given in the
|
during a matching operation. Details are given in the
|
||||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||||
documentation.
|
documentation.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
|
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *, void *),</b>
|
||||||
|
<b> void *<i>callout_data</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
This sets up a callout function for PCRE2 to call after each substitution
|
||||||
|
made by <b>pcre2_substitute()</b>. Details are given in the section entitled
|
||||||
|
"Creating a new string with substitutions"
|
||||||
|
<a href="#substitutions">below.</a>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -3163,26 +3179,30 @@ page, you cannot use names to distinguish the different subpatterns, because
|
||||||
names are not included in the compiled code. The matching process uses only
|
names are not included in the compiled code. The matching process uses only
|
||||||
numbers. For this reason, the use of different names for subpatterns of the
|
numbers. For this reason, the use of different names for subpatterns of the
|
||||||
same number causes an error at compile time.
|
same number causes an error at compile time.
|
||||||
</P>
|
<a name="substitutions"></a></P>
|
||||||
<br><a name="SEC36" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
<br><a name="SEC36" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||||
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
||||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
|
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
|
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *<i>outputbuffer</i>,</b>
|
||||||
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||||
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
string in <i>outputbuffer</i>, replacing one or more parts that were matched
|
||||||
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
with the <i>replacement</i> string, whose length is supplied in <b>rlength</b>.
|
||||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
This can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||||
which a \K item in a lookahead in the pattern causes the match to end before
|
The default is to perform just one replacement, but there is an option that
|
||||||
it starts are not supported, and give rise to an error return. For global
|
requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below for details).
|
||||||
replacements, matches in which \K in a lookbehind causes the match to start
|
</P>
|
||||||
earlier than the point that was reached in the previous iteration are also not
|
<P>
|
||||||
supported.
|
Matches in which a \K item in a lookahead in the pattern causes the match to
|
||||||
|
end before it starts are not supported, and give rise to an error return. For
|
||||||
|
global replacements, matches in which \K in a lookbehind causes the match to
|
||||||
|
start earlier than the point that was reached in the previous iteration are
|
||||||
|
also not supported.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||||
|
@ -3194,9 +3214,9 @@ allocate memory for the compiled code.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If an external <i>match_data</i> block is provided, its contents afterwards
|
If an external <i>match_data</i> block is provided, its contents afterwards
|
||||||
are those set by the final call to <b>pcre2_match()</b>, which will have
|
are those set by the final call to <b>pcre2_match()</b>. For global changes,
|
||||||
ended in a matching error. The contents of the ovector within the match data
|
this will have ended in a matching error. The contents of the ovector within
|
||||||
block may or may not have been changed.
|
the match data block may or may not have been changed.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||||
|
@ -3220,12 +3240,12 @@ length is in code units, not bytes.
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||||
dollar character is an escape character that can specify the insertion of
|
dollar character is an escape character that can specify the insertion of
|
||||||
characters from capturing groups or (*MARK), (*PRUNE), or (*THEN) items in the
|
characters from capturing groups or names from (*MARK) or other control verbs
|
||||||
pattern. The following forms are always recognized:
|
in the pattern. The following forms are always recognized:
|
||||||
<pre>
|
<pre>
|
||||||
$$ insert a dollar character
|
$$ insert a dollar character
|
||||||
$<n> or ${<n>} insert the contents of group <n>
|
$<n> or ${<n>} insert the contents of group <n>
|
||||||
$*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name
|
$*MARK or ${*MARK} insert a control verb name
|
||||||
</pre>
|
</pre>
|
||||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||||
required only if the following character would be interpreted as part of the
|
required only if the following character would be interpreted as part of the
|
||||||
|
@ -3234,12 +3254,13 @@ For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||||
string "+$1$0$1+", the result is "=+babcb+=".
|
string "+$1$0$1+", the result is "=+babcb+=".
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
$*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or (*THEN)
|
$*MARK inserts the name from the last encountered (*ACCEPT), (*COMMIT),
|
||||||
on the matching path that has a name. (*MARK) must always include a name, but
|
(*MARK), (*PRUNE), or (*THEN) on the matching path that has a name. (*MARK)
|
||||||
(*PRUNE) and (*THEN) need not. For example, in the case of (*MARK:A)(*PRUNE)
|
must always include a name, but the other verbs need not. For example, in
|
||||||
the name inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B".
|
the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
|
||||||
This facility can be used to perform simple simultaneous substitutions, as this
|
(*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be used to
|
||||||
<b>pcre2test</b> example shows:
|
perform simple simultaneous substitutions, as this <b>pcre2test</b> example
|
||||||
|
shows:
|
||||||
<pre>
|
<pre>
|
||||||
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
|
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
|
||||||
apple lemon
|
apple lemon
|
||||||
|
@ -3399,6 +3420,44 @@ obtained by calling the <b>pcre2_get_error_message()</b> function (see
|
||||||
"Obtaining a textual error message"
|
"Obtaining a textual error message"
|
||||||
<a href="#geterrormessage">above).</a>
|
<a href="#geterrormessage">above).</a>
|
||||||
</P>
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Substitution callouts
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
|
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *, void *),</b>
|
||||||
|
<b> void *<i>callout_data</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
The <b>pcre2_set_substitution_callout()</b> function can be used to specify a
|
||||||
|
callout function for <b>pcre2_substitute()</b>. This information is passed in
|
||||||
|
a match context. The callout function is called after each substitution. It is
|
||||||
|
not called for simulated substitutions that happen as a result of the
|
||||||
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. A callout function should not return
|
||||||
|
any value.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The first argument of the callout function is a pointer to a substitute callout
|
||||||
|
block structure, which contains the following fields, not necessarily in this
|
||||||
|
order:
|
||||||
|
<pre>
|
||||||
|
uint32_t <i>version</i>;
|
||||||
|
PCRE2_SIZE <i>input_offsets[2]</i>;
|
||||||
|
PCRE2_SIZE <i>output_offsets[2]</i>;
|
||||||
|
</pre>
|
||||||
|
The <i>version</i> field contains the version number of the block format. The
|
||||||
|
current version is 0. The version number will increase in future if more fields
|
||||||
|
are added, but the intention is never to remove any of the existing fields.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The <i>input_offsets</i> vector contains the code unit offsets in the input
|
||||||
|
string of the matched substring, and the <i>output_offsets</i> vector contains
|
||||||
|
the offsets of the replacement in the output string.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The second argument of the callout function is the value passed as
|
||||||
|
<i>callout_data</i> when the function was registered.
|
||||||
|
</P>
|
||||||
<br><a name="SEC37" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
<br><a name="SEC37" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
||||||
|
@ -3665,7 +3724,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 07 September 2018
|
Last updated: 18 September 2018
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2018 University of Cambridge.
|
Copyright © 1997-2018 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -44,6 +44,14 @@ a match context (see <b>pcre2_set_callout()</b> in the
|
||||||
documentation).
|
documentation).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
When using the <b>pcre2_substitute()</b> function, an additional callout feature
|
||||||
|
is available. This does a callout after each change to the subject string and
|
||||||
|
is described in the
|
||||||
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
|
documentation; the rest of this document is concerned with callouts during
|
||||||
|
pattern matching.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
Within a regular expression, (?C<arg>) indicates a point at which the external
|
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||||
function is to be called. Different callout points can be identified by putting
|
function is to be called. Different callout points can be identified by putting
|
||||||
a number less than 256 after the letter C. The default value is zero.
|
a number less than 256 after the letter C. The default value is zero.
|
||||||
|
@ -463,7 +471,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 26 April 2018
|
Last updated: 17 September 2018
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2018 University of Cambridge.
|
Copyright © 1997-2018 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -1041,6 +1041,7 @@ process.
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
|
allvector show the entire ovector
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
/g global global matching
|
/g global global matching
|
||||||
|
@ -1048,6 +1049,7 @@ process.
|
||||||
mark show mark values
|
mark show mark values
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
|
substitute_callout use substitution callouts
|
||||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
@ -1185,6 +1187,7 @@ pattern.
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
|
allvector show the entire ovector
|
||||||
allusedtext show all consulted text (non-JIT only)
|
allusedtext show all consulted text (non-JIT only)
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
callout_capture show captures at callout time
|
callout_capture show captures at callout time
|
||||||
|
@ -1214,6 +1217,7 @@ pattern.
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show startchar when relevant
|
startchar show startchar when relevant
|
||||||
startoffset=<n> same as offset=<n>
|
startoffset=<n> same as offset=<n>
|
||||||
|
substitute_callout use substitution callouts
|
||||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
@ -1281,10 +1285,28 @@ captured parentheses be output after a match. By default, only those up to the
|
||||||
highest one actually used in the match are output (corresponding to the return
|
highest one actually used in the match are output (corresponding to the return
|
||||||
code from <b>pcre2_match()</b>). Groups that did not take part in the match
|
code from <b>pcre2_match()</b>). Groups that did not take part in the match
|
||||||
are output as "<unset>". This modifier is not relevant for DFA matching (which
|
are output as "<unset>". This modifier is not relevant for DFA matching (which
|
||||||
does no capturing); it is ignored, with a warning message, if present.
|
does no capturing) and does not apply when <b>replace</b> is specified; it is
|
||||||
|
ignored, with a warning message, if present.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Testing callouts
|
Showing the entire ovector, for all outcomes
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
The <b>allvector</b> modifier requests that the entire ovector be shown,
|
||||||
|
whatever the outcome of the match. Compare <b>allcaptures</b>, which shows only
|
||||||
|
up to the maximum number of capture groups for the pattern, and then only for a
|
||||||
|
successful complete non-DFA match. This modifier, which acts after any match
|
||||||
|
result, and also for DFA matching, provides a means of checking that there are
|
||||||
|
no unexpected modifications to ovector fields. Before each match attempt, the
|
||||||
|
ovector is filled with a special value, and if this is found in both elements
|
||||||
|
of a capturing pair, "<unchanged>" is output. After a successful match, this
|
||||||
|
applies to all groups after the maximum capture group for the pattern. In other
|
||||||
|
cases it applies to the entire ovector. After a partial match, the first two
|
||||||
|
elements are the only ones that should be set. After a DFA match, the amount of
|
||||||
|
ovector that is used depends on the number of matches that were found.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Testing pattern callouts
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
A callout function is supplied when <b>pcre2test</b> calls the library matching
|
A callout function is supplied when <b>pcre2test</b> calls the library matching
|
||||||
|
@ -1292,6 +1314,9 @@ functions, unless <b>callout_none</b> is specified. Its behaviour can be
|
||||||
controlled by various modifiers listed above whose names begin with
|
controlled by various modifiers listed above whose names begin with
|
||||||
<b>callout_</b>. Details are given in the section entitled "Callouts"
|
<b>callout_</b>. Details are given in the section entitled "Callouts"
|
||||||
<a href="#callouts">below.</a>
|
<a href="#callouts">below.</a>
|
||||||
|
Testing callouts from <b>pcre2_substitute()</b> is decribed separately in
|
||||||
|
"Testing the substitution function"
|
||||||
|
<a href="#substitution">below.</a>
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Finding all matches in a string
|
Finding all matches in a string
|
||||||
|
@ -1343,7 +1368,7 @@ instead of a colon. This is in addition to the normal full list. The string
|
||||||
length (that is, the return from the extraction function) is given in
|
length (that is, the return from the extraction function) is given in
|
||||||
parentheses after each substring, followed by the name when the extraction was
|
parentheses after each substring, followed by the name when the extraction was
|
||||||
by name.
|
by name.
|
||||||
</P>
|
<a name="substitution"></a></P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Testing the substitution function
|
Testing the substitution function
|
||||||
</b><br>
|
</b><br>
|
||||||
|
@ -1384,6 +1409,16 @@ simple example of a substitution test:
|
||||||
=abc=abc=\=global
|
=abc=abc=\=global
|
||||||
2: =xxx=xxx=
|
2: =xxx=xxx=
|
||||||
</pre>
|
</pre>
|
||||||
|
If the <b>substitute_callout</b> modifier is set, a substitution callout
|
||||||
|
function is set up. When it is called (after each substitution), the offsets in
|
||||||
|
the input and output strings are output. For example:
|
||||||
|
<pre>
|
||||||
|
/abc/g,replace=<$0>,substitute_callout
|
||||||
|
abcdefabcpqr
|
||||||
|
Old 0 3 New 0 5
|
||||||
|
Old 6 9 New 8 13
|
||||||
|
2: <abc>def<abc>pqr
|
||||||
|
</pre>
|
||||||
Subject and replacement strings should be kept relatively short (fewer than 256
|
Subject and replacement strings should be kept relatively short (fewer than 256
|
||||||
characters) for substitution tests, as fixed-size buffers are used. To make it
|
characters) for substitution tests, as fixed-size buffers are used. To make it
|
||||||
easy to test for buffer overflow, if the replacement string starts with a
|
easy to test for buffer overflow, if the replacement string starts with a
|
||||||
|
@ -1401,10 +1436,10 @@ The default action of <b>pcre2_substitute()</b> is to return
|
||||||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
||||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
||||||
<b>substitute_overflow_length</b> modifier), <b>pcre2_substitute()</b> continues
|
<b>substitute_overflow_length</b> modifier), <b>pcre2_substitute()</b> continues
|
||||||
to go through the motions of matching and substituting, in order to compute the
|
to go through the motions of matching and substituting (but not doing any
|
||||||
size of buffer that is required. When this happens, <b>pcre2test</b> shows the
|
callouts), in order to compute the size of buffer that is required. When this
|
||||||
required buffer length (which includes space for the trailing zero) as part of
|
happens, <b>pcre2test</b> shows the required buffer length (which includes space
|
||||||
the error message. For example:
|
for the trailing zero) as part of the error message. For example:
|
||||||
<pre>
|
<pre>
|
||||||
/abc/substitute_overflow_length
|
/abc/substitute_overflow_length
|
||||||
123abc123\=replace=[9]XYZ
|
123abc123\=replace=[9]XYZ
|
||||||
|
@ -2004,7 +2039,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 21 July 2018
|
Last updated: 17 September 2018
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2018 University of Cambridge.
|
Copyright © 1997-2018 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2 3 "11 July 2018" "PCRE2 10.32"
|
.TH PCRE2 3 "17 September 2018" "PCRE2 10.33"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH INTRODUCTION
|
.SH INTRODUCTION
|
||||||
|
@ -156,7 +156,7 @@ listing), and the short pages for individual functions, are concatenated in
|
||||||
pcre2-config show PCRE2 installation configuration information
|
pcre2-config show PCRE2 installation configuration information
|
||||||
pcre2api details of PCRE2's native C API
|
pcre2api details of PCRE2's native C API
|
||||||
pcre2build building PCRE2
|
pcre2build building PCRE2
|
||||||
pcre2callout details of the callout feature
|
pcre2callout details of the pattern callout feature
|
||||||
pcre2compat discussion of Perl compatibility
|
pcre2compat discussion of Perl compatibility
|
||||||
pcre2convert details of pattern conversion functions
|
pcre2convert details of pattern conversion functions
|
||||||
pcre2demo a demonstration C program that uses PCRE2
|
pcre2demo a demonstration C program that uses PCRE2
|
||||||
|
@ -197,6 +197,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 11 July 2018
|
Last updated: 17 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
548
doc/pcre2.txt
548
doc/pcre2.txt
|
@ -141,7 +141,7 @@ USER DOCUMENTATION
|
||||||
pcre2-config show PCRE2 installation configuration information
|
pcre2-config show PCRE2 installation configuration information
|
||||||
pcre2api details of PCRE2's native C API
|
pcre2api details of PCRE2's native C API
|
||||||
pcre2build building PCRE2
|
pcre2build building PCRE2
|
||||||
pcre2callout details of the callout feature
|
pcre2callout details of the pattern callout feature
|
||||||
pcre2compat discussion of Perl compatibility
|
pcre2compat discussion of Perl compatibility
|
||||||
pcre2convert details of pattern conversion functions
|
pcre2convert details of pattern conversion functions
|
||||||
pcre2demo a demonstration C program that uses PCRE2
|
pcre2demo a demonstration C program that uses PCRE2
|
||||||
|
@ -177,7 +177,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 11 July 2018
|
Last updated: 17 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -293,6 +293,10 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS
|
||||||
int (*callout_function)(pcre2_callout_block *, void *),
|
int (*callout_function)(pcre2_callout_block *, void *),
|
||||||
void *callout_data);
|
void *callout_data);
|
||||||
|
|
||||||
|
int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||||
|
void (*callout_function)(pcre2_substitute_callout_block *, void *),
|
||||||
|
void *callout_data);
|
||||||
|
|
||||||
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
|
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
|
||||||
PCRE2_SIZE value);
|
PCRE2_SIZE value);
|
||||||
|
|
||||||
|
@ -933,10 +937,18 @@ PCRE2 CONTEXTS
|
||||||
int (*callout_function)(pcre2_callout_block *, void *),
|
int (*callout_function)(pcre2_callout_block *, void *),
|
||||||
void *callout_data);
|
void *callout_data);
|
||||||
|
|
||||||
This sets up a "callout" function for PCRE2 to call at specified points
|
This sets up a callout function for PCRE2 to call at specified points
|
||||||
during a matching operation. Details are given in the pcre2callout doc-
|
during a matching operation. Details are given in the pcre2callout doc-
|
||||||
umentation.
|
umentation.
|
||||||
|
|
||||||
|
int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||||
|
void (*callout_function)(pcre2_substitute_callout_block *, void *),
|
||||||
|
void *callout_data);
|
||||||
|
|
||||||
|
This sets up a callout function for PCRE2 to call after each substitu-
|
||||||
|
tion made by pcre2_substitute(). Details are given in the section enti-
|
||||||
|
tled "Creating a new string with substitutions" below.
|
||||||
|
|
||||||
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
|
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
|
||||||
PCRE2_SIZE value);
|
PCRE2_SIZE value);
|
||||||
|
|
||||||
|
@ -3083,18 +3095,22 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||||
uint32_t options, pcre2_match_data *match_data,
|
uint32_t options, pcre2_match_data *match_data,
|
||||||
pcre2_match_context *mcontext, PCRE2_SPTR replacement,
|
pcre2_match_context *mcontext, PCRE2_SPTR replacement,
|
||||||
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
|
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
|
||||||
PCRE2_SIZE *outlengthptr);
|
PCRE2_SIZE *outlengthptr);
|
||||||
|
|
||||||
This function calls pcre2_match() and then makes a copy of the subject
|
This function calls pcre2_match() and then makes a copy of the subject
|
||||||
string in outputbuffer, replacing the part that was matched with the
|
string in outputbuffer, replacing one or more parts that were matched
|
||||||
replacement string, whose length is supplied in rlength. This can be
|
with the replacement string, whose length is supplied in rlength. This
|
||||||
given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||||
which a \K item in a lookahead in the pattern causes the match to end
|
The default is to perform just one replacement, but there is an option
|
||||||
before it starts are not supported, and give rise to an error return.
|
that requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below
|
||||||
For global replacements, matches in which \K in a lookbehind causes the
|
for details).
|
||||||
match to start earlier than the point that was reached in the previous
|
|
||||||
iteration are also not supported.
|
Matches in which a \K item in a lookahead in the pattern causes the
|
||||||
|
match to end before it starts are not supported, and give rise to an
|
||||||
|
error return. For global replacements, matches in which \K in a lookbe-
|
||||||
|
hind causes the match to start earlier than the point that was reached
|
||||||
|
in the previous iteration are also not supported.
|
||||||
|
|
||||||
The first seven arguments of pcre2_substitute() are the same as for
|
The first seven arguments of pcre2_substitute() are the same as for
|
||||||
pcre2_match(), except that the partial matching options are not permit-
|
pcre2_match(), except that the partial matching options are not permit-
|
||||||
|
@ -3104,9 +3120,9 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
were used to allocate memory for the compiled code.
|
were used to allocate memory for the compiled code.
|
||||||
|
|
||||||
If an external match_data block is provided, its contents afterwards
|
If an external match_data block is provided, its contents afterwards
|
||||||
are those set by the final call to pcre2_match(), which will have ended
|
are those set by the final call to pcre2_match(). For global changes,
|
||||||
in a matching error. The contents of the ovector within the match data
|
this will have ended in a matching error. The contents of the ovector
|
||||||
block may or may not have been changed.
|
within the match data block may or may not have been changed.
|
||||||
|
|
||||||
The outlengthptr argument must point to a variable that contains the
|
The outlengthptr argument must point to a variable that contains the
|
||||||
length, in code units, of the output buffer. If the function is suc-
|
length, in code units, of the output buffer. If the function is suc-
|
||||||
|
@ -3128,13 +3144,13 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF
|
In the replacement string, which is interpreted as a UTF string in UTF
|
||||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||||
option is set, a dollar character is an escape character that can spec-
|
option is set, a dollar character is an escape character that can spec-
|
||||||
ify the insertion of characters from capturing groups or (*MARK),
|
ify the insertion of characters from capturing groups or names from
|
||||||
(*PRUNE), or (*THEN) items in the pattern. The following forms are
|
(*MARK) or other control verbs in the pattern. The following forms are
|
||||||
always recognized:
|
always recognized:
|
||||||
|
|
||||||
$$ insert a dollar character
|
$$ insert a dollar character
|
||||||
$<n> or ${<n>} insert the contents of group <n>
|
$<n> or ${<n>} insert the contents of group <n>
|
||||||
$*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name
|
$*MARK or ${*MARK} insert a control verb name
|
||||||
|
|
||||||
Either a group number or a group name can be given for <n>. Curly
|
Either a group number or a group name can be given for <n>. Curly
|
||||||
brackets are required only if the following character would be inter-
|
brackets are required only if the following character would be inter-
|
||||||
|
@ -3143,11 +3159,11 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
matched with "=abc=" and the replacement string "+$1$0$1+", the result
|
matched with "=abc=" and the replacement string "+$1$0$1+", the result
|
||||||
is "=+babcb+=".
|
is "=+babcb+=".
|
||||||
|
|
||||||
$*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or
|
$*MARK inserts the name from the last encountered (*ACCEPT), (*COMMIT),
|
||||||
(*THEN) on the matching path that has a name. (*MARK) must always
|
(*MARK), (*PRUNE), or (*THEN) on the matching path that has a name.
|
||||||
include a name, but (*PRUNE) and (*THEN) need not. For example, in the
|
(*MARK) must always include a name, but the other verbs need not. For
|
||||||
case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
|
example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but
|
||||||
(*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be
|
for (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be
|
||||||
used to perform simple simultaneous substitutions, as this pcre2test
|
used to perform simple simultaneous substitutions, as this pcre2test
|
||||||
example shows:
|
example shows:
|
||||||
|
|
||||||
|
@ -3302,62 +3318,95 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
obtained by calling the pcre2_get_error_message() function (see
|
obtained by calling the pcre2_get_error_message() function (see
|
||||||
"Obtaining a textual error message" above).
|
"Obtaining a textual error message" above).
|
||||||
|
|
||||||
|
Substitution callouts
|
||||||
|
|
||||||
|
int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||||
|
void (*callout_function)(pcre2_substitute_callout_block *, void *),
|
||||||
|
void *callout_data);
|
||||||
|
|
||||||
|
The pcre2_set_substitution_callout() function can be used to specify a
|
||||||
|
callout function for pcre2_substitute(). This information is passed in
|
||||||
|
a match context. The callout function is called after each substitu-
|
||||||
|
tion. It is not called for simulated substitutions that happen as a
|
||||||
|
result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. A callout func-
|
||||||
|
tion should not return any value.
|
||||||
|
|
||||||
|
The first argument of the callout function is a pointer to a substitute
|
||||||
|
callout block structure, which contains the following fields, not nec-
|
||||||
|
essarily in this order:
|
||||||
|
|
||||||
|
uint32_t version;
|
||||||
|
PCRE2_SIZE input_offsets[2];
|
||||||
|
PCRE2_SIZE output_offsets[2];
|
||||||
|
|
||||||
|
The version field contains the version number of the block format. The
|
||||||
|
current version is 0. The version number will increase in future if
|
||||||
|
more fields are added, but the intention is never to remove any of the
|
||||||
|
existing fields.
|
||||||
|
|
||||||
|
The input_offsets vector contains the code unit offsets in the input
|
||||||
|
string of the matched substring, and the output_offsets vector contains
|
||||||
|
the offsets of the replacement in the output string.
|
||||||
|
|
||||||
|
The second argument of the callout function is the value passed as
|
||||||
|
callout_data when the function was registered.
|
||||||
|
|
||||||
|
|
||||||
DUPLICATE SUBPATTERN NAMES
|
DUPLICATE SUBPATTERN NAMES
|
||||||
|
|
||||||
int pcre2_substring_nametable_scan(const pcre2_code *code,
|
int pcre2_substring_nametable_scan(const pcre2_code *code,
|
||||||
PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
|
PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
|
||||||
|
|
||||||
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
||||||
subpatterns are not required to be unique. Duplicate names are always
|
subpatterns are not required to be unique. Duplicate names are always
|
||||||
allowed for subpatterns with the same number, created by using the (?|
|
allowed for subpatterns with the same number, created by using the (?|
|
||||||
feature. Indeed, if such subpatterns are named, they are required to
|
feature. Indeed, if such subpatterns are named, they are required to
|
||||||
use the same names.
|
use the same names.
|
||||||
|
|
||||||
Normally, patterns with duplicate names are such that in any one match,
|
Normally, patterns with duplicate names are such that in any one match,
|
||||||
only one of the named subpatterns participates. An example is shown in
|
only one of the named subpatterns participates. An example is shown in
|
||||||
the pcre2pattern documentation.
|
the pcre2pattern documentation.
|
||||||
|
|
||||||
When duplicates are present, pcre2_substring_copy_byname() and
|
When duplicates are present, pcre2_substring_copy_byname() and
|
||||||
pcre2_substring_get_byname() return the first substring corresponding
|
pcre2_substring_get_byname() return the first substring corresponding
|
||||||
to the given name that is set. Only if none are set is
|
to the given name that is set. Only if none are set is
|
||||||
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
||||||
function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
|
function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
|
||||||
duplicate names.
|
duplicate names.
|
||||||
|
|
||||||
If you want to get full details of all captured substrings for a given
|
If you want to get full details of all captured substrings for a given
|
||||||
name, you must use the pcre2_substring_nametable_scan() function. The
|
name, you must use the pcre2_substring_nametable_scan() function. The
|
||||||
first argument is the compiled pattern, and the second is the name. If
|
first argument is the compiled pattern, and the second is the name. If
|
||||||
the third and fourth arguments are NULL, the function returns a group
|
the third and fourth arguments are NULL, the function returns a group
|
||||||
number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
|
number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
|
||||||
|
|
||||||
When the third and fourth arguments are not NULL, they must be pointers
|
When the third and fourth arguments are not NULL, they must be pointers
|
||||||
to variables that are updated by the function. After it has run, they
|
to variables that are updated by the function. After it has run, they
|
||||||
point to the first and last entries in the name-to-number table for the
|
point to the first and last entries in the name-to-number table for the
|
||||||
given name, and the function returns the length of each entry in code
|
given name, and the function returns the length of each entry in code
|
||||||
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
||||||
no entries for the given name.
|
no entries for the given name.
|
||||||
|
|
||||||
The format of the name table is described above in the section entitled
|
The format of the name table is described above in the section entitled
|
||||||
Information about a pattern. Given all the relevant entries for the
|
Information about a pattern. Given all the relevant entries for the
|
||||||
name, you can extract each of their numbers, and hence the captured
|
name, you can extract each of their numbers, and hence the captured
|
||||||
data.
|
data.
|
||||||
|
|
||||||
|
|
||||||
FINDING ALL POSSIBLE MATCHES AT ONE POSITION
|
FINDING ALL POSSIBLE MATCHES AT ONE POSITION
|
||||||
|
|
||||||
The traditional matching function uses a similar algorithm to Perl,
|
The traditional matching function uses a similar algorithm to Perl,
|
||||||
which stops when it finds the first match at a given point in the sub-
|
which stops when it finds the first match at a given point in the sub-
|
||||||
ject. If you want to find all possible matches, or the longest possible
|
ject. If you want to find all possible matches, or the longest possible
|
||||||
match at a given position, consider using the alternative matching
|
match at a given position, consider using the alternative matching
|
||||||
function (see below) instead. If you cannot use the alternative func-
|
function (see below) instead. If you cannot use the alternative func-
|
||||||
tion, you can kludge it up by making use of the callout facility, which
|
tion, you can kludge it up by making use of the callout facility, which
|
||||||
is described in the pcre2callout documentation.
|
is described in the pcre2callout documentation.
|
||||||
|
|
||||||
What you have to do is to insert a callout right at the end of the pat-
|
What you have to do is to insert a callout right at the end of the pat-
|
||||||
tern. When your callout function is called, extract and save the cur-
|
tern. When your callout function is called, extract and save the cur-
|
||||||
rent matched substring. Then return 1, which forces pcre2_match() to
|
rent matched substring. Then return 1, which forces pcre2_match() to
|
||||||
backtrack and try other alternatives. Ultimately, when it runs out of
|
backtrack and try other alternatives. Ultimately, when it runs out of
|
||||||
matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
|
matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
|
||||||
|
|
||||||
|
|
||||||
|
@ -3369,26 +3418,26 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
pcre2_match_context *mcontext,
|
pcre2_match_context *mcontext,
|
||||||
int *workspace, PCRE2_SIZE wscount);
|
int *workspace, PCRE2_SIZE wscount);
|
||||||
|
|
||||||
The function pcre2_dfa_match() is called to match a subject string
|
The function pcre2_dfa_match() is called to match a subject string
|
||||||
against a compiled pattern, using a matching algorithm that scans the
|
against a compiled pattern, using a matching algorithm that scans the
|
||||||
subject string just once (not counting lookaround assertions), and does
|
subject string just once (not counting lookaround assertions), and does
|
||||||
not backtrack. This has different characteristics to the normal algo-
|
not backtrack. This has different characteristics to the normal algo-
|
||||||
rithm, and is not compatible with Perl. Some of the features of PCRE2
|
rithm, and is not compatible with Perl. Some of the features of PCRE2
|
||||||
patterns are not supported. Nevertheless, there are times when this
|
patterns are not supported. Nevertheless, there are times when this
|
||||||
kind of matching can be useful. For a discussion of the two matching
|
kind of matching can be useful. For a discussion of the two matching
|
||||||
algorithms, and a list of features that pcre2_dfa_match() does not sup-
|
algorithms, and a list of features that pcre2_dfa_match() does not sup-
|
||||||
port, see the pcre2matching documentation.
|
port, see the pcre2matching documentation.
|
||||||
|
|
||||||
The arguments for the pcre2_dfa_match() function are the same as for
|
The arguments for the pcre2_dfa_match() function are the same as for
|
||||||
pcre2_match(), plus two extras. The ovector within the match data block
|
pcre2_match(), plus two extras. The ovector within the match data block
|
||||||
is used in a different way, and this is described below. The other com-
|
is used in a different way, and this is described below. The other com-
|
||||||
mon arguments are used in the same way as for pcre2_match(), so their
|
mon arguments are used in the same way as for pcre2_match(), so their
|
||||||
description is not repeated here.
|
description is not repeated here.
|
||||||
|
|
||||||
The two additional arguments provide workspace for the function. The
|
The two additional arguments provide workspace for the function. The
|
||||||
workspace vector should contain at least 20 elements. It is used for
|
workspace vector should contain at least 20 elements. It is used for
|
||||||
keeping track of multiple paths through the pattern tree. More
|
keeping track of multiple paths through the pattern tree. More
|
||||||
workspace is needed for patterns and subjects where there are a lot of
|
workspace is needed for patterns and subjects where there are a lot of
|
||||||
potential matches.
|
potential matches.
|
||||||
|
|
||||||
Here is an example of a simple call to pcre2_dfa_match():
|
Here is an example of a simple call to pcre2_dfa_match():
|
||||||
|
@ -3408,45 +3457,45 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
|
|
||||||
Option bits for pcre_dfa_match()
|
Option bits for pcre_dfa_match()
|
||||||
|
|
||||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_ENDAN-
|
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_ENDAN-
|
||||||
CHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
CHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||||
PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD,
|
PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD,
|
||||||
PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but
|
PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but
|
||||||
the last four of these are exactly the same as for pcre2_match(), so
|
the last four of these are exactly the same as for pcre2_match(), so
|
||||||
their description is not repeated here.
|
their description is not repeated here.
|
||||||
|
|
||||||
PCRE2_PARTIAL_HARD
|
PCRE2_PARTIAL_HARD
|
||||||
PCRE2_PARTIAL_SOFT
|
PCRE2_PARTIAL_SOFT
|
||||||
|
|
||||||
These have the same general effect as they do for pcre2_match(), but
|
These have the same general effect as they do for pcre2_match(), but
|
||||||
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
||||||
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
||||||
subject is reached and there is still at least one matching possibility
|
subject is reached and there is still at least one matching possibility
|
||||||
that requires additional characters. This happens even if some complete
|
that requires additional characters. This happens even if some complete
|
||||||
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
||||||
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
||||||
if the end of the subject is reached, there have been no complete
|
if the end of the subject is reached, there have been no complete
|
||||||
matches, but there is still at least one matching possibility. The por-
|
matches, but there is still at least one matching possibility. The por-
|
||||||
tion of the string that was inspected when the longest partial match
|
tion of the string that was inspected when the longest partial match
|
||||||
was found is set as the first matching string in both cases. There is a
|
was found is set as the first matching string in both cases. There is a
|
||||||
more detailed discussion of partial and multi-segment matching, with
|
more detailed discussion of partial and multi-segment matching, with
|
||||||
examples, in the pcre2partial documentation.
|
examples, in the pcre2partial documentation.
|
||||||
|
|
||||||
PCRE2_DFA_SHORTEST
|
PCRE2_DFA_SHORTEST
|
||||||
|
|
||||||
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
||||||
stop as soon as it has found one match. Because of the way the alterna-
|
stop as soon as it has found one match. Because of the way the alterna-
|
||||||
tive algorithm works, this is necessarily the shortest possible match
|
tive algorithm works, this is necessarily the shortest possible match
|
||||||
at the first possible matching point in the subject string.
|
at the first possible matching point in the subject string.
|
||||||
|
|
||||||
PCRE2_DFA_RESTART
|
PCRE2_DFA_RESTART
|
||||||
|
|
||||||
When pcre2_dfa_match() returns a partial match, it is possible to call
|
When pcre2_dfa_match() returns a partial match, it is possible to call
|
||||||
it again, with additional subject characters, and have it continue with
|
it again, with additional subject characters, and have it continue with
|
||||||
the same match. The PCRE2_DFA_RESTART option requests this action; when
|
the same match. The PCRE2_DFA_RESTART option requests this action; when
|
||||||
it is set, the workspace and wscount options must reference the same
|
it is set, the workspace and wscount options must reference the same
|
||||||
vector as before because data about the match so far is left in them
|
vector as before because data about the match so far is left in them
|
||||||
after a partial match. There is more discussion of this facility in the
|
after a partial match. There is more discussion of this facility in the
|
||||||
pcre2partial documentation.
|
pcre2partial documentation.
|
||||||
|
|
||||||
|
@ -3454,8 +3503,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
|
|
||||||
When pcre2_dfa_match() succeeds, it may have matched more than one sub-
|
When pcre2_dfa_match() succeeds, it may have matched more than one sub-
|
||||||
string in the subject. Note, however, that all the matches from one run
|
string in the subject. Note, however, that all the matches from one run
|
||||||
of the function start at the same point in the subject. The shorter
|
of the function start at the same point in the subject. The shorter
|
||||||
matches are all initial substrings of the longer matches. For example,
|
matches are all initial substrings of the longer matches. For example,
|
||||||
if the pattern
|
if the pattern
|
||||||
|
|
||||||
<.*>
|
<.*>
|
||||||
|
@ -3470,73 +3519,73 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
<something> <something else>
|
<something> <something else>
|
||||||
<something>
|
<something>
|
||||||
|
|
||||||
On success, the yield of the function is a number greater than zero,
|
On success, the yield of the function is a number greater than zero,
|
||||||
which is the number of matched substrings. The offsets of the sub-
|
which is the number of matched substrings. The offsets of the sub-
|
||||||
strings are returned in the ovector, and can be extracted by number in
|
strings are returned in the ovector, and can be extracted by number in
|
||||||
the same way as for pcre2_match(), but the numbers bear no relation to
|
the same way as for pcre2_match(), but the numbers bear no relation to
|
||||||
any capturing groups that may exist in the pattern, because DFA match-
|
any capturing groups that may exist in the pattern, because DFA match-
|
||||||
ing does not support group capture.
|
ing does not support group capture.
|
||||||
|
|
||||||
Calls to the convenience functions that extract substrings by name
|
Calls to the convenience functions that extract substrings by name
|
||||||
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
||||||
after a DFA match. The convenience functions that extract substrings by
|
after a DFA match. The convenience functions that extract substrings by
|
||||||
number never return PCRE2_ERROR_NOSUBSTRING.
|
number never return PCRE2_ERROR_NOSUBSTRING.
|
||||||
|
|
||||||
The matched strings are stored in the ovector in reverse order of
|
The matched strings are stored in the ovector in reverse order of
|
||||||
length; that is, the longest matching string is first. If there were
|
length; that is, the longest matching string is first. If there were
|
||||||
too many matches to fit into the ovector, the yield of the function is
|
too many matches to fit into the ovector, the yield of the function is
|
||||||
zero, and the vector is filled with the longest matches.
|
zero, and the vector is filled with the longest matches.
|
||||||
|
|
||||||
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
||||||
character repeats at the end of a pattern (as well as internally). For
|
character repeats at the end of a pattern (as well as internally). For
|
||||||
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
||||||
matching, this means that only one possible match is found. If you
|
matching, this means that only one possible match is found. If you
|
||||||
really do want multiple matches in such cases, either use an ungreedy
|
really do want multiple matches in such cases, either use an ungreedy
|
||||||
repeat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
repeat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||||
compiling.
|
compiling.
|
||||||
|
|
||||||
Error returns from pcre2_dfa_match()
|
Error returns from pcre2_dfa_match()
|
||||||
|
|
||||||
The pcre2_dfa_match() function returns a negative number when it fails.
|
The pcre2_dfa_match() function returns a negative number when it fails.
|
||||||
Many of the errors are the same as for pcre2_match(), as described
|
Many of the errors are the same as for pcre2_match(), as described
|
||||||
above. There are in addition the following errors that are specific to
|
above. There are in addition the following errors that are specific to
|
||||||
pcre2_dfa_match():
|
pcre2_dfa_match():
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_UITEM
|
PCRE2_ERROR_DFA_UITEM
|
||||||
|
|
||||||
This return is given if pcre2_dfa_match() encounters an item in the
|
This return is given if pcre2_dfa_match() encounters an item in the
|
||||||
pattern that it does not support, for instance, the use of \C in a UTF
|
pattern that it does not support, for instance, the use of \C in a UTF
|
||||||
mode or a backreference.
|
mode or a backreference.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_UCOND
|
PCRE2_ERROR_DFA_UCOND
|
||||||
|
|
||||||
This return is given if pcre2_dfa_match() encounters a condition item
|
This return is given if pcre2_dfa_match() encounters a condition item
|
||||||
that uses a backreference for the condition, or a test for recursion in
|
that uses a backreference for the condition, or a test for recursion in
|
||||||
a specific group. These are not supported.
|
a specific group. These are not supported.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_WSSIZE
|
PCRE2_ERROR_DFA_WSSIZE
|
||||||
|
|
||||||
This return is given if pcre2_dfa_match() runs out of space in the
|
This return is given if pcre2_dfa_match() runs out of space in the
|
||||||
workspace vector.
|
workspace vector.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_RECURSE
|
PCRE2_ERROR_DFA_RECURSE
|
||||||
|
|
||||||
When a recursive subpattern is processed, the matching function calls
|
When a recursive subpattern is processed, the matching function calls
|
||||||
itself recursively, using private memory for the ovector and workspace.
|
itself recursively, using private memory for the ovector and workspace.
|
||||||
This error is given if the internal ovector is not large enough. This
|
This error is given if the internal ovector is not large enough. This
|
||||||
should be extremely rare, as a vector of size 1000 is used.
|
should be extremely rare, as a vector of size 1000 is used.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_BADRESTART
|
PCRE2_ERROR_DFA_BADRESTART
|
||||||
|
|
||||||
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
||||||
some plausibility checks are made on the contents of the workspace,
|
some plausibility checks are made on the contents of the workspace,
|
||||||
which should contain data about the previous partial match. If any of
|
which should contain data about the previous partial match. If any of
|
||||||
these checks fail, this error is given.
|
these checks fail, this error is given.
|
||||||
|
|
||||||
|
|
||||||
SEE ALSO
|
SEE ALSO
|
||||||
|
|
||||||
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
||||||
pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).
|
pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).
|
||||||
|
|
||||||
|
|
||||||
|
@ -3549,7 +3598,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 07 September 2018
|
Last updated: 18 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -4135,21 +4184,26 @@ DESCRIPTION
|
||||||
its entry point in a match context (see pcre2_set_callout() in the
|
its entry point in a match context (see pcre2_set_callout() in the
|
||||||
pcre2api documentation).
|
pcre2api documentation).
|
||||||
|
|
||||||
Within a regular expression, (?C<arg>) indicates a point at which the
|
When using the pcre2_substitute() function, an additional callout fea-
|
||||||
external function is to be called. Different callout points can be
|
ture is available. This does a callout after each change to the subject
|
||||||
identified by putting a number less than 256 after the letter C. The
|
string and is described in the pcre2api documentation; the rest of this
|
||||||
default value is zero. Alternatively, the argument may be a delimited
|
document is concerned with callouts during pattern matching.
|
||||||
string. The starting delimiter must be one of ` ' " ^ % # $ { and the
|
|
||||||
|
Within a regular expression, (?C<arg>) indicates a point at which the
|
||||||
|
external function is to be called. Different callout points can be
|
||||||
|
identified by putting a number less than 256 after the letter C. The
|
||||||
|
default value is zero. Alternatively, the argument may be a delimited
|
||||||
|
string. The starting delimiter must be one of ` ' " ^ % # $ { and the
|
||||||
ending delimiter is the same as the start, except for {, where the end-
|
ending delimiter is the same as the start, except for {, where the end-
|
||||||
ing delimiter is }. If the ending delimiter is needed within the
|
ing delimiter is }. If the ending delimiter is needed within the
|
||||||
string, it must be doubled. For example, this pattern has two callout
|
string, it must be doubled. For example, this pattern has two callout
|
||||||
points:
|
points:
|
||||||
|
|
||||||
(?C1)abc(?C"some ""arbitrary"" text")def
|
(?C1)abc(?C"some ""arbitrary"" text")def
|
||||||
|
|
||||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
||||||
PCRE2 automatically inserts callouts, all with number 255, before each
|
PCRE2 automatically inserts callouts, all with number 255, before each
|
||||||
item in the pattern except for immediately before or after an explicit
|
item in the pattern except for immediately before or after an explicit
|
||||||
callout. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
|
callout. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
|
||||||
|
|
||||||
A(?C3)B
|
A(?C3)B
|
||||||
|
@ -4166,36 +4220,36 @@ DESCRIPTION
|
||||||
|
|
||||||
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
||||||
|
|
||||||
Notice that there is a callout before and after each parenthesis and
|
Notice that there is a callout before and after each parenthesis and
|
||||||
alternation bar. If the pattern contains a conditional group whose con-
|
alternation bar. If the pattern contains a conditional group whose con-
|
||||||
dition is an assertion, an automatic callout is inserted immediately
|
dition is an assertion, an automatic callout is inserted immediately
|
||||||
before the condition. Such a callout may also be inserted explicitly,
|
before the condition. Such a callout may also be inserted explicitly,
|
||||||
for example:
|
for example:
|
||||||
|
|
||||||
(?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de)
|
(?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de)
|
||||||
|
|
||||||
This applies only to assertion conditions (because they are themselves
|
This applies only to assertion conditions (because they are themselves
|
||||||
independent groups).
|
independent groups).
|
||||||
|
|
||||||
Callouts can be useful for tracking the progress of pattern matching.
|
Callouts can be useful for tracking the progress of pattern matching.
|
||||||
The pcre2test program has a pattern qualifier (/auto_callout) that sets
|
The pcre2test program has a pattern qualifier (/auto_callout) that sets
|
||||||
automatic callouts. When any callouts are present, the output from
|
automatic callouts. When any callouts are present, the output from
|
||||||
pcre2test indicates how the pattern is being matched. This is useful
|
pcre2test indicates how the pattern is being matched. This is useful
|
||||||
information when you are trying to optimize the performance of a par-
|
information when you are trying to optimize the performance of a par-
|
||||||
ticular pattern.
|
ticular pattern.
|
||||||
|
|
||||||
|
|
||||||
MISSING CALLOUTS
|
MISSING CALLOUTS
|
||||||
|
|
||||||
You should be aware that, because of optimizations in the way PCRE2
|
You should be aware that, because of optimizations in the way PCRE2
|
||||||
compiles and matches patterns, callouts sometimes do not happen exactly
|
compiles and matches patterns, callouts sometimes do not happen exactly
|
||||||
as you might expect.
|
as you might expect.
|
||||||
|
|
||||||
Auto-possessification
|
Auto-possessification
|
||||||
|
|
||||||
At compile time, PCRE2 "auto-possessifies" repeated items when it knows
|
At compile time, PCRE2 "auto-possessifies" repeated items when it knows
|
||||||
that what follows cannot be part of the repeat. For example, a+[bc] is
|
that what follows cannot be part of the repeat. For example, a+[bc] is
|
||||||
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
||||||
is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied
|
is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied
|
||||||
to the string "aaaa" is:
|
to the string "aaaa" is:
|
||||||
|
|
||||||
|
@ -4204,11 +4258,11 @@ MISSING CALLOUTS
|
||||||
+2 ^ ^ [bc]
|
+2 ^ ^ [bc]
|
||||||
No match
|
No match
|
||||||
|
|
||||||
This indicates that when matching [bc] fails, there is no backtracking
|
This indicates that when matching [bc] fails, there is no backtracking
|
||||||
into a+ (because it is being treated as a++) and therefore the callouts
|
into a+ (because it is being treated as a++) and therefore the callouts
|
||||||
that would be taken for the backtracks do not occur. You can disable
|
that would be taken for the backtracks do not occur. You can disable
|
||||||
the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to
|
the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to
|
||||||
pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In
|
pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In
|
||||||
this case, the output changes to this:
|
this case, the output changes to this:
|
||||||
|
|
||||||
--->aaaa
|
--->aaaa
|
||||||
|
@ -4225,19 +4279,19 @@ MISSING CALLOUTS
|
||||||
Automatic .* anchoring
|
Automatic .* anchoring
|
||||||
|
|
||||||
By default, an optimization is applied when .* is the first significant
|
By default, an optimization is applied when .* is the first significant
|
||||||
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
||||||
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
||||||
is not set, a match can start only after an internal newline or at the
|
is not set, a match can start only after an internal newline or at the
|
||||||
beginning of the subject, and pcre2_compile() remembers this. If a pat-
|
beginning of the subject, and pcre2_compile() remembers this. If a pat-
|
||||||
tern has more than one top-level branch, automatic anchoring occurs if
|
tern has more than one top-level branch, automatic anchoring occurs if
|
||||||
all branches are anchorable.
|
all branches are anchorable.
|
||||||
|
|
||||||
This optimization is disabled, however, if .* is in an atomic group or
|
This optimization is disabled, however, if .* is in an atomic group or
|
||||||
if there is a backreference to the capturing group in which it appears.
|
if there is a backreference to the capturing group in which it appears.
|
||||||
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
||||||
ever, the presence of callouts does not affect it.
|
ever, the presence of callouts does not affect it.
|
||||||
|
|
||||||
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
||||||
and applied to the string "aa", the pcre2test output is:
|
and applied to the string "aa", the pcre2test output is:
|
||||||
|
|
||||||
--->aa
|
--->aa
|
||||||
|
@ -4247,10 +4301,10 @@ MISSING CALLOUTS
|
||||||
+2 ^ \d
|
+2 ^ \d
|
||||||
No match
|
No match
|
||||||
|
|
||||||
This shows that all match attempts start at the beginning of the sub-
|
This shows that all match attempts start at the beginning of the sub-
|
||||||
ject. In other words, the pattern is anchored. You can disable this
|
ject. In other words, the pattern is anchored. You can disable this
|
||||||
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
||||||
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
||||||
put changes to:
|
put changes to:
|
||||||
|
|
||||||
--->aa
|
--->aa
|
||||||
|
@ -4263,42 +4317,42 @@ MISSING CALLOUTS
|
||||||
+2 ^ \d
|
+2 ^ \d
|
||||||
No match
|
No match
|
||||||
|
|
||||||
This shows more match attempts, starting at the second subject charac-
|
This shows more match attempts, starting at the second subject charac-
|
||||||
ter. Another optimization, described in the next section, means that
|
ter. Another optimization, described in the next section, means that
|
||||||
there is no subsequent attempt to match with an empty subject.
|
there is no subsequent attempt to match with an empty subject.
|
||||||
|
|
||||||
Other optimizations
|
Other optimizations
|
||||||
|
|
||||||
Other optimizations that provide fast "no match" results also affect
|
Other optimizations that provide fast "no match" results also affect
|
||||||
callouts. For example, if the pattern is
|
callouts. For example, if the pattern is
|
||||||
|
|
||||||
ab(?C4)cd
|
ab(?C4)cd
|
||||||
|
|
||||||
PCRE2 knows that any matching string must contain the letter "d". If
|
PCRE2 knows that any matching string must contain the letter "d". If
|
||||||
the subject string is "abyz", the lack of "d" means that matching
|
the subject string is "abyz", the lack of "d" means that matching
|
||||||
doesn't ever start, and the callout is never reached. However, with
|
doesn't ever start, and the callout is never reached. However, with
|
||||||
"abyd", though the result is still no match, the callout is obeyed.
|
"abyd", though the result is still no match, the callout is obeyed.
|
||||||
|
|
||||||
For most patterns PCRE2 also knows the minimum length of a matching
|
For most patterns PCRE2 also knows the minimum length of a matching
|
||||||
string, and will immediately give a "no match" return without actually
|
string, and will immediately give a "no match" return without actually
|
||||||
running a match if the subject is not long enough, or, for unanchored
|
running a match if the subject is not long enough, or, for unanchored
|
||||||
patterns, if it has been scanned far enough.
|
patterns, if it has been scanned far enough.
|
||||||
|
|
||||||
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
||||||
MIZE option to pcre2_compile(), or by starting the pattern with
|
MIZE option to pcre2_compile(), or by starting the pattern with
|
||||||
(*NO_START_OPT). This slows down the matching process, but does ensure
|
(*NO_START_OPT). This slows down the matching process, but does ensure
|
||||||
that callouts such as the example above are obeyed.
|
that callouts such as the example above are obeyed.
|
||||||
|
|
||||||
|
|
||||||
THE CALLOUT INTERFACE
|
THE CALLOUT INTERFACE
|
||||||
|
|
||||||
During matching, when PCRE2 reaches a callout point, if an external
|
During matching, when PCRE2 reaches a callout point, if an external
|
||||||
function is provided in the match context, it is called. This applies
|
function is provided in the match context, it is called. This applies
|
||||||
to both normal, DFA, and JIT matching. The first argument to the call-
|
to both normal, DFA, and JIT matching. The first argument to the call-
|
||||||
out function is a pointer to a pcre2_callout block. The second argument
|
out function is a pointer to a pcre2_callout block. The second argument
|
||||||
is the void * callout data that was supplied when the callout was set
|
is the void * callout data that was supplied when the callout was set
|
||||||
up by calling pcre2_set_callout() (see the pcre2api documentation). The
|
up by calling pcre2_set_callout() (see the pcre2api documentation). The
|
||||||
callout block structure contains the following fields, not necessarily
|
callout block structure contains the following fields, not necessarily
|
||||||
in this order:
|
in this order:
|
||||||
|
|
||||||
uint32_t version;
|
uint32_t version;
|
||||||
|
@ -4318,118 +4372,118 @@ THE CALLOUT INTERFACE
|
||||||
PCRE2_SIZE callout_string_length;
|
PCRE2_SIZE callout_string_length;
|
||||||
PCRE2_SPTR callout_string;
|
PCRE2_SPTR callout_string;
|
||||||
|
|
||||||
The version field contains the version number of the block format. The
|
The version field contains the version number of the block format. The
|
||||||
current version is 2; the three callout string fields were added for
|
current version is 2; the three callout string fields were added for
|
||||||
version 1, and the callout_flags field for version 2. If you are writ-
|
version 1, and the callout_flags field for version 2. If you are writ-
|
||||||
ing an application that might use an earlier release of PCRE2, you
|
ing an application that might use an earlier release of PCRE2, you
|
||||||
should check the version number before accessing any of these fields.
|
should check the version number before accessing any of these fields.
|
||||||
The version number will increase in future if more fields are added,
|
The version number will increase in future if more fields are added,
|
||||||
but the intention is never to remove any of the existing fields.
|
but the intention is never to remove any of the existing fields.
|
||||||
|
|
||||||
Fields for numerical callouts
|
Fields for numerical callouts
|
||||||
|
|
||||||
For a numerical callout, callout_string is NULL, and callout_number
|
For a numerical callout, callout_string is NULL, and callout_number
|
||||||
contains the number of the callout, in the range 0-255. This is the
|
contains the number of the callout, in the range 0-255. This is the
|
||||||
number that follows (?C for callouts that part of the pattern; it is
|
number that follows (?C for callouts that part of the pattern; it is
|
||||||
255 for automatically generated callouts.
|
255 for automatically generated callouts.
|
||||||
|
|
||||||
Fields for string callouts
|
Fields for string callouts
|
||||||
|
|
||||||
For callouts with string arguments, callout_number is always zero, and
|
For callouts with string arguments, callout_number is always zero, and
|
||||||
callout_string points to the string that is contained within the com-
|
callout_string points to the string that is contained within the com-
|
||||||
piled pattern. Its length is given by callout_string_length. Duplicated
|
piled pattern. Its length is given by callout_string_length. Duplicated
|
||||||
ending delimiters that were present in the original pattern string have
|
ending delimiters that were present in the original pattern string have
|
||||||
been turned into single characters, but there is no other processing of
|
been turned into single characters, but there is no other processing of
|
||||||
the callout string argument. An additional code unit containing binary
|
the callout string argument. An additional code unit containing binary
|
||||||
zero is present after the string, but is not included in the length.
|
zero is present after the string, but is not included in the length.
|
||||||
The delimiter that was used to start the string is also stored within
|
The delimiter that was used to start the string is also stored within
|
||||||
the pattern, immediately before the string itself. You can access this
|
the pattern, immediately before the string itself. You can access this
|
||||||
delimiter as callout_string[-1] if you need it.
|
delimiter as callout_string[-1] if you need it.
|
||||||
|
|
||||||
The callout_string_offset field is the code unit offset to the start of
|
The callout_string_offset field is the code unit offset to the start of
|
||||||
the callout argument string within the original pattern string. This is
|
the callout argument string within the original pattern string. This is
|
||||||
provided for the benefit of applications such as script languages that
|
provided for the benefit of applications such as script languages that
|
||||||
might need to report errors in the callout string within the pattern.
|
might need to report errors in the callout string within the pattern.
|
||||||
|
|
||||||
Fields for all callouts
|
Fields for all callouts
|
||||||
|
|
||||||
The remaining fields in the callout block are the same for both kinds
|
The remaining fields in the callout block are the same for both kinds
|
||||||
of callout.
|
of callout.
|
||||||
|
|
||||||
The offset_vector field is a pointer to a vector of capturing offsets
|
The offset_vector field is a pointer to a vector of capturing offsets
|
||||||
(the "ovector"). You may read the elements in this vector, but you must
|
(the "ovector"). You may read the elements in this vector, but you must
|
||||||
not change any of them.
|
not change any of them.
|
||||||
|
|
||||||
For calls to pcre2_match(), the offset_vector field is not (since
|
For calls to pcre2_match(), the offset_vector field is not (since
|
||||||
release 10.30) a pointer to the actual ovector that was passed to the
|
release 10.30) a pointer to the actual ovector that was passed to the
|
||||||
matching function in the match data block. Instead it points to an
|
matching function in the match data block. Instead it points to an
|
||||||
internal ovector of a size large enough to hold all possible captured
|
internal ovector of a size large enough to hold all possible captured
|
||||||
substrings in the pattern. Note that whenever a recursion or subroutine
|
substrings in the pattern. Note that whenever a recursion or subroutine
|
||||||
call within a pattern completes, the capturing state is reset to what
|
call within a pattern completes, the capturing state is reset to what
|
||||||
it was before.
|
it was before.
|
||||||
|
|
||||||
The capture_last field contains the number of the most recently cap-
|
The capture_last field contains the number of the most recently cap-
|
||||||
tured substring, and the capture_top field contains one more than the
|
tured substring, and the capture_top field contains one more than the
|
||||||
number of the highest numbered captured substring so far. If no sub-
|
number of the highest numbered captured substring so far. If no sub-
|
||||||
strings have yet been captured, the value of capture_last is 0 and the
|
strings have yet been captured, the value of capture_last is 0 and the
|
||||||
value of capture_top is 1. The values of these fields do not always
|
value of capture_top is 1. The values of these fields do not always
|
||||||
differ by one; for example, when the callout in the pattern
|
differ by one; for example, when the callout in the pattern
|
||||||
((a)(b))(?C2) is taken, capture_last is 1 but capture_top is 4.
|
((a)(b))(?C2) is taken, capture_last is 1 but capture_top is 4.
|
||||||
|
|
||||||
The contents of ovector[2] to ovector[<capture_top>*2-1] can be
|
The contents of ovector[2] to ovector[<capture_top>*2-1] can be
|
||||||
inspected in order to extract substrings that have been matched so far,
|
inspected in order to extract substrings that have been matched so far,
|
||||||
in the same way as extracting substrings after a match has completed.
|
in the same way as extracting substrings after a match has completed.
|
||||||
The values in ovector[0] and ovector[1] are always PCRE2_UNSET because
|
The values in ovector[0] and ovector[1] are always PCRE2_UNSET because
|
||||||
the match is by definition not complete. Substrings that have not been
|
the match is by definition not complete. Substrings that have not been
|
||||||
captured but whose numbers are less than capture_top also have both of
|
captured but whose numbers are less than capture_top also have both of
|
||||||
their ovector slots set to PCRE2_UNSET.
|
their ovector slots set to PCRE2_UNSET.
|
||||||
|
|
||||||
For DFA matching, the offset_vector field points to the ovector that
|
For DFA matching, the offset_vector field points to the ovector that
|
||||||
was passed to the matching function in the match data block for call-
|
was passed to the matching function in the match data block for call-
|
||||||
outs at the top level, but to an internal ovector during the processing
|
outs at the top level, but to an internal ovector during the processing
|
||||||
of pattern recursions, lookarounds, and atomic groups. However, these
|
of pattern recursions, lookarounds, and atomic groups. However, these
|
||||||
ovectors hold no useful information because pcre2_dfa_match() does not
|
ovectors hold no useful information because pcre2_dfa_match() does not
|
||||||
support substring capturing. The value of capture_top is always 1 and
|
support substring capturing. The value of capture_top is always 1 and
|
||||||
the value of capture_last is always 0 for DFA matching.
|
the value of capture_last is always 0 for DFA matching.
|
||||||
|
|
||||||
The subject and subject_length fields contain copies of the values that
|
The subject and subject_length fields contain copies of the values that
|
||||||
were passed to the matching function.
|
were passed to the matching function.
|
||||||
|
|
||||||
The start_match field normally contains the offset within the subject
|
The start_match field normally contains the offset within the subject
|
||||||
at which the current match attempt started. However, if the escape
|
at which the current match attempt started. However, if the escape
|
||||||
sequence \K has been encountered, this value is changed to reflect the
|
sequence \K has been encountered, this value is changed to reflect the
|
||||||
modified starting point. If the pattern is not anchored, the callout
|
modified starting point. If the pattern is not anchored, the callout
|
||||||
function may be called several times from the same point in the pattern
|
function may be called several times from the same point in the pattern
|
||||||
for different starting points in the subject.
|
for different starting points in the subject.
|
||||||
|
|
||||||
The current_position field contains the offset within the subject of
|
The current_position field contains the offset within the subject of
|
||||||
the current match pointer.
|
the current match pointer.
|
||||||
|
|
||||||
The pattern_position field contains the offset in the pattern string to
|
The pattern_position field contains the offset in the pattern string to
|
||||||
the next item to be matched.
|
the next item to be matched.
|
||||||
|
|
||||||
The next_item_length field contains the length of the next item to be
|
The next_item_length field contains the length of the next item to be
|
||||||
processed in the pattern string. When the callout is at the end of the
|
processed in the pattern string. When the callout is at the end of the
|
||||||
pattern, the length is zero. When the callout precedes an opening
|
pattern, the length is zero. When the callout precedes an opening
|
||||||
parenthesis, the length includes meta characters that follow the paren-
|
parenthesis, the length includes meta characters that follow the paren-
|
||||||
thesis. For example, in a callout before an assertion such as (?=ab)
|
thesis. For example, in a callout before an assertion such as (?=ab)
|
||||||
the length is 3. For an an alternation bar or a closing parenthesis,
|
the length is 3. For an an alternation bar or a closing parenthesis,
|
||||||
the length is one, unless a closing parenthesis is followed by a quan-
|
the length is one, unless a closing parenthesis is followed by a quan-
|
||||||
tifier, in which case its length is included. (This changed in release
|
tifier, in which case its length is included. (This changed in release
|
||||||
10.23. In earlier releases, before an opening parenthesis the length
|
10.23. In earlier releases, before an opening parenthesis the length
|
||||||
was that of the entire subpattern, and before an alternation bar or a
|
was that of the entire subpattern, and before an alternation bar or a
|
||||||
closing parenthesis the length was zero.)
|
closing parenthesis the length was zero.)
|
||||||
|
|
||||||
The pattern_position and next_item_length fields are intended to help
|
The pattern_position and next_item_length fields are intended to help
|
||||||
in distinguishing between different automatic callouts, which all have
|
in distinguishing between different automatic callouts, which all have
|
||||||
the same callout number. However, they are set for all callouts, and
|
the same callout number. However, they are set for all callouts, and
|
||||||
are used by pcre2test to show the next item to be matched when display-
|
are used by pcre2test to show the next item to be matched when display-
|
||||||
ing callout information.
|
ing callout information.
|
||||||
|
|
||||||
In callouts from pcre2_match() the mark field contains a pointer to the
|
In callouts from pcre2_match() the mark field contains a pointer to the
|
||||||
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||||
(*THEN) item in the match, or NULL if no such items have been passed.
|
(*THEN) item in the match, or NULL if no such items have been passed.
|
||||||
Instances of (*PRUNE) or (*THEN) without a name do not obliterate a
|
Instances of (*PRUNE) or (*THEN) without a name do not obliterate a
|
||||||
previous (*MARK). In callouts from the DFA matching function this field
|
previous (*MARK). In callouts from the DFA matching function this field
|
||||||
always contains NULL.
|
always contains NULL.
|
||||||
|
|
||||||
|
@ -4439,25 +4493,25 @@ THE CALLOUT INTERFACE
|
||||||
|
|
||||||
PCRE2_CALLOUT_STARTMATCH
|
PCRE2_CALLOUT_STARTMATCH
|
||||||
|
|
||||||
This is set for the first callout after the start of matching for each
|
This is set for the first callout after the start of matching for each
|
||||||
new starting position in the subject.
|
new starting position in the subject.
|
||||||
|
|
||||||
PCRE2_CALLOUT_BACKTRACK
|
PCRE2_CALLOUT_BACKTRACK
|
||||||
|
|
||||||
This is set if there has been a matching backtrack since the previous
|
This is set if there has been a matching backtrack since the previous
|
||||||
callout, or since the start of matching if this is the first callout
|
callout, or since the start of matching if this is the first callout
|
||||||
from a pcre2_match() run.
|
from a pcre2_match() run.
|
||||||
|
|
||||||
Both bits are set when a backtrack has caused a "bumpalong" to a new
|
Both bits are set when a backtrack has caused a "bumpalong" to a new
|
||||||
starting position in the subject. Output from pcre2test does not indi-
|
starting position in the subject. Output from pcre2test does not indi-
|
||||||
cate the presence of these bits unless the callout_extra modifier is
|
cate the presence of these bits unless the callout_extra modifier is
|
||||||
set.
|
set.
|
||||||
|
|
||||||
The information in the callout_flags field is provided so that applica-
|
The information in the callout_flags field is provided so that applica-
|
||||||
tions can track and tell their users how matching with backtracking is
|
tions can track and tell their users how matching with backtracking is
|
||||||
done. This can be useful when trying to optimize patterns, or just to
|
done. This can be useful when trying to optimize patterns, or just to
|
||||||
understand how PCRE2 works. There is no support in pcre2_dfa_match()
|
understand how PCRE2 works. There is no support in pcre2_dfa_match()
|
||||||
because there is no backtracking in DFA matching, and there is no sup-
|
because there is no backtracking in DFA matching, and there is no sup-
|
||||||
port in JIT because JIT is all about maximimizing matching performance.
|
port in JIT because JIT is all about maximimizing matching performance.
|
||||||
In both these cases the callout_flags field is always zero.
|
In both these cases the callout_flags field is always zero.
|
||||||
|
|
||||||
|
@ -4465,16 +4519,16 @@ THE CALLOUT INTERFACE
|
||||||
RETURN VALUES FROM CALLOUTS
|
RETURN VALUES FROM CALLOUTS
|
||||||
|
|
||||||
The external callout function returns an integer to PCRE2. If the value
|
The external callout function returns an integer to PCRE2. If the value
|
||||||
is zero, matching proceeds as normal. If the value is greater than
|
is zero, matching proceeds as normal. If the value is greater than
|
||||||
zero, matching fails at the current point, but the testing of other
|
zero, matching fails at the current point, but the testing of other
|
||||||
matching possibilities goes ahead, just as if a lookahead assertion had
|
matching possibilities goes ahead, just as if a lookahead assertion had
|
||||||
failed. If the value is less than zero, the match is abandoned, and the
|
failed. If the value is less than zero, the match is abandoned, and the
|
||||||
matching function returns the negative value.
|
matching function returns the negative value.
|
||||||
|
|
||||||
Negative values should normally be chosen from the set of
|
Negative values should normally be chosen from the set of
|
||||||
PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a
|
PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a
|
||||||
standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is
|
standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is
|
||||||
reserved for use by callout functions; it will never be used by PCRE2
|
reserved for use by callout functions; it will never be used by PCRE2
|
||||||
itself.
|
itself.
|
||||||
|
|
||||||
|
|
||||||
|
@ -4485,14 +4539,14 @@ CALLOUT ENUMERATION
|
||||||
void *user_data);
|
void *user_data);
|
||||||
|
|
||||||
A script language that supports the use of string arguments in callouts
|
A script language that supports the use of string arguments in callouts
|
||||||
might like to scan all the callouts in a pattern before running the
|
might like to scan all the callouts in a pattern before running the
|
||||||
match. This can be done by calling pcre2_callout_enumerate(). The first
|
match. This can be done by calling pcre2_callout_enumerate(). The first
|
||||||
argument is a pointer to a compiled pattern, the second points to a
|
argument is a pointer to a compiled pattern, the second points to a
|
||||||
callback function, and the third is arbitrary user data. The callback
|
callback function, and the third is arbitrary user data. The callback
|
||||||
function is called for every callout in the pattern in the order in
|
function is called for every callout in the pattern in the order in
|
||||||
which they appear. Its first argument is a pointer to a callout enumer-
|
which they appear. Its first argument is a pointer to a callout enumer-
|
||||||
ation block, and its second argument is the user_data value that was
|
ation block, and its second argument is the user_data value that was
|
||||||
passed to pcre2_callout_enumerate(). The data block contains the fol-
|
passed to pcre2_callout_enumerate(). The data block contains the fol-
|
||||||
lowing fields:
|
lowing fields:
|
||||||
|
|
||||||
version Block version number
|
version Block version number
|
||||||
|
@ -4503,17 +4557,17 @@ CALLOUT ENUMERATION
|
||||||
callout_string_length Length of callout string
|
callout_string_length Length of callout string
|
||||||
callout_string Points to callout string or is NULL
|
callout_string Points to callout string or is NULL
|
||||||
|
|
||||||
The version number is currently 0. It will increase if new fields are
|
The version number is currently 0. It will increase if new fields are
|
||||||
ever added to the block. The remaining fields are the same as their
|
ever added to the block. The remaining fields are the same as their
|
||||||
namesakes in the pcre2_callout block that is used for callouts during
|
namesakes in the pcre2_callout block that is used for callouts during
|
||||||
matching, as described above.
|
matching, as described above.
|
||||||
|
|
||||||
Note that the value of pattern_position is unique for each callout.
|
Note that the value of pattern_position is unique for each callout.
|
||||||
However, if a callout occurs inside a group that is quantified with a
|
However, if a callout occurs inside a group that is quantified with a
|
||||||
non-zero minimum or a fixed maximum, the group is replicated inside the
|
non-zero minimum or a fixed maximum, the group is replicated inside the
|
||||||
compiled pattern. For example, a pattern such as /(a){2}/ is compiled
|
compiled pattern. For example, a pattern such as /(a){2}/ is compiled
|
||||||
as if it were /(a)(a)/. This means that the callout will be enumerated
|
as if it were /(a)(a)/. This means that the callout will be enumerated
|
||||||
more than once, but with the same value for pattern_position in each
|
more than once, but with the same value for pattern_position in each
|
||||||
case.
|
case.
|
||||||
|
|
||||||
The callback function should normally return zero. If it returns a non-
|
The callback function should normally return zero. If it returns a non-
|
||||||
|
@ -4530,7 +4584,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 26 April 2018
|
Last updated: 17 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
.TH PCRE2_SET_SUBSTITUTE_CALLOUT 3 "17 September 2018" "PCRE2 10.33"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.B #include <pcre2.h>
|
||||||
|
.PP
|
||||||
|
.nf
|
||||||
|
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||||
|
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *),"
|
||||||
|
.B " void *\fIcallout_data\fP);"
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
This function sets the substitute callout fields in a match context (the first
|
||||||
|
argument). The second argument specifies a callout function, and the third
|
||||||
|
argument is an opaque data item that is passed to it. The result of this
|
||||||
|
function is always zero.
|
||||||
|
.P
|
||||||
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2api\fP
|
||||||
|
.\"
|
||||||
|
page and a description of the POSIX API in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2posix\fP
|
||||||
|
.\"
|
||||||
|
page.
|
107
doc/pcre2api.3
107
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2API 3 "07 September 2018" "PCRE2 10.32"
|
.TH PCRE2API 3 "18 September 2018" "PCRE2 10.33"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.sp
|
.sp
|
||||||
|
@ -123,6 +123,10 @@ document for an overview of all the PCRE2 documentation.
|
||||||
.B " int (*\fIcallout_function\fP)(pcre2_callout_block *, void *),"
|
.B " int (*\fIcallout_function\fP)(pcre2_callout_block *, void *),"
|
||||||
.B " void *\fIcallout_data\fP);"
|
.B " void *\fIcallout_data\fP);"
|
||||||
.sp
|
.sp
|
||||||
|
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||||
|
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *),"
|
||||||
|
.B " void *\fIcallout_data\fP);"
|
||||||
|
.sp
|
||||||
.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
|
.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
|
||||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||||
.sp
|
.sp
|
||||||
|
@ -847,7 +851,7 @@ PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||||
.B " void *\fIcallout_data\fP);"
|
.B " void *\fIcallout_data\fP);"
|
||||||
.fi
|
.fi
|
||||||
.sp
|
.sp
|
||||||
This sets up a "callout" function for PCRE2 to call at specified points
|
This sets up a callout function for PCRE2 to call at specified points
|
||||||
during a matching operation. Details are given in the
|
during a matching operation. Details are given in the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2callout\fP
|
\fBpcre2callout\fP
|
||||||
|
@ -855,6 +859,20 @@ during a matching operation. Details are given in the
|
||||||
documentation.
|
documentation.
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
|
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||||
|
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *),"
|
||||||
|
.B " void *\fIcallout_data\fP);"
|
||||||
|
.fi
|
||||||
|
.sp
|
||||||
|
This sets up a callout function for PCRE2 to call after each substitution
|
||||||
|
made by \fBpcre2_substitute()\fP. Details are given in the section entitled
|
||||||
|
"Creating a new string with substitutions"
|
||||||
|
.\" HTML <a href="#substitutions">
|
||||||
|
.\" </a>
|
||||||
|
below.
|
||||||
|
.\"
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
|
.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
|
||||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||||
.fi
|
.fi
|
||||||
|
@ -3171,6 +3189,7 @@ numbers. For this reason, the use of different names for subpatterns of the
|
||||||
same number causes an error at compile time.
|
same number causes an error at compile time.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
.\" HTML <a name="substitutions"></a>
|
||||||
.SH "CREATING A NEW STRING WITH SUBSTITUTIONS"
|
.SH "CREATING A NEW STRING WITH SUBSTITUTIONS"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
|
@ -3179,19 +3198,22 @@ same number causes an error at compile time.
|
||||||
.B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
|
.B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
|
||||||
.B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
|
.B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
|
||||||
.B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP,"
|
.B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP,"
|
||||||
.B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\zfP,"
|
.B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\fP,"
|
||||||
.B " PCRE2_SIZE *\fIoutlengthptr\fP);"
|
.B " PCRE2_SIZE *\fIoutlengthptr\fP);"
|
||||||
.fi
|
.fi
|
||||||
.P
|
.P
|
||||||
This function calls \fBpcre2_match()\fP and then makes a copy of the subject
|
This function calls \fBpcre2_match()\fP and then makes a copy of the subject
|
||||||
string in \fIoutputbuffer\fP, replacing the part that was matched with the
|
string in \fIoutputbuffer\fP, replacing one or more parts that were matched
|
||||||
\fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can
|
with the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP.
|
||||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
This can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||||
which a \eK item in a lookahead in the pattern causes the match to end before
|
The default is to perform just one replacement, but there is an option that
|
||||||
it starts are not supported, and give rise to an error return. For global
|
requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below for details).
|
||||||
replacements, matches in which \eK in a lookbehind causes the match to start
|
.P
|
||||||
earlier than the point that was reached in the previous iteration are also not
|
Matches in which a \eK item in a lookahead in the pattern causes the match to
|
||||||
supported.
|
end before it starts are not supported, and give rise to an error return. For
|
||||||
|
global replacements, matches in which \eK in a lookbehind causes the match to
|
||||||
|
start earlier than the point that was reached in the previous iteration are
|
||||||
|
also not supported.
|
||||||
.P
|
.P
|
||||||
The first seven arguments of \fBpcre2_substitute()\fP are the same as for
|
The first seven arguments of \fBpcre2_substitute()\fP are the same as for
|
||||||
\fBpcre2_match()\fP, except that the partial matching options are not
|
\fBpcre2_match()\fP, except that the partial matching options are not
|
||||||
|
@ -3201,9 +3223,9 @@ functions from the match context, if provided, or else those that were used to
|
||||||
allocate memory for the compiled code.
|
allocate memory for the compiled code.
|
||||||
.P
|
.P
|
||||||
If an external \fImatch_data\fP block is provided, its contents afterwards
|
If an external \fImatch_data\fP block is provided, its contents afterwards
|
||||||
are those set by the final call to \fBpcre2_match()\fP, which will have
|
are those set by the final call to \fBpcre2_match()\fP. For global changes,
|
||||||
ended in a matching error. The contents of the ovector within the match data
|
this will have ended in a matching error. The contents of the ovector within
|
||||||
block may or may not have been changed.
|
the match data block may or may not have been changed.
|
||||||
.P
|
.P
|
||||||
The \fIoutlengthptr\fP argument must point to a variable that contains the
|
The \fIoutlengthptr\fP argument must point to a variable that contains the
|
||||||
length, in code units, of the output buffer. If the function is successful, the
|
length, in code units, of the output buffer. If the function is successful, the
|
||||||
|
@ -3224,12 +3246,12 @@ length is in code units, not bytes.
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||||
dollar character is an escape character that can specify the insertion of
|
dollar character is an escape character that can specify the insertion of
|
||||||
characters from capturing groups or (*MARK), (*PRUNE), or (*THEN) items in the
|
characters from capturing groups or names from (*MARK) or other control verbs
|
||||||
pattern. The following forms are always recognized:
|
in the pattern. The following forms are always recognized:
|
||||||
.sp
|
.sp
|
||||||
$$ insert a dollar character
|
$$ insert a dollar character
|
||||||
$<n> or ${<n>} insert the contents of group <n>
|
$<n> or ${<n>} insert the contents of group <n>
|
||||||
$*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name
|
$*MARK or ${*MARK} insert a control verb name
|
||||||
.sp
|
.sp
|
||||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||||
required only if the following character would be interpreted as part of the
|
required only if the following character would be interpreted as part of the
|
||||||
|
@ -3237,12 +3259,13 @@ number or name. The number may be zero to include the entire matched string.
|
||||||
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||||
string "+$1$0$1+", the result is "=+babcb+=".
|
string "+$1$0$1+", the result is "=+babcb+=".
|
||||||
.P
|
.P
|
||||||
$*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or (*THEN)
|
$*MARK inserts the name from the last encountered (*ACCEPT), (*COMMIT),
|
||||||
on the matching path that has a name. (*MARK) must always include a name, but
|
(*MARK), (*PRUNE), or (*THEN) on the matching path that has a name. (*MARK)
|
||||||
(*PRUNE) and (*THEN) need not. For example, in the case of (*MARK:A)(*PRUNE)
|
must always include a name, but the other verbs need not. For example, in
|
||||||
the name inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B".
|
the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
|
||||||
This facility can be used to perform simple simultaneous substitutions, as this
|
(*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be used to
|
||||||
\fBpcre2test\fP example shows:
|
perform simple simultaneous substitutions, as this \fBpcre2test\fP example
|
||||||
|
shows:
|
||||||
.sp
|
.sp
|
||||||
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
|
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
|
||||||
apple lemon
|
apple lemon
|
||||||
|
@ -3388,6 +3411,42 @@ above).
|
||||||
.\"
|
.\"
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
.SS "Substitution callouts"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||||
|
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *),"
|
||||||
|
.B " void *\fIcallout_data\fP);"
|
||||||
|
.fi
|
||||||
|
.sp
|
||||||
|
The \fBpcre2_set_substitution_callout()\fP function can be used to specify a
|
||||||
|
callout function for \fBpcre2_substitute()\fP. This information is passed in
|
||||||
|
a match context. The callout function is called after each substitution. It is
|
||||||
|
not called for simulated substitutions that happen as a result of the
|
||||||
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. A callout function should not return
|
||||||
|
any value.
|
||||||
|
.P
|
||||||
|
The first argument of the callout function is a pointer to a substitute callout
|
||||||
|
block structure, which contains the following fields, not necessarily in this
|
||||||
|
order:
|
||||||
|
.sp
|
||||||
|
uint32_t \fIversion\fP;
|
||||||
|
PCRE2_SIZE \fIinput_offsets[2]\fP;
|
||||||
|
PCRE2_SIZE \fIoutput_offsets[2]\fP;
|
||||||
|
.sp
|
||||||
|
The \fIversion\fP field contains the version number of the block format. The
|
||||||
|
current version is 0. The version number will increase in future if more fields
|
||||||
|
are added, but the intention is never to remove any of the existing fields.
|
||||||
|
.P
|
||||||
|
The \fIinput_offsets\fP vector contains the code unit offsets in the input
|
||||||
|
string of the matched substring, and the \fIoutput_offsets\fP vector contains
|
||||||
|
the offsets of the replacement in the output string.
|
||||||
|
.P
|
||||||
|
The second argument of the callout function is the value passed as
|
||||||
|
\fIcallout_data\fP when the function was registered.
|
||||||
|
.
|
||||||
|
.
|
||||||
.SH "DUPLICATE SUBPATTERN NAMES"
|
.SH "DUPLICATE SUBPATTERN NAMES"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
|
@ -3670,6 +3729,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 07 September 2018
|
Last updated: 18 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2CALLOUT 3 "26 April 2018" "PCRE2 10.32"
|
.TH PCRE2CALLOUT 3 "17 September 2018" "PCRE2 10.33"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -27,6 +27,15 @@ a match context (see \fBpcre2_set_callout()\fP in the
|
||||||
.\"
|
.\"
|
||||||
documentation).
|
documentation).
|
||||||
.P
|
.P
|
||||||
|
When using the \fBpcre2_substitute()\fP function, an additional callout feature
|
||||||
|
is available. This does a callout after each change to the subject string and
|
||||||
|
is described in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2api\fP
|
||||||
|
.\"
|
||||||
|
documentation; the rest of this document is concerned with callouts during
|
||||||
|
pattern matching.
|
||||||
|
.P
|
||||||
Within a regular expression, (?C<arg>) indicates a point at which the external
|
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||||
function is to be called. Different callout points can be identified by putting
|
function is to be called. Different callout points can be identified by putting
|
||||||
a number less than 256 after the letter C. The default value is zero.
|
a number less than 256 after the letter C. The default value is zero.
|
||||||
|
@ -443,6 +452,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 26 April 2018
|
Last updated: 17 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2TEST 1 "15 September 2018" "PCRE 10.33"
|
.TH PCRE2TEST 1 "17 September 2018" "PCRE 10.33"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -1011,6 +1011,7 @@ process.
|
||||||
mark show mark values
|
mark show mark values
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
|
substitute_callout use substitution callouts
|
||||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
@ -1185,6 +1186,7 @@ pattern.
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show startchar when relevant
|
startchar show startchar when relevant
|
||||||
startoffset=<n> same as offset=<n>
|
startoffset=<n> same as offset=<n>
|
||||||
|
substitute_callout use substitution callouts
|
||||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
@ -1271,7 +1273,7 @@ elements are the only ones that should be set. After a DFA match, the amount of
|
||||||
ovector that is used depends on the number of matches that were found.
|
ovector that is used depends on the number of matches that were found.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SS "Testing callouts"
|
.SS "Testing pattern callouts"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
A callout function is supplied when \fBpcre2test\fP calls the library matching
|
A callout function is supplied when \fBpcre2test\fP calls the library matching
|
||||||
|
@ -1282,6 +1284,12 @@ controlled by various modifiers listed above whose names begin with
|
||||||
.\" </a>
|
.\" </a>
|
||||||
below.
|
below.
|
||||||
.\"
|
.\"
|
||||||
|
Testing callouts from \fBpcre2_substitute()\fP is decribed separately in
|
||||||
|
"Testing the substitution function"
|
||||||
|
.\" HTML <a href="#substitution">
|
||||||
|
.\" </a>
|
||||||
|
below.
|
||||||
|
.\"
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SS "Finding all matches in a string"
|
.SS "Finding all matches in a string"
|
||||||
|
@ -1332,6 +1340,7 @@ parentheses after each substring, followed by the name when the extraction was
|
||||||
by name.
|
by name.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
.\" HTML <a name="substitution"></a>
|
||||||
.SS "Testing the substitution function"
|
.SS "Testing the substitution function"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
|
@ -1367,6 +1376,16 @@ simple example of a substitution test:
|
||||||
=abc=abc=\e=global
|
=abc=abc=\e=global
|
||||||
2: =xxx=xxx=
|
2: =xxx=xxx=
|
||||||
.sp
|
.sp
|
||||||
|
If the \fBsubstitute_callout\fP modifier is set, a substitution callout
|
||||||
|
function is set up. When it is called (after each substitution), the offsets in
|
||||||
|
the input and output strings are output. For example:
|
||||||
|
.sp
|
||||||
|
/abc/g,replace=<$0>,substitute_callout
|
||||||
|
abcdefabcpqr
|
||||||
|
Old 0 3 New 0 5
|
||||||
|
Old 6 9 New 8 13
|
||||||
|
2: <abc>def<abc>pqr
|
||||||
|
.sp
|
||||||
Subject and replacement strings should be kept relatively short (fewer than 256
|
Subject and replacement strings should be kept relatively short (fewer than 256
|
||||||
characters) for substitution tests, as fixed-size buffers are used. To make it
|
characters) for substitution tests, as fixed-size buffers are used. To make it
|
||||||
easy to test for buffer overflow, if the replacement string starts with a
|
easy to test for buffer overflow, if the replacement string starts with a
|
||||||
|
@ -1384,10 +1403,10 @@ The default action of \fBpcre2_substitute()\fP is to return
|
||||||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
||||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
||||||
\fBsubstitute_overflow_length\fP modifier), \fBpcre2_substitute()\fP continues
|
\fBsubstitute_overflow_length\fP modifier), \fBpcre2_substitute()\fP continues
|
||||||
to go through the motions of matching and substituting, in order to compute the
|
to go through the motions of matching and substituting (but not doing any
|
||||||
size of buffer that is required. When this happens, \fBpcre2test\fP shows the
|
callouts), in order to compute the size of buffer that is required. When this
|
||||||
required buffer length (which includes space for the trailing zero) as part of
|
happens, \fBpcre2test\fP shows the required buffer length (which includes space
|
||||||
the error message. For example:
|
for the trailing zero) as part of the error message. For example:
|
||||||
.sp
|
.sp
|
||||||
/abc/substitute_overflow_length
|
/abc/substitute_overflow_length
|
||||||
123abc123\e=replace=[9]XYZ
|
123abc123\e=replace=[9]XYZ
|
||||||
|
@ -2002,6 +2021,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 15 September 2018
|
Last updated: 17 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -929,6 +929,7 @@ PATTERN MODIFIERS
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
|
allvector show the entire ovector
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
/g global global matching
|
/g global global matching
|
||||||
|
@ -936,6 +937,7 @@ PATTERN MODIFIERS
|
||||||
mark show mark values
|
mark show mark values
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
|
substitute_callout use substitution callouts
|
||||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
@ -1057,6 +1059,7 @@ SUBJECT MODIFIERS
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
|
allvector show the entire ovector
|
||||||
allusedtext show all consulted text (non-JIT only)
|
allusedtext show all consulted text (non-JIT only)
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
callout_capture show captures at callout time
|
callout_capture show captures at callout time
|
||||||
|
@ -1086,6 +1089,7 @@ SUBJECT MODIFIERS
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show startchar when relevant
|
startchar show startchar when relevant
|
||||||
startoffset=<n> same as offset=<n>
|
startoffset=<n> same as offset=<n>
|
||||||
|
substitute_callout use substitution callouts
|
||||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
@ -1150,76 +1154,95 @@ SUBJECT MODIFIERS
|
||||||
the highest one actually used in the match are output (corresponding to
|
the highest one actually used in the match are output (corresponding to
|
||||||
the return code from pcre2_match()). Groups that did not take part in
|
the return code from pcre2_match()). Groups that did not take part in
|
||||||
the match are output as "<unset>". This modifier is not relevant for
|
the match are output as "<unset>". This modifier is not relevant for
|
||||||
DFA matching (which does no capturing); it is ignored, with a warning
|
DFA matching (which does no capturing) and does not apply when replace
|
||||||
message, if present.
|
is specified; it is ignored, with a warning message, if present.
|
||||||
|
|
||||||
Testing callouts
|
Showing the entire ovector, for all outcomes
|
||||||
|
|
||||||
A callout function is supplied when pcre2test calls the library match-
|
The allvector modifier requests that the entire ovector be shown, what-
|
||||||
ing functions, unless callout_none is specified. Its behaviour can be
|
ever the outcome of the match. Compare allcaptures, which shows only up
|
||||||
controlled by various modifiers listed above whose names begin with
|
to the maximum number of capture groups for the pattern, and then only
|
||||||
callout_. Details are given in the section entitled "Callouts" below.
|
for a successful complete non-DFA match. This modifier, which acts
|
||||||
|
after any match result, and also for DFA matching, provides a means of
|
||||||
|
checking that there are no unexpected modifications to ovector fields.
|
||||||
|
Before each match attempt, the ovector is filled with a special value,
|
||||||
|
and if this is found in both elements of a capturing pair,
|
||||||
|
"<unchanged>" is output. After a successful match, this applies to all
|
||||||
|
groups after the maximum capture group for the pattern. In other cases
|
||||||
|
it applies to the entire ovector. After a partial match, the first two
|
||||||
|
elements are the only ones that should be set. After a DFA match, the
|
||||||
|
amount of ovector that is used depends on the number of matches that
|
||||||
|
were found.
|
||||||
|
|
||||||
|
Testing pattern callouts
|
||||||
|
|
||||||
|
A callout function is supplied when pcre2test calls the library match-
|
||||||
|
ing functions, unless callout_none is specified. Its behaviour can be
|
||||||
|
controlled by various modifiers listed above whose names begin with
|
||||||
|
callout_. Details are given in the section entitled "Callouts" below.
|
||||||
|
Testing callouts from pcre2_substitute() is decribed separately in
|
||||||
|
"Testing the substitution function" below.
|
||||||
|
|
||||||
Finding all matches in a string
|
Finding all matches in a string
|
||||||
|
|
||||||
Searching for all possible matches within a subject can be requested by
|
Searching for all possible matches within a subject can be requested by
|
||||||
the global or altglobal modifier. After finding a match, the matching
|
the global or altglobal modifier. After finding a match, the matching
|
||||||
function is called again to search the remainder of the subject. The
|
function is called again to search the remainder of the subject. The
|
||||||
difference between global and altglobal is that the former uses the
|
difference between global and altglobal is that the former uses the
|
||||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||||
searching at a new point within the entire string (which is what Perl
|
searching at a new point within the entire string (which is what Perl
|
||||||
does), whereas the latter passes over a shortened subject. This makes a
|
does), whereas the latter passes over a shortened subject. This makes a
|
||||||
difference to the matching process if the pattern begins with a lookbe-
|
difference to the matching process if the pattern begins with a lookbe-
|
||||||
hind assertion (including \b or \B).
|
hind assertion (including \b or \B).
|
||||||
|
|
||||||
If an empty string is matched, the next match is done with the
|
If an empty string is matched, the next match is done with the
|
||||||
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
||||||
for another, non-empty, match at the same point in the subject. If this
|
for another, non-empty, match at the same point in the subject. If this
|
||||||
match fails, the start offset is advanced, and the normal match is
|
match fails, the start offset is advanced, and the normal match is
|
||||||
retried. This imitates the way Perl handles such cases when using the
|
retried. This imitates the way Perl handles such cases when using the
|
||||||
/g modifier or the split() function. Normally, the start offset is
|
/g modifier or the split() function. Normally, the start offset is
|
||||||
advanced by one character, but if the newline convention recognizes
|
advanced by one character, but if the newline convention recognizes
|
||||||
CRLF as a newline, and the current character is CR followed by LF, an
|
CRLF as a newline, and the current character is CR followed by LF, an
|
||||||
advance of two characters occurs.
|
advance of two characters occurs.
|
||||||
|
|
||||||
Testing substring extraction functions
|
Testing substring extraction functions
|
||||||
|
|
||||||
The copy and get modifiers can be used to test the pcre2_sub-
|
The copy and get modifiers can be used to test the pcre2_sub-
|
||||||
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
|
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
|
||||||
given more than once, and each can specify a group name or number, for
|
given more than once, and each can specify a group name or number, for
|
||||||
example:
|
example:
|
||||||
|
|
||||||
abcd\=copy=1,copy=3,get=G1
|
abcd\=copy=1,copy=3,get=G1
|
||||||
|
|
||||||
If the #subject command is used to set default copy and/or get lists,
|
If the #subject command is used to set default copy and/or get lists,
|
||||||
these can be unset by specifying a negative number to cancel all num-
|
these can be unset by specifying a negative number to cancel all num-
|
||||||
bered groups and an empty name to cancel all named groups.
|
bered groups and an empty name to cancel all named groups.
|
||||||
|
|
||||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||||
all captured substrings.
|
all captured substrings.
|
||||||
|
|
||||||
If the subject line is successfully matched, the substrings extracted
|
If the subject line is successfully matched, the substrings extracted
|
||||||
by the convenience functions are output with C, G, or L after the
|
by the convenience functions are output with C, G, or L after the
|
||||||
string number instead of a colon. This is in addition to the normal
|
string number instead of a colon. This is in addition to the normal
|
||||||
full list. The string length (that is, the return from the extraction
|
full list. The string length (that is, the return from the extraction
|
||||||
function) is given in parentheses after each substring, followed by the
|
function) is given in parentheses after each substring, followed by the
|
||||||
name when the extraction was by name.
|
name when the extraction was by name.
|
||||||
|
|
||||||
Testing the substitution function
|
Testing the substitution function
|
||||||
|
|
||||||
If the replace modifier is set, the pcre2_substitute() function is
|
If the replace modifier is set, the pcre2_substitute() function is
|
||||||
called instead of one of the matching functions. Note that replacement
|
called instead of one of the matching functions. Note that replacement
|
||||||
strings cannot contain commas, because a comma signifies the end of a
|
strings cannot contain commas, because a comma signifies the end of a
|
||||||
modifier. This is not thought to be an issue in a test program.
|
modifier. This is not thought to be an issue in a test program.
|
||||||
|
|
||||||
Unlike subject strings, pcre2test does not process replacement strings
|
Unlike subject strings, pcre2test does not process replacement strings
|
||||||
for escape sequences. In UTF mode, a replacement string is checked to
|
for escape sequences. In UTF mode, a replacement string is checked to
|
||||||
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
||||||
a UTF string of the appropriate code unit width. If it is not a valid
|
a UTF string of the appropriate code unit width. If it is not a valid
|
||||||
UTF-8 string, the individual code units are copied directly. This pro-
|
UTF-8 string, the individual code units are copied directly. This pro-
|
||||||
vides a means of passing an invalid UTF-8 string for testing purposes.
|
vides a means of passing an invalid UTF-8 string for testing purposes.
|
||||||
|
|
||||||
The following modifiers set options (in additional to the normal match
|
The following modifiers set options (in additional to the normal match
|
||||||
options) for pcre2_substitute():
|
options) for pcre2_substitute():
|
||||||
|
|
||||||
global PCRE2_SUBSTITUTE_GLOBAL
|
global PCRE2_SUBSTITUTE_GLOBAL
|
||||||
|
@ -1229,8 +1252,8 @@ SUBJECT MODIFIERS
|
||||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
|
|
||||||
|
|
||||||
After a successful substitution, the modified string is output, pre-
|
After a successful substitution, the modified string is output, pre-
|
||||||
ceded by the number of replacements. This may be zero if there were no
|
ceded by the number of replacements. This may be zero if there were no
|
||||||
matches. Here is a simple example of a substitution test:
|
matches. Here is a simple example of a substitution test:
|
||||||
|
|
||||||
/abc/replace=xxx
|
/abc/replace=xxx
|
||||||
|
@ -1239,12 +1262,22 @@ SUBJECT MODIFIERS
|
||||||
=abc=abc=\=global
|
=abc=abc=\=global
|
||||||
2: =xxx=xxx=
|
2: =xxx=xxx=
|
||||||
|
|
||||||
Subject and replacement strings should be kept relatively short (fewer
|
If the substitute_callout modifier is set, a substitution callout func-
|
||||||
than 256 characters) for substitution tests, as fixed-size buffers are
|
tion is set up. When it is called (after each substitution), the off-
|
||||||
used. To make it easy to test for buffer overflow, if the replacement
|
sets in the input and output strings are output. For example:
|
||||||
string starts with a number in square brackets, that number is passed
|
|
||||||
to pcre2_substitute() as the size of the output buffer, with the
|
/abc/g,replace=<$0>,substitute_callout
|
||||||
replacement string starting at the next character. Here is an example
|
abcdefabcpqr
|
||||||
|
Old 0 3 New 0 5
|
||||||
|
Old 6 9 New 8 13
|
||||||
|
2: <abc>def<abc>pqr
|
||||||
|
|
||||||
|
Subject and replacement strings should be kept relatively short (fewer
|
||||||
|
than 256 characters) for substitution tests, as fixed-size buffers are
|
||||||
|
used. To make it easy to test for buffer overflow, if the replacement
|
||||||
|
string starts with a number in square brackets, that number is passed
|
||||||
|
to pcre2_substitute() as the size of the output buffer, with the
|
||||||
|
replacement string starting at the next character. Here is an example
|
||||||
that tests the edge case:
|
that tests the edge case:
|
||||||
|
|
||||||
/abc/
|
/abc/
|
||||||
|
@ -1253,14 +1286,15 @@ SUBJECT MODIFIERS
|
||||||
123abc123\=replace=[9]XYZ
|
123abc123\=replace=[9]XYZ
|
||||||
Failed: error -47: no more memory
|
Failed: error -47: no more memory
|
||||||
|
|
||||||
The default action of pcre2_substitute() is to return
|
The default action of pcre2_substitute() is to return
|
||||||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if
|
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if
|
||||||
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the sub-
|
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the sub-
|
||||||
stitute_overflow_length modifier), pcre2_substitute() continues to go
|
stitute_overflow_length modifier), pcre2_substitute() continues to go
|
||||||
through the motions of matching and substituting, in order to compute
|
through the motions of matching and substituting (but not doing any
|
||||||
the size of buffer that is required. When this happens, pcre2test shows
|
callouts), in order to compute the size of buffer that is required.
|
||||||
the required buffer length (which includes space for the trailing zero)
|
When this happens, pcre2test shows the required buffer length (which
|
||||||
as part of the error message. For example:
|
includes space for the trailing zero) as part of the error message. For
|
||||||
|
example:
|
||||||
|
|
||||||
/abc/substitute_overflow_length
|
/abc/substitute_overflow_length
|
||||||
123abc123\=replace=[9]XYZ
|
123abc123\=replace=[9]XYZ
|
||||||
|
@ -1818,5 +1852,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 21 July 2018
|
Last updated: 17 September 2018
|
||||||
Copyright (c) 1997-2018 University of Cambridge.
|
Copyright (c) 1997-2018 University of Cambridge.
|
||||||
|
|
|
@ -505,10 +505,10 @@ typedef struct pcre2_real_jit_stack pcre2_jit_stack; \
|
||||||
typedef pcre2_jit_stack *(*pcre2_jit_callback)(void *);
|
typedef pcre2_jit_stack *(*pcre2_jit_callback)(void *);
|
||||||
|
|
||||||
|
|
||||||
/* The structure for passing out data via the pcre_callout_function. We use a
|
/* The structures for passing out data via callout functions. We use structures
|
||||||
structure so that new fields can be added on the end in future versions,
|
so that new fields can be added on the end in future versions, without changing
|
||||||
without changing the API of the function, thereby allowing old clients to work
|
the API of the function, thereby allowing old clients to work without
|
||||||
without modification. Define the generic version in a macro; the width-specific
|
modification. Define the generic versions in a macro; the width-specific
|
||||||
versions are generated from this macro below. */
|
versions are generated from this macro below. */
|
||||||
|
|
||||||
/* Flags for the callout_flags field. These are cleared after a callout. */
|
/* Flags for the callout_flags field. These are cleared after a callout. */
|
||||||
|
@ -550,7 +550,15 @@ typedef struct pcre2_callout_enumerate_block { \
|
||||||
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||||
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||||
/* ------------------------------------------------------------------ */ \
|
/* ------------------------------------------------------------------ */ \
|
||||||
} pcre2_callout_enumerate_block;
|
} pcre2_callout_enumerate_block; \
|
||||||
|
\
|
||||||
|
typedef struct pcre2_substitute_callout_block { \
|
||||||
|
uint32_t version; /* Identifies version of block */ \
|
||||||
|
/* ------------------------ Version 0 ------------------------------- */ \
|
||||||
|
PCRE2_SIZE input_offsets[2]; /* Matched portion of the input */ \
|
||||||
|
PCRE2_SIZE output_offsets[2]; /* Changed portion of the output */ \
|
||||||
|
/* ------------------------------------------------------------------ */ \
|
||||||
|
} pcre2_substitute_callout_block;
|
||||||
|
|
||||||
|
|
||||||
/* List the generic forms of all other functions in macros, which will be
|
/* List the generic forms of all other functions in macros, which will be
|
||||||
|
@ -605,6 +613,9 @@ PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||||
pcre2_set_callout(pcre2_match_context *, \
|
pcre2_set_callout(pcre2_match_context *, \
|
||||||
int (*)(pcre2_callout_block *, void *), void *); \
|
int (*)(pcre2_callout_block *, void *), void *); \
|
||||||
|
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||||
|
pcre2_set_substitute_callout(pcre2_match_context *, \
|
||||||
|
void (*)(pcre2_substitute_callout_block *, void *), void *); \
|
||||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||||
pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \
|
pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \
|
||||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||||
|
@ -808,6 +819,7 @@ pcre2_compile are called by application code. */
|
||||||
|
|
||||||
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
||||||
#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_)
|
#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_)
|
||||||
|
#define pcre2_substitute_callout_block PCRE2_SUFFIX(pcre2_substitute_callout_block_)
|
||||||
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
||||||
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
||||||
#define pcre2_convert_context PCRE2_SUFFIX(pcre2_convert_context_)
|
#define pcre2_convert_context PCRE2_SUFFIX(pcre2_convert_context_)
|
||||||
|
@ -873,6 +885,7 @@ pcre2_compile are called by application code. */
|
||||||
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
|
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
|
||||||
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
||||||
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)
|
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)
|
||||||
|
#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_)
|
||||||
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
|
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
|
||||||
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
|
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
|
||||||
#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_)
|
#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_)
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2016-2017 University of Cambridge
|
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -163,11 +163,13 @@ when no context is supplied to a match function. */
|
||||||
const pcre2_match_context PRIV(default_match_context) = {
|
const pcre2_match_context PRIV(default_match_context) = {
|
||||||
{ default_malloc, default_free, NULL },
|
{ default_malloc, default_free, NULL },
|
||||||
#ifdef SUPPORT_JIT
|
#ifdef SUPPORT_JIT
|
||||||
NULL,
|
NULL, /* JIT callback */
|
||||||
NULL,
|
NULL, /* JIT callback data */
|
||||||
#endif
|
#endif
|
||||||
NULL,
|
NULL, /* Callout function */
|
||||||
NULL,
|
NULL, /* Callout data */
|
||||||
|
NULL, /* Substitute callout function */
|
||||||
|
NULL, /* Substitute callout data */
|
||||||
PCRE2_UNSET, /* Offset limit */
|
PCRE2_UNSET, /* Offset limit */
|
||||||
HEAP_LIMIT,
|
HEAP_LIMIT,
|
||||||
MATCH_LIMIT,
|
MATCH_LIMIT,
|
||||||
|
@ -403,6 +405,16 @@ mcontext->callout_data = callout_data;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||||
|
pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||||
|
void (*substitute_callout)(pcre2_substitute_callout_block *, void *),
|
||||||
|
void *substitute_callout_data)
|
||||||
|
{
|
||||||
|
mcontext->substitute_callout = substitute_callout;
|
||||||
|
mcontext->substitute_callout_data = substitute_callout_data;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||||
pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t limit)
|
pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t limit)
|
||||||
{
|
{
|
||||||
|
|
|
@ -585,6 +585,8 @@ typedef struct pcre2_real_match_context {
|
||||||
#endif
|
#endif
|
||||||
int (*callout)(pcre2_callout_block *, void *);
|
int (*callout)(pcre2_callout_block *, void *);
|
||||||
void *callout_data;
|
void *callout_data;
|
||||||
|
void (*substitute_callout)(pcre2_substitute_callout_block *, void *);
|
||||||
|
void *substitute_callout_data;
|
||||||
PCRE2_SIZE offset_limit;
|
PCRE2_SIZE offset_limit;
|
||||||
uint32_t heap_limit;
|
uint32_t heap_limit;
|
||||||
uint32_t match_limit;
|
uint32_t match_limit;
|
||||||
|
|
|
@ -239,7 +239,9 @@ PCRE2_SIZE extra_needed = 0;
|
||||||
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
|
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
|
||||||
PCRE2_SIZE *ovector;
|
PCRE2_SIZE *ovector;
|
||||||
PCRE2_SIZE ovecsave[3];
|
PCRE2_SIZE ovecsave[3];
|
||||||
|
pcre2_substitute_callout_block scb;
|
||||||
|
|
||||||
|
scb.version = 0;
|
||||||
buff_offset = 0;
|
buff_offset = 0;
|
||||||
lengthleft = buff_length = *blength;
|
lengthleft = buff_length = *blength;
|
||||||
*blength = PCRE2_UNSET;
|
*blength = PCRE2_UNSET;
|
||||||
|
@ -391,6 +393,11 @@ do
|
||||||
goto EXIT;
|
goto EXIT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Save the match point for a possible callout */
|
||||||
|
|
||||||
|
scb.input_offsets[0] = ovector[0];
|
||||||
|
scb.input_offsets[1] = ovector[1];
|
||||||
|
|
||||||
/* Count substitutions with a paranoid check for integer overflow; surely no
|
/* Count substitutions with a paranoid check for integer overflow; surely no
|
||||||
real call to this function would ever hit this! */
|
real call to this function would ever hit this! */
|
||||||
|
|
||||||
|
@ -401,11 +408,13 @@ do
|
||||||
}
|
}
|
||||||
subs++;
|
subs++;
|
||||||
|
|
||||||
/* Copy the text leading up to the match. */
|
/* Copy the text leading up to the match, and remember where the insert
|
||||||
|
begins. */
|
||||||
|
|
||||||
if (rc == 0) rc = ovector_count;
|
if (rc == 0) rc = ovector_count;
|
||||||
fraglength = ovector[0] - start_offset;
|
fraglength = ovector[0] - start_offset;
|
||||||
CHECKMEMCPY(subject + start_offset, fraglength);
|
CHECKMEMCPY(subject + start_offset, fraglength);
|
||||||
|
scb.output_offsets[0] = buff_offset;
|
||||||
|
|
||||||
/* Process the replacement string. Literal mode is set by \Q, but only in
|
/* Process the replacement string. Literal mode is set by \Q, but only in
|
||||||
extended mode when backslashes are being interpreted. In extended mode we
|
extended mode when backslashes are being interpreted. In extended mode we
|
||||||
|
@ -821,10 +830,19 @@ do
|
||||||
} /* End handling a literal code unit */
|
} /* End handling a literal code unit */
|
||||||
} /* End of loop for scanning the replacement. */
|
} /* End of loop for scanning the replacement. */
|
||||||
|
|
||||||
/* The replacement has been copied to the output. Save the details of this
|
/* The replacement has been copied to the output, or its size has been
|
||||||
match. See above for how this data is used. If we matched an empty string, do
|
remembered. Do the callout if there is one and we have done an actual
|
||||||
the magic for global matches. Finally, update the start offset to point to
|
replacement. */
|
||||||
the rest of the subject string. */
|
|
||||||
|
if (!overflowed && mcontext->substitute_callout != NULL)
|
||||||
|
{
|
||||||
|
scb.output_offsets[1] = buff_offset;
|
||||||
|
mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Save the details of this match. See above for how this data is used. If we
|
||||||
|
matched an empty string, do the magic for global matches. Finally, update the
|
||||||
|
start offset to point to the rest of the subject string. */
|
||||||
|
|
||||||
ovecsave[0] = ovector[0];
|
ovecsave[0] = ovector[0];
|
||||||
ovecsave[1] = ovector[1];
|
ovecsave[1] = ovector[1];
|
||||||
|
|
100
src/pcre2test.c
100
src/pcre2test.c
|
@ -484,14 +484,15 @@ so many of them that they are split into two fields. */
|
||||||
|
|
||||||
/* Second control word */
|
/* Second control word */
|
||||||
|
|
||||||
#define CTL2_SUBSTITUTE_EXTENDED 0x00000001u
|
#define CTL2_SUBSTITUTE_CALLOUT 0x00000001u
|
||||||
#define CTL2_SUBSTITUTE_OVERFLOW_LENGTH 0x00000002u
|
#define CTL2_SUBSTITUTE_EXTENDED 0x00000002u
|
||||||
#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000004u
|
#define CTL2_SUBSTITUTE_OVERFLOW_LENGTH 0x00000004u
|
||||||
#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000008u
|
#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000008u
|
||||||
#define CTL2_SUBJECT_LITERAL 0x00000010u
|
#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000010u
|
||||||
#define CTL2_CALLOUT_NO_WHERE 0x00000020u
|
#define CTL2_SUBJECT_LITERAL 0x00000020u
|
||||||
#define CTL2_CALLOUT_EXTRA 0x00000040u
|
#define CTL2_CALLOUT_NO_WHERE 0x00000040u
|
||||||
#define CTL2_ALLVECTOR 0x00000080u
|
#define CTL2_CALLOUT_EXTRA 0x00000080u
|
||||||
|
#define CTL2_ALLVECTOR 0x00000100u
|
||||||
|
|
||||||
#define CTL2_NL_SET 0x40000000u /* Informational */
|
#define CTL2_NL_SET 0x40000000u /* Informational */
|
||||||
#define CTL2_BSR_SET 0x80000000u /* Informational */
|
#define CTL2_BSR_SET 0x80000000u /* Informational */
|
||||||
|
@ -511,7 +512,8 @@ different things in the two cases. */
|
||||||
CTL_STARTCHAR|\
|
CTL_STARTCHAR|\
|
||||||
CTL_UTF8_INPUT)
|
CTL_UTF8_INPUT)
|
||||||
|
|
||||||
#define CTL2_ALLPD (CTL2_SUBSTITUTE_EXTENDED|\
|
#define CTL2_ALLPD (CTL2_SUBSTITUTE_CALLOUT|\
|
||||||
|
CTL2_SUBSTITUTE_EXTENDED|\
|
||||||
CTL2_SUBSTITUTE_OVERFLOW_LENGTH|\
|
CTL2_SUBSTITUTE_OVERFLOW_LENGTH|\
|
||||||
CTL2_SUBSTITUTE_UNKNOWN_UNSET|\
|
CTL2_SUBSTITUTE_UNKNOWN_UNSET|\
|
||||||
CTL2_SUBSTITUTE_UNSET_EMPTY|\
|
CTL2_SUBSTITUTE_UNSET_EMPTY|\
|
||||||
|
@ -690,6 +692,7 @@ static modstruct modlist[] = {
|
||||||
{ "startchar", MOD_PND, MOD_CTL, CTL_STARTCHAR, PO(control) },
|
{ "startchar", MOD_PND, MOD_CTL, CTL_STARTCHAR, PO(control) },
|
||||||
{ "startoffset", MOD_DAT, MOD_INT, 0, DO(offset) },
|
{ "startoffset", MOD_DAT, MOD_INT, 0, DO(offset) },
|
||||||
{ "subject_literal", MOD_PATP, MOD_CTL, CTL2_SUBJECT_LITERAL, PO(control2) },
|
{ "subject_literal", MOD_PATP, MOD_CTL, CTL2_SUBJECT_LITERAL, PO(control2) },
|
||||||
|
{ "substitute_callout", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_CALLOUT, PO(control2) },
|
||||||
{ "substitute_extended", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_EXTENDED, PO(control2) },
|
{ "substitute_extended", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_EXTENDED, PO(control2) },
|
||||||
{ "substitute_overflow_length", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_OVERFLOW_LENGTH, PO(control2) },
|
{ "substitute_overflow_length", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_OVERFLOW_LENGTH, PO(control2) },
|
||||||
{ "substitute_unknown_unset", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_UNKNOWN_UNSET, PO(control2) },
|
{ "substitute_unknown_unset", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_UNKNOWN_UNSET, PO(control2) },
|
||||||
|
@ -1355,6 +1358,17 @@ are supported. */
|
||||||
else \
|
else \
|
||||||
pcre2_set_parens_nest_limit_32(G(a,32),b)
|
pcre2_set_parens_nest_limit_32(G(a,32),b)
|
||||||
|
|
||||||
|
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||||
|
if (test_mode == PCRE8_MODE) \
|
||||||
|
pcre2_set_substitute_callout_8(G(a,8), \
|
||||||
|
(void (*)(pcre2_substitute_callout_block_8 *, void *))b,c); \
|
||||||
|
else if (test_mode == PCRE16_MODE) \
|
||||||
|
pcre2_set_substitute_callout_16(G(a,16), \
|
||||||
|
(void (*)(pcre2_substitute_callout_block_16 *, void *))b,c); \
|
||||||
|
else \
|
||||||
|
pcre2_set_substitute_callout_32(G(a,32), \
|
||||||
|
(void (*)(pcre2_substitute_callout_block_32 *, void *))b,c)
|
||||||
|
|
||||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||||
if (test_mode == PCRE8_MODE) \
|
if (test_mode == PCRE8_MODE) \
|
||||||
a = pcre2_substitute_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
a = pcre2_substitute_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
||||||
|
@ -1824,6 +1838,14 @@ the three different cases. */
|
||||||
else \
|
else \
|
||||||
G(pcre2_set_parens_nest_limit_,BITTWO)(G(a,BITTWO),b)
|
G(pcre2_set_parens_nest_limit_,BITTWO)(G(a,BITTWO),b)
|
||||||
|
|
||||||
|
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||||
|
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||||
|
G(pcre2_set_substitute_callout_,BITONE)(G(a,BITONE), \
|
||||||
|
(void (*)(G(pcre2_substitute_callout_block_,BITONE) *, void *))b,c); \
|
||||||
|
else \
|
||||||
|
G(pcre2_set_substitute_callout_,BITTWO)(G(a,BITTWO), \
|
||||||
|
(void (*)(G(pcre2_substitute_callout_block_,BITTWO) *, void *))b,c)
|
||||||
|
|
||||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||||
a = G(pcre2_substitute_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \
|
a = G(pcre2_substitute_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \
|
||||||
|
@ -2025,6 +2047,9 @@ the three different cases. */
|
||||||
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_8(G(a,8),b)
|
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_8(G(a,8),b)
|
||||||
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_8(G(a,8),b)
|
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_8(G(a,8),b)
|
||||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_8(G(a,8),b)
|
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_8(G(a,8),b)
|
||||||
|
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||||
|
pcre2_set_substitute_callout_8(G(a,8), \
|
||||||
|
(void (*)(pcre2_substitute_callout_block_8 *, void *))b,c)
|
||||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||||
a = pcre2_substitute_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
a = pcre2_substitute_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
||||||
(PCRE2_SPTR8)i,j,(PCRE2_UCHAR8 *)k,l)
|
(PCRE2_SPTR8)i,j,(PCRE2_UCHAR8 *)k,l)
|
||||||
|
@ -2129,6 +2154,9 @@ the three different cases. */
|
||||||
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_16(G(a,16),b)
|
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_16(G(a,16),b)
|
||||||
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_16(G(a,16),b)
|
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_16(G(a,16),b)
|
||||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_16(G(a,16),b)
|
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_16(G(a,16),b)
|
||||||
|
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||||
|
pcre2_set_substitute_callout_16(G(a,16), \
|
||||||
|
(void (*)(pcre2_substitute_callout_block_16 *, void *))b,c)
|
||||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||||
a = pcre2_substitute_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16), \
|
a = pcre2_substitute_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16), \
|
||||||
(PCRE2_SPTR16)i,j,(PCRE2_UCHAR16 *)k,l)
|
(PCRE2_SPTR16)i,j,(PCRE2_UCHAR16 *)k,l)
|
||||||
|
@ -2221,7 +2249,7 @@ the three different cases. */
|
||||||
#define PCRE2_SERIALIZE_GET_NUMBER_OF_CODES(r,a) \
|
#define PCRE2_SERIALIZE_GET_NUMBER_OF_CODES(r,a) \
|
||||||
r = pcre2_serialize_get_number_of_codes_32(a)
|
r = pcre2_serialize_get_number_of_codes_32(a)
|
||||||
#define PCRE2_SET_CALLOUT(a,b,c) \
|
#define PCRE2_SET_CALLOUT(a,b,c) \
|
||||||
pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *, void *))b,c);
|
pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *, void *))b,c)
|
||||||
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b)
|
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b)
|
||||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
||||||
pcre2_set_compile_recursion_guard_32(G(a,32),b,c)
|
pcre2_set_compile_recursion_guard_32(G(a,32),b,c)
|
||||||
|
@ -2233,6 +2261,9 @@ the three different cases. */
|
||||||
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_32(G(a,32),b)
|
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_32(G(a,32),b)
|
||||||
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_32(G(a,32),b)
|
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_32(G(a,32),b)
|
||||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_32(G(a,32),b)
|
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_32(G(a,32),b)
|
||||||
|
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||||
|
pcre2_set_substitute_callout_32(G(a,32), \
|
||||||
|
(void (*)(pcre2_substitute_callout_block_32 *, void *))b,c)
|
||||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||||
a = pcre2_substitute_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32), \
|
a = pcre2_substitute_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32), \
|
||||||
(PCRE2_SPTR32)i,j,(PCRE2_UCHAR32 *)k,l)
|
(PCRE2_SPTR32)i,j,(PCRE2_UCHAR32 *)k,l)
|
||||||
|
@ -4022,7 +4053,7 @@ Returns: nothing
|
||||||
static void
|
static void
|
||||||
show_controls(uint32_t controls, uint32_t controls2, const char *before)
|
show_controls(uint32_t controls, uint32_t controls2, const char *before)
|
||||||
{
|
{
|
||||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
before,
|
before,
|
||||||
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
||||||
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
||||||
|
@ -4058,6 +4089,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s
|
||||||
((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "",
|
((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "",
|
||||||
((controls & CTL_PUSHTABLESCOPY) != 0)? " pushtablescopy" : "",
|
((controls & CTL_PUSHTABLESCOPY) != 0)? " pushtablescopy" : "",
|
||||||
((controls & CTL_STARTCHAR) != 0)? " startchar" : "",
|
((controls & CTL_STARTCHAR) != 0)? " startchar" : "",
|
||||||
|
((controls2 & CTL2_SUBSTITUTE_CALLOUT) != 0)? " substitute_callout" : "",
|
||||||
((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "",
|
((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "",
|
||||||
((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "",
|
((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "",
|
||||||
((controls2 & CTL2_SUBSTITUTE_UNKNOWN_UNSET) != 0)? " substitute_unknown_unset" : "",
|
((controls2 & CTL2_SUBSTITUTE_UNKNOWN_UNSET) != 0)? " substitute_unknown_unset" : "",
|
||||||
|
@ -5896,6 +5928,35 @@ return capcount;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Substitute callout function *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* Called from pcre2_substitute() when the substitute_callout modifier is set.
|
||||||
|
Print out the data that is passed back. The substitute callout block is
|
||||||
|
identical for all code unit widths, so we just pick one.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
scb pointer to substitute callout block
|
||||||
|
data_ptr callout data
|
||||||
|
|
||||||
|
Returns: nothing
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void
|
||||||
|
substitute_callout_function(pcre2_substitute_callout_block_8 *scb,
|
||||||
|
void *data_ptr)
|
||||||
|
{
|
||||||
|
(void)data_ptr; /* Not used */
|
||||||
|
fprintf(outfile, "Old %" SIZ_FORM " %" SIZ_FORM " New %" SIZ_FORM
|
||||||
|
" %" SIZ_FORM "\n",
|
||||||
|
SIZ_CAST scb->input_offsets[0],
|
||||||
|
SIZ_CAST scb->input_offsets[1],
|
||||||
|
SIZ_CAST scb->output_offsets[0],
|
||||||
|
SIZ_CAST scb->output_offsets[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Callout function *
|
* Callout function *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
@ -5907,8 +5968,11 @@ callout block for different code unit widths are that the pointers to the
|
||||||
subject, the most recent MARK, and a callout argument string point to strings
|
subject, the most recent MARK, and a callout argument string point to strings
|
||||||
of the appropriate width. Casts can be used to deal with this.
|
of the appropriate width. Casts can be used to deal with this.
|
||||||
|
|
||||||
Argument: a pointer to a callout block
|
Arguments:
|
||||||
Return:
|
cb a pointer to a callout block
|
||||||
|
callout_data_ptr the provided callout data
|
||||||
|
|
||||||
|
Returns: 0 or 1 or an error, as determined by settings
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -7158,6 +7222,16 @@ if (dat_datctl.replacement[0] != 0)
|
||||||
rlen = PCRE2_ZERO_TERMINATED;
|
rlen = PCRE2_ZERO_TERMINATED;
|
||||||
else
|
else
|
||||||
rlen = (CASTVAR(uint8_t *, r) - rbuffer)/code_unit_size;
|
rlen = (CASTVAR(uint8_t *, r) - rbuffer)/code_unit_size;
|
||||||
|
|
||||||
|
if ((dat_datctl.control2 & CTL2_SUBSTITUTE_CALLOUT) != 0)
|
||||||
|
{
|
||||||
|
PCRE2_SET_SUBSTITUTE_CALLOUT(dat_context, substitute_callout_function, NULL);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PCRE2_SET_SUBSTITUTE_CALLOUT(dat_context, NULL, NULL); /* No callout */
|
||||||
|
}
|
||||||
|
|
||||||
PCRE2_SUBSTITUTE(rc, compiled_code, pp, arg_ulen, dat_datctl.offset,
|
PCRE2_SUBSTITUTE(rc, compiled_code, pp, arg_ulen, dat_datctl.offset,
|
||||||
dat_datctl.options|xoptions, match_data, dat_context,
|
dat_datctl.options|xoptions, match_data, dat_context,
|
||||||
rbuffer, rlen, nbuffer, &nsize);
|
rbuffer, rlen, nbuffer, &nsize);
|
||||||
|
|
|
@ -476,4 +476,9 @@
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
aaa
|
aaa
|
||||||
|
|
||||||
|
# Offsets are different in 8-bit mode.
|
||||||
|
|
||||||
|
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||||
|
123abcáyzabcdef789abcሴqr
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -382,4 +382,9 @@
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
aaa
|
aaa
|
||||||
|
|
||||||
|
# Offsets are different in 8-bit mode.
|
||||||
|
|
||||||
|
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||||
|
123abcáyzabcdef789abcሴqr
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -5514,4 +5514,7 @@ a)"xI
|
||||||
abcdef\=ovector=4
|
abcdef\=ovector=4
|
||||||
abxyz\=ovector=4
|
abxyz\=ovector=4
|
||||||
|
|
||||||
|
/a(b)c|xyz/g,replace=<$0>,substitute_callout
|
||||||
|
abcdefabcpqr
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -1626,4 +1626,14 @@ Subject length lower bound = 1
|
||||||
aaa
|
aaa
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
# Offsets are different in 8-bit mode.
|
||||||
|
|
||||||
|
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||||
|
123abcáyzabcdef789abcሴqr
|
||||||
|
Old 6 6 New 6 8
|
||||||
|
Old 13 13 New 15 17
|
||||||
|
Old 13 16 New 17 22
|
||||||
|
Old 22 22 New 28 30
|
||||||
|
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||||
|
|
||||||
# End of testinput10
|
# End of testinput10
|
||||||
|
|
|
@ -1471,4 +1471,14 @@ Subject length lower bound = 1
|
||||||
aaa
|
aaa
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
# Offsets are different in 8-bit mode.
|
||||||
|
|
||||||
|
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||||
|
123abcáyzabcdef789abcሴqr
|
||||||
|
Old 6 6 New 6 8
|
||||||
|
Old 12 12 New 14 16
|
||||||
|
Old 12 15 New 16 21
|
||||||
|
Old 21 21 New 27 29
|
||||||
|
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -1468,4 +1468,14 @@ Subject length lower bound = 1
|
||||||
aaa
|
aaa
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
# Offsets are different in 8-bit mode.
|
||||||
|
|
||||||
|
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||||
|
123abcáyzabcdef789abcሴqr
|
||||||
|
Old 6 6 New 6 8
|
||||||
|
Old 12 12 New 14 16
|
||||||
|
Old 12 15 New 16 21
|
||||||
|
Old 21 21 New 27 29
|
||||||
|
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||||
|
|
||||||
# End of testinput12
|
# End of testinput12
|
||||||
|
|
|
@ -16795,6 +16795,12 @@ Subject length lower bound = 1
|
||||||
2: <unchanged>
|
2: <unchanged>
|
||||||
3: <unchanged>
|
3: <unchanged>
|
||||||
|
|
||||||
|
/a(b)c|xyz/g,replace=<$0>,substitute_callout
|
||||||
|
abcdefabcpqr
|
||||||
|
Old 0 3 New 0 5
|
||||||
|
Old 6 9 New 8 13
|
||||||
|
2: <abc>def<abc>pqr
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
Error -62: bad serialized data
|
Error -62: bad serialized data
|
||||||
|
|
Loading…
Reference in New Issue