Implement callouts from pcre2_substitute().
This commit is contained in:
parent
80adf9d165
commit
a69267246f
|
@ -12,6 +12,8 @@ partial matches.
|
|||
2. Fix subject buffer overread in JIT when UTF is disabled and \X or \R has
|
||||
a greater than 1 fixed quantifier. This issue was found by Yunho Kim.
|
||||
|
||||
3. Added support for callouts from pcre2_substitute().
|
||||
|
||||
|
||||
Version 10.32 10-September-2018
|
||||
-------------------------------
|
||||
|
|
|
@ -85,6 +85,7 @@ dist_html_DATA = \
|
|||
doc/html/pcre2_set_parens_nest_limit.html \
|
||||
doc/html/pcre2_set_recursion_limit.html \
|
||||
doc/html/pcre2_set_recursion_memory_management.html \
|
||||
doc/html/pcre2_set_substitute_callout.html \
|
||||
doc/html/pcre2_substitute.html \
|
||||
doc/html/pcre2_substring_copy_byname.html \
|
||||
doc/html/pcre2_substring_copy_bynumber.html \
|
||||
|
@ -178,6 +179,7 @@ dist_man_MANS = \
|
|||
doc/pcre2_set_parens_nest_limit.3 \
|
||||
doc/pcre2_set_recursion_limit.3 \
|
||||
doc/pcre2_set_recursion_memory_management.3 \
|
||||
doc/pcre2_set_substitute_callout.3 \
|
||||
doc/pcre2_substitute.3 \
|
||||
doc/pcre2_substring_copy_byname.3 \
|
||||
doc/pcre2_substring_copy_bynumber.3 \
|
||||
|
|
|
@ -162,7 +162,7 @@ listing), and the short pages for individual functions, are concatenated in
|
|||
pcre2-config show PCRE2 installation configuration information
|
||||
pcre2api details of PCRE2's native C API
|
||||
pcre2build building PCRE2
|
||||
pcre2callout details of the callout feature
|
||||
pcre2callout details of the pattern callout feature
|
||||
pcre2compat discussion of Perl compatibility
|
||||
pcre2convert details of pattern conversion functions
|
||||
pcre2demo a demonstration C program that uses PCRE2
|
||||
|
@ -198,7 +198,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
|||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 11 July 2018
|
||||
Last updated: 17 September 2018
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2_set_substitute_callout specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2_set_substitute_callout man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
SYNOPSIS
|
||||
</b><br>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function sets the substitute callout fields in a match context (the first
|
||||
argument). The second argument specifies a callout function, and the third
|
||||
argument is an opaque data item that is passed to it. The result of this
|
||||
function is always zero.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
page and a description of the POSIX API in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
page.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -182,6 +182,11 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b> void *<i>callout_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -912,12 +917,23 @@ PCRE2_ERROR_BADDATA if invalid data is detected.
|
|||
<b> void *<i>callout_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
This sets up a "callout" function for PCRE2 to call at specified points
|
||||
This sets up a callout function for PCRE2 to call at specified points
|
||||
during a matching operation. Details are given in the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
This sets up a callout function for PCRE2 to call after each substitution
|
||||
made by <b>pcre2_substitute()</b>. Details are given in the section entitled
|
||||
"Creating a new string with substitutions"
|
||||
<a href="#substitutions">below.</a>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -3163,26 +3179,30 @@ page, you cannot use names to distinguish the different subpatterns, because
|
|||
names are not included in the compiled code. The matching process uses only
|
||||
numbers. For this reason, the use of different names for subpatterns of the
|
||||
same number causes an error at compile time.
|
||||
</P>
|
||||
<a name="substitutions"></a></P>
|
||||
<br><a name="SEC36" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
<b> uint32_t <i>options</i>, pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>, PCRE2_SPTR <i>replacement</i>,</b>
|
||||
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *\fIoutputbuffer\zfP,</b>
|
||||
<b> PCRE2_SIZE <i>rlength</i>, PCRE2_UCHAR *<i>outputbuffer</i>,</b>
|
||||
<b> PCRE2_SIZE *<i>outlengthptr</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
||||
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||
which a \K item in a lookahead in the pattern causes the match to end before
|
||||
it starts are not supported, and give rise to an error return. For global
|
||||
replacements, matches in which \K in a lookbehind causes the match to start
|
||||
earlier than the point that was reached in the previous iteration are also not
|
||||
supported.
|
||||
string in <i>outputbuffer</i>, replacing one or more parts that were matched
|
||||
with the <i>replacement</i> string, whose length is supplied in <b>rlength</b>.
|
||||
This can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
The default is to perform just one replacement, but there is an option that
|
||||
requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below for details).
|
||||
</P>
|
||||
<P>
|
||||
Matches in which a \K item in a lookahead in the pattern causes the match to
|
||||
end before it starts are not supported, and give rise to an error return. For
|
||||
global replacements, matches in which \K in a lookbehind causes the match to
|
||||
start earlier than the point that was reached in the previous iteration are
|
||||
also not supported.
|
||||
</P>
|
||||
<P>
|
||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||
|
@ -3194,9 +3214,9 @@ allocate memory for the compiled code.
|
|||
</P>
|
||||
<P>
|
||||
If an external <i>match_data</i> block is provided, its contents afterwards
|
||||
are those set by the final call to <b>pcre2_match()</b>, which will have
|
||||
ended in a matching error. The contents of the ovector within the match data
|
||||
block may or may not have been changed.
|
||||
are those set by the final call to <b>pcre2_match()</b>. For global changes,
|
||||
this will have ended in a matching error. The contents of the ovector within
|
||||
the match data block may or may not have been changed.
|
||||
</P>
|
||||
<P>
|
||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||
|
@ -3220,12 +3240,12 @@ length is in code units, not bytes.
|
|||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups or (*MARK), (*PRUNE), or (*THEN) items in the
|
||||
pattern. The following forms are always recognized:
|
||||
characters from capturing groups or names from (*MARK) or other control verbs
|
||||
in the pattern. The following forms are always recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name
|
||||
$*MARK or ${*MARK} insert a control verb name
|
||||
</pre>
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
|
@ -3234,12 +3254,13 @@ For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
|||
string "+$1$0$1+", the result is "=+babcb+=".
|
||||
</P>
|
||||
<P>
|
||||
$*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or (*THEN)
|
||||
on the matching path that has a name. (*MARK) must always include a name, but
|
||||
(*PRUNE) and (*THEN) need not. For example, in the case of (*MARK:A)(*PRUNE)
|
||||
the name inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B".
|
||||
This facility can be used to perform simple simultaneous substitutions, as this
|
||||
<b>pcre2test</b> example shows:
|
||||
$*MARK inserts the name from the last encountered (*ACCEPT), (*COMMIT),
|
||||
(*MARK), (*PRUNE), or (*THEN) on the matching path that has a name. (*MARK)
|
||||
must always include a name, but the other verbs need not. For example, in
|
||||
the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
|
||||
(*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be used to
|
||||
perform simple simultaneous substitutions, as this <b>pcre2test</b> example
|
||||
shows:
|
||||
<pre>
|
||||
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
|
@ -3399,6 +3420,44 @@ obtained by calling the <b>pcre2_get_error_message()</b> function (see
|
|||
"Obtaining a textual error message"
|
||||
<a href="#geterrormessage">above).</a>
|
||||
</P>
|
||||
<br><b>
|
||||
Substitution callouts
|
||||
</b><br>
|
||||
<P>
|
||||
<b>int pcre2_set_substitute_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> void (*<i>callout_function</i>)(pcre2_substitute_callout_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
The <b>pcre2_set_substitution_callout()</b> function can be used to specify a
|
||||
callout function for <b>pcre2_substitute()</b>. This information is passed in
|
||||
a match context. The callout function is called after each substitution. It is
|
||||
not called for simulated substitutions that happen as a result of the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. A callout function should not return
|
||||
any value.
|
||||
</P>
|
||||
<P>
|
||||
The first argument of the callout function is a pointer to a substitute callout
|
||||
block structure, which contains the following fields, not necessarily in this
|
||||
order:
|
||||
<pre>
|
||||
uint32_t <i>version</i>;
|
||||
PCRE2_SIZE <i>input_offsets[2]</i>;
|
||||
PCRE2_SIZE <i>output_offsets[2]</i>;
|
||||
</pre>
|
||||
The <i>version</i> field contains the version number of the block format. The
|
||||
current version is 0. The version number will increase in future if more fields
|
||||
are added, but the intention is never to remove any of the existing fields.
|
||||
</P>
|
||||
<P>
|
||||
The <i>input_offsets</i> vector contains the code unit offsets in the input
|
||||
string of the matched substring, and the <i>output_offsets</i> vector contains
|
||||
the offsets of the replacement in the output string.
|
||||
</P>
|
||||
<P>
|
||||
The second argument of the callout function is the value passed as
|
||||
<i>callout_data</i> when the function was registered.
|
||||
</P>
|
||||
<br><a name="SEC37" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
||||
|
@ -3665,7 +3724,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 07 September 2018
|
||||
Last updated: 18 September 2018
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -44,6 +44,14 @@ a match context (see <b>pcre2_set_callout()</b> in the
|
|||
documentation).
|
||||
</P>
|
||||
<P>
|
||||
When using the <b>pcre2_substitute()</b> function, an additional callout feature
|
||||
is available. This does a callout after each change to the subject string and
|
||||
is described in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation; the rest of this document is concerned with callouts during
|
||||
pattern matching.
|
||||
</P>
|
||||
<P>
|
||||
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||
function is to be called. Different callout points can be identified by putting
|
||||
a number less than 256 after the letter C. The default value is zero.
|
||||
|
@ -463,7 +471,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 26 April 2018
|
||||
Last updated: 17 September 2018
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -1041,6 +1041,7 @@ process.
|
|||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
|
@ -1048,6 +1049,7 @@ process.
|
|||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
|
@ -1185,6 +1187,7 @@ pattern.
|
|||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
callout_capture show captures at callout time
|
||||
|
@ -1214,6 +1217,7 @@ pattern.
|
|||
replace=<string> specify a replacement string
|
||||
startchar show startchar when relevant
|
||||
startoffset=<n> same as offset=<n>
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
|
@ -1281,10 +1285,28 @@ captured parentheses be output after a match. By default, only those up to the
|
|||
highest one actually used in the match are output (corresponding to the return
|
||||
code from <b>pcre2_match()</b>). Groups that did not take part in the match
|
||||
are output as "<unset>". This modifier is not relevant for DFA matching (which
|
||||
does no capturing); it is ignored, with a warning message, if present.
|
||||
does no capturing) and does not apply when <b>replace</b> is specified; it is
|
||||
ignored, with a warning message, if present.
|
||||
</P>
|
||||
<br><b>
|
||||
Testing callouts
|
||||
Showing the entire ovector, for all outcomes
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>allvector</b> modifier requests that the entire ovector be shown,
|
||||
whatever the outcome of the match. Compare <b>allcaptures</b>, which shows only
|
||||
up to the maximum number of capture groups for the pattern, and then only for a
|
||||
successful complete non-DFA match. This modifier, which acts after any match
|
||||
result, and also for DFA matching, provides a means of checking that there are
|
||||
no unexpected modifications to ovector fields. Before each match attempt, the
|
||||
ovector is filled with a special value, and if this is found in both elements
|
||||
of a capturing pair, "<unchanged>" is output. After a successful match, this
|
||||
applies to all groups after the maximum capture group for the pattern. In other
|
||||
cases it applies to the entire ovector. After a partial match, the first two
|
||||
elements are the only ones that should be set. After a DFA match, the amount of
|
||||
ovector that is used depends on the number of matches that were found.
|
||||
</P>
|
||||
<br><b>
|
||||
Testing pattern callouts
|
||||
</b><br>
|
||||
<P>
|
||||
A callout function is supplied when <b>pcre2test</b> calls the library matching
|
||||
|
@ -1292,6 +1314,9 @@ functions, unless <b>callout_none</b> is specified. Its behaviour can be
|
|||
controlled by various modifiers listed above whose names begin with
|
||||
<b>callout_</b>. Details are given in the section entitled "Callouts"
|
||||
<a href="#callouts">below.</a>
|
||||
Testing callouts from <b>pcre2_substitute()</b> is decribed separately in
|
||||
"Testing the substitution function"
|
||||
<a href="#substitution">below.</a>
|
||||
</P>
|
||||
<br><b>
|
||||
Finding all matches in a string
|
||||
|
@ -1343,7 +1368,7 @@ instead of a colon. This is in addition to the normal full list. The string
|
|||
length (that is, the return from the extraction function) is given in
|
||||
parentheses after each substring, followed by the name when the extraction was
|
||||
by name.
|
||||
</P>
|
||||
<a name="substitution"></a></P>
|
||||
<br><b>
|
||||
Testing the substitution function
|
||||
</b><br>
|
||||
|
@ -1384,6 +1409,16 @@ simple example of a substitution test:
|
|||
=abc=abc=\=global
|
||||
2: =xxx=xxx=
|
||||
</pre>
|
||||
If the <b>substitute_callout</b> modifier is set, a substitution callout
|
||||
function is set up. When it is called (after each substitution), the offsets in
|
||||
the input and output strings are output. For example:
|
||||
<pre>
|
||||
/abc/g,replace=<$0>,substitute_callout
|
||||
abcdefabcpqr
|
||||
Old 0 3 New 0 5
|
||||
Old 6 9 New 8 13
|
||||
2: <abc>def<abc>pqr
|
||||
</pre>
|
||||
Subject and replacement strings should be kept relatively short (fewer than 256
|
||||
characters) for substitution tests, as fixed-size buffers are used. To make it
|
||||
easy to test for buffer overflow, if the replacement string starts with a
|
||||
|
@ -1401,10 +1436,10 @@ The default action of <b>pcre2_substitute()</b> is to return
|
|||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
||||
<b>substitute_overflow_length</b> modifier), <b>pcre2_substitute()</b> continues
|
||||
to go through the motions of matching and substituting, in order to compute the
|
||||
size of buffer that is required. When this happens, <b>pcre2test</b> shows the
|
||||
required buffer length (which includes space for the trailing zero) as part of
|
||||
the error message. For example:
|
||||
to go through the motions of matching and substituting (but not doing any
|
||||
callouts), in order to compute the size of buffer that is required. When this
|
||||
happens, <b>pcre2test</b> shows the required buffer length (which includes space
|
||||
for the trailing zero) as part of the error message. For example:
|
||||
<pre>
|
||||
/abc/substitute_overflow_length
|
||||
123abc123\=replace=[9]XYZ
|
||||
|
@ -2004,7 +2039,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 21 July 2018
|
||||
Last updated: 17 September 2018
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2 3 "11 July 2018" "PCRE2 10.32"
|
||||
.TH PCRE2 3 "17 September 2018" "PCRE2 10.33"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH INTRODUCTION
|
||||
|
@ -156,7 +156,7 @@ listing), and the short pages for individual functions, are concatenated in
|
|||
pcre2-config show PCRE2 installation configuration information
|
||||
pcre2api details of PCRE2's native C API
|
||||
pcre2build building PCRE2
|
||||
pcre2callout details of the callout feature
|
||||
pcre2callout details of the pattern callout feature
|
||||
pcre2compat discussion of Perl compatibility
|
||||
pcre2convert details of pattern conversion functions
|
||||
pcre2demo a demonstration C program that uses PCRE2
|
||||
|
@ -197,6 +197,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 11 July 2018
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
.fi
|
||||
|
|
548
doc/pcre2.txt
548
doc/pcre2.txt
|
@ -141,7 +141,7 @@ USER DOCUMENTATION
|
|||
pcre2-config show PCRE2 installation configuration information
|
||||
pcre2api details of PCRE2's native C API
|
||||
pcre2build building PCRE2
|
||||
pcre2callout details of the callout feature
|
||||
pcre2callout details of the pattern callout feature
|
||||
pcre2compat discussion of Perl compatibility
|
||||
pcre2convert details of pattern conversion functions
|
||||
pcre2demo a demonstration C program that uses PCRE2
|
||||
|
@ -177,7 +177,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 11 July 2018
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -293,6 +293,10 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS
|
|||
int (*callout_function)(pcre2_callout_block *, void *),
|
||||
void *callout_data);
|
||||
|
||||
int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||
void (*callout_function)(pcre2_substitute_callout_block *, void *),
|
||||
void *callout_data);
|
||||
|
||||
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
|
||||
PCRE2_SIZE value);
|
||||
|
||||
|
@ -933,10 +937,18 @@ PCRE2 CONTEXTS
|
|||
int (*callout_function)(pcre2_callout_block *, void *),
|
||||
void *callout_data);
|
||||
|
||||
This sets up a "callout" function for PCRE2 to call at specified points
|
||||
This sets up a callout function for PCRE2 to call at specified points
|
||||
during a matching operation. Details are given in the pcre2callout doc-
|
||||
umentation.
|
||||
|
||||
int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||
void (*callout_function)(pcre2_substitute_callout_block *, void *),
|
||||
void *callout_data);
|
||||
|
||||
This sets up a callout function for PCRE2 to call after each substitu-
|
||||
tion made by pcre2_substitute(). Details are given in the section enti-
|
||||
tled "Creating a new string with substitutions" below.
|
||||
|
||||
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
|
||||
PCRE2_SIZE value);
|
||||
|
||||
|
@ -3083,18 +3095,22 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||
uint32_t options, pcre2_match_data *match_data,
|
||||
pcre2_match_context *mcontext, PCRE2_SPTR replacement,
|
||||
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbufferP,
|
||||
PCRE2_SIZE rlength, PCRE2_UCHAR *outputbuffer,
|
||||
PCRE2_SIZE *outlengthptr);
|
||||
|
||||
This function calls pcre2_match() and then makes a copy of the subject
|
||||
string in outputbuffer, replacing the part that was matched with the
|
||||
replacement string, whose length is supplied in rlength. This can be
|
||||
given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||
which a \K item in a lookahead in the pattern causes the match to end
|
||||
before it starts are not supported, and give rise to an error return.
|
||||
For global replacements, matches in which \K in a lookbehind causes the
|
||||
match to start earlier than the point that was reached in the previous
|
||||
iteration are also not supported.
|
||||
string in outputbuffer, replacing one or more parts that were matched
|
||||
with the replacement string, whose length is supplied in rlength. This
|
||||
can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
The default is to perform just one replacement, but there is an option
|
||||
that requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below
|
||||
for details).
|
||||
|
||||
Matches in which a \K item in a lookahead in the pattern causes the
|
||||
match to end before it starts are not supported, and give rise to an
|
||||
error return. For global replacements, matches in which \K in a lookbe-
|
||||
hind causes the match to start earlier than the point that was reached
|
||||
in the previous iteration are also not supported.
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
pcre2_match(), except that the partial matching options are not permit-
|
||||
|
@ -3104,9 +3120,9 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
were used to allocate memory for the compiled code.
|
||||
|
||||
If an external match_data block is provided, its contents afterwards
|
||||
are those set by the final call to pcre2_match(), which will have ended
|
||||
in a matching error. The contents of the ovector within the match data
|
||||
block may or may not have been changed.
|
||||
are those set by the final call to pcre2_match(). For global changes,
|
||||
this will have ended in a matching error. The contents of the ovector
|
||||
within the match data block may or may not have been changed.
|
||||
|
||||
The outlengthptr argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is suc-
|
||||
|
@ -3128,13 +3144,13 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
In the replacement string, which is interpreted as a UTF string in UTF
|
||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||
option is set, a dollar character is an escape character that can spec-
|
||||
ify the insertion of characters from capturing groups or (*MARK),
|
||||
(*PRUNE), or (*THEN) items in the pattern. The following forms are
|
||||
ify the insertion of characters from capturing groups or names from
|
||||
(*MARK) or other control verbs in the pattern. The following forms are
|
||||
always recognized:
|
||||
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name
|
||||
$*MARK or ${*MARK} insert a control verb name
|
||||
|
||||
Either a group number or a group name can be given for <n>. Curly
|
||||
brackets are required only if the following character would be inter-
|
||||
|
@ -3143,11 +3159,11 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
matched with "=abc=" and the replacement string "+$1$0$1+", the result
|
||||
is "=+babcb+=".
|
||||
|
||||
$*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or
|
||||
(*THEN) on the matching path that has a name. (*MARK) must always
|
||||
include a name, but (*PRUNE) and (*THEN) need not. For example, in the
|
||||
case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
|
||||
(*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be
|
||||
$*MARK inserts the name from the last encountered (*ACCEPT), (*COMMIT),
|
||||
(*MARK), (*PRUNE), or (*THEN) on the matching path that has a name.
|
||||
(*MARK) must always include a name, but the other verbs need not. For
|
||||
example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but
|
||||
for (*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be
|
||||
used to perform simple simultaneous substitutions, as this pcre2test
|
||||
example shows:
|
||||
|
||||
|
@ -3302,62 +3318,95 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
obtained by calling the pcre2_get_error_message() function (see
|
||||
"Obtaining a textual error message" above).
|
||||
|
||||
Substitution callouts
|
||||
|
||||
int pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||
void (*callout_function)(pcre2_substitute_callout_block *, void *),
|
||||
void *callout_data);
|
||||
|
||||
The pcre2_set_substitution_callout() function can be used to specify a
|
||||
callout function for pcre2_substitute(). This information is passed in
|
||||
a match context. The callout function is called after each substitu-
|
||||
tion. It is not called for simulated substitutions that happen as a
|
||||
result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. A callout func-
|
||||
tion should not return any value.
|
||||
|
||||
The first argument of the callout function is a pointer to a substitute
|
||||
callout block structure, which contains the following fields, not nec-
|
||||
essarily in this order:
|
||||
|
||||
uint32_t version;
|
||||
PCRE2_SIZE input_offsets[2];
|
||||
PCRE2_SIZE output_offsets[2];
|
||||
|
||||
The version field contains the version number of the block format. The
|
||||
current version is 0. The version number will increase in future if
|
||||
more fields are added, but the intention is never to remove any of the
|
||||
existing fields.
|
||||
|
||||
The input_offsets vector contains the code unit offsets in the input
|
||||
string of the matched substring, and the output_offsets vector contains
|
||||
the offsets of the replacement in the output string.
|
||||
|
||||
The second argument of the callout function is the value passed as
|
||||
callout_data when the function was registered.
|
||||
|
||||
|
||||
DUPLICATE SUBPATTERN NAMES
|
||||
|
||||
int pcre2_substring_nametable_scan(const pcre2_code *code,
|
||||
PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
|
||||
|
||||
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
||||
subpatterns are not required to be unique. Duplicate names are always
|
||||
allowed for subpatterns with the same number, created by using the (?|
|
||||
feature. Indeed, if such subpatterns are named, they are required to
|
||||
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
||||
subpatterns are not required to be unique. Duplicate names are always
|
||||
allowed for subpatterns with the same number, created by using the (?|
|
||||
feature. Indeed, if such subpatterns are named, they are required to
|
||||
use the same names.
|
||||
|
||||
Normally, patterns with duplicate names are such that in any one match,
|
||||
only one of the named subpatterns participates. An example is shown in
|
||||
only one of the named subpatterns participates. An example is shown in
|
||||
the pcre2pattern documentation.
|
||||
|
||||
When duplicates are present, pcre2_substring_copy_byname() and
|
||||
pcre2_substring_get_byname() return the first substring corresponding
|
||||
to the given name that is set. Only if none are set is
|
||||
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
||||
When duplicates are present, pcre2_substring_copy_byname() and
|
||||
pcre2_substring_get_byname() return the first substring corresponding
|
||||
to the given name that is set. Only if none are set is
|
||||
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
||||
function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
|
||||
duplicate names.
|
||||
|
||||
If you want to get full details of all captured substrings for a given
|
||||
name, you must use the pcre2_substring_nametable_scan() function. The
|
||||
first argument is the compiled pattern, and the second is the name. If
|
||||
the third and fourth arguments are NULL, the function returns a group
|
||||
If you want to get full details of all captured substrings for a given
|
||||
name, you must use the pcre2_substring_nametable_scan() function. The
|
||||
first argument is the compiled pattern, and the second is the name. If
|
||||
the third and fourth arguments are NULL, the function returns a group
|
||||
number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
|
||||
|
||||
When the third and fourth arguments are not NULL, they must be pointers
|
||||
to variables that are updated by the function. After it has run, they
|
||||
to variables that are updated by the function. After it has run, they
|
||||
point to the first and last entries in the name-to-number table for the
|
||||
given name, and the function returns the length of each entry in code
|
||||
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
||||
given name, and the function returns the length of each entry in code
|
||||
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
||||
no entries for the given name.
|
||||
|
||||
The format of the name table is described above in the section entitled
|
||||
Information about a pattern. Given all the relevant entries for the
|
||||
name, you can extract each of their numbers, and hence the captured
|
||||
Information about a pattern. Given all the relevant entries for the
|
||||
name, you can extract each of their numbers, and hence the captured
|
||||
data.
|
||||
|
||||
|
||||
FINDING ALL POSSIBLE MATCHES AT ONE POSITION
|
||||
|
||||
The traditional matching function uses a similar algorithm to Perl,
|
||||
which stops when it finds the first match at a given point in the sub-
|
||||
The traditional matching function uses a similar algorithm to Perl,
|
||||
which stops when it finds the first match at a given point in the sub-
|
||||
ject. If you want to find all possible matches, or the longest possible
|
||||
match at a given position, consider using the alternative matching
|
||||
function (see below) instead. If you cannot use the alternative func-
|
||||
match at a given position, consider using the alternative matching
|
||||
function (see below) instead. If you cannot use the alternative func-
|
||||
tion, you can kludge it up by making use of the callout facility, which
|
||||
is described in the pcre2callout documentation.
|
||||
|
||||
What you have to do is to insert a callout right at the end of the pat-
|
||||
tern. When your callout function is called, extract and save the cur-
|
||||
rent matched substring. Then return 1, which forces pcre2_match() to
|
||||
backtrack and try other alternatives. Ultimately, when it runs out of
|
||||
tern. When your callout function is called, extract and save the cur-
|
||||
rent matched substring. Then return 1, which forces pcre2_match() to
|
||||
backtrack and try other alternatives. Ultimately, when it runs out of
|
||||
matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
|
||||
|
||||
|
||||
|
@ -3369,26 +3418,26 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
pcre2_match_context *mcontext,
|
||||
int *workspace, PCRE2_SIZE wscount);
|
||||
|
||||
The function pcre2_dfa_match() is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the
|
||||
The function pcre2_dfa_match() is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the
|
||||
subject string just once (not counting lookaround assertions), and does
|
||||
not backtrack. This has different characteristics to the normal algo-
|
||||
rithm, and is not compatible with Perl. Some of the features of PCRE2
|
||||
patterns are not supported. Nevertheless, there are times when this
|
||||
kind of matching can be useful. For a discussion of the two matching
|
||||
not backtrack. This has different characteristics to the normal algo-
|
||||
rithm, and is not compatible with Perl. Some of the features of PCRE2
|
||||
patterns are not supported. Nevertheless, there are times when this
|
||||
kind of matching can be useful. For a discussion of the two matching
|
||||
algorithms, and a list of features that pcre2_dfa_match() does not sup-
|
||||
port, see the pcre2matching documentation.
|
||||
|
||||
The arguments for the pcre2_dfa_match() function are the same as for
|
||||
The arguments for the pcre2_dfa_match() function are the same as for
|
||||
pcre2_match(), plus two extras. The ovector within the match data block
|
||||
is used in a different way, and this is described below. The other com-
|
||||
mon arguments are used in the same way as for pcre2_match(), so their
|
||||
mon arguments are used in the same way as for pcre2_match(), so their
|
||||
description is not repeated here.
|
||||
|
||||
The two additional arguments provide workspace for the function. The
|
||||
workspace vector should contain at least 20 elements. It is used for
|
||||
The two additional arguments provide workspace for the function. The
|
||||
workspace vector should contain at least 20 elements. It is used for
|
||||
keeping track of multiple paths through the pattern tree. More
|
||||
workspace is needed for patterns and subjects where there are a lot of
|
||||
workspace is needed for patterns and subjects where there are a lot of
|
||||
potential matches.
|
||||
|
||||
Here is an example of a simple call to pcre2_dfa_match():
|
||||
|
@ -3408,45 +3457,45 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
|
||||
Option bits for pcre_dfa_match()
|
||||
|
||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_ENDAN-
|
||||
CHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_ENDAN-
|
||||
CHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||
PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD,
|
||||
PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but
|
||||
the last four of these are exactly the same as for pcre2_match(), so
|
||||
PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but
|
||||
the last four of these are exactly the same as for pcre2_match(), so
|
||||
their description is not repeated here.
|
||||
|
||||
PCRE2_PARTIAL_HARD
|
||||
PCRE2_PARTIAL_SOFT
|
||||
|
||||
These have the same general effect as they do for pcre2_match(), but
|
||||
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
||||
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
||||
These have the same general effect as they do for pcre2_match(), but
|
||||
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
||||
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
||||
subject is reached and there is still at least one matching possibility
|
||||
that requires additional characters. This happens even if some complete
|
||||
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
||||
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
||||
if the end of the subject is reached, there have been no complete
|
||||
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
||||
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
||||
if the end of the subject is reached, there have been no complete
|
||||
matches, but there is still at least one matching possibility. The por-
|
||||
tion of the string that was inspected when the longest partial match
|
||||
tion of the string that was inspected when the longest partial match
|
||||
was found is set as the first matching string in both cases. There is a
|
||||
more detailed discussion of partial and multi-segment matching, with
|
||||
more detailed discussion of partial and multi-segment matching, with
|
||||
examples, in the pcre2partial documentation.
|
||||
|
||||
PCRE2_DFA_SHORTEST
|
||||
|
||||
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
||||
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
||||
stop as soon as it has found one match. Because of the way the alterna-
|
||||
tive algorithm works, this is necessarily the shortest possible match
|
||||
tive algorithm works, this is necessarily the shortest possible match
|
||||
at the first possible matching point in the subject string.
|
||||
|
||||
PCRE2_DFA_RESTART
|
||||
|
||||
When pcre2_dfa_match() returns a partial match, it is possible to call
|
||||
When pcre2_dfa_match() returns a partial match, it is possible to call
|
||||
it again, with additional subject characters, and have it continue with
|
||||
the same match. The PCRE2_DFA_RESTART option requests this action; when
|
||||
it is set, the workspace and wscount options must reference the same
|
||||
vector as before because data about the match so far is left in them
|
||||
it is set, the workspace and wscount options must reference the same
|
||||
vector as before because data about the match so far is left in them
|
||||
after a partial match. There is more discussion of this facility in the
|
||||
pcre2partial documentation.
|
||||
|
||||
|
@ -3454,8 +3503,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
|
||||
When pcre2_dfa_match() succeeds, it may have matched more than one sub-
|
||||
string in the subject. Note, however, that all the matches from one run
|
||||
of the function start at the same point in the subject. The shorter
|
||||
matches are all initial substrings of the longer matches. For example,
|
||||
of the function start at the same point in the subject. The shorter
|
||||
matches are all initial substrings of the longer matches. For example,
|
||||
if the pattern
|
||||
|
||||
<.*>
|
||||
|
@ -3470,73 +3519,73 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
<something> <something else>
|
||||
<something>
|
||||
|
||||
On success, the yield of the function is a number greater than zero,
|
||||
which is the number of matched substrings. The offsets of the sub-
|
||||
strings are returned in the ovector, and can be extracted by number in
|
||||
the same way as for pcre2_match(), but the numbers bear no relation to
|
||||
any capturing groups that may exist in the pattern, because DFA match-
|
||||
On success, the yield of the function is a number greater than zero,
|
||||
which is the number of matched substrings. The offsets of the sub-
|
||||
strings are returned in the ovector, and can be extracted by number in
|
||||
the same way as for pcre2_match(), but the numbers bear no relation to
|
||||
any capturing groups that may exist in the pattern, because DFA match-
|
||||
ing does not support group capture.
|
||||
|
||||
Calls to the convenience functions that extract substrings by name
|
||||
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
||||
Calls to the convenience functions that extract substrings by name
|
||||
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
||||
after a DFA match. The convenience functions that extract substrings by
|
||||
number never return PCRE2_ERROR_NOSUBSTRING.
|
||||
|
||||
The matched strings are stored in the ovector in reverse order of
|
||||
length; that is, the longest matching string is first. If there were
|
||||
too many matches to fit into the ovector, the yield of the function is
|
||||
The matched strings are stored in the ovector in reverse order of
|
||||
length; that is, the longest matching string is first. If there were
|
||||
too many matches to fit into the ovector, the yield of the function is
|
||||
zero, and the vector is filled with the longest matches.
|
||||
|
||||
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
||||
character repeats at the end of a pattern (as well as internally). For
|
||||
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
||||
matching, this means that only one possible match is found. If you
|
||||
really do want multiple matches in such cases, either use an ungreedy
|
||||
repeat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
||||
character repeats at the end of a pattern (as well as internally). For
|
||||
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
||||
matching, this means that only one possible match is found. If you
|
||||
really do want multiple matches in such cases, either use an ungreedy
|
||||
repeat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||
compiling.
|
||||
|
||||
Error returns from pcre2_dfa_match()
|
||||
|
||||
The pcre2_dfa_match() function returns a negative number when it fails.
|
||||
Many of the errors are the same as for pcre2_match(), as described
|
||||
Many of the errors are the same as for pcre2_match(), as described
|
||||
above. There are in addition the following errors that are specific to
|
||||
pcre2_dfa_match():
|
||||
|
||||
PCRE2_ERROR_DFA_UITEM
|
||||
|
||||
This return is given if pcre2_dfa_match() encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \C in a UTF
|
||||
This return is given if pcre2_dfa_match() encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \C in a UTF
|
||||
mode or a backreference.
|
||||
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
|
||||
This return is given if pcre2_dfa_match() encounters a condition item
|
||||
This return is given if pcre2_dfa_match() encounters a condition item
|
||||
that uses a backreference for the condition, or a test for recursion in
|
||||
a specific group. These are not supported.
|
||||
|
||||
PCRE2_ERROR_DFA_WSSIZE
|
||||
|
||||
This return is given if pcre2_dfa_match() runs out of space in the
|
||||
This return is given if pcre2_dfa_match() runs out of space in the
|
||||
workspace vector.
|
||||
|
||||
PCRE2_ERROR_DFA_RECURSE
|
||||
|
||||
When a recursive subpattern is processed, the matching function calls
|
||||
When a recursive subpattern is processed, the matching function calls
|
||||
itself recursively, using private memory for the ovector and workspace.
|
||||
This error is given if the internal ovector is not large enough. This
|
||||
This error is given if the internal ovector is not large enough. This
|
||||
should be extremely rare, as a vector of size 1000 is used.
|
||||
|
||||
PCRE2_ERROR_DFA_BADRESTART
|
||||
|
||||
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
||||
some plausibility checks are made on the contents of the workspace,
|
||||
which should contain data about the previous partial match. If any of
|
||||
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
||||
some plausibility checks are made on the contents of the workspace,
|
||||
which should contain data about the previous partial match. If any of
|
||||
these checks fail, this error is given.
|
||||
|
||||
|
||||
SEE ALSO
|
||||
|
||||
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
||||
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
||||
pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2unicode(3).
|
||||
|
||||
|
||||
|
@ -3549,7 +3598,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 07 September 2018
|
||||
Last updated: 18 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -4135,21 +4184,26 @@ DESCRIPTION
|
|||
its entry point in a match context (see pcre2_set_callout() in the
|
||||
pcre2api documentation).
|
||||
|
||||
Within a regular expression, (?C<arg>) indicates a point at which the
|
||||
external function is to be called. Different callout points can be
|
||||
identified by putting a number less than 256 after the letter C. The
|
||||
default value is zero. Alternatively, the argument may be a delimited
|
||||
string. The starting delimiter must be one of ` ' " ^ % # $ { and the
|
||||
When using the pcre2_substitute() function, an additional callout fea-
|
||||
ture is available. This does a callout after each change to the subject
|
||||
string and is described in the pcre2api documentation; the rest of this
|
||||
document is concerned with callouts during pattern matching.
|
||||
|
||||
Within a regular expression, (?C<arg>) indicates a point at which the
|
||||
external function is to be called. Different callout points can be
|
||||
identified by putting a number less than 256 after the letter C. The
|
||||
default value is zero. Alternatively, the argument may be a delimited
|
||||
string. The starting delimiter must be one of ` ' " ^ % # $ { and the
|
||||
ending delimiter is the same as the start, except for {, where the end-
|
||||
ing delimiter is }. If the ending delimiter is needed within the
|
||||
string, it must be doubled. For example, this pattern has two callout
|
||||
ing delimiter is }. If the ending delimiter is needed within the
|
||||
string, it must be doubled. For example, this pattern has two callout
|
||||
points:
|
||||
|
||||
(?C1)abc(?C"some ""arbitrary"" text")def
|
||||
|
||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
||||
PCRE2 automatically inserts callouts, all with number 255, before each
|
||||
item in the pattern except for immediately before or after an explicit
|
||||
PCRE2 automatically inserts callouts, all with number 255, before each
|
||||
item in the pattern except for immediately before or after an explicit
|
||||
callout. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
|
||||
|
||||
A(?C3)B
|
||||
|
@ -4166,36 +4220,36 @@ DESCRIPTION
|
|||
|
||||
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
||||
|
||||
Notice that there is a callout before and after each parenthesis and
|
||||
Notice that there is a callout before and after each parenthesis and
|
||||
alternation bar. If the pattern contains a conditional group whose con-
|
||||
dition is an assertion, an automatic callout is inserted immediately
|
||||
before the condition. Such a callout may also be inserted explicitly,
|
||||
dition is an assertion, an automatic callout is inserted immediately
|
||||
before the condition. Such a callout may also be inserted explicitly,
|
||||
for example:
|
||||
|
||||
(?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de)
|
||||
|
||||
This applies only to assertion conditions (because they are themselves
|
||||
This applies only to assertion conditions (because they are themselves
|
||||
independent groups).
|
||||
|
||||
Callouts can be useful for tracking the progress of pattern matching.
|
||||
Callouts can be useful for tracking the progress of pattern matching.
|
||||
The pcre2test program has a pattern qualifier (/auto_callout) that sets
|
||||
automatic callouts. When any callouts are present, the output from
|
||||
pcre2test indicates how the pattern is being matched. This is useful
|
||||
information when you are trying to optimize the performance of a par-
|
||||
automatic callouts. When any callouts are present, the output from
|
||||
pcre2test indicates how the pattern is being matched. This is useful
|
||||
information when you are trying to optimize the performance of a par-
|
||||
ticular pattern.
|
||||
|
||||
|
||||
MISSING CALLOUTS
|
||||
|
||||
You should be aware that, because of optimizations in the way PCRE2
|
||||
You should be aware that, because of optimizations in the way PCRE2
|
||||
compiles and matches patterns, callouts sometimes do not happen exactly
|
||||
as you might expect.
|
||||
|
||||
Auto-possessification
|
||||
|
||||
At compile time, PCRE2 "auto-possessifies" repeated items when it knows
|
||||
that what follows cannot be part of the repeat. For example, a+[bc] is
|
||||
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
||||
that what follows cannot be part of the repeat. For example, a+[bc] is
|
||||
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
||||
is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied
|
||||
to the string "aaaa" is:
|
||||
|
||||
|
@ -4204,11 +4258,11 @@ MISSING CALLOUTS
|
|||
+2 ^ ^ [bc]
|
||||
No match
|
||||
|
||||
This indicates that when matching [bc] fails, there is no backtracking
|
||||
This indicates that when matching [bc] fails, there is no backtracking
|
||||
into a+ (because it is being treated as a++) and therefore the callouts
|
||||
that would be taken for the backtracks do not occur. You can disable
|
||||
the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to
|
||||
pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In
|
||||
that would be taken for the backtracks do not occur. You can disable
|
||||
the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to
|
||||
pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In
|
||||
this case, the output changes to this:
|
||||
|
||||
--->aaaa
|
||||
|
@ -4225,19 +4279,19 @@ MISSING CALLOUTS
|
|||
Automatic .* anchoring
|
||||
|
||||
By default, an optimization is applied when .* is the first significant
|
||||
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
||||
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
||||
is not set, a match can start only after an internal newline or at the
|
||||
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
||||
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
||||
is not set, a match can start only after an internal newline or at the
|
||||
beginning of the subject, and pcre2_compile() remembers this. If a pat-
|
||||
tern has more than one top-level branch, automatic anchoring occurs if
|
||||
tern has more than one top-level branch, automatic anchoring occurs if
|
||||
all branches are anchorable.
|
||||
|
||||
This optimization is disabled, however, if .* is in an atomic group or
|
||||
This optimization is disabled, however, if .* is in an atomic group or
|
||||
if there is a backreference to the capturing group in which it appears.
|
||||
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
||||
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
||||
ever, the presence of callouts does not affect it.
|
||||
|
||||
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
||||
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
||||
and applied to the string "aa", the pcre2test output is:
|
||||
|
||||
--->aa
|
||||
|
@ -4247,10 +4301,10 @@ MISSING CALLOUTS
|
|||
+2 ^ \d
|
||||
No match
|
||||
|
||||
This shows that all match attempts start at the beginning of the sub-
|
||||
ject. In other words, the pattern is anchored. You can disable this
|
||||
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
||||
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
||||
This shows that all match attempts start at the beginning of the sub-
|
||||
ject. In other words, the pattern is anchored. You can disable this
|
||||
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
||||
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
||||
put changes to:
|
||||
|
||||
--->aa
|
||||
|
@ -4263,42 +4317,42 @@ MISSING CALLOUTS
|
|||
+2 ^ \d
|
||||
No match
|
||||
|
||||
This shows more match attempts, starting at the second subject charac-
|
||||
ter. Another optimization, described in the next section, means that
|
||||
This shows more match attempts, starting at the second subject charac-
|
||||
ter. Another optimization, described in the next section, means that
|
||||
there is no subsequent attempt to match with an empty subject.
|
||||
|
||||
Other optimizations
|
||||
|
||||
Other optimizations that provide fast "no match" results also affect
|
||||
Other optimizations that provide fast "no match" results also affect
|
||||
callouts. For example, if the pattern is
|
||||
|
||||
ab(?C4)cd
|
||||
|
||||
PCRE2 knows that any matching string must contain the letter "d". If
|
||||
the subject string is "abyz", the lack of "d" means that matching
|
||||
doesn't ever start, and the callout is never reached. However, with
|
||||
PCRE2 knows that any matching string must contain the letter "d". If
|
||||
the subject string is "abyz", the lack of "d" means that matching
|
||||
doesn't ever start, and the callout is never reached. However, with
|
||||
"abyd", though the result is still no match, the callout is obeyed.
|
||||
|
||||
For most patterns PCRE2 also knows the minimum length of a matching
|
||||
string, and will immediately give a "no match" return without actually
|
||||
running a match if the subject is not long enough, or, for unanchored
|
||||
For most patterns PCRE2 also knows the minimum length of a matching
|
||||
string, and will immediately give a "no match" return without actually
|
||||
running a match if the subject is not long enough, or, for unanchored
|
||||
patterns, if it has been scanned far enough.
|
||||
|
||||
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
||||
MIZE option to pcre2_compile(), or by starting the pattern with
|
||||
(*NO_START_OPT). This slows down the matching process, but does ensure
|
||||
MIZE option to pcre2_compile(), or by starting the pattern with
|
||||
(*NO_START_OPT). This slows down the matching process, but does ensure
|
||||
that callouts such as the example above are obeyed.
|
||||
|
||||
|
||||
THE CALLOUT INTERFACE
|
||||
|
||||
During matching, when PCRE2 reaches a callout point, if an external
|
||||
function is provided in the match context, it is called. This applies
|
||||
to both normal, DFA, and JIT matching. The first argument to the call-
|
||||
During matching, when PCRE2 reaches a callout point, if an external
|
||||
function is provided in the match context, it is called. This applies
|
||||
to both normal, DFA, and JIT matching. The first argument to the call-
|
||||
out function is a pointer to a pcre2_callout block. The second argument
|
||||
is the void * callout data that was supplied when the callout was set
|
||||
is the void * callout data that was supplied when the callout was set
|
||||
up by calling pcre2_set_callout() (see the pcre2api documentation). The
|
||||
callout block structure contains the following fields, not necessarily
|
||||
callout block structure contains the following fields, not necessarily
|
||||
in this order:
|
||||
|
||||
uint32_t version;
|
||||
|
@ -4318,118 +4372,118 @@ THE CALLOUT INTERFACE
|
|||
PCRE2_SIZE callout_string_length;
|
||||
PCRE2_SPTR callout_string;
|
||||
|
||||
The version field contains the version number of the block format. The
|
||||
current version is 2; the three callout string fields were added for
|
||||
version 1, and the callout_flags field for version 2. If you are writ-
|
||||
ing an application that might use an earlier release of PCRE2, you
|
||||
should check the version number before accessing any of these fields.
|
||||
The version number will increase in future if more fields are added,
|
||||
The version field contains the version number of the block format. The
|
||||
current version is 2; the three callout string fields were added for
|
||||
version 1, and the callout_flags field for version 2. If you are writ-
|
||||
ing an application that might use an earlier release of PCRE2, you
|
||||
should check the version number before accessing any of these fields.
|
||||
The version number will increase in future if more fields are added,
|
||||
but the intention is never to remove any of the existing fields.
|
||||
|
||||
Fields for numerical callouts
|
||||
|
||||
For a numerical callout, callout_string is NULL, and callout_number
|
||||
contains the number of the callout, in the range 0-255. This is the
|
||||
number that follows (?C for callouts that part of the pattern; it is
|
||||
For a numerical callout, callout_string is NULL, and callout_number
|
||||
contains the number of the callout, in the range 0-255. This is the
|
||||
number that follows (?C for callouts that part of the pattern; it is
|
||||
255 for automatically generated callouts.
|
||||
|
||||
Fields for string callouts
|
||||
|
||||
For callouts with string arguments, callout_number is always zero, and
|
||||
callout_string points to the string that is contained within the com-
|
||||
For callouts with string arguments, callout_number is always zero, and
|
||||
callout_string points to the string that is contained within the com-
|
||||
piled pattern. Its length is given by callout_string_length. Duplicated
|
||||
ending delimiters that were present in the original pattern string have
|
||||
been turned into single characters, but there is no other processing of
|
||||
the callout string argument. An additional code unit containing binary
|
||||
zero is present after the string, but is not included in the length.
|
||||
The delimiter that was used to start the string is also stored within
|
||||
the pattern, immediately before the string itself. You can access this
|
||||
the callout string argument. An additional code unit containing binary
|
||||
zero is present after the string, but is not included in the length.
|
||||
The delimiter that was used to start the string is also stored within
|
||||
the pattern, immediately before the string itself. You can access this
|
||||
delimiter as callout_string[-1] if you need it.
|
||||
|
||||
The callout_string_offset field is the code unit offset to the start of
|
||||
the callout argument string within the original pattern string. This is
|
||||
provided for the benefit of applications such as script languages that
|
||||
provided for the benefit of applications such as script languages that
|
||||
might need to report errors in the callout string within the pattern.
|
||||
|
||||
Fields for all callouts
|
||||
|
||||
The remaining fields in the callout block are the same for both kinds
|
||||
The remaining fields in the callout block are the same for both kinds
|
||||
of callout.
|
||||
|
||||
The offset_vector field is a pointer to a vector of capturing offsets
|
||||
The offset_vector field is a pointer to a vector of capturing offsets
|
||||
(the "ovector"). You may read the elements in this vector, but you must
|
||||
not change any of them.
|
||||
|
||||
For calls to pcre2_match(), the offset_vector field is not (since
|
||||
release 10.30) a pointer to the actual ovector that was passed to the
|
||||
matching function in the match data block. Instead it points to an
|
||||
internal ovector of a size large enough to hold all possible captured
|
||||
For calls to pcre2_match(), the offset_vector field is not (since
|
||||
release 10.30) a pointer to the actual ovector that was passed to the
|
||||
matching function in the match data block. Instead it points to an
|
||||
internal ovector of a size large enough to hold all possible captured
|
||||
substrings in the pattern. Note that whenever a recursion or subroutine
|
||||
call within a pattern completes, the capturing state is reset to what
|
||||
call within a pattern completes, the capturing state is reset to what
|
||||
it was before.
|
||||
|
||||
The capture_last field contains the number of the most recently cap-
|
||||
tured substring, and the capture_top field contains one more than the
|
||||
number of the highest numbered captured substring so far. If no sub-
|
||||
strings have yet been captured, the value of capture_last is 0 and the
|
||||
value of capture_top is 1. The values of these fields do not always
|
||||
differ by one; for example, when the callout in the pattern
|
||||
The capture_last field contains the number of the most recently cap-
|
||||
tured substring, and the capture_top field contains one more than the
|
||||
number of the highest numbered captured substring so far. If no sub-
|
||||
strings have yet been captured, the value of capture_last is 0 and the
|
||||
value of capture_top is 1. The values of these fields do not always
|
||||
differ by one; for example, when the callout in the pattern
|
||||
((a)(b))(?C2) is taken, capture_last is 1 but capture_top is 4.
|
||||
|
||||
The contents of ovector[2] to ovector[<capture_top>*2-1] can be
|
||||
The contents of ovector[2] to ovector[<capture_top>*2-1] can be
|
||||
inspected in order to extract substrings that have been matched so far,
|
||||
in the same way as extracting substrings after a match has completed.
|
||||
The values in ovector[0] and ovector[1] are always PCRE2_UNSET because
|
||||
the match is by definition not complete. Substrings that have not been
|
||||
captured but whose numbers are less than capture_top also have both of
|
||||
in the same way as extracting substrings after a match has completed.
|
||||
The values in ovector[0] and ovector[1] are always PCRE2_UNSET because
|
||||
the match is by definition not complete. Substrings that have not been
|
||||
captured but whose numbers are less than capture_top also have both of
|
||||
their ovector slots set to PCRE2_UNSET.
|
||||
|
||||
For DFA matching, the offset_vector field points to the ovector that
|
||||
was passed to the matching function in the match data block for call-
|
||||
For DFA matching, the offset_vector field points to the ovector that
|
||||
was passed to the matching function in the match data block for call-
|
||||
outs at the top level, but to an internal ovector during the processing
|
||||
of pattern recursions, lookarounds, and atomic groups. However, these
|
||||
ovectors hold no useful information because pcre2_dfa_match() does not
|
||||
support substring capturing. The value of capture_top is always 1 and
|
||||
of pattern recursions, lookarounds, and atomic groups. However, these
|
||||
ovectors hold no useful information because pcre2_dfa_match() does not
|
||||
support substring capturing. The value of capture_top is always 1 and
|
||||
the value of capture_last is always 0 for DFA matching.
|
||||
|
||||
The subject and subject_length fields contain copies of the values that
|
||||
were passed to the matching function.
|
||||
|
||||
The start_match field normally contains the offset within the subject
|
||||
at which the current match attempt started. However, if the escape
|
||||
sequence \K has been encountered, this value is changed to reflect the
|
||||
modified starting point. If the pattern is not anchored, the callout
|
||||
The start_match field normally contains the offset within the subject
|
||||
at which the current match attempt started. However, if the escape
|
||||
sequence \K has been encountered, this value is changed to reflect the
|
||||
modified starting point. If the pattern is not anchored, the callout
|
||||
function may be called several times from the same point in the pattern
|
||||
for different starting points in the subject.
|
||||
|
||||
The current_position field contains the offset within the subject of
|
||||
The current_position field contains the offset within the subject of
|
||||
the current match pointer.
|
||||
|
||||
The pattern_position field contains the offset in the pattern string to
|
||||
the next item to be matched.
|
||||
|
||||
The next_item_length field contains the length of the next item to be
|
||||
processed in the pattern string. When the callout is at the end of the
|
||||
pattern, the length is zero. When the callout precedes an opening
|
||||
The next_item_length field contains the length of the next item to be
|
||||
processed in the pattern string. When the callout is at the end of the
|
||||
pattern, the length is zero. When the callout precedes an opening
|
||||
parenthesis, the length includes meta characters that follow the paren-
|
||||
thesis. For example, in a callout before an assertion such as (?=ab)
|
||||
the length is 3. For an an alternation bar or a closing parenthesis,
|
||||
the length is one, unless a closing parenthesis is followed by a quan-
|
||||
thesis. For example, in a callout before an assertion such as (?=ab)
|
||||
the length is 3. For an an alternation bar or a closing parenthesis,
|
||||
the length is one, unless a closing parenthesis is followed by a quan-
|
||||
tifier, in which case its length is included. (This changed in release
|
||||
10.23. In earlier releases, before an opening parenthesis the length
|
||||
was that of the entire subpattern, and before an alternation bar or a
|
||||
10.23. In earlier releases, before an opening parenthesis the length
|
||||
was that of the entire subpattern, and before an alternation bar or a
|
||||
closing parenthesis the length was zero.)
|
||||
|
||||
The pattern_position and next_item_length fields are intended to help
|
||||
in distinguishing between different automatic callouts, which all have
|
||||
the same callout number. However, they are set for all callouts, and
|
||||
The pattern_position and next_item_length fields are intended to help
|
||||
in distinguishing between different automatic callouts, which all have
|
||||
the same callout number. However, they are set for all callouts, and
|
||||
are used by pcre2test to show the next item to be matched when display-
|
||||
ing callout information.
|
||||
|
||||
In callouts from pcre2_match() the mark field contains a pointer to the
|
||||
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||
(*THEN) item in the match, or NULL if no such items have been passed.
|
||||
Instances of (*PRUNE) or (*THEN) without a name do not obliterate a
|
||||
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||
(*THEN) item in the match, or NULL if no such items have been passed.
|
||||
Instances of (*PRUNE) or (*THEN) without a name do not obliterate a
|
||||
previous (*MARK). In callouts from the DFA matching function this field
|
||||
always contains NULL.
|
||||
|
||||
|
@ -4439,25 +4493,25 @@ THE CALLOUT INTERFACE
|
|||
|
||||
PCRE2_CALLOUT_STARTMATCH
|
||||
|
||||
This is set for the first callout after the start of matching for each
|
||||
This is set for the first callout after the start of matching for each
|
||||
new starting position in the subject.
|
||||
|
||||
PCRE2_CALLOUT_BACKTRACK
|
||||
|
||||
This is set if there has been a matching backtrack since the previous
|
||||
callout, or since the start of matching if this is the first callout
|
||||
This is set if there has been a matching backtrack since the previous
|
||||
callout, or since the start of matching if this is the first callout
|
||||
from a pcre2_match() run.
|
||||
|
||||
Both bits are set when a backtrack has caused a "bumpalong" to a new
|
||||
starting position in the subject. Output from pcre2test does not indi-
|
||||
cate the presence of these bits unless the callout_extra modifier is
|
||||
Both bits are set when a backtrack has caused a "bumpalong" to a new
|
||||
starting position in the subject. Output from pcre2test does not indi-
|
||||
cate the presence of these bits unless the callout_extra modifier is
|
||||
set.
|
||||
|
||||
The information in the callout_flags field is provided so that applica-
|
||||
tions can track and tell their users how matching with backtracking is
|
||||
done. This can be useful when trying to optimize patterns, or just to
|
||||
understand how PCRE2 works. There is no support in pcre2_dfa_match()
|
||||
because there is no backtracking in DFA matching, and there is no sup-
|
||||
tions can track and tell their users how matching with backtracking is
|
||||
done. This can be useful when trying to optimize patterns, or just to
|
||||
understand how PCRE2 works. There is no support in pcre2_dfa_match()
|
||||
because there is no backtracking in DFA matching, and there is no sup-
|
||||
port in JIT because JIT is all about maximimizing matching performance.
|
||||
In both these cases the callout_flags field is always zero.
|
||||
|
||||
|
@ -4465,16 +4519,16 @@ THE CALLOUT INTERFACE
|
|||
RETURN VALUES FROM CALLOUTS
|
||||
|
||||
The external callout function returns an integer to PCRE2. If the value
|
||||
is zero, matching proceeds as normal. If the value is greater than
|
||||
zero, matching fails at the current point, but the testing of other
|
||||
is zero, matching proceeds as normal. If the value is greater than
|
||||
zero, matching fails at the current point, but the testing of other
|
||||
matching possibilities goes ahead, just as if a lookahead assertion had
|
||||
failed. If the value is less than zero, the match is abandoned, and the
|
||||
matching function returns the negative value.
|
||||
|
||||
Negative values should normally be chosen from the set of
|
||||
PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a
|
||||
standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is
|
||||
reserved for use by callout functions; it will never be used by PCRE2
|
||||
Negative values should normally be chosen from the set of
|
||||
PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a
|
||||
standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is
|
||||
reserved for use by callout functions; it will never be used by PCRE2
|
||||
itself.
|
||||
|
||||
|
||||
|
@ -4485,14 +4539,14 @@ CALLOUT ENUMERATION
|
|||
void *user_data);
|
||||
|
||||
A script language that supports the use of string arguments in callouts
|
||||
might like to scan all the callouts in a pattern before running the
|
||||
might like to scan all the callouts in a pattern before running the
|
||||
match. This can be done by calling pcre2_callout_enumerate(). The first
|
||||
argument is a pointer to a compiled pattern, the second points to a
|
||||
callback function, and the third is arbitrary user data. The callback
|
||||
function is called for every callout in the pattern in the order in
|
||||
argument is a pointer to a compiled pattern, the second points to a
|
||||
callback function, and the third is arbitrary user data. The callback
|
||||
function is called for every callout in the pattern in the order in
|
||||
which they appear. Its first argument is a pointer to a callout enumer-
|
||||
ation block, and its second argument is the user_data value that was
|
||||
passed to pcre2_callout_enumerate(). The data block contains the fol-
|
||||
ation block, and its second argument is the user_data value that was
|
||||
passed to pcre2_callout_enumerate(). The data block contains the fol-
|
||||
lowing fields:
|
||||
|
||||
version Block version number
|
||||
|
@ -4503,17 +4557,17 @@ CALLOUT ENUMERATION
|
|||
callout_string_length Length of callout string
|
||||
callout_string Points to callout string or is NULL
|
||||
|
||||
The version number is currently 0. It will increase if new fields are
|
||||
ever added to the block. The remaining fields are the same as their
|
||||
namesakes in the pcre2_callout block that is used for callouts during
|
||||
The version number is currently 0. It will increase if new fields are
|
||||
ever added to the block. The remaining fields are the same as their
|
||||
namesakes in the pcre2_callout block that is used for callouts during
|
||||
matching, as described above.
|
||||
|
||||
Note that the value of pattern_position is unique for each callout.
|
||||
However, if a callout occurs inside a group that is quantified with a
|
||||
Note that the value of pattern_position is unique for each callout.
|
||||
However, if a callout occurs inside a group that is quantified with a
|
||||
non-zero minimum or a fixed maximum, the group is replicated inside the
|
||||
compiled pattern. For example, a pattern such as /(a){2}/ is compiled
|
||||
as if it were /(a)(a)/. This means that the callout will be enumerated
|
||||
more than once, but with the same value for pattern_position in each
|
||||
compiled pattern. For example, a pattern such as /(a){2}/ is compiled
|
||||
as if it were /(a)(a)/. This means that the callout will be enumerated
|
||||
more than once, but with the same value for pattern_position in each
|
||||
case.
|
||||
|
||||
The callback function should normally return zero. If it returns a non-
|
||||
|
@ -4530,7 +4584,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 26 April 2018
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
.TH PCRE2_SET_SUBSTITUTE_CALLOUT 3 "17 September 2018" "PCRE2 10.33"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.nf
|
||||
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *),"
|
||||
.B " void *\fIcallout_data\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function sets the substitute callout fields in a match context (the first
|
||||
argument). The second argument specifies a callout function, and the third
|
||||
argument is an opaque data item that is passed to it. The result of this
|
||||
function is always zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
page and a description of the POSIX API in the
|
||||
.\" HREF
|
||||
\fBpcre2posix\fP
|
||||
.\"
|
||||
page.
|
107
doc/pcre2api.3
107
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "07 September 2018" "PCRE2 10.32"
|
||||
.TH PCRE2API 3 "18 September 2018" "PCRE2 10.33"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -123,6 +123,10 @@ document for an overview of all the PCRE2 documentation.
|
|||
.B " int (*\fIcallout_function\fP)(pcre2_callout_block *, void *),"
|
||||
.B " void *\fIcallout_data\fP);"
|
||||
.sp
|
||||
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *),"
|
||||
.B " void *\fIcallout_data\fP);"
|
||||
.sp
|
||||
.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
|
||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||
.sp
|
||||
|
@ -847,7 +851,7 @@ PCRE2_ERROR_BADDATA if invalid data is detected.
|
|||
.B " void *\fIcallout_data\fP);"
|
||||
.fi
|
||||
.sp
|
||||
This sets up a "callout" function for PCRE2 to call at specified points
|
||||
This sets up a callout function for PCRE2 to call at specified points
|
||||
during a matching operation. Details are given in the
|
||||
.\" HREF
|
||||
\fBpcre2callout\fP
|
||||
|
@ -855,6 +859,20 @@ during a matching operation. Details are given in the
|
|||
documentation.
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *),"
|
||||
.B " void *\fIcallout_data\fP);"
|
||||
.fi
|
||||
.sp
|
||||
This sets up a callout function for PCRE2 to call after each substitution
|
||||
made by \fBpcre2_substitute()\fP. Details are given in the section entitled
|
||||
"Creating a new string with substitutions"
|
||||
.\" HTML <a href="#substitutions">
|
||||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_offset_limit(pcre2_match_context *\fImcontext\fP,
|
||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||
.fi
|
||||
|
@ -3171,6 +3189,7 @@ numbers. For this reason, the use of different names for subpatterns of the
|
|||
same number causes an error at compile time.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="substitutions"></a>
|
||||
.SH "CREATING A NEW STRING WITH SUBSTITUTIONS"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -3179,19 +3198,22 @@ same number causes an error at compile time.
|
|||
.B " PCRE2_SIZE \fIlength\fP, PCRE2_SIZE \fIstartoffset\fP,"
|
||||
.B " uint32_t \fIoptions\fP, pcre2_match_data *\fImatch_data\fP,"
|
||||
.B " pcre2_match_context *\fImcontext\fP, PCRE2_SPTR \fIreplacement\fP,"
|
||||
.B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\zfP,"
|
||||
.B " PCRE2_SIZE \fIrlength\fP, PCRE2_UCHAR *\fIoutputbuffer\fP,"
|
||||
.B " PCRE2_SIZE *\fIoutlengthptr\fP);"
|
||||
.fi
|
||||
.P
|
||||
This function calls \fBpcre2_match()\fP and then makes a copy of the subject
|
||||
string in \fIoutputbuffer\fP, replacing the part that was matched with the
|
||||
\fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||
which a \eK item in a lookahead in the pattern causes the match to end before
|
||||
it starts are not supported, and give rise to an error return. For global
|
||||
replacements, matches in which \eK in a lookbehind causes the match to start
|
||||
earlier than the point that was reached in the previous iteration are also not
|
||||
supported.
|
||||
string in \fIoutputbuffer\fP, replacing one or more parts that were matched
|
||||
with the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP.
|
||||
This can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||
The default is to perform just one replacement, but there is an option that
|
||||
requests multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below for details).
|
||||
.P
|
||||
Matches in which a \eK item in a lookahead in the pattern causes the match to
|
||||
end before it starts are not supported, and give rise to an error return. For
|
||||
global replacements, matches in which \eK in a lookbehind causes the match to
|
||||
start earlier than the point that was reached in the previous iteration are
|
||||
also not supported.
|
||||
.P
|
||||
The first seven arguments of \fBpcre2_substitute()\fP are the same as for
|
||||
\fBpcre2_match()\fP, except that the partial matching options are not
|
||||
|
@ -3201,9 +3223,9 @@ functions from the match context, if provided, or else those that were used to
|
|||
allocate memory for the compiled code.
|
||||
.P
|
||||
If an external \fImatch_data\fP block is provided, its contents afterwards
|
||||
are those set by the final call to \fBpcre2_match()\fP, which will have
|
||||
ended in a matching error. The contents of the ovector within the match data
|
||||
block may or may not have been changed.
|
||||
are those set by the final call to \fBpcre2_match()\fP. For global changes,
|
||||
this will have ended in a matching error. The contents of the ovector within
|
||||
the match data block may or may not have been changed.
|
||||
.P
|
||||
The \fIoutlengthptr\fP argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is successful, the
|
||||
|
@ -3224,12 +3246,12 @@ length is in code units, not bytes.
|
|||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups or (*MARK), (*PRUNE), or (*THEN) items in the
|
||||
pattern. The following forms are always recognized:
|
||||
characters from capturing groups or names from (*MARK) or other control verbs
|
||||
in the pattern. The following forms are always recognized:
|
||||
.sp
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert a (*MARK), (*PRUNE), or (*THEN) name
|
||||
$*MARK or ${*MARK} insert a control verb name
|
||||
.sp
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
|
@ -3237,12 +3259,13 @@ number or name. The number may be zero to include the entire matched string.
|
|||
For example, if the pattern a(b)c is matched with "=abc=" and the replacement
|
||||
string "+$1$0$1+", the result is "=+babcb+=".
|
||||
.P
|
||||
$*MARK inserts the name from the last encountered (*MARK), (*PRUNE), or (*THEN)
|
||||
on the matching path that has a name. (*MARK) must always include a name, but
|
||||
(*PRUNE) and (*THEN) need not. For example, in the case of (*MARK:A)(*PRUNE)
|
||||
the name inserted is "A", but for (*MARK:A)(*PRUNE:B) the relevant name is "B".
|
||||
This facility can be used to perform simple simultaneous substitutions, as this
|
||||
\fBpcre2test\fP example shows:
|
||||
$*MARK inserts the name from the last encountered (*ACCEPT), (*COMMIT),
|
||||
(*MARK), (*PRUNE), or (*THEN) on the matching path that has a name. (*MARK)
|
||||
must always include a name, but the other verbs need not. For example, in
|
||||
the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for
|
||||
(*MARK:A)(*PRUNE:B) the relevant name is "B". This facility can be used to
|
||||
perform simple simultaneous substitutions, as this \fBpcre2test\fP example
|
||||
shows:
|
||||
.sp
|
||||
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
|
@ -3388,6 +3411,42 @@ above).
|
|||
.\"
|
||||
.
|
||||
.
|
||||
.SS "Substitution callouts"
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_substitute_callout(pcre2_match_context *\fImcontext\fP,
|
||||
.B " void (*\fIcallout_function\fP)(pcre2_substitute_callout_block *, void *),"
|
||||
.B " void *\fIcallout_data\fP);"
|
||||
.fi
|
||||
.sp
|
||||
The \fBpcre2_set_substitution_callout()\fP function can be used to specify a
|
||||
callout function for \fBpcre2_substitute()\fP. This information is passed in
|
||||
a match context. The callout function is called after each substitution. It is
|
||||
not called for simulated substitutions that happen as a result of the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. A callout function should not return
|
||||
any value.
|
||||
.P
|
||||
The first argument of the callout function is a pointer to a substitute callout
|
||||
block structure, which contains the following fields, not necessarily in this
|
||||
order:
|
||||
.sp
|
||||
uint32_t \fIversion\fP;
|
||||
PCRE2_SIZE \fIinput_offsets[2]\fP;
|
||||
PCRE2_SIZE \fIoutput_offsets[2]\fP;
|
||||
.sp
|
||||
The \fIversion\fP field contains the version number of the block format. The
|
||||
current version is 0. The version number will increase in future if more fields
|
||||
are added, but the intention is never to remove any of the existing fields.
|
||||
.P
|
||||
The \fIinput_offsets\fP vector contains the code unit offsets in the input
|
||||
string of the matched substring, and the \fIoutput_offsets\fP vector contains
|
||||
the offsets of the replacement in the output string.
|
||||
.P
|
||||
The second argument of the callout function is the value passed as
|
||||
\fIcallout_data\fP when the function was registered.
|
||||
.
|
||||
.
|
||||
.SH "DUPLICATE SUBPATTERN NAMES"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -3670,6 +3729,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 07 September 2018
|
||||
Last updated: 18 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2CALLOUT 3 "26 April 2018" "PCRE2 10.32"
|
||||
.TH PCRE2CALLOUT 3 "17 September 2018" "PCRE2 10.33"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -27,6 +27,15 @@ a match context (see \fBpcre2_set_callout()\fP in the
|
|||
.\"
|
||||
documentation).
|
||||
.P
|
||||
When using the \fBpcre2_substitute()\fP function, an additional callout feature
|
||||
is available. This does a callout after each change to the subject string and
|
||||
is described in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
documentation; the rest of this document is concerned with callouts during
|
||||
pattern matching.
|
||||
.P
|
||||
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||
function is to be called. Different callout points can be identified by putting
|
||||
a number less than 256 after the letter C. The default value is zero.
|
||||
|
@ -443,6 +452,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 26 April 2018
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "15 September 2018" "PCRE 10.33"
|
||||
.TH PCRE2TEST 1 "17 September 2018" "PCRE 10.33"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -1011,6 +1011,7 @@ process.
|
|||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
|
@ -1185,6 +1186,7 @@ pattern.
|
|||
replace=<string> specify a replacement string
|
||||
startchar show startchar when relevant
|
||||
startoffset=<n> same as offset=<n>
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
|
@ -1271,7 +1273,7 @@ elements are the only ones that should be set. After a DFA match, the amount of
|
|||
ovector that is used depends on the number of matches that were found.
|
||||
.
|
||||
.
|
||||
.SS "Testing callouts"
|
||||
.SS "Testing pattern callouts"
|
||||
.rs
|
||||
.sp
|
||||
A callout function is supplied when \fBpcre2test\fP calls the library matching
|
||||
|
@ -1282,6 +1284,12 @@ controlled by various modifiers listed above whose names begin with
|
|||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
Testing callouts from \fBpcre2_substitute()\fP is decribed separately in
|
||||
"Testing the substitution function"
|
||||
.\" HTML <a href="#substitution">
|
||||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
.
|
||||
.
|
||||
.SS "Finding all matches in a string"
|
||||
|
@ -1332,6 +1340,7 @@ parentheses after each substring, followed by the name when the extraction was
|
|||
by name.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="substitution"></a>
|
||||
.SS "Testing the substitution function"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -1367,6 +1376,16 @@ simple example of a substitution test:
|
|||
=abc=abc=\e=global
|
||||
2: =xxx=xxx=
|
||||
.sp
|
||||
If the \fBsubstitute_callout\fP modifier is set, a substitution callout
|
||||
function is set up. When it is called (after each substitution), the offsets in
|
||||
the input and output strings are output. For example:
|
||||
.sp
|
||||
/abc/g,replace=<$0>,substitute_callout
|
||||
abcdefabcpqr
|
||||
Old 0 3 New 0 5
|
||||
Old 6 9 New 8 13
|
||||
2: <abc>def<abc>pqr
|
||||
.sp
|
||||
Subject and replacement strings should be kept relatively short (fewer than 256
|
||||
characters) for substitution tests, as fixed-size buffers are used. To make it
|
||||
easy to test for buffer overflow, if the replacement string starts with a
|
||||
|
@ -1384,10 +1403,10 @@ The default action of \fBpcre2_substitute()\fP is to return
|
|||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
||||
\fBsubstitute_overflow_length\fP modifier), \fBpcre2_substitute()\fP continues
|
||||
to go through the motions of matching and substituting, in order to compute the
|
||||
size of buffer that is required. When this happens, \fBpcre2test\fP shows the
|
||||
required buffer length (which includes space for the trailing zero) as part of
|
||||
the error message. For example:
|
||||
to go through the motions of matching and substituting (but not doing any
|
||||
callouts), in order to compute the size of buffer that is required. When this
|
||||
happens, \fBpcre2test\fP shows the required buffer length (which includes space
|
||||
for the trailing zero) as part of the error message. For example:
|
||||
.sp
|
||||
/abc/substitute_overflow_length
|
||||
123abc123\e=replace=[9]XYZ
|
||||
|
@ -2002,6 +2021,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 15 September 2018
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -929,6 +929,7 @@ PATTERN MODIFIERS
|
|||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text
|
||||
altglobal alternative global matching
|
||||
/g global global matching
|
||||
|
@ -936,6 +937,7 @@ PATTERN MODIFIERS
|
|||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
|
@ -1057,6 +1059,7 @@ SUBJECT MODIFIERS
|
|||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allvector show the entire ovector
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
callout_capture show captures at callout time
|
||||
|
@ -1086,6 +1089,7 @@ SUBJECT MODIFIERS
|
|||
replace=<string> specify a replacement string
|
||||
startchar show startchar when relevant
|
||||
startoffset=<n> same as offset=<n>
|
||||
substitute_callout use substitution callouts
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
|
@ -1150,76 +1154,95 @@ SUBJECT MODIFIERS
|
|||
the highest one actually used in the match are output (corresponding to
|
||||
the return code from pcre2_match()). Groups that did not take part in
|
||||
the match are output as "<unset>". This modifier is not relevant for
|
||||
DFA matching (which does no capturing); it is ignored, with a warning
|
||||
message, if present.
|
||||
DFA matching (which does no capturing) and does not apply when replace
|
||||
is specified; it is ignored, with a warning message, if present.
|
||||
|
||||
Testing callouts
|
||||
Showing the entire ovector, for all outcomes
|
||||
|
||||
A callout function is supplied when pcre2test calls the library match-
|
||||
ing functions, unless callout_none is specified. Its behaviour can be
|
||||
controlled by various modifiers listed above whose names begin with
|
||||
callout_. Details are given in the section entitled "Callouts" below.
|
||||
The allvector modifier requests that the entire ovector be shown, what-
|
||||
ever the outcome of the match. Compare allcaptures, which shows only up
|
||||
to the maximum number of capture groups for the pattern, and then only
|
||||
for a successful complete non-DFA match. This modifier, which acts
|
||||
after any match result, and also for DFA matching, provides a means of
|
||||
checking that there are no unexpected modifications to ovector fields.
|
||||
Before each match attempt, the ovector is filled with a special value,
|
||||
and if this is found in both elements of a capturing pair,
|
||||
"<unchanged>" is output. After a successful match, this applies to all
|
||||
groups after the maximum capture group for the pattern. In other cases
|
||||
it applies to the entire ovector. After a partial match, the first two
|
||||
elements are the only ones that should be set. After a DFA match, the
|
||||
amount of ovector that is used depends on the number of matches that
|
||||
were found.
|
||||
|
||||
Testing pattern callouts
|
||||
|
||||
A callout function is supplied when pcre2test calls the library match-
|
||||
ing functions, unless callout_none is specified. Its behaviour can be
|
||||
controlled by various modifiers listed above whose names begin with
|
||||
callout_. Details are given in the section entitled "Callouts" below.
|
||||
Testing callouts from pcre2_substitute() is decribed separately in
|
||||
"Testing the substitution function" below.
|
||||
|
||||
Finding all matches in a string
|
||||
|
||||
Searching for all possible matches within a subject can be requested by
|
||||
the global or altglobal modifier. After finding a match, the matching
|
||||
function is called again to search the remainder of the subject. The
|
||||
difference between global and altglobal is that the former uses the
|
||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||
searching at a new point within the entire string (which is what Perl
|
||||
the global or altglobal modifier. After finding a match, the matching
|
||||
function is called again to search the remainder of the subject. The
|
||||
difference between global and altglobal is that the former uses the
|
||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||
searching at a new point within the entire string (which is what Perl
|
||||
does), whereas the latter passes over a shortened subject. This makes a
|
||||
difference to the matching process if the pattern begins with a lookbe-
|
||||
hind assertion (including \b or \B).
|
||||
|
||||
If an empty string is matched, the next match is done with the
|
||||
If an empty string is matched, the next match is done with the
|
||||
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
||||
for another, non-empty, match at the same point in the subject. If this
|
||||
match fails, the start offset is advanced, and the normal match is
|
||||
retried. This imitates the way Perl handles such cases when using the
|
||||
/g modifier or the split() function. Normally, the start offset is
|
||||
advanced by one character, but if the newline convention recognizes
|
||||
CRLF as a newline, and the current character is CR followed by LF, an
|
||||
match fails, the start offset is advanced, and the normal match is
|
||||
retried. This imitates the way Perl handles such cases when using the
|
||||
/g modifier or the split() function. Normally, the start offset is
|
||||
advanced by one character, but if the newline convention recognizes
|
||||
CRLF as a newline, and the current character is CR followed by LF, an
|
||||
advance of two characters occurs.
|
||||
|
||||
Testing substring extraction functions
|
||||
|
||||
The copy and get modifiers can be used to test the pcre2_sub-
|
||||
The copy and get modifiers can be used to test the pcre2_sub-
|
||||
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
|
||||
given more than once, and each can specify a group name or number, for
|
||||
given more than once, and each can specify a group name or number, for
|
||||
example:
|
||||
|
||||
abcd\=copy=1,copy=3,get=G1
|
||||
|
||||
If the #subject command is used to set default copy and/or get lists,
|
||||
these can be unset by specifying a negative number to cancel all num-
|
||||
If the #subject command is used to set default copy and/or get lists,
|
||||
these can be unset by specifying a negative number to cancel all num-
|
||||
bered groups and an empty name to cancel all named groups.
|
||||
|
||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||
all captured substrings.
|
||||
|
||||
If the subject line is successfully matched, the substrings extracted
|
||||
by the convenience functions are output with C, G, or L after the
|
||||
string number instead of a colon. This is in addition to the normal
|
||||
full list. The string length (that is, the return from the extraction
|
||||
If the subject line is successfully matched, the substrings extracted
|
||||
by the convenience functions are output with C, G, or L after the
|
||||
string number instead of a colon. This is in addition to the normal
|
||||
full list. The string length (that is, the return from the extraction
|
||||
function) is given in parentheses after each substring, followed by the
|
||||
name when the extraction was by name.
|
||||
|
||||
Testing the substitution function
|
||||
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Note that replacement
|
||||
strings cannot contain commas, because a comma signifies the end of a
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Note that replacement
|
||||
strings cannot contain commas, because a comma signifies the end of a
|
||||
modifier. This is not thought to be an issue in a test program.
|
||||
|
||||
Unlike subject strings, pcre2test does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to
|
||||
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
||||
a UTF string of the appropriate code unit width. If it is not a valid
|
||||
UTF-8 string, the individual code units are copied directly. This pro-
|
||||
Unlike subject strings, pcre2test does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to
|
||||
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
||||
a UTF string of the appropriate code unit width. If it is not a valid
|
||||
UTF-8 string, the individual code units are copied directly. This pro-
|
||||
vides a means of passing an invalid UTF-8 string for testing purposes.
|
||||
|
||||
The following modifiers set options (in additional to the normal match
|
||||
The following modifiers set options (in additional to the normal match
|
||||
options) for pcre2_substitute():
|
||||
|
||||
global PCRE2_SUBSTITUTE_GLOBAL
|
||||
|
@ -1229,8 +1252,8 @@ SUBJECT MODIFIERS
|
|||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
|
||||
After a successful substitution, the modified string is output, pre-
|
||||
ceded by the number of replacements. This may be zero if there were no
|
||||
After a successful substitution, the modified string is output, pre-
|
||||
ceded by the number of replacements. This may be zero if there were no
|
||||
matches. Here is a simple example of a substitution test:
|
||||
|
||||
/abc/replace=xxx
|
||||
|
@ -1239,12 +1262,22 @@ SUBJECT MODIFIERS
|
|||
=abc=abc=\=global
|
||||
2: =xxx=xxx=
|
||||
|
||||
Subject and replacement strings should be kept relatively short (fewer
|
||||
than 256 characters) for substitution tests, as fixed-size buffers are
|
||||
used. To make it easy to test for buffer overflow, if the replacement
|
||||
string starts with a number in square brackets, that number is passed
|
||||
to pcre2_substitute() as the size of the output buffer, with the
|
||||
replacement string starting at the next character. Here is an example
|
||||
If the substitute_callout modifier is set, a substitution callout func-
|
||||
tion is set up. When it is called (after each substitution), the off-
|
||||
sets in the input and output strings are output. For example:
|
||||
|
||||
/abc/g,replace=<$0>,substitute_callout
|
||||
abcdefabcpqr
|
||||
Old 0 3 New 0 5
|
||||
Old 6 9 New 8 13
|
||||
2: <abc>def<abc>pqr
|
||||
|
||||
Subject and replacement strings should be kept relatively short (fewer
|
||||
than 256 characters) for substitution tests, as fixed-size buffers are
|
||||
used. To make it easy to test for buffer overflow, if the replacement
|
||||
string starts with a number in square brackets, that number is passed
|
||||
to pcre2_substitute() as the size of the output buffer, with the
|
||||
replacement string starting at the next character. Here is an example
|
||||
that tests the edge case:
|
||||
|
||||
/abc/
|
||||
|
@ -1253,14 +1286,15 @@ SUBJECT MODIFIERS
|
|||
123abc123\=replace=[9]XYZ
|
||||
Failed: error -47: no more memory
|
||||
|
||||
The default action of pcre2_substitute() is to return
|
||||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if
|
||||
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the sub-
|
||||
stitute_overflow_length modifier), pcre2_substitute() continues to go
|
||||
through the motions of matching and substituting, in order to compute
|
||||
the size of buffer that is required. When this happens, pcre2test shows
|
||||
the required buffer length (which includes space for the trailing zero)
|
||||
as part of the error message. For example:
|
||||
The default action of pcre2_substitute() is to return
|
||||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if
|
||||
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the sub-
|
||||
stitute_overflow_length modifier), pcre2_substitute() continues to go
|
||||
through the motions of matching and substituting (but not doing any
|
||||
callouts), in order to compute the size of buffer that is required.
|
||||
When this happens, pcre2test shows the required buffer length (which
|
||||
includes space for the trailing zero) as part of the error message. For
|
||||
example:
|
||||
|
||||
/abc/substitute_overflow_length
|
||||
123abc123\=replace=[9]XYZ
|
||||
|
@ -1818,5 +1852,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 21 July 2018
|
||||
Last updated: 17 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
|
|
|
@ -505,10 +505,10 @@ typedef struct pcre2_real_jit_stack pcre2_jit_stack; \
|
|||
typedef pcre2_jit_stack *(*pcre2_jit_callback)(void *);
|
||||
|
||||
|
||||
/* The structure for passing out data via the pcre_callout_function. We use a
|
||||
structure so that new fields can be added on the end in future versions,
|
||||
without changing the API of the function, thereby allowing old clients to work
|
||||
without modification. Define the generic version in a macro; the width-specific
|
||||
/* The structures for passing out data via callout functions. We use structures
|
||||
so that new fields can be added on the end in future versions, without changing
|
||||
the API of the function, thereby allowing old clients to work without
|
||||
modification. Define the generic versions in a macro; the width-specific
|
||||
versions are generated from this macro below. */
|
||||
|
||||
/* Flags for the callout_flags field. These are cleared after a callout. */
|
||||
|
@ -550,7 +550,15 @@ typedef struct pcre2_callout_enumerate_block { \
|
|||
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||
/* ------------------------------------------------------------------ */ \
|
||||
} pcre2_callout_enumerate_block;
|
||||
} pcre2_callout_enumerate_block; \
|
||||
\
|
||||
typedef struct pcre2_substitute_callout_block { \
|
||||
uint32_t version; /* Identifies version of block */ \
|
||||
/* ------------------------ Version 0 ------------------------------- */ \
|
||||
PCRE2_SIZE input_offsets[2]; /* Matched portion of the input */ \
|
||||
PCRE2_SIZE output_offsets[2]; /* Changed portion of the output */ \
|
||||
/* ------------------------------------------------------------------ */ \
|
||||
} pcre2_substitute_callout_block;
|
||||
|
||||
|
||||
/* List the generic forms of all other functions in macros, which will be
|
||||
|
@ -605,6 +613,9 @@ PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
|||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_callout(pcre2_match_context *, \
|
||||
int (*)(pcre2_callout_block *, void *), void *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_substitute_callout(pcre2_match_context *, \
|
||||
void (*)(pcre2_substitute_callout_block *, void *), void *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_depth_limit(pcre2_match_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
|
@ -808,6 +819,7 @@ pcre2_compile are called by application code. */
|
|||
|
||||
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
||||
#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_)
|
||||
#define pcre2_substitute_callout_block PCRE2_SUFFIX(pcre2_substitute_callout_block_)
|
||||
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
||||
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
||||
#define pcre2_convert_context PCRE2_SUFFIX(pcre2_convert_context_)
|
||||
|
@ -873,6 +885,7 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
|
||||
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
||||
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)
|
||||
#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_)
|
||||
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
|
||||
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
|
||||
#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_)
|
||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
|||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2017 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -163,11 +163,13 @@ when no context is supplied to a match function. */
|
|||
const pcre2_match_context PRIV(default_match_context) = {
|
||||
{ default_malloc, default_free, NULL },
|
||||
#ifdef SUPPORT_JIT
|
||||
NULL,
|
||||
NULL,
|
||||
NULL, /* JIT callback */
|
||||
NULL, /* JIT callback data */
|
||||
#endif
|
||||
NULL,
|
||||
NULL,
|
||||
NULL, /* Callout function */
|
||||
NULL, /* Callout data */
|
||||
NULL, /* Substitute callout function */
|
||||
NULL, /* Substitute callout data */
|
||||
PCRE2_UNSET, /* Offset limit */
|
||||
HEAP_LIMIT,
|
||||
MATCH_LIMIT,
|
||||
|
@ -403,6 +405,16 @@ mcontext->callout_data = callout_data;
|
|||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_substitute_callout(pcre2_match_context *mcontext,
|
||||
void (*substitute_callout)(pcre2_substitute_callout_block *, void *),
|
||||
void *substitute_callout_data)
|
||||
{
|
||||
mcontext->substitute_callout = substitute_callout;
|
||||
mcontext->substitute_callout_data = substitute_callout_data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_heap_limit(pcre2_match_context *mcontext, uint32_t limit)
|
||||
{
|
||||
|
|
|
@ -585,6 +585,8 @@ typedef struct pcre2_real_match_context {
|
|||
#endif
|
||||
int (*callout)(pcre2_callout_block *, void *);
|
||||
void *callout_data;
|
||||
void (*substitute_callout)(pcre2_substitute_callout_block *, void *);
|
||||
void *substitute_callout_data;
|
||||
PCRE2_SIZE offset_limit;
|
||||
uint32_t heap_limit;
|
||||
uint32_t match_limit;
|
||||
|
|
|
@ -239,7 +239,9 @@ PCRE2_SIZE extra_needed = 0;
|
|||
PCRE2_SIZE buff_offset, buff_length, lengthleft, fraglength;
|
||||
PCRE2_SIZE *ovector;
|
||||
PCRE2_SIZE ovecsave[3];
|
||||
pcre2_substitute_callout_block scb;
|
||||
|
||||
scb.version = 0;
|
||||
buff_offset = 0;
|
||||
lengthleft = buff_length = *blength;
|
||||
*blength = PCRE2_UNSET;
|
||||
|
@ -391,6 +393,11 @@ do
|
|||
goto EXIT;
|
||||
}
|
||||
|
||||
/* Save the match point for a possible callout */
|
||||
|
||||
scb.input_offsets[0] = ovector[0];
|
||||
scb.input_offsets[1] = ovector[1];
|
||||
|
||||
/* Count substitutions with a paranoid check for integer overflow; surely no
|
||||
real call to this function would ever hit this! */
|
||||
|
||||
|
@ -401,11 +408,13 @@ do
|
|||
}
|
||||
subs++;
|
||||
|
||||
/* Copy the text leading up to the match. */
|
||||
/* Copy the text leading up to the match, and remember where the insert
|
||||
begins. */
|
||||
|
||||
if (rc == 0) rc = ovector_count;
|
||||
fraglength = ovector[0] - start_offset;
|
||||
CHECKMEMCPY(subject + start_offset, fraglength);
|
||||
scb.output_offsets[0] = buff_offset;
|
||||
|
||||
/* Process the replacement string. Literal mode is set by \Q, but only in
|
||||
extended mode when backslashes are being interpreted. In extended mode we
|
||||
|
@ -821,10 +830,19 @@ do
|
|||
} /* End handling a literal code unit */
|
||||
} /* End of loop for scanning the replacement. */
|
||||
|
||||
/* The replacement has been copied to the output. Save the details of this
|
||||
match. See above for how this data is used. If we matched an empty string, do
|
||||
the magic for global matches. Finally, update the start offset to point to
|
||||
the rest of the subject string. */
|
||||
/* The replacement has been copied to the output, or its size has been
|
||||
remembered. Do the callout if there is one and we have done an actual
|
||||
replacement. */
|
||||
|
||||
if (!overflowed && mcontext->substitute_callout != NULL)
|
||||
{
|
||||
scb.output_offsets[1] = buff_offset;
|
||||
mcontext->substitute_callout(&scb, mcontext->substitute_callout_data);
|
||||
}
|
||||
|
||||
/* Save the details of this match. See above for how this data is used. If we
|
||||
matched an empty string, do the magic for global matches. Finally, update the
|
||||
start offset to point to the rest of the subject string. */
|
||||
|
||||
ovecsave[0] = ovector[0];
|
||||
ovecsave[1] = ovector[1];
|
||||
|
|
100
src/pcre2test.c
100
src/pcre2test.c
|
@ -484,14 +484,15 @@ so many of them that they are split into two fields. */
|
|||
|
||||
/* Second control word */
|
||||
|
||||
#define CTL2_SUBSTITUTE_EXTENDED 0x00000001u
|
||||
#define CTL2_SUBSTITUTE_OVERFLOW_LENGTH 0x00000002u
|
||||
#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000004u
|
||||
#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000008u
|
||||
#define CTL2_SUBJECT_LITERAL 0x00000010u
|
||||
#define CTL2_CALLOUT_NO_WHERE 0x00000020u
|
||||
#define CTL2_CALLOUT_EXTRA 0x00000040u
|
||||
#define CTL2_ALLVECTOR 0x00000080u
|
||||
#define CTL2_SUBSTITUTE_CALLOUT 0x00000001u
|
||||
#define CTL2_SUBSTITUTE_EXTENDED 0x00000002u
|
||||
#define CTL2_SUBSTITUTE_OVERFLOW_LENGTH 0x00000004u
|
||||
#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000008u
|
||||
#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000010u
|
||||
#define CTL2_SUBJECT_LITERAL 0x00000020u
|
||||
#define CTL2_CALLOUT_NO_WHERE 0x00000040u
|
||||
#define CTL2_CALLOUT_EXTRA 0x00000080u
|
||||
#define CTL2_ALLVECTOR 0x00000100u
|
||||
|
||||
#define CTL2_NL_SET 0x40000000u /* Informational */
|
||||
#define CTL2_BSR_SET 0x80000000u /* Informational */
|
||||
|
@ -511,7 +512,8 @@ different things in the two cases. */
|
|||
CTL_STARTCHAR|\
|
||||
CTL_UTF8_INPUT)
|
||||
|
||||
#define CTL2_ALLPD (CTL2_SUBSTITUTE_EXTENDED|\
|
||||
#define CTL2_ALLPD (CTL2_SUBSTITUTE_CALLOUT|\
|
||||
CTL2_SUBSTITUTE_EXTENDED|\
|
||||
CTL2_SUBSTITUTE_OVERFLOW_LENGTH|\
|
||||
CTL2_SUBSTITUTE_UNKNOWN_UNSET|\
|
||||
CTL2_SUBSTITUTE_UNSET_EMPTY|\
|
||||
|
@ -690,6 +692,7 @@ static modstruct modlist[] = {
|
|||
{ "startchar", MOD_PND, MOD_CTL, CTL_STARTCHAR, PO(control) },
|
||||
{ "startoffset", MOD_DAT, MOD_INT, 0, DO(offset) },
|
||||
{ "subject_literal", MOD_PATP, MOD_CTL, CTL2_SUBJECT_LITERAL, PO(control2) },
|
||||
{ "substitute_callout", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_CALLOUT, PO(control2) },
|
||||
{ "substitute_extended", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_EXTENDED, PO(control2) },
|
||||
{ "substitute_overflow_length", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_OVERFLOW_LENGTH, PO(control2) },
|
||||
{ "substitute_unknown_unset", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_UNKNOWN_UNSET, PO(control2) },
|
||||
|
@ -1355,6 +1358,17 @@ are supported. */
|
|||
else \
|
||||
pcre2_set_parens_nest_limit_32(G(a,32),b)
|
||||
|
||||
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
pcre2_set_substitute_callout_8(G(a,8), \
|
||||
(void (*)(pcre2_substitute_callout_block_8 *, void *))b,c); \
|
||||
else if (test_mode == PCRE16_MODE) \
|
||||
pcre2_set_substitute_callout_16(G(a,16), \
|
||||
(void (*)(pcre2_substitute_callout_block_16 *, void *))b,c); \
|
||||
else \
|
||||
pcre2_set_substitute_callout_32(G(a,32), \
|
||||
(void (*)(pcre2_substitute_callout_block_32 *, void *))b,c)
|
||||
|
||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
a = pcre2_substitute_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
||||
|
@ -1824,6 +1838,14 @@ the three different cases. */
|
|||
else \
|
||||
G(pcre2_set_parens_nest_limit_,BITTWO)(G(a,BITTWO),b)
|
||||
|
||||
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
G(pcre2_set_substitute_callout_,BITONE)(G(a,BITONE), \
|
||||
(void (*)(G(pcre2_substitute_callout_block_,BITONE) *, void *))b,c); \
|
||||
else \
|
||||
G(pcre2_set_substitute_callout_,BITTWO)(G(a,BITTWO), \
|
||||
(void (*)(G(pcre2_substitute_callout_block_,BITTWO) *, void *))b,c)
|
||||
|
||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
a = G(pcre2_substitute_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \
|
||||
|
@ -2025,6 +2047,9 @@ the three different cases. */
|
|||
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_8(G(a,8),b)
|
||||
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_8(G(a,8),b)
|
||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_8(G(a,8),b)
|
||||
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||
pcre2_set_substitute_callout_8(G(a,8), \
|
||||
(void (*)(pcre2_substitute_callout_block_8 *, void *))b,c)
|
||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||
a = pcre2_substitute_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
||||
(PCRE2_SPTR8)i,j,(PCRE2_UCHAR8 *)k,l)
|
||||
|
@ -2129,6 +2154,9 @@ the three different cases. */
|
|||
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_16(G(a,16),b)
|
||||
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_16(G(a,16),b)
|
||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_16(G(a,16),b)
|
||||
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||
pcre2_set_substitute_callout_16(G(a,16), \
|
||||
(void (*)(pcre2_substitute_callout_block_16 *, void *))b,c)
|
||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||
a = pcre2_substitute_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16), \
|
||||
(PCRE2_SPTR16)i,j,(PCRE2_UCHAR16 *)k,l)
|
||||
|
@ -2221,7 +2249,7 @@ the three different cases. */
|
|||
#define PCRE2_SERIALIZE_GET_NUMBER_OF_CODES(r,a) \
|
||||
r = pcre2_serialize_get_number_of_codes_32(a)
|
||||
#define PCRE2_SET_CALLOUT(a,b,c) \
|
||||
pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *, void *))b,c);
|
||||
pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *, void *))b,c)
|
||||
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b)
|
||||
#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \
|
||||
pcre2_set_compile_recursion_guard_32(G(a,32),b,c)
|
||||
|
@ -2233,6 +2261,9 @@ the three different cases. */
|
|||
#define PCRE2_SET_MAX_PATTERN_LENGTH(a,b) pcre2_set_max_pattern_length_32(G(a,32),b)
|
||||
#define PCRE2_SET_OFFSET_LIMIT(a,b) pcre2_set_offset_limit_32(G(a,32),b)
|
||||
#define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_32(G(a,32),b)
|
||||
#define PCRE2_SET_SUBSTITUTE_CALLOUT(a,b,c) \
|
||||
pcre2_set_substitute_callout_32(G(a,32), \
|
||||
(void (*)(pcre2_substitute_callout_block_32 *, void *))b,c)
|
||||
#define PCRE2_SUBSTITUTE(a,b,c,d,e,f,g,h,i,j,k,l) \
|
||||
a = pcre2_substitute_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32), \
|
||||
(PCRE2_SPTR32)i,j,(PCRE2_UCHAR32 *)k,l)
|
||||
|
@ -4022,7 +4053,7 @@ Returns: nothing
|
|||
static void
|
||||
show_controls(uint32_t controls, uint32_t controls2, const char *before)
|
||||
{
|
||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
||||
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
||||
|
@ -4058,6 +4089,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s
|
|||
((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "",
|
||||
((controls & CTL_PUSHTABLESCOPY) != 0)? " pushtablescopy" : "",
|
||||
((controls & CTL_STARTCHAR) != 0)? " startchar" : "",
|
||||
((controls2 & CTL2_SUBSTITUTE_CALLOUT) != 0)? " substitute_callout" : "",
|
||||
((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "",
|
||||
((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "",
|
||||
((controls2 & CTL2_SUBSTITUTE_UNKNOWN_UNSET) != 0)? " substitute_unknown_unset" : "",
|
||||
|
@ -5896,6 +5928,35 @@ return capcount;
|
|||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Substitute callout function *
|
||||
*************************************************/
|
||||
|
||||
/* Called from pcre2_substitute() when the substitute_callout modifier is set.
|
||||
Print out the data that is passed back. The substitute callout block is
|
||||
identical for all code unit widths, so we just pick one.
|
||||
|
||||
Arguments:
|
||||
scb pointer to substitute callout block
|
||||
data_ptr callout data
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
substitute_callout_function(pcre2_substitute_callout_block_8 *scb,
|
||||
void *data_ptr)
|
||||
{
|
||||
(void)data_ptr; /* Not used */
|
||||
fprintf(outfile, "Old %" SIZ_FORM " %" SIZ_FORM " New %" SIZ_FORM
|
||||
" %" SIZ_FORM "\n",
|
||||
SIZ_CAST scb->input_offsets[0],
|
||||
SIZ_CAST scb->input_offsets[1],
|
||||
SIZ_CAST scb->output_offsets[0],
|
||||
SIZ_CAST scb->output_offsets[1]);
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Callout function *
|
||||
*************************************************/
|
||||
|
@ -5907,8 +5968,11 @@ callout block for different code unit widths are that the pointers to the
|
|||
subject, the most recent MARK, and a callout argument string point to strings
|
||||
of the appropriate width. Casts can be used to deal with this.
|
||||
|
||||
Argument: a pointer to a callout block
|
||||
Return:
|
||||
Arguments:
|
||||
cb a pointer to a callout block
|
||||
callout_data_ptr the provided callout data
|
||||
|
||||
Returns: 0 or 1 or an error, as determined by settings
|
||||
*/
|
||||
|
||||
static int
|
||||
|
@ -7158,6 +7222,16 @@ if (dat_datctl.replacement[0] != 0)
|
|||
rlen = PCRE2_ZERO_TERMINATED;
|
||||
else
|
||||
rlen = (CASTVAR(uint8_t *, r) - rbuffer)/code_unit_size;
|
||||
|
||||
if ((dat_datctl.control2 & CTL2_SUBSTITUTE_CALLOUT) != 0)
|
||||
{
|
||||
PCRE2_SET_SUBSTITUTE_CALLOUT(dat_context, substitute_callout_function, NULL);
|
||||
}
|
||||
else
|
||||
{
|
||||
PCRE2_SET_SUBSTITUTE_CALLOUT(dat_context, NULL, NULL); /* No callout */
|
||||
}
|
||||
|
||||
PCRE2_SUBSTITUTE(rc, compiled_code, pp, arg_ulen, dat_datctl.offset,
|
||||
dat_datctl.options|xoptions, match_data, dat_context,
|
||||
rbuffer, rlen, nbuffer, &nsize);
|
||||
|
|
|
@ -476,4 +476,9 @@
|
|||
\= Expect no match
|
||||
aaa
|
||||
|
||||
# Offsets are different in 8-bit mode.
|
||||
|
||||
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||
123abcáyzabcdef789abcሴqr
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -382,4 +382,9 @@
|
|||
\= Expect no match
|
||||
aaa
|
||||
|
||||
# Offsets are different in 8-bit mode.
|
||||
|
||||
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||
123abcáyzabcdef789abcሴqr
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -5514,4 +5514,7 @@ a)"xI
|
|||
abcdef\=ovector=4
|
||||
abxyz\=ovector=4
|
||||
|
||||
/a(b)c|xyz/g,replace=<$0>,substitute_callout
|
||||
abcdefabcpqr
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -1626,4 +1626,14 @@ Subject length lower bound = 1
|
|||
aaa
|
||||
No match
|
||||
|
||||
# Offsets are different in 8-bit mode.
|
||||
|
||||
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||
123abcáyzabcdef789abcሴqr
|
||||
Old 6 6 New 6 8
|
||||
Old 13 13 New 15 17
|
||||
Old 13 16 New 17 22
|
||||
Old 22 22 New 28 30
|
||||
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -1471,4 +1471,14 @@ Subject length lower bound = 1
|
|||
aaa
|
||||
No match
|
||||
|
||||
# Offsets are different in 8-bit mode.
|
||||
|
||||
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||
123abcáyzabcdef789abcሴqr
|
||||
Old 6 6 New 6 8
|
||||
Old 12 12 New 14 16
|
||||
Old 12 15 New 16 21
|
||||
Old 21 21 New 27 29
|
||||
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -1468,4 +1468,14 @@ Subject length lower bound = 1
|
|||
aaa
|
||||
No match
|
||||
|
||||
# Offsets are different in 8-bit mode.
|
||||
|
||||
/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
|
||||
123abcáyzabcdef789abcሴqr
|
||||
Old 6 6 New 6 8
|
||||
Old 12 12 New 14 16
|
||||
Old 12 15 New 16 21
|
||||
Old 21 21 New 27 29
|
||||
4: 123abc<>\x{e1}yzabc<><def>789abc<>\x{1234}qr
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -16795,6 +16795,12 @@ Subject length lower bound = 1
|
|||
2: <unchanged>
|
||||
3: <unchanged>
|
||||
|
||||
/a(b)c|xyz/g,replace=<$0>,substitute_callout
|
||||
abcdefabcpqr
|
||||
Old 0 3 New 0 5
|
||||
Old 6 9 New 8 13
|
||||
2: <abc>def<abc>pqr
|
||||
|
||||
# End of testinput2
|
||||
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error -62: bad serialized data
|
||||
|
|
Loading…
Reference in New Issue