Implement pcre2_callout_enumerate().
This commit is contained in:
parent
b15698b077
commit
4e61019ffe
|
@ -10,7 +10,9 @@ Version 10.20 xx-xx-2015
|
||||||
|
|
||||||
3. The invalid pattern (?(?C) has a missing assertion condition at the end. The
|
3. The invalid pattern (?(?C) has a missing assertion condition at the end. The
|
||||||
pcre2_compile() function read past the end of the input before diagnosing an
|
pcre2_compile() function read past the end of the input before diagnosing an
|
||||||
error.
|
error. This bug was discovered by the LLVM fuzzer.
|
||||||
|
|
||||||
|
4. Implemented pcre2_callout_enumerate().
|
||||||
|
|
||||||
|
|
||||||
Version 10.10 06-March-2015
|
Version 10.10 06-March-2015
|
||||||
|
|
|
@ -24,6 +24,7 @@ dist_html_DATA = \
|
||||||
doc/html/index.html \
|
doc/html/index.html \
|
||||||
doc/html/pcre2-config.html \
|
doc/html/pcre2-config.html \
|
||||||
doc/html/pcre2.html \
|
doc/html/pcre2.html \
|
||||||
|
doc/html/pcre2_callout_enumerate.html \
|
||||||
doc/html/pcre2_code_free.html \
|
doc/html/pcre2_code_free.html \
|
||||||
doc/html/pcre2_compile.html \
|
doc/html/pcre2_compile.html \
|
||||||
doc/html/pcre2_compile_context_copy.html \
|
doc/html/pcre2_compile_context_copy.html \
|
||||||
|
@ -102,6 +103,7 @@ dist_html_DATA = \
|
||||||
dist_man_MANS = \
|
dist_man_MANS = \
|
||||||
doc/pcre2-config.1 \
|
doc/pcre2-config.1 \
|
||||||
doc/pcre2.3 \
|
doc/pcre2.3 \
|
||||||
|
doc/pcre2_callout_enumerate.3 \
|
||||||
doc/pcre2_code_free.3 \
|
doc/pcre2_code_free.3 \
|
||||||
doc/pcre2_compile.3 \
|
doc/pcre2_compile.3 \
|
||||||
doc/pcre2_compile_context_copy.3 \
|
doc/pcre2_compile_context_copy.3 \
|
||||||
|
|
|
@ -88,6 +88,9 @@ in the library.
|
||||||
|
|
||||||
<table>
|
<table>
|
||||||
|
|
||||||
|
<tr><td><a href="pcre2_callout_enumerate.html">pcre2_callout_enumerate</a></td>
|
||||||
|
<td> Enumerate callouts in a compiled pattern</td></tr>
|
||||||
|
|
||||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||||
<td> Free a compiled pattern</td></tr>
|
<td> Free a compiled pattern</td></tr>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>pcre2_callout_enumerate specification</title>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||||
|
<h1>pcre2_callout_enumerate man page</h1>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
|
please consult the man page, in case the conversion went wrong.
|
||||||
|
<br>
|
||||||
|
<br><b>
|
||||||
|
SYNOPSIS
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
<b>#include <pcre2.h></b>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||||
|
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||||
|
<b> void *<i>callout_data</i>);</b>
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
DESCRIPTION
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
This function scans a compiled regular expression and calls the <i>callback()</i>
|
||||||
|
function for each callout within the pattern. The yield of the function is zero
|
||||||
|
for success and non-zero otherwise. The arguments are:
|
||||||
|
<pre>
|
||||||
|
<i>code</i> Points to the compiled pattern
|
||||||
|
<i>callback</i> The callback function
|
||||||
|
<i>callout_data</i> User data that is passed to the callback
|
||||||
|
</pre>
|
||||||
|
The <i>callback()</i> function is passed a pointer to a data block containing
|
||||||
|
the following fields:
|
||||||
|
<pre>
|
||||||
|
<i>version</i> Block version number
|
||||||
|
<i>pattern_position</i> Offset to next item in pattern
|
||||||
|
<i>next_item_length</i> Length of next item in pattern
|
||||||
|
<i>callout_number</i> Number for numbered callouts
|
||||||
|
<i>callout_string_offset</i> Offset to string within pattern
|
||||||
|
<i>callout_string_length</i> Length of callout string
|
||||||
|
<i>callout_string</i> Points to callout string or is NULL
|
||||||
|
</pre>
|
||||||
|
The second argument is the callout data that was passed to
|
||||||
|
<b>pcre2_callout_enumerate()</b>. The <b>callback()</b> function must return zero
|
||||||
|
for success. Any other value causes the pattern scan to stop, with the value
|
||||||
|
being passed back as the result of <b>pcre2_callout_enumerate()</b>.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
|
page and a description of the POSIX API in the
|
||||||
|
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||||
|
page.
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
|
@ -35,23 +35,24 @@ please consult the man page, in case the conversion went wrong.
|
||||||
<li><a name="TOC20" href="#SEC20">JUST-IN-TIME (JIT) COMPILATION</a>
|
<li><a name="TOC20" href="#SEC20">JUST-IN-TIME (JIT) COMPILATION</a>
|
||||||
<li><a name="TOC21" href="#SEC21">LOCALE SUPPORT</a>
|
<li><a name="TOC21" href="#SEC21">LOCALE SUPPORT</a>
|
||||||
<li><a name="TOC22" href="#SEC22">INFORMATION ABOUT A COMPILED PATTERN</a>
|
<li><a name="TOC22" href="#SEC22">INFORMATION ABOUT A COMPILED PATTERN</a>
|
||||||
<li><a name="TOC23" href="#SEC23">SERIALIZATION AND PRECOMPILING</a>
|
<li><a name="TOC23" href="#SEC23">INFORMATION ABOUT A PATTERN'S CALLOUTS</a>
|
||||||
<li><a name="TOC24" href="#SEC24">THE MATCH DATA BLOCK</a>
|
<li><a name="TOC24" href="#SEC24">SERIALIZATION AND PRECOMPILING</a>
|
||||||
<li><a name="TOC25" href="#SEC25">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
<li><a name="TOC25" href="#SEC25">THE MATCH DATA BLOCK</a>
|
||||||
<li><a name="TOC26" href="#SEC26">NEWLINE HANDLING WHEN MATCHING</a>
|
<li><a name="TOC26" href="#SEC26">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
||||||
<li><a name="TOC27" href="#SEC27">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
<li><a name="TOC27" href="#SEC27">NEWLINE HANDLING WHEN MATCHING</a>
|
||||||
<li><a name="TOC28" href="#SEC28">OTHER INFORMATION ABOUT A MATCH</a>
|
<li><a name="TOC28" href="#SEC28">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
||||||
<li><a name="TOC29" href="#SEC29">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
<li><a name="TOC29" href="#SEC29">OTHER INFORMATION ABOUT A MATCH</a>
|
||||||
<li><a name="TOC30" href="#SEC30">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
<li><a name="TOC30" href="#SEC30">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
||||||
<li><a name="TOC31" href="#SEC31">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
<li><a name="TOC31" href="#SEC31">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
||||||
<li><a name="TOC32" href="#SEC32">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
<li><a name="TOC32" href="#SEC32">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
||||||
<li><a name="TOC33" href="#SEC33">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
<li><a name="TOC33" href="#SEC33">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
||||||
<li><a name="TOC34" href="#SEC34">DUPLICATE SUBPATTERN NAMES</a>
|
<li><a name="TOC34" href="#SEC34">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
||||||
<li><a name="TOC35" href="#SEC35">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
<li><a name="TOC35" href="#SEC35">DUPLICATE SUBPATTERN NAMES</a>
|
||||||
<li><a name="TOC36" href="#SEC36">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
<li><a name="TOC36" href="#SEC36">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
||||||
<li><a name="TOC37" href="#SEC37">SEE ALSO</a>
|
<li><a name="TOC37" href="#SEC37">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
||||||
<li><a name="TOC38" href="#SEC38">AUTHOR</a>
|
<li><a name="TOC38" href="#SEC38">SEE ALSO</a>
|
||||||
<li><a name="TOC39" href="#SEC39">REVISION</a>
|
<li><a name="TOC39" href="#SEC39">AUTHOR</a>
|
||||||
|
<li><a name="TOC40" href="#SEC40">REVISION</a>
|
||||||
</ul>
|
</ul>
|
||||||
<P>
|
<P>
|
||||||
<b>#include <pcre2.h></b>
|
<b>#include <pcre2.h></b>
|
||||||
|
@ -291,6 +292,11 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
|
<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||||
|
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||||
|
<b> void *<i>user_data</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>);</b>
|
<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC11" href="#TOC1">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a><br>
|
<br><a name="SEC11" href="#TOC1">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a><br>
|
||||||
|
@ -1433,14 +1439,16 @@ can be processed in different locales.
|
||||||
<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
|
<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>pcre2_pattern_info()</b> function returns information about a compiled
|
The <b>pcre2_pattern_info()</b> function returns general information about a
|
||||||
pattern. The first argument is a pointer to the compiled pattern. The second
|
compiled pattern. For information about callouts, see the
|
||||||
argument specifies which piece of information is required, and the third
|
<a href="pcre2pattern.html#infoaboutcallouts">next section.</a>
|
||||||
argument is a pointer to a variable to receive the data. If the third argument
|
The first argument for <b>pcre2_pattern_info()</b> is a pointer to the compiled
|
||||||
is NULL, the first argument is ignored, and the function returns the size in
|
pattern. The second argument specifies which piece of information is required,
|
||||||
bytes of the variable that is required for the information requested.
|
and the third argument is a pointer to a variable to receive the data. If the
|
||||||
Otherwise, The yield of the function is zero for success, or one of the
|
third argument is NULL, the first argument is ignored, and the function returns
|
||||||
following negative numbers:
|
the size in bytes of the variable that is required for the information
|
||||||
|
requested. Otherwise, The yield of the function is zero for success, or one of
|
||||||
|
the following negative numbers:
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_ERROR_NULL the argument <i>code</i> was NULL
|
PCRE2_ERROR_NULL the argument <i>code</i> was NULL
|
||||||
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
||||||
|
@ -1719,8 +1727,27 @@ memory in which to place the compiled pattern may be slightly larger than the
|
||||||
value returned by this option, because there are cases where the code that
|
value returned by this option, because there are cases where the code that
|
||||||
calculates the size has to over-estimate. Processing a pattern with the JIT
|
calculates the size has to over-estimate. Processing a pattern with the JIT
|
||||||
compiler does not alter the value returned by this option.
|
compiler does not alter the value returned by this option.
|
||||||
|
<a name="infoaboutcallouts"></a></P>
|
||||||
|
<br><a name="SEC23" href="#TOC1">INFORMATION ABOUT A PATTERN'S CALLOUTS</a><br>
|
||||||
|
<P>
|
||||||
|
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||||
|
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||||
|
<b> void *<i>user_data</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
A script language that supports the use of string arguments in callouts might
|
||||||
|
like to scan all the callouts in a pattern before running the match. This can
|
||||||
|
be done by calling <b>pcre2_callout_enumerate()</b>. The first argument is a
|
||||||
|
pointer to a compiled pattern, the second points to a callback function, and
|
||||||
|
the third is arbitrary user data. The callback function is called for every
|
||||||
|
callout in the pattern in the order in which they appear. Its first argument is
|
||||||
|
a pointer to a callout enumeration block, and its second argument is the
|
||||||
|
<i>user_data</i> value that was passed to <b>pcre2_callout_enumerate()</b>. The
|
||||||
|
contents of the callout enumeration block are described in the
|
||||||
|
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||||
|
documentation, which also gives further details about callouts.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC23" href="#TOC1">SERIALIZATION AND PRECOMPILING</a><br>
|
<br><a name="SEC24" href="#TOC1">SERIALIZATION AND PRECOMPILING</a><br>
|
||||||
<P>
|
<P>
|
||||||
It is possible to save compiled patterns on disc or elsewhere, and reload them
|
It is possible to save compiled patterns on disc or elsewhere, and reload them
|
||||||
later, subject to a number of restrictions. The functions whose names begin
|
later, subject to a number of restrictions. The functions whose names begin
|
||||||
|
@ -1729,7 +1756,7 @@ the
|
||||||
<a href="pcre2serialize.html"><b>pcre2serialize</b></a>
|
<a href="pcre2serialize.html"><b>pcre2serialize</b></a>
|
||||||
documentation.
|
documentation.
|
||||||
<a name="matchdatablock"></a></P>
|
<a name="matchdatablock"></a></P>
|
||||||
<br><a name="SEC24" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
<br><a name="SEC25" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
|
@ -1800,7 +1827,7 @@ match data block (for that match) have taken place.
|
||||||
When a match data block itself is no longer needed, it should be freed by
|
When a match data block itself is no longer needed, it should be freed by
|
||||||
calling <b>pcre2_match_data_free()</b>.
|
calling <b>pcre2_match_data_free()</b>.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC25" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
<br><a name="SEC26" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||||
|
@ -2014,7 +2041,7 @@ examples, in the
|
||||||
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
||||||
documentation.
|
documentation.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC26" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
<br><a name="SEC27" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is built, a default newline convention is set; this is usually the
|
When PCRE2 is built, a default newline convention is set; this is usually the
|
||||||
standard convention for the operating system. The default can be overridden in
|
standard convention for the operating system. The default can be overridden in
|
||||||
|
@ -2049,7 +2076,7 @@ LF in the characters that it matches.
|
||||||
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
||||||
valid newline sequence and explicit \r or \n escapes appear in the pattern.
|
valid newline sequence and explicit \r or \n escapes appear in the pattern.
|
||||||
<a name="matchedstrings"></a></P>
|
<a name="matchedstrings"></a></P>
|
||||||
<br><a name="SEC27" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
<br><a name="SEC28" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -2151,7 +2178,7 @@ parentheses, no more than <i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by
|
||||||
<b>pcre2_match()</b>. The other elements retain whatever values they previously
|
<b>pcre2_match()</b>. The other elements retain whatever values they previously
|
||||||
had.
|
had.
|
||||||
<a name="matchotherdata"></a></P>
|
<a name="matchotherdata"></a></P>
|
||||||
<br><a name="SEC28" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
<br><a name="SEC29" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -2195,7 +2222,7 @@ the code unit offset of the invalid UTF character. Details are given in the
|
||||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||||
page.
|
page.
|
||||||
<a name="errorlist"></a></P>
|
<a name="errorlist"></a></P>
|
||||||
<br><a name="SEC29" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
<br><a name="SEC30" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
||||||
<P>
|
<P>
|
||||||
If <b>pcre2_match()</b> fails, it returns a negative number. This can be
|
If <b>pcre2_match()</b> fails, it returns a negative number. This can be
|
||||||
converted to a text string by calling <b>pcre2_get_error_message()</b>. Negative
|
converted to a text string by calling <b>pcre2_get_error_message()</b>. Negative
|
||||||
|
@ -2246,8 +2273,8 @@ of the subject.
|
||||||
PCRE2_ERROR_CALLOUT
|
PCRE2_ERROR_CALLOUT
|
||||||
</pre>
|
</pre>
|
||||||
This error is never generated by <b>pcre2_match()</b> itself. It is provided for
|
This error is never generated by <b>pcre2_match()</b> itself. It is provided for
|
||||||
use by callout functions that want to cause <b>pcre2_match()</b> to return a
|
use by callout functions that want to cause <b>pcre2_match()</b> or
|
||||||
distinctive error code. See the
|
<b>pcre2_callout_enumerate()</b> to return a distinctive error code. See the
|
||||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||||
documentation for details.
|
documentation for details.
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -2304,7 +2331,7 @@ is attempted.
|
||||||
</pre>
|
</pre>
|
||||||
The internal recursion limit was reached.
|
The internal recursion limit was reached.
|
||||||
<a name="extractbynumber"></a></P>
|
<a name="extractbynumber"></a></P>
|
||||||
<br><a name="SEC30" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
<br><a name="SEC31" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
|
<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
|
||||||
<b> uint32_t <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
|
<b> uint32_t <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
|
||||||
|
@ -2401,7 +2428,7 @@ The substring did not participate in the match. For example, if the pattern is
|
||||||
(abc)|(def) and the subject is "def", and the ovector contains at least two
|
(abc)|(def) and the subject is "def", and the ovector contains at least two
|
||||||
capturing slots, substring number 1 is unset.
|
capturing slots, substring number 1 is unset.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC31" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
<br><a name="SEC32" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
||||||
<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
|
<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
|
||||||
|
@ -2440,7 +2467,7 @@ can be distinguished from a genuine zero-length substring by inspecting the
|
||||||
appropriate offset in the ovector, which contain PCRE2_UNSET for unset
|
appropriate offset in the ovector, which contain PCRE2_UNSET for unset
|
||||||
substrings, or by calling <b>pcre2_substring_length_bynumber()</b>.
|
substrings, or by calling <b>pcre2_substring_length_bynumber()</b>.
|
||||||
<a name="extractbyname"></a></P>
|
<a name="extractbyname"></a></P>
|
||||||
<br><a name="SEC32" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
<br><a name="SEC33" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
|
<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
|
||||||
<b> PCRE2_SPTR <i>name</i>);</b>
|
<b> PCRE2_SPTR <i>name</i>);</b>
|
||||||
|
@ -2500,7 +2527,7 @@ names are not included in the compiled code. The matching process uses only
|
||||||
numbers. For this reason, the use of different names for subpatterns of the
|
numbers. For this reason, the use of different names for subpatterns of the
|
||||||
same number causes an error at compile time.
|
same number causes an error at compile time.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC33" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
<br><a name="SEC34" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||||
|
@ -2561,7 +2588,7 @@ straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid
|
||||||
replacement string (unrecognized sequence following a dollar sign), and
|
replacement string (unrecognized sequence following a dollar sign), and
|
||||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
|
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC34" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
||||||
<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
|
<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
|
||||||
|
@ -2606,7 +2633,7 @@ The format of the name table is described above in the section entitled
|
||||||
Given all the relevant entries for the name, you can extract each of their
|
Given all the relevant entries for the name, you can extract each of their
|
||||||
numbers, and hence the captured data.
|
numbers, and hence the captured data.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC35" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
<br><a name="SEC36" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
||||||
<P>
|
<P>
|
||||||
The traditional matching function uses a similar algorithm to Perl, which stops
|
The traditional matching function uses a similar algorithm to Perl, which stops
|
||||||
when it finds the first match at a given point in the subject. If you want to
|
when it finds the first match at a given point in the subject. If you want to
|
||||||
|
@ -2624,7 +2651,7 @@ substring. Then return 1, which forces <b>pcre2_match()</b> to backtrack and try
|
||||||
other alternatives. Ultimately, when it runs out of matches,
|
other alternatives. Ultimately, when it runs out of matches,
|
||||||
<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH.
|
<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH.
|
||||||
<a name="dfamatch"></a></P>
|
<a name="dfamatch"></a></P>
|
||||||
<br><a name="SEC36" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
<br><a name="SEC37" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||||
|
@ -2819,13 +2846,13 @@ some plausibility checks are made on the contents of the workspace, which
|
||||||
should contain data about the previous partial match. If any of these checks
|
should contain data about the previous partial match. If any of these checks
|
||||||
fail, this error is given.
|
fail, this error is given.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC37" href="#TOC1">SEE ALSO</a><br>
|
<br><a name="SEC38" href="#TOC1">SEE ALSO</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2build</b>(3), <b>pcre2callout</b>(3), <b>pcre2demo(3)</b>,
|
<b>pcre2build</b>(3), <b>pcre2callout</b>(3), <b>pcre2demo(3)</b>,
|
||||||
<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3),
|
<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3),
|
||||||
<b>pcre2sample</b>(3), <b>pcre2stack</b>(3), <b>pcre2unicode</b>(3).
|
<b>pcre2sample</b>(3), <b>pcre2stack</b>(3), <b>pcre2unicode</b>(3).
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC38" href="#TOC1">AUTHOR</a><br>
|
<br><a name="SEC39" href="#TOC1">AUTHOR</a><br>
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
|
@ -2834,9 +2861,9 @@ University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC39" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 23 January 2015
|
Last updated: 23 March 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -17,9 +17,10 @@ please consult the man page, in case the conversion went wrong.
|
||||||
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
||||||
<li><a name="TOC3" href="#SEC3">MISSING CALLOUTS</a>
|
<li><a name="TOC3" href="#SEC3">MISSING CALLOUTS</a>
|
||||||
<li><a name="TOC4" href="#SEC4">THE CALLOUT INTERFACE</a>
|
<li><a name="TOC4" href="#SEC4">THE CALLOUT INTERFACE</a>
|
||||||
<li><a name="TOC5" href="#SEC5">RETURN VALUES</a>
|
<li><a name="TOC5" href="#SEC5">RETURN VALUES FROM CALLOUTS</a>
|
||||||
<li><a name="TOC6" href="#SEC6">AUTHOR</a>
|
<li><a name="TOC6" href="#SEC6">CALLOUT ENUMERATION</a>
|
||||||
<li><a name="TOC7" href="#SEC7">REVISION</a>
|
<li><a name="TOC7" href="#SEC7">AUTHOR</a>
|
||||||
|
<li><a name="TOC8" href="#SEC8">REVISION</a>
|
||||||
</ul>
|
</ul>
|
||||||
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -27,23 +28,32 @@ please consult the man page, in case the conversion went wrong.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>int (*pcre2_callout)(pcre2_callout_block *, void *);</b>
|
<b>int (*pcre2_callout)(pcre2_callout_block *, void *);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||||
|
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||||
|
<b> void *<i>user_data</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
||||||
<P>
|
<P>
|
||||||
PCRE2 provides a feature called "callout", which is a means of temporarily
|
PCRE2 provides a feature called "callout", which is a means of temporarily
|
||||||
passing control to the caller of PCRE2 in the middle of pattern matching. The
|
passing control to the caller of PCRE2 in the middle of pattern matching. The
|
||||||
caller of PCRE2 provides an external function by putting its entry point in
|
caller of PCRE2 provides an external function by putting its entry point in
|
||||||
a match context (see <b>pcre2_set_callout()</b>) in the
|
a match context (see <b>pcre2_set_callout()</b> in the
|
||||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
documentation).
|
documentation).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Within a regular expression, (?C) indicates the points at which the external
|
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||||
function is to be called. Different callout points can be identified by putting
|
function is to be called. Different callout points can be identified by putting
|
||||||
a number less than 256 after the letter C. The default value is zero.
|
a number less than 256 after the letter C. The default value is zero.
|
||||||
For example, this pattern has two callout points:
|
Alternatively, the argument may be a delimited string. The starting delimiter
|
||||||
|
must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the
|
||||||
|
start, except for {, where the ending delimiter is }. If the ending delimiter
|
||||||
|
is needed within the string, it must be doubled. For example, this pattern has
|
||||||
|
two callout points:
|
||||||
<pre>
|
<pre>
|
||||||
(?C1)abc(?C2)def
|
(?C1)abc(?C"some ""arbitrary"" text")def
|
||||||
</pre>
|
</pre>
|
||||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
|
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
|
||||||
automatically inserts callouts, all with number 255, before each item in the
|
automatically inserts callouts, all with number 255, before each item in the
|
||||||
|
@ -62,19 +72,18 @@ alternation bar. If the pattern contains a conditional group whose condition is
|
||||||
an assertion, an automatic callout is inserted immediately before the
|
an assertion, an automatic callout is inserted immediately before the
|
||||||
condition. Such a callout may also be inserted explicitly, for example:
|
condition. Such a callout may also be inserted explicitly, for example:
|
||||||
<pre>
|
<pre>
|
||||||
(?(?C9)(?=a)ab|de)
|
(?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de)
|
||||||
</pre>
|
</pre>
|
||||||
This applies only to assertion conditions (because they are themselves
|
This applies only to assertion conditions (because they are themselves
|
||||||
independent groups).
|
independent groups).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Automatic callouts can be used for tracking the progress of pattern matching.
|
Callouts can be useful for tracking the progress of pattern matching. The
|
||||||
The
|
|
||||||
<a href="pcre2test.html"><b>pcre2test</b></a>
|
<a href="pcre2test.html"><b>pcre2test</b></a>
|
||||||
program has a pattern qualifier (/auto_callout) that sets automatic callouts;
|
program has a pattern qualifier (/auto_callout) that sets automatic callouts.
|
||||||
when it is used, the output indicates how the pattern is being matched. This is
|
When any callouts are present, the output from <b>pcre2test</b> indicates how
|
||||||
useful information when you are trying to optimize the performance of a
|
the pattern is being matched. This is useful information when you are trying to
|
||||||
particular pattern.
|
optimize the performance of a particular pattern.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC3" href="#TOC1">MISSING CALLOUTS</a><br>
|
<br><a name="SEC3" href="#TOC1">MISSING CALLOUTS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -185,7 +194,7 @@ You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
|
||||||
option to <b>pcre2_compile()</b>, or by starting the pattern with
|
option to <b>pcre2_compile()</b>, or by starting the pattern with
|
||||||
(*NO_START_OPT). This slows down the matching process, but does ensure that
|
(*NO_START_OPT). This slows down the matching process, but does ensure that
|
||||||
callouts such as the example above are obeyed.
|
callouts such as the example above are obeyed.
|
||||||
</P>
|
<a name="calloutinterface"></a></P>
|
||||||
<br><a name="SEC4" href="#TOC1">THE CALLOUT INTERFACE</a><br>
|
<br><a name="SEC4" href="#TOC1">THE CALLOUT INTERFACE</a><br>
|
||||||
<P>
|
<P>
|
||||||
During matching, when PCRE2 reaches a callout point, if an external function is
|
During matching, when PCRE2 reaches a callout point, if an external function is
|
||||||
|
@ -209,16 +218,53 @@ documentation). The callout block structure contains the following fields:
|
||||||
PCRE2_SIZE <i>current_position</i>;
|
PCRE2_SIZE <i>current_position</i>;
|
||||||
PCRE2_SIZE <i>pattern_position</i>;
|
PCRE2_SIZE <i>pattern_position</i>;
|
||||||
PCRE2_SIZE <i>next_item_length</i>;
|
PCRE2_SIZE <i>next_item_length</i>;
|
||||||
|
PCRE2_SIZE <i>callout_string_offset</i>;
|
||||||
|
PCRE2_SIZE <i>callout_string_length</i>;
|
||||||
|
PCRE2_SPTR <i>callout_string</i>;
|
||||||
</pre>
|
</pre>
|
||||||
The <i>version</i> field contains the version number of the block format. The
|
The <i>version</i> field contains the version number of the block format. The
|
||||||
current version is 0. The version number will change in future if additional
|
current version is 1; the three callout string fields were added for this
|
||||||
fields are added, but the intention is never to remove any of the existing
|
version. If you are writing an application that might use an earlier release of
|
||||||
fields.
|
PCRE2, you should check the version number before accessing any of these
|
||||||
|
fields. The version number will increase in future if more fields are added,
|
||||||
|
but the intention is never to remove any of the existing fields.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Fields for numerical callouts
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
For a numerical callout, <i>callout_string</i> is NULL, and <i>callout_number</i>
|
||||||
|
contains the number of the callout, in the range 0-255. This is the number
|
||||||
|
that follows (?C for manual callouts; it is 255 for automatically generated
|
||||||
|
callouts.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Fields for string callouts
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
For callouts with string arguments, <i>callout_number</i> is always zero, and
|
||||||
|
<i>callout_string</i> points to the string that is contained within the compiled
|
||||||
|
pattern. Its length is given by <i>callout_string_length</i>. Duplicated ending
|
||||||
|
delimiters that were present in the original pattern string have been turned
|
||||||
|
into single characters, but there is no other processing of the callout string
|
||||||
|
argument. An additional code unit containing binary zero is present after the
|
||||||
|
string, but is not included in the length. The delimiter that was used to start
|
||||||
|
the string is also stored within the pattern, immediately before the string
|
||||||
|
itself. You can access this delimiter as <i>callout_string</i>[-1] if you need
|
||||||
|
it.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>callout_number</i> field contains the number of the callout, as compiled
|
The <i>callout_string_offset</i> field is the code unit offset to the start of
|
||||||
into the pattern (that is, the number after ?C for manual callouts, and 255 for
|
the callout argument string within the original pattern string. This is
|
||||||
automatically generated callouts).
|
provided for the benefit of applications such as script languages that might
|
||||||
|
need to report errors in the callout string within the pattern.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Fields for all callouts
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
The remaining fields in the callout block are the same for both kinds of
|
||||||
|
callout.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>offset_vector</i> field is a pointer to the vector of capturing offsets
|
The <i>offset_vector</i> field is a pointer to the vector of capturing offsets
|
||||||
|
@ -259,8 +305,8 @@ substrings have been captured, the value of <i>capture_last</i> is 0. This is
|
||||||
always the case for the DFA matching functions.
|
always the case for the DFA matching functions.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>pattern_position</i> field contains the offset to the next item to be
|
The <i>pattern_position</i> field contains the offset in the pattern string to
|
||||||
matched in the pattern string.
|
the next item to be matched.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>next_item_length</i> field contains the length of the next item to be
|
The <i>next_item_length</i> field contains the length of the next item to be
|
||||||
|
@ -272,7 +318,9 @@ of the entire subpattern.
|
||||||
<P>
|
<P>
|
||||||
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
|
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
|
||||||
help in distinguishing between different automatic callouts, which all have the
|
help in distinguishing between different automatic callouts, which all have the
|
||||||
same callout number. However, they are set for all callouts.
|
same callout number. However, they are set for all callouts, and are used by
|
||||||
|
<b>pcre2test</b> to show the next item to be matched when displaying callout
|
||||||
|
information.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In callouts from <b>pcre2_match()</b> the <i>mark</i> field contains a pointer to
|
In callouts from <b>pcre2_match()</b> the <i>mark</i> field contains a pointer to
|
||||||
|
@ -281,7 +329,7 @@ the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||||
of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In
|
of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In
|
||||||
callouts from the DFA matching function this field always contains NULL.
|
callouts from the DFA matching function this field always contains NULL.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">RETURN VALUES</a><br>
|
<br><a name="SEC5" href="#TOC1">RETURN VALUES FROM CALLOUTS</a><br>
|
||||||
<P>
|
<P>
|
||||||
The external callout function returns an integer to PCRE2. If the value is
|
The external callout function returns an integer to PCRE2. If the value is
|
||||||
zero, matching proceeds as normal. If the value is greater than zero, matching
|
zero, matching proceeds as normal. If the value is greater than zero, matching
|
||||||
|
@ -296,7 +344,51 @@ values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match"
|
||||||
failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout
|
failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout
|
||||||
functions; it will never be used by PCRE2 itself.
|
functions; it will never be used by PCRE2 itself.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC6" href="#TOC1">AUTHOR</a><br>
|
<br><a name="SEC6" href="#TOC1">CALLOUT ENUMERATION</a><br>
|
||||||
|
<P>
|
||||||
|
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||||
|
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||||
|
<b> void *<i>user_data</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
A script language that supports the use of string arguments in callouts might
|
||||||
|
like to scan all the callouts in a pattern before running the match. This can
|
||||||
|
be done by calling <b>pcre2_callout_enumerate()</b>. The first argument is a
|
||||||
|
pointer to a compiled pattern, the second points to a callback function, and
|
||||||
|
the third is arbitrary user data. The callback function is called for every
|
||||||
|
callout in the pattern in the order in which they appear. Its first argument is
|
||||||
|
a pointer to a callout enumeration block, and its second argument is the
|
||||||
|
<i>user_data</i> value that was passed to <b>pcre2_callout_enumerate()</b>. The
|
||||||
|
data block contains the following fields:
|
||||||
|
<pre>
|
||||||
|
<i>version</i> Block version number
|
||||||
|
<i>pattern_position</i> Offset to next item in pattern
|
||||||
|
<i>next_item_length</i> Length of next item in pattern
|
||||||
|
<i>callout_number</i> Number for numbered callouts
|
||||||
|
<i>callout_string_offset</i> Offset to string within pattern
|
||||||
|
<i>callout_string_length</i> Length of callout string
|
||||||
|
<i>callout_string</i> Points to callout string or is NULL
|
||||||
|
</pre>
|
||||||
|
The version number is currently 0. It will increase if new fields are ever
|
||||||
|
added to the block. The remaining fields are the same as their namesakes in the
|
||||||
|
<b>pcre2_callout</b> block that is used for callouts during matching, as
|
||||||
|
described
|
||||||
|
<a href="#calloutinterface">above.</a>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Note that the value of <i>pattern_position</i> is unique for each callout.
|
||||||
|
However, if a callout occurs inside a group that is quantified with a non-zero
|
||||||
|
minimum or a fixed maximum, the group is replicated inside the compiled
|
||||||
|
pattern. For example, a pattern such as /(a){2}/ is compiled as if it were
|
||||||
|
/(a)(a)/. This means that the callout will be enumerated more than once, but
|
||||||
|
with the same value for <i>pattern_position</i> in each case.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The callback function should normally return zero. If it returns a non-zero
|
||||||
|
value, scanning the pattern stops, and that value is returned from
|
||||||
|
<b>pcre2_callout_enumerate()</b>.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
|
@ -305,9 +397,9 @@ University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC7" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 02 January 2015
|
Last updated: 23 March 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -83,11 +83,11 @@ the
|
||||||
documentation for details.
|
documentation for details.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
8. Subpatterns that are called as subroutines (whether or not recursively) are
|
8. Subroutine calls (whether recursive or not) are treated as atomic groups.
|
||||||
always treated as atomic groups in PCRE2. This is like Python, but unlike Perl.
|
Atomic recursion is like Python, but unlike Perl. Captured values that are set
|
||||||
Captured values that are set outside a subroutine call can be reference from
|
outside a subroutine call can be referenced from inside in PCRE2, but not in
|
||||||
inside in PCRE2, but not in Perl. There is a discussion that explains these
|
Perl. There is a discussion that explains these differences in more detail in
|
||||||
differences in more detail in the
|
the
|
||||||
<a href="pcre2pattern.html#recursiondifference">section on recursion differences from Perl</a>
|
<a href="pcre2pattern.html#recursiondifference">section on recursion differences from Perl</a>
|
||||||
in the
|
in the
|
||||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||||
|
@ -214,9 +214,9 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 28 September 2014
|
Last updated: 15 March 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -2786,43 +2786,70 @@ same pair of parentheses when there is a repetition.
|
||||||
PCRE2 provides a similar feature, but of course it cannot obey arbitrary Perl
|
PCRE2 provides a similar feature, but of course it cannot obey arbitrary Perl
|
||||||
code. The feature is called "callout". The caller of PCRE2 provides an external
|
code. The feature is called "callout". The caller of PCRE2 provides an external
|
||||||
function by putting its entry point in a match context using the function
|
function by putting its entry point in a match context using the function
|
||||||
<b>pcre2_set_callout()</b> and passing the context to <b>pcre2_match()</b> or
|
<b>pcre2_set_callout()</b>, and then passing that context to <b>pcre2_match()</b>
|
||||||
<b>pcre2_dfa_match()</b>. If no match context is passed, or if the callout entry
|
or <b>pcre2_dfa_match()</b>. If no match context is passed, or if the callout
|
||||||
point is set to NULL, callouts are disabled.
|
entry point is set to NULL, callouts are disabled.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Within a regular expression, (?C) indicates the points at which the external
|
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||||
function is to be called. If you want to identify different callout points, you
|
function is to be called. There are two kinds of callout: those with a
|
||||||
can put a number less than 256 after the letter C. The default value is zero.
|
numerical argument and those with a string argument. (?C) on its own with no
|
||||||
For example, this pattern has two callout points:
|
argument is treated as (?C0). A numerical argument allows the application to
|
||||||
|
distinguish between different callouts. String arguments were added for release
|
||||||
|
10.20 to make it possible for script languages that use PCRE2 to embed short
|
||||||
|
scripts within patterns in a similar way to Perl.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
During matching, when PCRE2 reaches a callout point, the external function is
|
||||||
|
called. It is provided with the number or string argument of the callout, the
|
||||||
|
position in the pattern, and one item of data that is also set in the match
|
||||||
|
block. The callout function may cause matching to proceed, to backtrack, or to
|
||||||
|
fail.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
By default, PCRE2 implements a number of optimizations at matching time, and
|
||||||
|
one side-effect is that sometimes callouts are skipped. If you need all
|
||||||
|
possible callouts to happen, you need to set options that disable the relevant
|
||||||
|
optimizations. More details, including a complete description of the
|
||||||
|
programming interface to the callout function, are given in the
|
||||||
|
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||||
|
documentation.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Callouts with numerical arguments
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
If you just want to have a means of identifying different callout points, put a
|
||||||
|
number less than 256 after the letter C. For example, this pattern has two
|
||||||
|
callout points:
|
||||||
<pre>
|
<pre>
|
||||||
(?C1)abc(?C2)def
|
(?C1)abc(?C2)def
|
||||||
</pre>
|
</pre>
|
||||||
If the PCRE2_AUTO_CALLOUT flag is passed to <b>pcre2_compile()</b>, callouts are
|
If the PCRE2_AUTO_CALLOUT flag is passed to <b>pcre2_compile()</b>, numerical
|
||||||
automatically installed before each item in the pattern. They are all numbered
|
callouts are automatically installed before each item in the pattern. They are
|
||||||
255. If there is a conditional group in the pattern whose condition is an
|
all numbered 255. If there is a conditional group in the pattern whose
|
||||||
assertion, an additional callout is inserted just before the condition. An
|
condition is an assertion, an additional callout is inserted just before the
|
||||||
explicit callout may also be set at this position, as in this example:
|
condition. An explicit callout may also be set at this position, as in this
|
||||||
|
example:
|
||||||
<pre>
|
<pre>
|
||||||
(?(?C9)(?=a)abc|def)
|
(?(?C9)(?=a)abc|def)
|
||||||
</pre>
|
</pre>
|
||||||
Note that this applies only to assertion conditions, not to other types of
|
Note that this applies only to assertion conditions, not to other types of
|
||||||
condition.
|
condition.
|
||||||
</P>
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Callouts with string arguments
|
||||||
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
During matching, when PCRE2 reaches a callout point, the external function is
|
A delimited string may be used instead of a number as a callout argument. The
|
||||||
called. It is provided with the number of the callout, the position in the
|
starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is
|
||||||
pattern, and one item of data that is also set in the match block. The callout
|
the same as the start, except for {, where the ending delimiter is }. If the
|
||||||
function may cause matching to proceed, to backtrack, or to fail.
|
ending delimiter is needed within the string, it must be doubled. For
|
||||||
</P>
|
example:
|
||||||
<P>
|
<pre>
|
||||||
By default, PCRE2 implements a number of optimizations at matching time, and
|
(?C'ab ''c'' d')xyz(?C{any text})pqr
|
||||||
one side-effect is that sometimes callouts are skipped. If you need all
|
</pre>
|
||||||
possible callouts to happen, you need to set options that disable the relevant
|
The doubling is removed before the string is passed to the callout function.
|
||||||
optimizations. More details, and a complete description of the interface to the
|
|
||||||
callout function, are given in the
|
|
||||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
|
||||||
documentation.
|
|
||||||
<a name="backtrackcontrol"></a></P>
|
<a name="backtrackcontrol"></a></P>
|
||||||
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -3258,7 +3285,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 28 January 2015
|
Last updated: 15 March 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -535,9 +535,13 @@ pattern is not anchored.
|
||||||
<br><a name="SEC24" href="#TOC1">CALLOUTS</a><br>
|
<br><a name="SEC24" href="#TOC1">CALLOUTS</a><br>
|
||||||
<P>
|
<P>
|
||||||
<pre>
|
<pre>
|
||||||
(?C) callout
|
(?C) callout (assumed number 0)
|
||||||
(?Cn) callout with data n
|
(?Cn) callout with numerical data n
|
||||||
</PRE>
|
(?C"text") callout with string data
|
||||||
|
</pre>
|
||||||
|
The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
|
||||||
|
start and the end), and the starting delimiter { matched with the ending
|
||||||
|
delimiter }. To encode the ending delimiter within the string, double it.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC25" href="#TOC1">SEE ALSO</a><br>
|
<br><a name="SEC25" href="#TOC1">SEE ALSO</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -555,7 +559,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 26 January 2015
|
Last updated: 15 March 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -90,11 +90,18 @@ names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
||||||
<P>
|
<P>
|
||||||
Input to <b>pcre2test</b> is processed line by line, either by calling the C
|
Input to <b>pcre2test</b> is processed line by line, either by calling the C
|
||||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> library (see
|
library's <b>fgets()</b> function, or via the <b>libreadline</b> library (see
|
||||||
below). In Unix-like environments, <b>fgets()</b> treats any bytes other than
|
below). The input is processed using using C's string functions, so must not
|
||||||
newline as data characters. However, in some Windows environments character 26
|
contain binary zeroes, even though in Unix-like environments, <b>fgets()</b>
|
||||||
(hex 1A) causes an immediate end of file, and no further data is read. For
|
treats any bytes other than newline as data characters. In some Windows
|
||||||
maximum portability, therefore, it is safest to avoid non-printing characters
|
environments character 26 (hex 1A) causes an immediate end of file, and no
|
||||||
in <b>pcre2test</b> input files.
|
further data is read.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
For maximum portability, therefore, it is safest to avoid non-printing
|
||||||
|
characters in <b>pcre2test</b> input files. There is a facility for specifying a
|
||||||
|
pattern's characters as hexadecimal pairs, thus making it possible to include
|
||||||
|
binary zeroes in a pattern for testing purposes. Subject lines are processed
|
||||||
|
for backslash escapes, which makes it possible to include any data value.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -499,6 +506,7 @@ about the pattern:
|
||||||
<pre>
|
<pre>
|
||||||
bsr=[anycrlf|unicode] specify \R handling
|
bsr=[anycrlf|unicode] specify \R handling
|
||||||
/B bincode show binary code without lengths
|
/B bincode show binary code without lengths
|
||||||
|
callout_info show callout information
|
||||||
debug same as info,fullbincode
|
debug same as info,fullbincode
|
||||||
fullbincode show binary code with lengths
|
fullbincode show binary code with lengths
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
|
@ -580,6 +588,12 @@ unit" is the last literal code unit that must be present in any match. This is
|
||||||
not necessarily the last character. These lines are omitted if no starting or
|
not necessarily the last character. These lines are omitted if no starting or
|
||||||
ending code units are recorded.
|
ending code units are recorded.
|
||||||
</P>
|
</P>
|
||||||
|
<P>
|
||||||
|
The <b>callout_info</b> modifier requests information about all the callouts in
|
||||||
|
the pattern. A list of them is output at the end of any other information that
|
||||||
|
is requested. For each callout, either its number or string is given, followed
|
||||||
|
by the item that follows it in the pattern.
|
||||||
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Specifying a pattern in hex
|
Specifying a pattern in hex
|
||||||
</b><br>
|
</b><br>
|
||||||
|
@ -907,12 +921,15 @@ set, the current captured groups are output when a callout occurs.
|
||||||
The <b>callout_fail</b> modifier can be given one or two numbers. If there is
|
The <b>callout_fail</b> modifier can be given one or two numbers. If there is
|
||||||
only one number, 1 is returned instead of 0 when a callout of that number is
|
only one number, 1 is returned instead of 0 when a callout of that number is
|
||||||
reached. If two numbers are given, 1 is returned when callout <n> is reached
|
reached. If two numbers are given, 1 is returned when callout <n> is reached
|
||||||
for the <m>th time.
|
for the <m>th time. Note that callouts with string arguments are always given
|
||||||
|
the number zero. See "Callouts" below for a description of the output when a
|
||||||
|
callout it taken.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>callout_data</b> modifier can be given an unsigned or a negative number.
|
The <b>callout_data</b> modifier can be given an unsigned or a negative number.
|
||||||
Any value other than zero is used as a return from <b>pcre2test</b>'s callout
|
This is set as the "user data" that is passed to the matching function, and
|
||||||
function.
|
passed back when the callout function is invoked. Any value other than zero is
|
||||||
|
used as a return from <b>pcre2test</b>'s callout function.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Finding all matches in a string
|
Finding all matches in a string
|
||||||
|
@ -1262,10 +1279,32 @@ documentation.
|
||||||
<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br>
|
<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br>
|
||||||
<P>
|
<P>
|
||||||
If the pattern contains any callout requests, <b>pcre2test</b>'s callout
|
If the pattern contains any callout requests, <b>pcre2test</b>'s callout
|
||||||
function is called during matching. This works with both matching functions. By
|
function is called during matching unless <b>callout_none</b> is specified.
|
||||||
default, the called function displays the callout number, the start and current
|
This works with both matching functions.
|
||||||
positions in the text at the callout time, and the next pattern item to be
|
</P>
|
||||||
tested. For example:
|
<P>
|
||||||
|
The callout function in <b>pcre2test</b> returns zero (carry on matching) by
|
||||||
|
default, but you can use a <b>callout_fail</b> modifier in a subject line (as
|
||||||
|
described above) to change this and other parameters of the callout.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Inserting callouts can be helpful when using <b>pcre2test</b> to check
|
||||||
|
complicated regular expressions. For further information about callouts, see
|
||||||
|
the
|
||||||
|
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||||
|
documentation.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The output for callouts with numerical arguments and those with string
|
||||||
|
arguments is slightly different.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Callouts with numerical arguments
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
By default, the callout function displays the callout number, the start and
|
||||||
|
current positions in the subject text at the callout time, and the next pattern
|
||||||
|
item to be tested. For example:
|
||||||
<pre>
|
<pre>
|
||||||
--->pqrabcdef
|
--->pqrabcdef
|
||||||
0 ^ ^ \d
|
0 ^ ^ \d
|
||||||
|
@ -1308,17 +1347,27 @@ The mark changes between matching "a" and "b", but stays the same for the rest
|
||||||
of the match, so nothing more is output. If, as a result of backtracking, the
|
of the match, so nothing more is output. If, as a result of backtracking, the
|
||||||
mark reverts to being unset, the text "<unset>" is output.
|
mark reverts to being unset, the text "<unset>" is output.
|
||||||
</P>
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Callouts with string arguments
|
||||||
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The callout function in <b>pcre2test</b> returns zero (carry on matching) by
|
The output for a callout with a string argument is similar, except that instead
|
||||||
default, but you can use a <b>callout_fail</b> modifier in a subject line (as
|
of outputting a callout number before the position indicators, the callout
|
||||||
described above) to change this and other parameters of the callout.
|
string and its offset in the pattern string are output before the reflection of
|
||||||
</P>
|
the subject string, and the subject string is reflected for each callout. For
|
||||||
<P>
|
example:
|
||||||
Inserting callouts can be helpful when using <b>pcre2test</b> to check
|
<pre>
|
||||||
complicated regular expressions. For further information about callouts, see
|
re> /^ab(?C'first')cd(?C"second")ef/
|
||||||
the
|
data> abcdefg
|
||||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
Callout (7): 'first'
|
||||||
documentation.
|
--->abcdefg
|
||||||
|
^ ^ c
|
||||||
|
Callout (20): "second"
|
||||||
|
--->abcdefg
|
||||||
|
^ ^ e
|
||||||
|
0: abcdef
|
||||||
|
|
||||||
|
</PRE>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC17" href="#TOC1">NON-PRINTING CHARACTERS</a><br>
|
<br><a name="SEC17" href="#TOC1">NON-PRINTING CHARACTERS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -1411,7 +1460,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 23 January 2015
|
Last updated: 22 March 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -88,6 +88,9 @@ in the library.
|
||||||
|
|
||||||
<table>
|
<table>
|
||||||
|
|
||||||
|
<tr><td><a href="pcre2_callout_enumerate.html">pcre2_callout_enumerate</a></td>
|
||||||
|
<td> Enumerate callouts in a compiled pattern</td></tr>
|
||||||
|
|
||||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||||
<td> Free a compiled pattern</td></tr>
|
<td> Free a compiled pattern</td></tr>
|
||||||
|
|
||||||
|
|
352
doc/pcre2.txt
352
doc/pcre2.txt
|
@ -367,6 +367,10 @@ PCRE2 NATIVE API AUXILIARY FUNCTIONS
|
||||||
|
|
||||||
int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
|
int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
|
||||||
|
|
||||||
|
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||||
|
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||||
|
void *user_data);
|
||||||
|
|
||||||
int pcre2_config(uint32_t what, void *where);
|
int pcre2_config(uint32_t what, void *where);
|
||||||
|
|
||||||
|
|
||||||
|
@ -1452,14 +1456,16 @@ INFORMATION ABOUT A COMPILED PATTERN
|
||||||
|
|
||||||
int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
|
int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
|
||||||
|
|
||||||
The pcre2_pattern_info() function returns information about a compiled
|
The pcre2_pattern_info() function returns general information about a
|
||||||
pattern. The first argument is a pointer to the compiled pattern. The
|
compiled pattern. For information about callouts, see the next section.
|
||||||
second argument specifies which piece of information is required, and
|
The first argument for pcre2_pattern_info() is a pointer to the com-
|
||||||
the third argument is a pointer to a variable to receive the data. If
|
piled pattern. The second argument specifies which piece of information
|
||||||
the third argument is NULL, the first argument is ignored, and the
|
is required, and the third argument is a pointer to a variable to
|
||||||
function returns the size in bytes of the variable that is required for
|
receive the data. If the third argument is NULL, the first argument is
|
||||||
the information requested. Otherwise, The yield of the function is
|
ignored, and the function returns the size in bytes of the variable
|
||||||
zero for success, or one of the following negative numbers:
|
that is required for the information requested. Otherwise, The yield of
|
||||||
|
the function is zero for success, or one of the following negative num-
|
||||||
|
bers:
|
||||||
|
|
||||||
PCRE2_ERROR_NULL the argument code was NULL
|
PCRE2_ERROR_NULL the argument code was NULL
|
||||||
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
||||||
|
@ -1744,6 +1750,25 @@ INFORMATION ABOUT A COMPILED PATTERN
|
||||||
alter the value returned by this option.
|
alter the value returned by this option.
|
||||||
|
|
||||||
|
|
||||||
|
INFORMATION ABOUT A PATTERN'S CALLOUTS
|
||||||
|
|
||||||
|
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||||
|
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||||
|
void *user_data);
|
||||||
|
|
||||||
|
A script language that supports the use of string arguments in callouts
|
||||||
|
might like to scan all the callouts in a pattern before running the
|
||||||
|
match. This can be done by calling pcre2_callout_enumerate(). The first
|
||||||
|
argument is a pointer to a compiled pattern, the second points to a
|
||||||
|
callback function, and the third is arbitrary user data. The callback
|
||||||
|
function is called for every callout in the pattern in the order in
|
||||||
|
which they appear. Its first argument is a pointer to a callout enumer-
|
||||||
|
ation block, and its second argument is the user_data value that was
|
||||||
|
passed to pcre2_callout_enumerate(). The contents of the callout enu-
|
||||||
|
meration block are described in the pcre2callout documentation, which
|
||||||
|
also gives further details about callouts.
|
||||||
|
|
||||||
|
|
||||||
SERIALIZATION AND PRECOMPILING
|
SERIALIZATION AND PRECOMPILING
|
||||||
|
|
||||||
It is possible to save compiled patterns on disc or elsewhere, and
|
It is possible to save compiled patterns on disc or elsewhere, and
|
||||||
|
@ -2221,9 +2246,9 @@ ERROR RETURNS FROM pcre2_match()
|
||||||
PCRE2_ERROR_CALLOUT
|
PCRE2_ERROR_CALLOUT
|
||||||
|
|
||||||
This error is never generated by pcre2_match() itself. It is provided
|
This error is never generated by pcre2_match() itself. It is provided
|
||||||
for use by callout functions that want to cause pcre2_match() to return
|
for use by callout functions that want to cause pcre2_match() or
|
||||||
a distinctive error code. See the pcre2callout documentation for
|
pcre2_callout_enumerate() to return a distinctive error code. See the
|
||||||
details.
|
pcre2callout documentation for details.
|
||||||
|
|
||||||
PCRE2_ERROR_INTERNAL
|
PCRE2_ERROR_INTERNAL
|
||||||
|
|
||||||
|
@ -2771,7 +2796,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 23 January 2015
|
Last updated: 23 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -3250,22 +3275,30 @@ SYNOPSIS
|
||||||
|
|
||||||
int (*pcre2_callout)(pcre2_callout_block *, void *);
|
int (*pcre2_callout)(pcre2_callout_block *, void *);
|
||||||
|
|
||||||
|
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||||
|
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||||
|
void *user_data);
|
||||||
|
|
||||||
|
|
||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
|
|
||||||
PCRE2 provides a feature called "callout", which is a means of tempo-
|
PCRE2 provides a feature called "callout", which is a means of tempo-
|
||||||
rarily passing control to the caller of PCRE2 in the middle of pattern
|
rarily passing control to the caller of PCRE2 in the middle of pattern
|
||||||
matching. The caller of PCRE2 provides an external function by putting
|
matching. The caller of PCRE2 provides an external function by putting
|
||||||
its entry point in a match context (see pcre2_set_callout()) in the
|
its entry point in a match context (see pcre2_set_callout() in the
|
||||||
pcre2api documentation).
|
pcre2api documentation).
|
||||||
|
|
||||||
Within a regular expression, (?C) indicates the points at which the
|
Within a regular expression, (?C<arg>) indicates a point at which the
|
||||||
external function is to be called. Different callout points can be
|
external function is to be called. Different callout points can be
|
||||||
identified by putting a number less than 256 after the letter C. The
|
identified by putting a number less than 256 after the letter C. The
|
||||||
default value is zero. For example, this pattern has two callout
|
default value is zero. Alternatively, the argument may be a delimited
|
||||||
|
string. The starting delimiter must be one of ` ' " ^ % # $ { and the
|
||||||
|
ending delimiter is the same as the start, except for {, where the end-
|
||||||
|
ing delimiter is }. If the ending delimiter is needed within the
|
||||||
|
string, it must be doubled. For example, this pattern has two callout
|
||||||
points:
|
points:
|
||||||
|
|
||||||
(?C1)abc(?C2)def
|
(?C1)abc(?C"some ""arbitrary"" text")def
|
||||||
|
|
||||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
||||||
PCRE2 automatically inserts callouts, all with number 255, before each
|
PCRE2 automatically inserts callouts, all with number 255, before each
|
||||||
|
@ -3284,29 +3317,30 @@ DESCRIPTION
|
||||||
before the condition. Such a callout may also be inserted explicitly,
|
before the condition. Such a callout may also be inserted explicitly,
|
||||||
for example:
|
for example:
|
||||||
|
|
||||||
(?(?C9)(?=a)ab|de)
|
(?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de)
|
||||||
|
|
||||||
This applies only to assertion conditions (because they are themselves
|
This applies only to assertion conditions (because they are themselves
|
||||||
independent groups).
|
independent groups).
|
||||||
|
|
||||||
Automatic callouts can be used for tracking the progress of pattern
|
Callouts can be useful for tracking the progress of pattern matching.
|
||||||
matching. The pcre2test program has a pattern qualifier (/auto_call-
|
The pcre2test program has a pattern qualifier (/auto_callout) that sets
|
||||||
out) that sets automatic callouts; when it is used, the output indi-
|
automatic callouts. When any callouts are present, the output from
|
||||||
cates how the pattern is being matched. This is useful information when
|
pcre2test indicates how the pattern is being matched. This is useful
|
||||||
you are trying to optimize the performance of a particular pattern.
|
information when you are trying to optimize the performance of a par-
|
||||||
|
ticular pattern.
|
||||||
|
|
||||||
|
|
||||||
MISSING CALLOUTS
|
MISSING CALLOUTS
|
||||||
|
|
||||||
You should be aware that, because of optimizations in the way PCRE2
|
You should be aware that, because of optimizations in the way PCRE2
|
||||||
compiles and matches patterns, callouts sometimes do not happen exactly
|
compiles and matches patterns, callouts sometimes do not happen exactly
|
||||||
as you might expect.
|
as you might expect.
|
||||||
|
|
||||||
Auto-possessification
|
Auto-possessification
|
||||||
|
|
||||||
At compile time, PCRE2 "auto-possessifies" repeated items when it knows
|
At compile time, PCRE2 "auto-possessifies" repeated items when it knows
|
||||||
that what follows cannot be part of the repeat. For example, a+[bc] is
|
that what follows cannot be part of the repeat. For example, a+[bc] is
|
||||||
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
||||||
is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied
|
is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied
|
||||||
to the string "aaaa" is:
|
to the string "aaaa" is:
|
||||||
|
|
||||||
|
@ -3315,10 +3349,10 @@ MISSING CALLOUTS
|
||||||
+2 ^ ^ [bc]
|
+2 ^ ^ [bc]
|
||||||
No match
|
No match
|
||||||
|
|
||||||
This indicates that when matching [bc] fails, there is no backtracking
|
This indicates that when matching [bc] fails, there is no backtracking
|
||||||
into a+ and therefore the callouts that would be taken for the back-
|
into a+ and therefore the callouts that would be taken for the back-
|
||||||
tracks do not occur. You can disable the auto-possessify feature by
|
tracks do not occur. You can disable the auto-possessify feature by
|
||||||
passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat-
|
passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat-
|
||||||
tern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
|
tern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
|
||||||
|
|
||||||
--->aaaa
|
--->aaaa
|
||||||
|
@ -3335,16 +3369,16 @@ MISSING CALLOUTS
|
||||||
Automatic .* anchoring
|
Automatic .* anchoring
|
||||||
|
|
||||||
By default, an optimization is applied when .* is the first significant
|
By default, an optimization is applied when .* is the first significant
|
||||||
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
||||||
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
||||||
is not set, a match can start only after an internal newline or at the
|
is not set, a match can start only after an internal newline or at the
|
||||||
beginning of the subject, and pcre2_compile() remembers this. This
|
beginning of the subject, and pcre2_compile() remembers this. This
|
||||||
optimization is disabled, however, if .* is in an atomic group or if
|
optimization is disabled, however, if .* is in an atomic group or if
|
||||||
there is a back reference to the capturing group in which it appears.
|
there is a back reference to the capturing group in which it appears.
|
||||||
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
||||||
ever, the presence of callouts does not affect it.
|
ever, the presence of callouts does not affect it.
|
||||||
|
|
||||||
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
||||||
and applied to the string "aa", the pcre2test output is:
|
and applied to the string "aa", the pcre2test output is:
|
||||||
|
|
||||||
--->aa
|
--->aa
|
||||||
|
@ -3354,10 +3388,10 @@ MISSING CALLOUTS
|
||||||
+2 ^ \d
|
+2 ^ \d
|
||||||
No match
|
No match
|
||||||
|
|
||||||
This shows that all match attempts start at the beginning of the sub-
|
This shows that all match attempts start at the beginning of the sub-
|
||||||
ject. In other words, the pattern is anchored. You can disable this
|
ject. In other words, the pattern is anchored. You can disable this
|
||||||
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
||||||
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
||||||
put changes to:
|
put changes to:
|
||||||
|
|
||||||
--->aa
|
--->aa
|
||||||
|
@ -3370,43 +3404,43 @@ MISSING CALLOUTS
|
||||||
+2 ^ \d
|
+2 ^ \d
|
||||||
No match
|
No match
|
||||||
|
|
||||||
This shows more match attempts, starting at the second subject charac-
|
This shows more match attempts, starting at the second subject charac-
|
||||||
ter. Another optimization, described in the next section, means that
|
ter. Another optimization, described in the next section, means that
|
||||||
there is no subsequent attempt to match with an empty subject.
|
there is no subsequent attempt to match with an empty subject.
|
||||||
|
|
||||||
If a pattern has more than one top-level branch, automatic anchoring
|
If a pattern has more than one top-level branch, automatic anchoring
|
||||||
occurs if all branches are anchorable.
|
occurs if all branches are anchorable.
|
||||||
|
|
||||||
Other optimizations
|
Other optimizations
|
||||||
|
|
||||||
Other optimizations that provide fast "no match" results also affect
|
Other optimizations that provide fast "no match" results also affect
|
||||||
callouts. For example, if the pattern is
|
callouts. For example, if the pattern is
|
||||||
|
|
||||||
ab(?C4)cd
|
ab(?C4)cd
|
||||||
|
|
||||||
PCRE2 knows that any matching string must contain the letter "d". If
|
PCRE2 knows that any matching string must contain the letter "d". If
|
||||||
the subject string is "abyz", the lack of "d" means that matching
|
the subject string is "abyz", the lack of "d" means that matching
|
||||||
doesn't ever start, and the callout is never reached. However, with
|
doesn't ever start, and the callout is never reached. However, with
|
||||||
"abyd", though the result is still no match, the callout is obeyed.
|
"abyd", though the result is still no match, the callout is obeyed.
|
||||||
|
|
||||||
PCRE2 also knows the minimum length of a matching string, and will
|
PCRE2 also knows the minimum length of a matching string, and will
|
||||||
immediately give a "no match" return without actually running a match
|
immediately give a "no match" return without actually running a match
|
||||||
if the subject is not long enough, or, for unanchored patterns, if it
|
if the subject is not long enough, or, for unanchored patterns, if it
|
||||||
has been scanned far enough.
|
has been scanned far enough.
|
||||||
|
|
||||||
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
||||||
MIZE option to pcre2_compile(), or by starting the pattern with
|
MIZE option to pcre2_compile(), or by starting the pattern with
|
||||||
(*NO_START_OPT). This slows down the matching process, but does ensure
|
(*NO_START_OPT). This slows down the matching process, but does ensure
|
||||||
that callouts such as the example above are obeyed.
|
that callouts such as the example above are obeyed.
|
||||||
|
|
||||||
|
|
||||||
THE CALLOUT INTERFACE
|
THE CALLOUT INTERFACE
|
||||||
|
|
||||||
During matching, when PCRE2 reaches a callout point, if an external
|
During matching, when PCRE2 reaches a callout point, if an external
|
||||||
function is set in the match context, it is called. This applies to
|
function is set in the match context, it is called. This applies to
|
||||||
both normal and DFA matching. The first argument to the callout func-
|
both normal and DFA matching. The first argument to the callout func-
|
||||||
tion is a pointer to a pcre2_callout block. The second argument is the
|
tion is a pointer to a pcre2_callout block. The second argument is the
|
||||||
void * callout data that was supplied when the callout was set up by
|
void * callout data that was supplied when the callout was set up by
|
||||||
calling pcre2_set_callout() (see the pcre2api documentation). The call-
|
calling pcre2_set_callout() (see the pcre2api documentation). The call-
|
||||||
out block structure contains the following fields:
|
out block structure contains the following fields:
|
||||||
|
|
||||||
|
@ -3422,15 +3456,47 @@ THE CALLOUT INTERFACE
|
||||||
PCRE2_SIZE current_position;
|
PCRE2_SIZE current_position;
|
||||||
PCRE2_SIZE pattern_position;
|
PCRE2_SIZE pattern_position;
|
||||||
PCRE2_SIZE next_item_length;
|
PCRE2_SIZE next_item_length;
|
||||||
|
PCRE2_SIZE callout_string_offset;
|
||||||
|
PCRE2_SIZE callout_string_length;
|
||||||
|
PCRE2_SPTR callout_string;
|
||||||
|
|
||||||
The version field contains the version number of the block format. The
|
The version field contains the version number of the block format. The
|
||||||
current version is 0. The version number will change in future if addi-
|
current version is 1; the three callout string fields were added for
|
||||||
tional fields are added, but the intention is never to remove any of
|
this version. If you are writing an application that might use an ear-
|
||||||
the existing fields.
|
lier release of PCRE2, you should check the version number before
|
||||||
|
accessing any of these fields. The version number will increase in
|
||||||
|
future if more fields are added, but the intention is never to remove
|
||||||
|
any of the existing fields.
|
||||||
|
|
||||||
The callout_number field contains the number of the callout, as com-
|
Fields for numerical callouts
|
||||||
piled into the pattern (that is, the number after ?C for manual call-
|
|
||||||
outs, and 255 for automatically generated callouts).
|
For a numerical callout, callout_string is NULL, and callout_number
|
||||||
|
contains the number of the callout, in the range 0-255. This is the
|
||||||
|
number that follows (?C for manual callouts; it is 255 for automati-
|
||||||
|
cally generated callouts.
|
||||||
|
|
||||||
|
Fields for string callouts
|
||||||
|
|
||||||
|
For callouts with string arguments, callout_number is always zero, and
|
||||||
|
callout_string points to the string that is contained within the com-
|
||||||
|
piled pattern. Its length is given by callout_string_length. Duplicated
|
||||||
|
ending delimiters that were present in the original pattern string have
|
||||||
|
been turned into single characters, but there is no other processing of
|
||||||
|
the callout string argument. An additional code unit containing binary
|
||||||
|
zero is present after the string, but is not included in the length.
|
||||||
|
The delimiter that was used to start the string is also stored within
|
||||||
|
the pattern, immediately before the string itself. You can access this
|
||||||
|
delimiter as callout_string[-1] if you need it.
|
||||||
|
|
||||||
|
The callout_string_offset field is the code unit offset to the start of
|
||||||
|
the callout argument string within the original pattern string. This is
|
||||||
|
provided for the benefit of applications such as script languages that
|
||||||
|
might need to report errors in the callout string within the pattern.
|
||||||
|
|
||||||
|
Fields for all callouts
|
||||||
|
|
||||||
|
The remaining fields in the callout block are the same for both kinds
|
||||||
|
of callout.
|
||||||
|
|
||||||
The offset_vector field is a pointer to the vector of capturing offsets
|
The offset_vector field is a pointer to the vector of capturing offsets
|
||||||
(the "ovector") that was passed to the matching function in the match
|
(the "ovector") that was passed to the matching function in the match
|
||||||
|
@ -3464,8 +3530,8 @@ THE CALLOUT INTERFACE
|
||||||
substrings. If no substrings have been captured, the value of cap-
|
substrings. If no substrings have been captured, the value of cap-
|
||||||
ture_last is 0. This is always the case for the DFA matching functions.
|
ture_last is 0. This is always the case for the DFA matching functions.
|
||||||
|
|
||||||
The pattern_position field contains the offset to the next item to be
|
The pattern_position field contains the offset in the pattern string to
|
||||||
matched in the pattern string.
|
the next item to be matched.
|
||||||
|
|
||||||
The next_item_length field contains the length of the next item to be
|
The next_item_length field contains the length of the next item to be
|
||||||
matched in the pattern string. When the callout immediately precedes an
|
matched in the pattern string. When the callout immediately precedes an
|
||||||
|
@ -3475,7 +3541,9 @@ THE CALLOUT INTERFACE
|
||||||
|
|
||||||
The pattern_position and next_item_length fields are intended to help
|
The pattern_position and next_item_length fields are intended to help
|
||||||
in distinguishing between different automatic callouts, which all have
|
in distinguishing between different automatic callouts, which all have
|
||||||
the same callout number. However, they are set for all callouts.
|
the same callout number. However, they are set for all callouts, and
|
||||||
|
are used by pcre2test to show the next item to be matched when display-
|
||||||
|
ing callout information.
|
||||||
|
|
||||||
In callouts from pcre2_match() the mark field contains a pointer to the
|
In callouts from pcre2_match() the mark field contains a pointer to the
|
||||||
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||||
|
@ -3485,7 +3553,7 @@ THE CALLOUT INTERFACE
|
||||||
always contains NULL.
|
always contains NULL.
|
||||||
|
|
||||||
|
|
||||||
RETURN VALUES
|
RETURN VALUES FROM CALLOUTS
|
||||||
|
|
||||||
The external callout function returns an integer to PCRE2. If the value
|
The external callout function returns an integer to PCRE2. If the value
|
||||||
is zero, matching proceeds as normal. If the value is greater than
|
is zero, matching proceeds as normal. If the value is greater than
|
||||||
|
@ -3501,6 +3569,49 @@ RETURN VALUES
|
||||||
itself.
|
itself.
|
||||||
|
|
||||||
|
|
||||||
|
CALLOUT ENUMERATION
|
||||||
|
|
||||||
|
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||||
|
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||||
|
void *user_data);
|
||||||
|
|
||||||
|
A script language that supports the use of string arguments in callouts
|
||||||
|
might like to scan all the callouts in a pattern before running the
|
||||||
|
match. This can be done by calling pcre2_callout_enumerate(). The first
|
||||||
|
argument is a pointer to a compiled pattern, the second points to a
|
||||||
|
callback function, and the third is arbitrary user data. The callback
|
||||||
|
function is called for every callout in the pattern in the order in
|
||||||
|
which they appear. Its first argument is a pointer to a callout enumer-
|
||||||
|
ation block, and its second argument is the user_data value that was
|
||||||
|
passed to pcre2_callout_enumerate(). The data block contains the fol-
|
||||||
|
lowing fields:
|
||||||
|
|
||||||
|
version Block version number
|
||||||
|
pattern_position Offset to next item in pattern
|
||||||
|
next_item_length Length of next item in pattern
|
||||||
|
callout_number Number for numbered callouts
|
||||||
|
callout_string_offset Offset to string within pattern
|
||||||
|
callout_string_length Length of callout string
|
||||||
|
callout_string Points to callout string or is NULL
|
||||||
|
|
||||||
|
The version number is currently 0. It will increase if new fields are
|
||||||
|
ever added to the block. The remaining fields are the same as their
|
||||||
|
namesakes in the pcre2_callout block that is used for callouts during
|
||||||
|
matching, as described above.
|
||||||
|
|
||||||
|
Note that the value of pattern_position is unique for each callout.
|
||||||
|
However, if a callout occurs inside a group that is quantified with a
|
||||||
|
non-zero minimum or a fixed maximum, the group is replicated inside the
|
||||||
|
compiled pattern. For example, a pattern such as /(a){2}/ is compiled
|
||||||
|
as if it were /(a)(a)/. This means that the callout will be enumerated
|
||||||
|
more than once, but with the same value for pattern_position in each
|
||||||
|
case.
|
||||||
|
|
||||||
|
The callback function should normally return zero. If it returns a non-
|
||||||
|
zero value, scanning the pattern stops, and that value is returned from
|
||||||
|
pcre2_callout_enumerate().
|
||||||
|
|
||||||
|
|
||||||
AUTHOR
|
AUTHOR
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
|
@ -3510,7 +3621,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 02 January 2015
|
Last updated: 23 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -3585,104 +3696,103 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
||||||
during pattern matching. See the pcre2callout documentation for
|
during pattern matching. See the pcre2callout documentation for
|
||||||
details.
|
details.
|
||||||
|
|
||||||
8. Subpatterns that are called as subroutines (whether or not recur-
|
8. Subroutine calls (whether recursive or not) are treated as atomic
|
||||||
sively) are always treated as atomic groups in PCRE2. This is like
|
groups. Atomic recursion is like Python, but unlike Perl. Captured
|
||||||
Python, but unlike Perl. Captured values that are set outside a sub-
|
values that are set outside a subroutine call can be referenced from
|
||||||
routine call can be reference from inside in PCRE2, but not in Perl.
|
inside in PCRE2, but not in Perl. There is a discussion that explains
|
||||||
There is a discussion that explains these differences in more detail in
|
these differences in more detail in the section on recursion differ-
|
||||||
the section on recursion differences from Perl in the pcre2pattern
|
ences from Perl in the pcre2pattern page.
|
||||||
page.
|
|
||||||
|
|
||||||
9. If any of the backtracking control verbs are used in a subpattern
|
9. If any of the backtracking control verbs are used in a subpattern
|
||||||
that is called as a subroutine (whether or not recursively), their
|
that is called as a subroutine (whether or not recursively), their
|
||||||
effect is confined to that subpattern; it does not extend to the sur-
|
effect is confined to that subpattern; it does not extend to the sur-
|
||||||
rounding pattern. This is not always the case in Perl. In particular,
|
rounding pattern. This is not always the case in Perl. In particular,
|
||||||
if (*THEN) is present in a group that is called as a subroutine, its
|
if (*THEN) is present in a group that is called as a subroutine, its
|
||||||
action is limited to that group, even if the group does not contain any
|
action is limited to that group, even if the group does not contain any
|
||||||
| characters. Note that such subpatterns are processed as anchored at
|
| characters. Note that such subpatterns are processed as anchored at
|
||||||
the point where they are tested.
|
the point where they are tested.
|
||||||
|
|
||||||
10. If a pattern contains more than one backtracking control verb, the
|
10. If a pattern contains more than one backtracking control verb, the
|
||||||
first one that is backtracked onto acts. For example, in the pattern
|
first one that is backtracked onto acts. For example, in the pattern
|
||||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure
|
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure
|
||||||
in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases
|
in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases
|
||||||
it is the same as PCRE2, but there are examples where it differs.
|
it is the same as PCRE2, but there are examples where it differs.
|
||||||
|
|
||||||
11. Most backtracking verbs in assertions have their normal actions.
|
11. Most backtracking verbs in assertions have their normal actions.
|
||||||
They are not confined to the assertion.
|
They are not confined to the assertion.
|
||||||
|
|
||||||
12. There are some differences that are concerned with the settings of
|
12. There are some differences that are concerned with the settings of
|
||||||
captured strings when part of a pattern is repeated. For example,
|
captured strings when part of a pattern is repeated. For example,
|
||||||
matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
|
matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
|
||||||
unset, but in PCRE2 it is set to "b".
|
unset, but in PCRE2 it is set to "b".
|
||||||
|
|
||||||
13. PCRE2's handling of duplicate subpattern numbers and duplicate sub-
|
13. PCRE2's handling of duplicate subpattern numbers and duplicate sub-
|
||||||
pattern names is not as general as Perl's. This is a consequence of the
|
pattern names is not as general as Perl's. This is a consequence of the
|
||||||
fact the PCRE2 works internally just with numbers, using an external
|
fact the PCRE2 works internally just with numbers, using an external
|
||||||
table to translate between numbers and names. In particular, a pattern
|
table to translate between numbers and names. In particular, a pattern
|
||||||
such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
|
such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
|
||||||
the same number but different names, is not supported, and causes an
|
the same number but different names, is not supported, and causes an
|
||||||
error at compile time. If it were allowed, it would not be possible to
|
error at compile time. If it were allowed, it would not be possible to
|
||||||
distinguish which parentheses matched, because both names map to cap-
|
distinguish which parentheses matched, because both names map to cap-
|
||||||
turing subpattern number 1. To avoid this confusing situation, an error
|
turing subpattern number 1. To avoid this confusing situation, an error
|
||||||
is given at compile time.
|
is given at compile time.
|
||||||
|
|
||||||
14. Perl recognizes comments in some places that PCRE2 does not, for
|
14. Perl recognizes comments in some places that PCRE2 does not, for
|
||||||
example, between the ( and ? at the start of a subpattern. If the /x
|
example, between the ( and ? at the start of a subpattern. If the /x
|
||||||
modifier is set, Perl allows white space between ( and ? (though cur-
|
modifier is set, Perl allows white space between ( and ? (though cur-
|
||||||
rent Perls warn that this is deprecated) but PCRE2 never does, even if
|
rent Perls warn that this is deprecated) but PCRE2 never does, even if
|
||||||
the PCRE2_EXTENDED option is set.
|
the PCRE2_EXTENDED option is set.
|
||||||
|
|
||||||
15. Perl, when in warning mode, gives warnings for character classes
|
15. Perl, when in warning mode, gives warnings for character classes
|
||||||
such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter-
|
such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter-
|
||||||
als. PCRE2 has no warning features, so it gives an error in these cases
|
als. PCRE2 has no warning features, so it gives an error in these cases
|
||||||
because they are almost certainly user mistakes.
|
because they are almost certainly user mistakes.
|
||||||
|
|
||||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are
|
16. In PCRE2, the upper/lower case character properties Lu and Ll are
|
||||||
not affected when case-independent matching is specified. For example,
|
not affected when case-independent matching is specified. For example,
|
||||||
\p{Lu} always matches an upper case letter. I think Perl has changed in
|
\p{Lu} always matches an upper case letter. I think Perl has changed in
|
||||||
this respect; in the release at the time of writing (5.16), \p{Lu} and
|
this respect; in the release at the time of writing (5.16), \p{Lu} and
|
||||||
\p{Ll} match all letters, regardless of case, when case independence is
|
\p{Ll} match all letters, regardless of case, when case independence is
|
||||||
specified.
|
specified.
|
||||||
|
|
||||||
17. PCRE2 provides some extensions to the Perl regular expression
|
17. PCRE2 provides some extensions to the Perl regular expression
|
||||||
facilities. Perl 5.10 includes new features that are not in earlier
|
facilities. Perl 5.10 includes new features that are not in earlier
|
||||||
versions of Perl, some of which (such as named parentheses) have been
|
versions of Perl, some of which (such as named parentheses) have been
|
||||||
in PCRE2 for some time. This list is with respect to Perl 5.10:
|
in PCRE2 for some time. This list is with respect to Perl 5.10:
|
||||||
|
|
||||||
(a) Although lookbehind assertions in PCRE2 must match fixed length
|
(a) Although lookbehind assertions in PCRE2 must match fixed length
|
||||||
strings, each alternative branch of a lookbehind assertion can match a
|
strings, each alternative branch of a lookbehind assertion can match a
|
||||||
different length of string. Perl requires them all to have the same
|
different length of string. Perl requires them all to have the same
|
||||||
length.
|
length.
|
||||||
|
|
||||||
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the
|
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the
|
||||||
$ meta-character matches only at the very end of the string.
|
$ meta-character matches only at the very end of the string.
|
||||||
|
|
||||||
(c) A backslash followed by a letter with no special meaning is
|
(c) A backslash followed by a letter with no special meaning is
|
||||||
faulted. (Perl can be made to issue a warning.)
|
faulted. (Perl can be made to issue a warning.)
|
||||||
|
|
||||||
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti-
|
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti-
|
||||||
fiers is inverted, that is, by default they are not greedy, but if fol-
|
fiers is inverted, that is, by default they are not greedy, but if fol-
|
||||||
lowed by a question mark they are.
|
lowed by a question mark they are.
|
||||||
|
|
||||||
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to
|
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to
|
||||||
be tried only at the first matching position in the subject string.
|
be tried only at the first matching position in the subject string.
|
||||||
|
|
||||||
(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||||
PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl
|
PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl
|
||||||
equivalents.
|
equivalents.
|
||||||
|
|
||||||
(g) The \R escape sequence can be restricted to match only CR, LF, or
|
(g) The \R escape sequence can be restricted to match only CR, LF, or
|
||||||
CRLF by the PCRE2_BSR_ANYCRLF option.
|
CRLF by the PCRE2_BSR_ANYCRLF option.
|
||||||
|
|
||||||
(h) The callout facility is PCRE2-specific.
|
(h) The callout facility is PCRE2-specific.
|
||||||
|
|
||||||
(i) The partial matching facility is PCRE2-specific.
|
(i) The partial matching facility is PCRE2-specific.
|
||||||
|
|
||||||
(j) The alternative matching function (pcre2_dfa_match() matches in a
|
(j) The alternative matching function (pcre2_dfa_match() matches in a
|
||||||
different way and is not Perl-compatible.
|
different way and is not Perl-compatible.
|
||||||
|
|
||||||
(k) PCRE2 recognizes some special sequences such as (*CR) at the start
|
(k) PCRE2 recognizes some special sequences such as (*CR) at the start
|
||||||
of a pattern that set overall options that cannot be changed within the
|
of a pattern that set overall options that cannot be changed within the
|
||||||
pattern.
|
pattern.
|
||||||
|
|
||||||
|
@ -3696,8 +3806,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 28 September 2014
|
Last updated: 15 March 2015
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
.TH PCRE2_COMPILE 3 "23 March 2015" "PCRE2 10.20"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.B #include <pcre2.h>
|
||||||
|
.PP
|
||||||
|
.nf
|
||||||
|
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||||
|
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||||
|
.B " void *\fIcallout_data\fP);"
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
This function scans a compiled regular expression and calls the \fIcallback()\fP
|
||||||
|
function for each callout within the pattern. The yield of the function is zero
|
||||||
|
for success and non-zero otherwise. The arguments are:
|
||||||
|
.sp
|
||||||
|
\fIcode\fP Points to the compiled pattern
|
||||||
|
\fIcallback\fP The callback function
|
||||||
|
\fIcallout_data\fP User data that is passed to the callback
|
||||||
|
.sp
|
||||||
|
The \fIcallback()\fP function is passed a pointer to a data block containing
|
||||||
|
the following fields:
|
||||||
|
.sp
|
||||||
|
\fIversion\fP Block version number
|
||||||
|
\fIpattern_position\fP Offset to next item in pattern
|
||||||
|
\fInext_item_length\fP Length of next item in pattern
|
||||||
|
\fIcallout_number\fP Number for numbered callouts
|
||||||
|
\fIcallout_string_offset\fP Offset to string within pattern
|
||||||
|
\fIcallout_string_length\fP Length of callout string
|
||||||
|
\fIcallout_string\fP Points to callout string or is NULL
|
||||||
|
.sp
|
||||||
|
The second argument is the callout data that was passed to
|
||||||
|
\fBpcre2_callout_enumerate()\fP. The \fBcallback()\fP function must return zero
|
||||||
|
for success. Any other value causes the pattern scan to stop, with the value
|
||||||
|
being passed back as the result of \fBpcre2_callout_enumerate()\fP.
|
||||||
|
.P
|
||||||
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2api\fP
|
||||||
|
.\"
|
||||||
|
page and a description of the POSIX API in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2posix\fP
|
||||||
|
.\"
|
||||||
|
page.
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2API 3 "23 January 2015" "PCRE2 10.10"
|
.TH PCRE2API 3 "23 March 2015" "PCRE2 10.20"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.sp
|
.sp
|
||||||
|
@ -234,6 +234,10 @@ document for an overview of all the PCRE2 documentation.
|
||||||
.sp
|
.sp
|
||||||
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
||||||
.sp
|
.sp
|
||||||
|
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||||
|
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||||
|
.B " void *\fIuser_data\fP);"
|
||||||
|
.sp
|
||||||
.B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
.B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
||||||
.fi
|
.fi
|
||||||
.
|
.
|
||||||
|
@ -1427,14 +1431,19 @@ can be processed in different locales.
|
||||||
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
||||||
.fi
|
.fi
|
||||||
.P
|
.P
|
||||||
The \fBpcre2_pattern_info()\fP function returns information about a compiled
|
The \fBpcre2_pattern_info()\fP function returns general information about a
|
||||||
pattern. The first argument is a pointer to the compiled pattern. The second
|
compiled pattern. For information about callouts, see the
|
||||||
argument specifies which piece of information is required, and the third
|
.\" HTML <a href="pcre2pattern.html#infoaboutcallouts">
|
||||||
argument is a pointer to a variable to receive the data. If the third argument
|
.\" </a>
|
||||||
is NULL, the first argument is ignored, and the function returns the size in
|
next section.
|
||||||
bytes of the variable that is required for the information requested.
|
.\"
|
||||||
Otherwise, The yield of the function is zero for success, or one of the
|
The first argument for \fBpcre2_pattern_info()\fP is a pointer to the compiled
|
||||||
following negative numbers:
|
pattern. The second argument specifies which piece of information is required,
|
||||||
|
and the third argument is a pointer to a variable to receive the data. If the
|
||||||
|
third argument is NULL, the first argument is ignored, and the function returns
|
||||||
|
the size in bytes of the variable that is required for the information
|
||||||
|
requested. Otherwise, The yield of the function is zero for success, or one of
|
||||||
|
the following negative numbers:
|
||||||
.sp
|
.sp
|
||||||
PCRE2_ERROR_NULL the argument \fIcode\fP was NULL
|
PCRE2_ERROR_NULL the argument \fIcode\fP was NULL
|
||||||
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
||||||
|
@ -1716,6 +1725,31 @@ calculates the size has to over-estimate. Processing a pattern with the JIT
|
||||||
compiler does not alter the value returned by this option.
|
compiler does not alter the value returned by this option.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
.\" HTML <a name="infoaboutcallouts"></a>
|
||||||
|
.SH "INFORMATION ABOUT A PATTERN'S CALLOUTS"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||||
|
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||||
|
.B " void *\fIuser_data\fP);"
|
||||||
|
.fi
|
||||||
|
.sp
|
||||||
|
A script language that supports the use of string arguments in callouts might
|
||||||
|
like to scan all the callouts in a pattern before running the match. This can
|
||||||
|
be done by calling \fBpcre2_callout_enumerate()\fP. The first argument is a
|
||||||
|
pointer to a compiled pattern, the second points to a callback function, and
|
||||||
|
the third is arbitrary user data. The callback function is called for every
|
||||||
|
callout in the pattern in the order in which they appear. Its first argument is
|
||||||
|
a pointer to a callout enumeration block, and its second argument is the
|
||||||
|
\fIuser_data\fP value that was passed to \fBpcre2_callout_enumerate()\fP. The
|
||||||
|
contents of the callout enumeration block are described in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2callout\fP
|
||||||
|
.\"
|
||||||
|
documentation, which also gives further details about callouts.
|
||||||
|
.
|
||||||
|
.
|
||||||
.SH "SERIALIZATION AND PRECOMPILING"
|
.SH "SERIALIZATION AND PRECOMPILING"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
|
@ -2275,8 +2309,8 @@ of the subject.
|
||||||
PCRE2_ERROR_CALLOUT
|
PCRE2_ERROR_CALLOUT
|
||||||
.sp
|
.sp
|
||||||
This error is never generated by \fBpcre2_match()\fP itself. It is provided for
|
This error is never generated by \fBpcre2_match()\fP itself. It is provided for
|
||||||
use by callout functions that want to cause \fBpcre2_match()\fP to return a
|
use by callout functions that want to cause \fBpcre2_match()\fP or
|
||||||
distinctive error code. See the
|
\fBpcre2_callout_enumerate()\fP to return a distinctive error code. See the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2callout\fP
|
\fBpcre2callout\fP
|
||||||
.\"
|
.\"
|
||||||
|
@ -2885,6 +2919,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 23 January 2015
|
Last updated: 23 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2CALLOUT 3 "16 March 2015" "PCRE2 10.20"
|
.TH PCRE2CALLOUT 3 "23 March 2015" "PCRE2 10.20"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -7,7 +7,13 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.B #include <pcre2.h>
|
.B #include <pcre2.h>
|
||||||
.PP
|
.PP
|
||||||
.SM
|
.SM
|
||||||
|
.nf
|
||||||
.B int (*pcre2_callout)(pcre2_callout_block *, void *);
|
.B int (*pcre2_callout)(pcre2_callout_block *, void *);
|
||||||
|
.sp
|
||||||
|
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||||
|
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||||
|
.B " void *\fIuser_data\fP);"
|
||||||
|
.fi
|
||||||
.
|
.
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
.rs
|
.rs
|
||||||
|
@ -170,6 +176,7 @@ option to \fBpcre2_compile()\fP, or by starting the pattern with
|
||||||
callouts such as the example above are obeyed.
|
callouts such as the example above are obeyed.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
.\" HTML <a name="calloutinterface"></a>
|
||||||
.SH "THE CALLOUT INTERFACE"
|
.SH "THE CALLOUT INTERFACE"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
|
@ -199,7 +206,6 @@ documentation). The callout block structure contains the following fields:
|
||||||
PCRE2_SIZE \fIcallout_string_offset\fP;
|
PCRE2_SIZE \fIcallout_string_offset\fP;
|
||||||
PCRE2_SIZE \fIcallout_string_length\fP;
|
PCRE2_SIZE \fIcallout_string_length\fP;
|
||||||
PCRE2_SPTR \fIcallout_string\fP;
|
PCRE2_SPTR \fIcallout_string\fP;
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
The \fIversion\fP field contains the version number of the block format. The
|
The \fIversion\fP field contains the version number of the block format. The
|
||||||
current version is 1; the three callout string fields were added for this
|
current version is 1; the three callout string fields were added for this
|
||||||
|
@ -276,8 +282,8 @@ outside the recursion, as do the values of all captured substrings. If no
|
||||||
substrings have been captured, the value of \fIcapture_last\fP is 0. This is
|
substrings have been captured, the value of \fIcapture_last\fP is 0. This is
|
||||||
always the case for the DFA matching functions.
|
always the case for the DFA matching functions.
|
||||||
.P
|
.P
|
||||||
The \fIpattern_position\fP field contains the offset to the next item to be
|
The \fIpattern_position\fP field contains the offset in the pattern string to
|
||||||
matched in the pattern string.
|
the next item to be matched.
|
||||||
.P
|
.P
|
||||||
The \fInext_item_length\fP field contains the length of the next item to be
|
The \fInext_item_length\fP field contains the length of the next item to be
|
||||||
matched in the pattern string. When the callout immediately precedes an
|
matched in the pattern string. When the callout immediately precedes an
|
||||||
|
@ -298,7 +304,7 @@ of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In
|
||||||
callouts from the DFA matching function this field always contains NULL.
|
callouts from the DFA matching function this field always contains NULL.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "RETURN VALUES"
|
.SH "RETURN VALUES FROM CALLOUTS"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
The external callout function returns an integer to PCRE2. If the value is
|
The external callout function returns an integer to PCRE2. If the value is
|
||||||
|
@ -314,6 +320,54 @@ failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout
|
||||||
functions; it will never be used by PCRE2 itself.
|
functions; it will never be used by PCRE2 itself.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
.SH "CALLOUT ENUMERATION"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||||
|
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||||
|
.B " void *\fIuser_data\fP);"
|
||||||
|
.fi
|
||||||
|
.sp
|
||||||
|
A script language that supports the use of string arguments in callouts might
|
||||||
|
like to scan all the callouts in a pattern before running the match. This can
|
||||||
|
be done by calling \fBpcre2_callout_enumerate()\fP. The first argument is a
|
||||||
|
pointer to a compiled pattern, the second points to a callback function, and
|
||||||
|
the third is arbitrary user data. The callback function is called for every
|
||||||
|
callout in the pattern in the order in which they appear. Its first argument is
|
||||||
|
a pointer to a callout enumeration block, and its second argument is the
|
||||||
|
\fIuser_data\fP value that was passed to \fBpcre2_callout_enumerate()\fP. The
|
||||||
|
data block contains the following fields:
|
||||||
|
.sp
|
||||||
|
\fIversion\fP Block version number
|
||||||
|
\fIpattern_position\fP Offset to next item in pattern
|
||||||
|
\fInext_item_length\fP Length of next item in pattern
|
||||||
|
\fIcallout_number\fP Number for numbered callouts
|
||||||
|
\fIcallout_string_offset\fP Offset to string within pattern
|
||||||
|
\fIcallout_string_length\fP Length of callout string
|
||||||
|
\fIcallout_string\fP Points to callout string or is NULL
|
||||||
|
.sp
|
||||||
|
The version number is currently 0. It will increase if new fields are ever
|
||||||
|
added to the block. The remaining fields are the same as their namesakes in the
|
||||||
|
\fBpcre2_callout\fP block that is used for callouts during matching, as
|
||||||
|
described
|
||||||
|
.\" HTML <a href="#calloutinterface">
|
||||||
|
.\" </a>
|
||||||
|
above.
|
||||||
|
.\"
|
||||||
|
.P
|
||||||
|
Note that the value of \fIpattern_position\fP is unique for each callout.
|
||||||
|
However, if a callout occurs inside a group that is quantified with a non-zero
|
||||||
|
minimum or a fixed maximum, the group is replicated inside the compiled
|
||||||
|
pattern. For example, a pattern such as /(a){2}/ is compiled as if it were
|
||||||
|
/(a)(a)/. This means that the callout will be enumerated more than once, but
|
||||||
|
with the same value for \fIpattern_position\fP in each case.
|
||||||
|
.P
|
||||||
|
The callback function should normally return zero. If it returns a non-zero
|
||||||
|
value, scanning the pattern stops, and that value is returned from
|
||||||
|
\fBpcre2_callout_enumerate()\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
.SH AUTHOR
|
.SH AUTHOR
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
|
@ -328,6 +382,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 16 March 2015
|
Last updated: 23 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2TEST 1 "16 March 2015" "PCRE 10.20"
|
.TH PCRE2TEST 1 "22 March 2015" "PCRE 10.20"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -473,6 +473,7 @@ about the pattern:
|
||||||
.sp
|
.sp
|
||||||
bsr=[anycrlf|unicode] specify \eR handling
|
bsr=[anycrlf|unicode] specify \eR handling
|
||||||
/B bincode show binary code without lengths
|
/B bincode show binary code without lengths
|
||||||
|
callout_info show callout information
|
||||||
debug same as info,fullbincode
|
debug same as info,fullbincode
|
||||||
fullbincode show binary code with lengths
|
fullbincode show binary code with lengths
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
|
@ -549,6 +550,11 @@ if there is more than one they are listed as "starting code units". "Last code
|
||||||
unit" is the last literal code unit that must be present in any match. This is
|
unit" is the last literal code unit that must be present in any match. This is
|
||||||
not necessarily the last character. These lines are omitted if no starting or
|
not necessarily the last character. These lines are omitted if no starting or
|
||||||
ending code units are recorded.
|
ending code units are recorded.
|
||||||
|
.P
|
||||||
|
The \fBcallout_info\fP modifier requests information about all the callouts in
|
||||||
|
the pattern. A list of them is output at the end of any other information that
|
||||||
|
is requested. For each callout, either its number or string is given, followed
|
||||||
|
by the item that follows it in the pattern.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SS "Specifying a pattern in hex"
|
.SS "Specifying a pattern in hex"
|
||||||
|
@ -1437,6 +1443,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 16 March 2015
|
Last updated: 22 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -59,41 +59,48 @@ INPUT ENCODING
|
||||||
|
|
||||||
Input to pcre2test is processed line by line, either by calling the C
|
Input to pcre2test is processed line by line, either by calling the C
|
||||||
library's fgets() function, or via the libreadline library (see below).
|
library's fgets() function, or via the libreadline library (see below).
|
||||||
In Unix-like environments, fgets() treats any bytes other than newline
|
The input is processed using using C's string functions, so must not
|
||||||
as data characters. However, in some Windows environments character 26
|
contain binary zeroes, even though in Unix-like environments, fgets()
|
||||||
(hex 1A) causes an immediate end of file, and no further data is read.
|
treats any bytes other than newline as data characters. In some Windows
|
||||||
For maximum portability, therefore, it is safest to avoid non-printing
|
environments character 26 (hex 1A) causes an immediate end of file, and
|
||||||
characters in pcre2test input files.
|
no further data is read.
|
||||||
|
|
||||||
|
For maximum portability, therefore, it is safest to avoid non-printing
|
||||||
|
characters in pcre2test input files. There is a facility for specifying
|
||||||
|
a pattern's characters as hexadecimal pairs, thus making it possible to
|
||||||
|
include binary zeroes in a pattern for testing purposes. Subject lines
|
||||||
|
are processed for backslash escapes, which makes it possible to include
|
||||||
|
any data value.
|
||||||
|
|
||||||
|
|
||||||
COMMAND LINE OPTIONS
|
COMMAND LINE OPTIONS
|
||||||
|
|
||||||
-8 If the 8-bit library has been built, this option causes it to
|
-8 If the 8-bit library has been built, this option causes it to
|
||||||
be used (this is the default). If the 8-bit library has not
|
be used (this is the default). If the 8-bit library has not
|
||||||
been built, this option causes an error.
|
been built, this option causes an error.
|
||||||
|
|
||||||
-16 If the 16-bit library has been built, this option causes it
|
-16 If the 16-bit library has been built, this option causes it
|
||||||
to be used. If only the 16-bit library has been built, this
|
to be used. If only the 16-bit library has been built, this
|
||||||
is the default. If the 16-bit library has not been built,
|
is the default. If the 16-bit library has not been built,
|
||||||
this option causes an error.
|
this option causes an error.
|
||||||
|
|
||||||
-32 If the 32-bit library has been built, this option causes it
|
-32 If the 32-bit library has been built, this option causes it
|
||||||
to be used. If only the 32-bit library has been built, this
|
to be used. If only the 32-bit library has been built, this
|
||||||
is the default. If the 32-bit library has not been built,
|
is the default. If the 32-bit library has not been built,
|
||||||
this option causes an error.
|
this option causes an error.
|
||||||
|
|
||||||
-b Behave as if each pattern has the /fullbincode modifier; the
|
-b Behave as if each pattern has the /fullbincode modifier; the
|
||||||
full internal binary form of the pattern is output after com-
|
full internal binary form of the pattern is output after com-
|
||||||
pilation.
|
pilation.
|
||||||
|
|
||||||
-C Output the version number of the PCRE2 library, and all
|
-C Output the version number of the PCRE2 library, and all
|
||||||
available information about the optional features that are
|
available information about the optional features that are
|
||||||
included, and then exit with zero exit code. All other
|
included, and then exit with zero exit code. All other
|
||||||
options are ignored.
|
options are ignored.
|
||||||
|
|
||||||
-C option Output information about a specific build-time option, then
|
-C option Output information about a specific build-time option, then
|
||||||
exit. This functionality is intended for use in scripts such
|
exit. This functionality is intended for use in scripts such
|
||||||
as RunTest. The following options output the value and set
|
as RunTest. The following options output the value and set
|
||||||
the exit code as indicated:
|
the exit code as indicated:
|
||||||
|
|
||||||
ebcdic-nl the code for LF (= NL) in an EBCDIC environment:
|
ebcdic-nl the code for LF (= NL) in an EBCDIC environment:
|
||||||
|
@ -109,7 +116,7 @@ COMMAND LINE OPTIONS
|
||||||
ANYCRLF or ANY
|
ANYCRLF or ANY
|
||||||
exit code is always 0
|
exit code is always 0
|
||||||
|
|
||||||
The following options output 1 for true or 0 for false, and
|
The following options output 1 for true or 0 for false, and
|
||||||
set the exit code to the same value:
|
set the exit code to the same value:
|
||||||
|
|
||||||
ebcdic compiled for an EBCDIC environment
|
ebcdic compiled for an EBCDIC environment
|
||||||
|
@ -119,15 +126,15 @@ COMMAND LINE OPTIONS
|
||||||
pcre2-8 the 8-bit library was built
|
pcre2-8 the 8-bit library was built
|
||||||
unicode Unicode support is available
|
unicode Unicode support is available
|
||||||
|
|
||||||
If an unknown option is given, an error message is output;
|
If an unknown option is given, an error message is output;
|
||||||
the exit code is 0.
|
the exit code is 0.
|
||||||
|
|
||||||
-d Behave as if each pattern has the debug modifier; the inter-
|
-d Behave as if each pattern has the debug modifier; the inter-
|
||||||
nal form and information about the compiled pattern is output
|
nal form and information about the compiled pattern is output
|
||||||
after compilation; -d is equivalent to -b -i.
|
after compilation; -d is equivalent to -b -i.
|
||||||
|
|
||||||
-dfa Behave as if each subject line has the dfa modifier; matching
|
-dfa Behave as if each subject line has the dfa modifier; matching
|
||||||
is done using the pcre2_dfa_match() function instead of the
|
is done using the pcre2_dfa_match() function instead of the
|
||||||
default pcre2_match().
|
default pcre2_match().
|
||||||
|
|
||||||
-help Output a brief summary these options and then exit.
|
-help Output a brief summary these options and then exit.
|
||||||
|
@ -135,8 +142,8 @@ COMMAND LINE OPTIONS
|
||||||
-i Behave as if each pattern has the /info modifier; information
|
-i Behave as if each pattern has the /info modifier; information
|
||||||
about the compiled pattern is given after compilation.
|
about the compiled pattern is given after compilation.
|
||||||
|
|
||||||
-jit Behave as if each pattern line has the jit modifier; after
|
-jit Behave as if each pattern line has the jit modifier; after
|
||||||
successful compilation, each pattern is passed to the just-
|
successful compilation, each pattern is passed to the just-
|
||||||
in-time compiler, if available.
|
in-time compiler, if available.
|
||||||
|
|
||||||
-pattern modifier-list
|
-pattern modifier-list
|
||||||
|
@ -145,25 +152,25 @@ COMMAND LINE OPTIONS
|
||||||
-q Do not output the version number of pcre2test at the start of
|
-q Do not output the version number of pcre2test at the start of
|
||||||
execution.
|
execution.
|
||||||
|
|
||||||
-S size On Unix-like systems, set the size of the run-time stack to
|
-S size On Unix-like systems, set the size of the run-time stack to
|
||||||
size megabytes.
|
size megabytes.
|
||||||
|
|
||||||
-subject modifier-list
|
-subject modifier-list
|
||||||
Behave as if each subject line contains the given modifiers.
|
Behave as if each subject line contains the given modifiers.
|
||||||
|
|
||||||
-t Run each compile and match many times with a timer, and out-
|
-t Run each compile and match many times with a timer, and out-
|
||||||
put the resulting times per compile or match. When JIT is
|
put the resulting times per compile or match. When JIT is
|
||||||
used, separate times are given for the initial compile and
|
used, separate times are given for the initial compile and
|
||||||
the JIT compile. You can control the number of iterations
|
the JIT compile. You can control the number of iterations
|
||||||
that are used for timing by following -t with a number (as a
|
that are used for timing by following -t with a number (as a
|
||||||
separate item on the command line). For example, "-t 1000"
|
separate item on the command line). For example, "-t 1000"
|
||||||
iterates 1000 times. The default is to iterate 500,000 times.
|
iterates 1000 times. The default is to iterate 500,000 times.
|
||||||
|
|
||||||
-tm This is like -t except that it times only the matching phase,
|
-tm This is like -t except that it times only the matching phase,
|
||||||
not the compile phase.
|
not the compile phase.
|
||||||
|
|
||||||
-T -TM These behave like -t and -tm, but in addition, at the end of
|
-T -TM These behave like -t and -tm, but in addition, at the end of
|
||||||
a run, the total times for all compiles and matches are out-
|
a run, the total times for all compiles and matches are out-
|
||||||
put.
|
put.
|
||||||
|
|
||||||
-version Output the PCRE2 version number and then exit.
|
-version Output the PCRE2 version number and then exit.
|
||||||
|
@ -171,158 +178,158 @@ COMMAND LINE OPTIONS
|
||||||
|
|
||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
|
|
||||||
If pcre2test is given two filename arguments, it reads from the first
|
If pcre2test is given two filename arguments, it reads from the first
|
||||||
and writes to the second. If the first name is "-", input is taken from
|
and writes to the second. If the first name is "-", input is taken from
|
||||||
the standard input. If pcre2test is given only one argument, it reads
|
the standard input. If pcre2test is given only one argument, it reads
|
||||||
from that file and writes to stdout. Otherwise, it reads from stdin and
|
from that file and writes to stdout. Otherwise, it reads from stdin and
|
||||||
writes to stdout.
|
writes to stdout.
|
||||||
|
|
||||||
When pcre2test is built, a configuration option can specify that it
|
When pcre2test is built, a configuration option can specify that it
|
||||||
should be linked with the libreadline or libedit library. When this is
|
should be linked with the libreadline or libedit library. When this is
|
||||||
done, if the input is from a terminal, it is read using the readline()
|
done, if the input is from a terminal, it is read using the readline()
|
||||||
function. This provides line-editing and history facilities. The output
|
function. This provides line-editing and history facilities. The output
|
||||||
from the -help option states whether or not readline() will be used.
|
from the -help option states whether or not readline() will be used.
|
||||||
|
|
||||||
The program handles any number of tests, each of which consists of a
|
The program handles any number of tests, each of which consists of a
|
||||||
set of input lines. Each set starts with a regular expression pattern,
|
set of input lines. Each set starts with a regular expression pattern,
|
||||||
followed by any number of subject lines to be matched against that pat-
|
followed by any number of subject lines to be matched against that pat-
|
||||||
tern. In between sets of test data, command lines that begin with # may
|
tern. In between sets of test data, command lines that begin with # may
|
||||||
appear. This file format, with some restrictions, can also be processed
|
appear. This file format, with some restrictions, can also be processed
|
||||||
by the perltest.sh script that is distributed with PCRE2 as a means of
|
by the perltest.sh script that is distributed with PCRE2 as a means of
|
||||||
checking that the behaviour of PCRE2 and Perl is the same.
|
checking that the behaviour of PCRE2 and Perl is the same.
|
||||||
|
|
||||||
When the input is a terminal, pcre2test prompts for each line of input,
|
When the input is a terminal, pcre2test prompts for each line of input,
|
||||||
using "re>" to prompt for regular expression patterns, and "data>" to
|
using "re>" to prompt for regular expression patterns, and "data>" to
|
||||||
prompt for subject lines. Command lines starting with # can be entered
|
prompt for subject lines. Command lines starting with # can be entered
|
||||||
only in response to the "re>" prompt.
|
only in response to the "re>" prompt.
|
||||||
|
|
||||||
Each subject line is matched separately and independently. If you want
|
Each subject line is matched separately and independently. If you want
|
||||||
to do multi-line matches, you have to use the \n escape sequence (or \r
|
to do multi-line matches, you have to use the \n escape sequence (or \r
|
||||||
or \r\n, etc., depending on the newline setting) in a single line of
|
or \r\n, etc., depending on the newline setting) in a single line of
|
||||||
input to encode the newline sequences. There is no limit on the length
|
input to encode the newline sequences. There is no limit on the length
|
||||||
of subject lines; the input buffer is automatically extended if it is
|
of subject lines; the input buffer is automatically extended if it is
|
||||||
too small. There is a replication feature that makes it possible to
|
too small. There is a replication feature that makes it possible to
|
||||||
generate long subject lines without having to supply them explicitly.
|
generate long subject lines without having to supply them explicitly.
|
||||||
|
|
||||||
An empty line or the end of the file signals the end of the subject
|
An empty line or the end of the file signals the end of the subject
|
||||||
lines for a test, at which point a new pattern or command line is
|
lines for a test, at which point a new pattern or command line is
|
||||||
expected if there is still input to be read.
|
expected if there is still input to be read.
|
||||||
|
|
||||||
|
|
||||||
COMMAND LINES
|
COMMAND LINES
|
||||||
|
|
||||||
In between sets of test data, a line that begins with # is interpreted
|
In between sets of test data, a line that begins with # is interpreted
|
||||||
as a command line. If the first character is followed by white space or
|
as a command line. If the first character is followed by white space or
|
||||||
an exclamation mark, the line is treated as a comment, and ignored.
|
an exclamation mark, the line is treated as a comment, and ignored.
|
||||||
Otherwise, the following commands are recognized:
|
Otherwise, the following commands are recognized:
|
||||||
|
|
||||||
#forbid_utf
|
#forbid_utf
|
||||||
|
|
||||||
Subsequent patterns automatically have the PCRE2_NEVER_UTF and
|
Subsequent patterns automatically have the PCRE2_NEVER_UTF and
|
||||||
PCRE2_NEVER_UCP options set, which locks out the use of UTF and Unicode
|
PCRE2_NEVER_UCP options set, which locks out the use of UTF and Unicode
|
||||||
property features. This is a trigger guard that is used in test files
|
property features. This is a trigger guard that is used in test files
|
||||||
to ensure that UTF or Unicode property tests are not accidentally added
|
to ensure that UTF or Unicode property tests are not accidentally added
|
||||||
to files that are used when Unicode support is not included in the
|
to files that are used when Unicode support is not included in the
|
||||||
library. This effect can also be obtained by the use of #pattern; the
|
library. This effect can also be obtained by the use of #pattern; the
|
||||||
difference is that #forbid_utf cannot be unset, and the automatic
|
difference is that #forbid_utf cannot be unset, and the automatic
|
||||||
options are not displayed in pattern information, to avoid cluttering
|
options are not displayed in pattern information, to avoid cluttering
|
||||||
up test output.
|
up test output.
|
||||||
|
|
||||||
#load <filename>
|
#load <filename>
|
||||||
|
|
||||||
This command is used to load a set of precompiled patterns from a file,
|
This command is used to load a set of precompiled patterns from a file,
|
||||||
as described in the section entitled "Saving and restoring compiled
|
as described in the section entitled "Saving and restoring compiled
|
||||||
patterns" below.
|
patterns" below.
|
||||||
|
|
||||||
#pattern <modifier-list>
|
#pattern <modifier-list>
|
||||||
|
|
||||||
This command sets a default modifier list that applies to all subse-
|
This command sets a default modifier list that applies to all subse-
|
||||||
quent patterns. Modifiers on a pattern can change these settings.
|
quent patterns. Modifiers on a pattern can change these settings.
|
||||||
|
|
||||||
#perltest
|
#perltest
|
||||||
|
|
||||||
The appearance of this line causes all subsequent modifier settings to
|
The appearance of this line causes all subsequent modifier settings to
|
||||||
be checked for compatibility with the perltest.sh script, which is used
|
be checked for compatibility with the perltest.sh script, which is used
|
||||||
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
||||||
comment lines, none of the other command lines are permitted, because
|
comment lines, none of the other command lines are permitted, because
|
||||||
they and many of the modifiers are specific to pcre2test, and should
|
they and many of the modifiers are specific to pcre2test, and should
|
||||||
not be used in test files that are also processed by perltest.sh. The
|
not be used in test files that are also processed by perltest.sh. The
|
||||||
#perltest command helps detect tests that are accidentally put in the
|
#perltest command helps detect tests that are accidentally put in the
|
||||||
wrong file.
|
wrong file.
|
||||||
|
|
||||||
#pop [<modifiers>]
|
#pop [<modifiers>]
|
||||||
|
|
||||||
This command is used to manipulate the stack of compiled patterns, as
|
This command is used to manipulate the stack of compiled patterns, as
|
||||||
described in the section entitled "Saving and restoring compiled pat-
|
described in the section entitled "Saving and restoring compiled pat-
|
||||||
terns" below.
|
terns" below.
|
||||||
|
|
||||||
#save <filename>
|
#save <filename>
|
||||||
|
|
||||||
This command is used to save a set of compiled patterns to a file, as
|
This command is used to save a set of compiled patterns to a file, as
|
||||||
described in the section entitled "Saving and restoring compiled pat-
|
described in the section entitled "Saving and restoring compiled pat-
|
||||||
terns" below.
|
terns" below.
|
||||||
|
|
||||||
#subject <modifier-list>
|
#subject <modifier-list>
|
||||||
|
|
||||||
This command sets a default modifier list that applies to all subse-
|
This command sets a default modifier list that applies to all subse-
|
||||||
quent subject lines. Modifiers on a subject line can change these set-
|
quent subject lines. Modifiers on a subject line can change these set-
|
||||||
tings.
|
tings.
|
||||||
|
|
||||||
|
|
||||||
MODIFIER SYNTAX
|
MODIFIER SYNTAX
|
||||||
|
|
||||||
Modifier lists are used with both pattern and subject lines. Items in a
|
Modifier lists are used with both pattern and subject lines. Items in a
|
||||||
list are separated by commas and optional white space. Some modifiers
|
list are separated by commas and optional white space. Some modifiers
|
||||||
may be given for both patterns and subject lines, whereas others are
|
may be given for both patterns and subject lines, whereas others are
|
||||||
valid for one or the other only. Each modifier has a long name, for
|
valid for one or the other only. Each modifier has a long name, for
|
||||||
example "anchored", and some of them must be followed by an equals sign
|
example "anchored", and some of them must be followed by an equals sign
|
||||||
and a value, for example, "offset=12". Modifiers that do not take val-
|
and a value, for example, "offset=12". Modifiers that do not take val-
|
||||||
ues may be preceded by a minus sign to turn off a previous setting.
|
ues may be preceded by a minus sign to turn off a previous setting.
|
||||||
|
|
||||||
A few of the more common modifiers can also be specified as single let-
|
A few of the more common modifiers can also be specified as single let-
|
||||||
ters, for example "i" for "caseless". In documentation, following the
|
ters, for example "i" for "caseless". In documentation, following the
|
||||||
Perl convention, these are written with a slash ("the /i modifier") for
|
Perl convention, these are written with a slash ("the /i modifier") for
|
||||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||||
item of a modifier list. If the first item is not recognized as a long
|
item of a modifier list. If the first item is not recognized as a long
|
||||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
/abc/ig,newline=cr,jit=3
|
/abc/ig,newline=cr,jit=3
|
||||||
|
|
||||||
This is a pattern line whose modifier list starts with two one-letter
|
This is a pattern line whose modifier list starts with two one-letter
|
||||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||||
same as used in Perl.
|
same as used in Perl.
|
||||||
|
|
||||||
|
|
||||||
PATTERN SYNTAX
|
PATTERN SYNTAX
|
||||||
|
|
||||||
A pattern line must start with one of the following characters (common
|
A pattern line must start with one of the following characters (common
|
||||||
symbols, excluding pattern meta-characters):
|
symbols, excluding pattern meta-characters):
|
||||||
|
|
||||||
/ ! " ' ` - = _ : ; , % & @ ~
|
/ ! " ' ` - = _ : ; , % & @ ~
|
||||||
|
|
||||||
This is interpreted as the pattern's delimiter. A regular expression
|
This is interpreted as the pattern's delimiter. A regular expression
|
||||||
may be continued over several input lines, in which case the newline
|
may be continued over several input lines, in which case the newline
|
||||||
characters are included within it. It is possible to include the delim-
|
characters are included within it. It is possible to include the delim-
|
||||||
iter within the pattern by escaping it with a backslash, for example
|
iter within the pattern by escaping it with a backslash, for example
|
||||||
|
|
||||||
/abc\/def/
|
/abc\/def/
|
||||||
|
|
||||||
If you do this, the escape and the delimiter form part of the pattern,
|
If you do this, the escape and the delimiter form part of the pattern,
|
||||||
but since the delimiters are all non-alphanumeric, this does not affect
|
but since the delimiters are all non-alphanumeric, this does not affect
|
||||||
its interpretation. If the terminating delimiter is immediately fol-
|
its interpretation. If the terminating delimiter is immediately fol-
|
||||||
lowed by a backslash, for example,
|
lowed by a backslash, for example,
|
||||||
|
|
||||||
/abc/\
|
/abc/\
|
||||||
|
|
||||||
then a backslash is added to the end of the pattern. This is done to
|
then a backslash is added to the end of the pattern. This is done to
|
||||||
provide a way of testing the error condition that arises if a pattern
|
provide a way of testing the error condition that arises if a pattern
|
||||||
finishes with a backslash, because
|
finishes with a backslash, because
|
||||||
|
|
||||||
/abc\/
|
/abc\/
|
||||||
|
|
||||||
is interpreted as the first line of a pattern that starts with "abc/",
|
is interpreted as the first line of a pattern that starts with "abc/",
|
||||||
causing pcre2test to read the next line as a continuation of the regu-
|
causing pcre2test to read the next line as a continuation of the regu-
|
||||||
lar expression.
|
lar expression.
|
||||||
|
|
||||||
A pattern can be followed by a modifier list (details below).
|
A pattern can be followed by a modifier list (details below).
|
||||||
|
@ -330,7 +337,7 @@ PATTERN SYNTAX
|
||||||
|
|
||||||
SUBJECT LINE SYNTAX
|
SUBJECT LINE SYNTAX
|
||||||
|
|
||||||
Before each subject line is passed to pcre2_match() or
|
Before each subject line is passed to pcre2_match() or
|
||||||
pcre2_dfa_match(), leading and trailing white space is removed, and the
|
pcre2_dfa_match(), leading and trailing white space is removed, and the
|
||||||
line is scanned for backslash escapes. The following provide a means of
|
line is scanned for backslash escapes. The following provide a means of
|
||||||
encoding non-printing characters in a visible way:
|
encoding non-printing characters in a visible way:
|
||||||
|
@ -350,23 +357,23 @@ SUBJECT LINE SYNTAX
|
||||||
\x{hh...} hexadecimal character (any number of hex digits)
|
\x{hh...} hexadecimal character (any number of hex digits)
|
||||||
|
|
||||||
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
||||||
the pattern. It is recognized always. There may be any number of hexa-
|
the pattern. It is recognized always. There may be any number of hexa-
|
||||||
decimal digits inside the braces; invalid values provoke error mes-
|
decimal digits inside the braces; invalid values provoke error mes-
|
||||||
sages.
|
sages.
|
||||||
|
|
||||||
Note that \xhh specifies one byte rather than one character in UTF-8
|
Note that \xhh specifies one byte rather than one character in UTF-8
|
||||||
mode; this makes it possible to construct invalid UTF-8 sequences for
|
mode; this makes it possible to construct invalid UTF-8 sequences for
|
||||||
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
||||||
character in UTF-8 mode, generating more than one byte if the value is
|
character in UTF-8 mode, generating more than one byte if the value is
|
||||||
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
||||||
\x{hh} generates one byte for values less than 256, and causes an error
|
\x{hh} generates one byte for values less than 256, and causes an error
|
||||||
for greater values.
|
for greater values.
|
||||||
|
|
||||||
In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
|
In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
|
||||||
possible to construct invalid UTF-16 sequences for testing purposes.
|
possible to construct invalid UTF-16 sequences for testing purposes.
|
||||||
|
|
||||||
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
||||||
makes it possible to construct invalid UTF-32 sequences for testing
|
makes it possible to construct invalid UTF-32 sequences for testing
|
||||||
purposes.
|
purposes.
|
||||||
|
|
||||||
There is a special backslash sequence that specifies replication of one
|
There is a special backslash sequence that specifies replication of one
|
||||||
|
@ -374,38 +381,38 @@ SUBJECT LINE SYNTAX
|
||||||
|
|
||||||
\[<characters>]{<count>}
|
\[<characters>]{<count>}
|
||||||
|
|
||||||
This makes it possible to test long strings without having to provide
|
This makes it possible to test long strings without having to provide
|
||||||
them as part of the file. For example:
|
them as part of the file. For example:
|
||||||
|
|
||||||
\[abc]{4}
|
\[abc]{4}
|
||||||
|
|
||||||
is converted to "abcabcabcabc". This feature does not support nesting.
|
is converted to "abcabcabcabc". This feature does not support nesting.
|
||||||
To include a closing square bracket in the characters, code it as \x5D.
|
To include a closing square bracket in the characters, code it as \x5D.
|
||||||
|
|
||||||
A backslash followed by an equals sign marks the end of the subject
|
A backslash followed by an equals sign marks the end of the subject
|
||||||
string and the start of a modifier list. For example:
|
string and the start of a modifier list. For example:
|
||||||
|
|
||||||
abc\=notbol,notempty
|
abc\=notbol,notempty
|
||||||
|
|
||||||
A backslash followed by any other non-alphanumeric character just
|
A backslash followed by any other non-alphanumeric character just
|
||||||
escapes that character. A backslash followed by anything else causes an
|
escapes that character. A backslash followed by anything else causes an
|
||||||
error. However, if the very last character in the line is a backslash
|
error. However, if the very last character in the line is a backslash
|
||||||
(and there is no modifier list), it is ignored. This gives a way of
|
(and there is no modifier list), it is ignored. This gives a way of
|
||||||
passing an empty line as data, since a real empty line terminates the
|
passing an empty line as data, since a real empty line terminates the
|
||||||
data input.
|
data input.
|
||||||
|
|
||||||
|
|
||||||
PATTERN MODIFIERS
|
PATTERN MODIFIERS
|
||||||
|
|
||||||
There are three types of modifier that can appear in pattern lines, two
|
There are three types of modifier that can appear in pattern lines, two
|
||||||
of which may also be used in a #pattern command. A pattern's modifier
|
of which may also be used in a #pattern command. A pattern's modifier
|
||||||
list can add to or override default modifiers that were set by a previ-
|
list can add to or override default modifiers that were set by a previ-
|
||||||
ous #pattern command.
|
ous #pattern command.
|
||||||
|
|
||||||
Setting compilation options
|
Setting compilation options
|
||||||
|
|
||||||
The following modifiers set options for pcre2_compile(). The most com-
|
The following modifiers set options for pcre2_compile(). The most com-
|
||||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||||
tion of their effects.
|
tion of their effects.
|
||||||
|
|
||||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||||
|
@ -432,17 +439,18 @@ PATTERN MODIFIERS
|
||||||
utf set PCRE2_UTF
|
utf set PCRE2_UTF
|
||||||
|
|
||||||
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
||||||
non-printing characters in output strings to be printed using the
|
non-printing characters in output strings to be printed using the
|
||||||
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
||||||
without the curly brackets.
|
without the curly brackets.
|
||||||
|
|
||||||
Setting compilation controls
|
Setting compilation controls
|
||||||
|
|
||||||
The following modifiers affect the compilation process or request
|
The following modifiers affect the compilation process or request
|
||||||
information about the pattern:
|
information about the pattern:
|
||||||
|
|
||||||
bsr=[anycrlf|unicode] specify \R handling
|
bsr=[anycrlf|unicode] specify \R handling
|
||||||
/B bincode show binary code without lengths
|
/B bincode show binary code without lengths
|
||||||
|
callout_info show callout information
|
||||||
debug same as info,fullbincode
|
debug same as info,fullbincode
|
||||||
fullbincode show binary code with lengths
|
fullbincode show binary code with lengths
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
|
@ -463,34 +471,34 @@ PATTERN MODIFIERS
|
||||||
|
|
||||||
Newline and \R handling
|
Newline and \R handling
|
||||||
|
|
||||||
The bsr modifier specifies what \R in a pattern should match. If it is
|
The bsr modifier specifies what \R in a pattern should match. If it is
|
||||||
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
||||||
"unicode", \R matches any Unicode newline sequence. The default is
|
"unicode", \R matches any Unicode newline sequence. The default is
|
||||||
specified when PCRE2 is built, with the default default being Unicode.
|
specified when PCRE2 is built, with the default default being Unicode.
|
||||||
|
|
||||||
The newline modifier specifies which characters are to be interpreted
|
The newline modifier specifies which characters are to be interpreted
|
||||||
as newlines, both in the pattern and in subject lines. The type must be
|
as newlines, both in the pattern and in subject lines. The type must be
|
||||||
one of CR, LF, CRLF, ANYCRLF, or ANY (in upper or lower case).
|
one of CR, LF, CRLF, ANYCRLF, or ANY (in upper or lower case).
|
||||||
|
|
||||||
Information about a pattern
|
Information about a pattern
|
||||||
|
|
||||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||||
available information.
|
available information.
|
||||||
|
|
||||||
The bincode modifier causes a representation of the compiled code to be
|
The bincode modifier causes a representation of the compiled code to be
|
||||||
output after compilation. This information does not contain length and
|
output after compilation. This information does not contain length and
|
||||||
offset values, which ensures that the same output is generated for dif-
|
offset values, which ensures that the same output is generated for dif-
|
||||||
ferent internal link sizes and different code unit widths. By using
|
ferent internal link sizes and different code unit widths. By using
|
||||||
bincode, the same regression tests can be used in different environ-
|
bincode, the same regression tests can be used in different environ-
|
||||||
ments.
|
ments.
|
||||||
|
|
||||||
The fullbincode modifier, by contrast, does include length and offset
|
The fullbincode modifier, by contrast, does include length and offset
|
||||||
values. This is used in a few special tests that run only for specific
|
values. This is used in a few special tests that run only for specific
|
||||||
code unit widths and link sizes, and is also useful for one-off tests.
|
code unit widths and link sizes, and is also useful for one-off tests.
|
||||||
|
|
||||||
The info modifier requests information about the compiled pattern
|
The info modifier requests information about the compiled pattern
|
||||||
(whether it is anchored, has a fixed first character, and so on). The
|
(whether it is anchored, has a fixed first character, and so on). The
|
||||||
information is obtained from the pcre2_pattern_info() function. Here
|
information is obtained from the pcre2_pattern_info() function. Here
|
||||||
are some typical examples:
|
are some typical examples:
|
||||||
|
|
||||||
re> /(?i)(^a|^b)/m,info
|
re> /(?i)(^a|^b)/m,info
|
||||||
|
@ -508,16 +516,21 @@ PATTERN MODIFIERS
|
||||||
Last code unit = 'c' (caseless)
|
Last code unit = 'c' (caseless)
|
||||||
Subject length lower bound = 3
|
Subject length lower bound = 3
|
||||||
|
|
||||||
"Compile options" are those specified by modifiers; "overall options"
|
"Compile options" are those specified by modifiers; "overall options"
|
||||||
have added options that are taken or deduced from the pattern. If both
|
have added options that are taken or deduced from the pattern. If both
|
||||||
sets of options are the same, just a single "options" line is output;
|
sets of options are the same, just a single "options" line is output;
|
||||||
if there are no options, the line is omitted. "First code unit" is
|
if there are no options, the line is omitted. "First code unit" is
|
||||||
where any match must start; if there is more than one they are listed
|
where any match must start; if there is more than one they are listed
|
||||||
as "starting code units". "Last code unit" is the last literal code
|
as "starting code units". "Last code unit" is the last literal code
|
||||||
unit that must be present in any match. This is not necessarily the
|
unit that must be present in any match. This is not necessarily the
|
||||||
last character. These lines are omitted if no starting or ending code
|
last character. These lines are omitted if no starting or ending code
|
||||||
units are recorded.
|
units are recorded.
|
||||||
|
|
||||||
|
The callout_info modifier requests information about all the callouts
|
||||||
|
in the pattern. A list of them is output at the end of any other infor-
|
||||||
|
mation that is requested. For each callout, either its number or string
|
||||||
|
is given, followed by the item that follows it in the pattern.
|
||||||
|
|
||||||
Specifying a pattern in hex
|
Specifying a pattern in hex
|
||||||
|
|
||||||
The hex modifier specifies that the characters of the pattern are to be
|
The hex modifier specifies that the characters of the pattern are to be
|
||||||
|
@ -808,11 +821,15 @@ SUBJECT MODIFIERS
|
||||||
The callout_fail modifier can be given one or two numbers. If there is
|
The callout_fail modifier can be given one or two numbers. If there is
|
||||||
only one number, 1 is returned instead of 0 when a callout of that num-
|
only one number, 1 is returned instead of 0 when a callout of that num-
|
||||||
ber is reached. If two numbers are given, 1 is returned when callout
|
ber is reached. If two numbers are given, 1 is returned when callout
|
||||||
<n> is reached for the <m>th time.
|
<n> is reached for the <m>th time. Note that callouts with string argu-
|
||||||
|
ments are always given the number zero. See "Callouts" below for a
|
||||||
|
description of the output when a callout it taken.
|
||||||
|
|
||||||
The callout_data modifier can be given an unsigned or a negative num-
|
The callout_data modifier can be given an unsigned or a negative num-
|
||||||
ber. Any value other than zero is used as a return from pcre2test's
|
ber. This is set as the "user data" that is passed to the matching
|
||||||
callout function.
|
function, and passed back when the callout function is invoked. Any
|
||||||
|
value other than zero is used as a return from pcre2test's callout
|
||||||
|
function.
|
||||||
|
|
||||||
Finding all matches in a string
|
Finding all matches in a string
|
||||||
|
|
||||||
|
@ -1136,22 +1153,37 @@ RESTARTING AFTER A PARTIAL MATCH
|
||||||
CALLOUTS
|
CALLOUTS
|
||||||
|
|
||||||
If the pattern contains any callout requests, pcre2test's callout func-
|
If the pattern contains any callout requests, pcre2test's callout func-
|
||||||
tion is called during matching. This works with both matching func-
|
tion is called during matching unless callout_none is specified. This
|
||||||
tions. By default, the called function displays the callout number, the
|
works with both matching functions.
|
||||||
start and current positions in the text at the callout time, and the
|
|
||||||
|
The callout function in pcre2test returns zero (carry on matching) by
|
||||||
|
default, but you can use a callout_fail modifier in a subject line (as
|
||||||
|
described above) to change this and other parameters of the callout.
|
||||||
|
|
||||||
|
Inserting callouts can be helpful when using pcre2test to check compli-
|
||||||
|
cated regular expressions. For further information about callouts, see
|
||||||
|
the pcre2callout documentation.
|
||||||
|
|
||||||
|
The output for callouts with numerical arguments and those with string
|
||||||
|
arguments is slightly different.
|
||||||
|
|
||||||
|
Callouts with numerical arguments
|
||||||
|
|
||||||
|
By default, the callout function displays the callout number, the start
|
||||||
|
and current positions in the subject text at the callout time, and the
|
||||||
next pattern item to be tested. For example:
|
next pattern item to be tested. For example:
|
||||||
|
|
||||||
--->pqrabcdef
|
--->pqrabcdef
|
||||||
0 ^ ^ \d
|
0 ^ ^ \d
|
||||||
|
|
||||||
This output indicates that callout number 0 occurred for a match
|
This output indicates that callout number 0 occurred for a match
|
||||||
attempt starting at the fourth character of the subject string, when
|
attempt starting at the fourth character of the subject string, when
|
||||||
the pointer was at the seventh character, and when the next pattern
|
the pointer was at the seventh character, and when the next pattern
|
||||||
item was \d. Just one circumflex is output if the start and current
|
item was \d. Just one circumflex is output if the start and current
|
||||||
positions are the same.
|
positions are the same.
|
||||||
|
|
||||||
Callouts numbered 255 are assumed to be automatic callouts, inserted as
|
Callouts numbered 255 are assumed to be automatic callouts, inserted as
|
||||||
a result of the /auto_callout pattern modifier. In this case, instead
|
a result of the /auto_callout pattern modifier. In this case, instead
|
||||||
of showing the callout number, the offset in the pattern, preceded by a
|
of showing the callout number, the offset in the pattern, preceded by a
|
||||||
plus, is output. For example:
|
plus, is output. For example:
|
||||||
|
|
||||||
|
@ -1165,7 +1197,7 @@ CALLOUTS
|
||||||
0: E*
|
0: E*
|
||||||
|
|
||||||
If a pattern contains (*MARK) items, an additional line is output when-
|
If a pattern contains (*MARK) items, an additional line is output when-
|
||||||
ever a change of latest mark is passed to the callout function. For
|
ever a change of latest mark is passed to the callout function. For
|
||||||
example:
|
example:
|
||||||
|
|
||||||
re> /a(*MARK:X)bc/auto_callout
|
re> /a(*MARK:X)bc/auto_callout
|
||||||
|
@ -1179,76 +1211,86 @@ CALLOUTS
|
||||||
+12 ^ ^
|
+12 ^ ^
|
||||||
0: abc
|
0: abc
|
||||||
|
|
||||||
The mark changes between matching "a" and "b", but stays the same for
|
The mark changes between matching "a" and "b", but stays the same for
|
||||||
the rest of the match, so nothing more is output. If, as a result of
|
the rest of the match, so nothing more is output. If, as a result of
|
||||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||||
output.
|
output.
|
||||||
|
|
||||||
The callout function in pcre2test returns zero (carry on matching) by
|
Callouts with string arguments
|
||||||
default, but you can use a callout_fail modifier in a subject line (as
|
|
||||||
described above) to change this and other parameters of the callout.
|
|
||||||
|
|
||||||
Inserting callouts can be helpful when using pcre2test to check compli-
|
The output for a callout with a string argument is similar, except that
|
||||||
cated regular expressions. For further information about callouts, see
|
instead of outputting a callout number before the position indicators,
|
||||||
the pcre2callout documentation.
|
the callout string and its offset in the pattern string are output
|
||||||
|
before the reflection of the subject string, and the subject string is
|
||||||
|
reflected for each callout. For example:
|
||||||
|
|
||||||
|
re> /^ab(?C'first')cd(?C"second")ef/
|
||||||
|
data> abcdefg
|
||||||
|
Callout (7): 'first'
|
||||||
|
--->abcdefg
|
||||||
|
^ ^ c
|
||||||
|
Callout (20): "second"
|
||||||
|
--->abcdefg
|
||||||
|
^ ^ e
|
||||||
|
0: abcdef
|
||||||
|
|
||||||
|
|
||||||
NON-PRINTING CHARACTERS
|
NON-PRINTING CHARACTERS
|
||||||
|
|
||||||
When pcre2test is outputting text in the compiled version of a pattern,
|
When pcre2test is outputting text in the compiled version of a pattern,
|
||||||
bytes other than 32-126 are always treated as non-printing characters
|
bytes other than 32-126 are always treated as non-printing characters
|
||||||
and are therefore shown as hex escapes.
|
and are therefore shown as hex escapes.
|
||||||
|
|
||||||
When pcre2test is outputting text that is a matched part of a subject
|
When pcre2test is outputting text that is a matched part of a subject
|
||||||
string, it behaves in the same way, unless a different locale has been
|
string, it behaves in the same way, unless a different locale has been
|
||||||
set for the pattern (using the /locale modifier). In this case, the
|
set for the pattern (using the /locale modifier). In this case, the
|
||||||
isprint() function is used to distinguish printing and non-printing
|
isprint() function is used to distinguish printing and non-printing
|
||||||
characters.
|
characters.
|
||||||
|
|
||||||
|
|
||||||
SAVING AND RESTORING COMPILED PATTERNS
|
SAVING AND RESTORING COMPILED PATTERNS
|
||||||
|
|
||||||
It is possible to save compiled patterns on disc or elsewhere, and
|
It is possible to save compiled patterns on disc or elsewhere, and
|
||||||
reload them later, subject to a number of restrictions. JIT data cannot
|
reload them later, subject to a number of restrictions. JIT data cannot
|
||||||
be saved. The host on which the patterns are reloaded must be running
|
be saved. The host on which the patterns are reloaded must be running
|
||||||
the same version of PCRE2, with the same code unit width, and must also
|
the same version of PCRE2, with the same code unit width, and must also
|
||||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||||
compiled patterns can be saved they must be serialized, that is, con-
|
compiled patterns can be saved they must be serialized, that is, con-
|
||||||
verted to a stream of bytes. A single byte stream may contain any num-
|
verted to a stream of bytes. A single byte stream may contain any num-
|
||||||
ber of compiled patterns, but they must all use the same character
|
ber of compiled patterns, but they must all use the same character
|
||||||
tables. A single copy of the tables is included in the byte stream (its
|
tables. A single copy of the tables is included in the byte stream (its
|
||||||
size is 1088 bytes).
|
size is 1088 bytes).
|
||||||
|
|
||||||
The functions whose names begin with pcre2_serialize_ are used for
|
The functions whose names begin with pcre2_serialize_ are used for
|
||||||
serializing and de-serializing. They are described in the pcre2serial-
|
serializing and de-serializing. They are described in the pcre2serial-
|
||||||
ize documentation. In this section we describe the features of
|
ize documentation. In this section we describe the features of
|
||||||
pcre2test that can be used to test these functions.
|
pcre2test that can be used to test these functions.
|
||||||
|
|
||||||
When a pattern with push modifier is successfully compiled, it is
|
When a pattern with push modifier is successfully compiled, it is
|
||||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||||
next line to contain a new pattern (or command) instead of a subject
|
next line to contain a new pattern (or command) instead of a subject
|
||||||
line. By this means, a number of patterns can be compiled and retained.
|
line. By this means, a number of patterns can be compiled and retained.
|
||||||
The push modifier is incompatible with posix, and control modifiers
|
The push modifier is incompatible with posix, and control modifiers
|
||||||
that act at match time are ignored (with a message). The jitverify mod-
|
that act at match time are ignored (with a message). The jitverify mod-
|
||||||
ifier applies only at compile time. The command
|
ifier applies only at compile time. The command
|
||||||
|
|
||||||
#save <filename>
|
#save <filename>
|
||||||
|
|
||||||
causes all the stacked patterns to be serialized and the result written
|
causes all the stacked patterns to be serialized and the result written
|
||||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||||
command
|
command
|
||||||
|
|
||||||
#load <filename>
|
#load <filename>
|
||||||
|
|
||||||
reads the data in the file, and then arranges for it to be de-serial-
|
reads the data in the file, and then arranges for it to be de-serial-
|
||||||
ized, with the resulting compiled patterns added to the pattern stack.
|
ized, with the resulting compiled patterns added to the pattern stack.
|
||||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||||
mand, which must be followed by lines of subjects that are to be
|
mand, which must be followed by lines of subjects that are to be
|
||||||
matched with the pattern, terminated as usual by an empty line or end
|
matched with the pattern, terminated as usual by an empty line or end
|
||||||
of file. This command may be followed by a modifier list containing
|
of file. This command may be followed by a modifier list containing
|
||||||
only control modifiers that act after a pattern has been compiled. In
|
only control modifiers that act after a pattern has been compiled. In
|
||||||
particular, hex, posix, and push are not allowed, nor are any option-
|
particular, hex, posix, and push are not allowed, nor are any option-
|
||||||
setting modifiers. The JIT modifiers are, however permitted. Here is
|
setting modifiers. The JIT modifiers are, however permitted. Here is
|
||||||
an example that saves and reloads two patterns.
|
an example that saves and reloads two patterns.
|
||||||
|
|
||||||
/abc/push
|
/abc/push
|
||||||
|
@ -1261,7 +1303,7 @@ SAVING AND RESTORING COMPILED PATTERNS
|
||||||
#pop jit,bincode
|
#pop jit,bincode
|
||||||
abc
|
abc
|
||||||
|
|
||||||
If jitverify is used with #pop, it does not automatically imply jit,
|
If jitverify is used with #pop, it does not automatically imply jit,
|
||||||
which is different behaviour from when it is used on a pattern.
|
which is different behaviour from when it is used on a pattern.
|
||||||
|
|
||||||
|
|
||||||
|
@ -1280,5 +1322,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 23 January 2015
|
Last updated: 22 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
|
|
|
@ -342,7 +342,19 @@ typedef struct pcre2_callout_block { \
|
||||||
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||||
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||||
/* ------------------------------------------------------------------ */ \
|
/* ------------------------------------------------------------------ */ \
|
||||||
} pcre2_callout_block;
|
} pcre2_callout_block; \
|
||||||
|
\
|
||||||
|
typedef struct pcre2_callout_enumerate_block { \
|
||||||
|
uint32_t version; /* Identifies version of block */ \
|
||||||
|
/* ------------------------ Version 0 ------------------------------- */ \
|
||||||
|
PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \
|
||||||
|
PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \
|
||||||
|
uint32_t callout_number; /* Number compiled into pattern */ \
|
||||||
|
PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \
|
||||||
|
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||||
|
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||||
|
/* ------------------------------------------------------------------ */ \
|
||||||
|
} pcre2_callout_enumerate_block;
|
||||||
|
|
||||||
|
|
||||||
/* List the generic forms of all other functions in macros, which will be
|
/* List the generic forms of all other functions in macros, which will be
|
||||||
|
@ -410,6 +422,9 @@ PCRE2_EXP_DECL void pcre2_code_free(pcre2_code *);
|
||||||
|
|
||||||
#define PCRE2_PATTERN_INFO_FUNCTIONS \
|
#define PCRE2_PATTERN_INFO_FUNCTIONS \
|
||||||
PCRE2_EXP_DECL int pcre2_pattern_info(const pcre2_code *, uint32_t, \
|
PCRE2_EXP_DECL int pcre2_pattern_info(const pcre2_code *, uint32_t, \
|
||||||
|
void *); \
|
||||||
|
PCRE2_EXP_DECL int pcre2_callout_enumerate(const pcre2_code *, \
|
||||||
|
int (*)(pcre2_callout_enumerate_block *, void *), \
|
||||||
void *);
|
void *);
|
||||||
|
|
||||||
|
|
||||||
|
@ -538,15 +553,17 @@ pcre2_compile are called by application code. */
|
||||||
|
|
||||||
/* Data blocks */
|
/* Data blocks */
|
||||||
|
|
||||||
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
||||||
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_)
|
||||||
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
||||||
#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_)
|
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
||||||
#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_)
|
#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_)
|
||||||
|
#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_)
|
||||||
|
|
||||||
|
|
||||||
/* Functions: the complete list in alphabetical order */
|
/* Functions: the complete list in alphabetical order */
|
||||||
|
|
||||||
|
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
||||||
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
||||||
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
||||||
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
||||||
|
@ -554,7 +571,6 @@ pcre2_compile are called by application code. */
|
||||||
#define pcre2_compile_context_free PCRE2_SUFFIX(pcre2_compile_context_free_)
|
#define pcre2_compile_context_free PCRE2_SUFFIX(pcre2_compile_context_free_)
|
||||||
#define pcre2_config PCRE2_SUFFIX(pcre2_config_)
|
#define pcre2_config PCRE2_SUFFIX(pcre2_config_)
|
||||||
#define pcre2_dfa_match PCRE2_SUFFIX(pcre2_dfa_match_)
|
#define pcre2_dfa_match PCRE2_SUFFIX(pcre2_dfa_match_)
|
||||||
#define pcre2_match PCRE2_SUFFIX(pcre2_match_)
|
|
||||||
#define pcre2_general_context_copy PCRE2_SUFFIX(pcre2_general_context_copy_)
|
#define pcre2_general_context_copy PCRE2_SUFFIX(pcre2_general_context_copy_)
|
||||||
#define pcre2_general_context_create PCRE2_SUFFIX(pcre2_general_context_create_)
|
#define pcre2_general_context_create PCRE2_SUFFIX(pcre2_general_context_create_)
|
||||||
#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_)
|
#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_)
|
||||||
|
@ -570,6 +586,7 @@ pcre2_compile are called by application code. */
|
||||||
#define pcre2_jit_stack_create PCRE2_SUFFIX(pcre2_jit_stack_create_)
|
#define pcre2_jit_stack_create PCRE2_SUFFIX(pcre2_jit_stack_create_)
|
||||||
#define pcre2_jit_stack_free PCRE2_SUFFIX(pcre2_jit_stack_free_)
|
#define pcre2_jit_stack_free PCRE2_SUFFIX(pcre2_jit_stack_free_)
|
||||||
#define pcre2_maketables PCRE2_SUFFIX(pcre2_maketables_)
|
#define pcre2_maketables PCRE2_SUFFIX(pcre2_maketables_)
|
||||||
|
#define pcre2_match PCRE2_SUFFIX(pcre2_match_)
|
||||||
#define pcre2_match_context_copy PCRE2_SUFFIX(pcre2_match_context_copy_)
|
#define pcre2_match_context_copy PCRE2_SUFFIX(pcre2_match_context_copy_)
|
||||||
#define pcre2_match_context_create PCRE2_SUFFIX(pcre2_match_context_create_)
|
#define pcre2_match_context_create PCRE2_SUFFIX(pcre2_match_context_create_)
|
||||||
#define pcre2_match_context_free PCRE2_SUFFIX(pcre2_match_context_free_)
|
#define pcre2_match_context_free PCRE2_SUFFIX(pcre2_match_context_free_)
|
||||||
|
|
|
@ -225,4 +225,181 @@ switch(what)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Callout enumerator *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
Arguments:
|
||||||
|
code points to compiled code
|
||||||
|
callback function called for each callout block
|
||||||
|
callout_data user data passed to the callback
|
||||||
|
|
||||||
|
Returns: 0 when successfully completed
|
||||||
|
< 0 on local error
|
||||||
|
!= 0 for callback error
|
||||||
|
*/
|
||||||
|
|
||||||
|
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||||
|
pcre2_callout_enumerate(const pcre2_code *code,
|
||||||
|
int (*callback)(pcre2_callout_enumerate_block *, void *), void *callout_data)
|
||||||
|
{
|
||||||
|
pcre2_real_code *re = (pcre2_real_code *)code;
|
||||||
|
pcre2_callout_enumerate_block cb;
|
||||||
|
PCRE2_SPTR cc;
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (re == NULL) return PCRE2_ERROR_NULL;
|
||||||
|
|
||||||
|
/* Check that the first field in the block is the magic number. If it is not,
|
||||||
|
return with PCRE2_ERROR_BADMAGIC. */
|
||||||
|
|
||||||
|
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
|
||||||
|
|
||||||
|
/* Check that this pattern was compiled in the correct bit mode */
|
||||||
|
|
||||||
|
if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE;
|
||||||
|
|
||||||
|
cb.version = 0;
|
||||||
|
cc = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code))
|
||||||
|
+ re->name_count * re->name_entry_size;
|
||||||
|
|
||||||
|
while (TRUE)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
switch (*cc)
|
||||||
|
{
|
||||||
|
case OP_END:
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
case OP_CHAR:
|
||||||
|
case OP_CHARI:
|
||||||
|
case OP_NOT:
|
||||||
|
case OP_NOTI:
|
||||||
|
case OP_STAR:
|
||||||
|
case OP_MINSTAR:
|
||||||
|
case OP_PLUS:
|
||||||
|
case OP_MINPLUS:
|
||||||
|
case OP_QUERY:
|
||||||
|
case OP_MINQUERY:
|
||||||
|
case OP_UPTO:
|
||||||
|
case OP_MINUPTO:
|
||||||
|
case OP_EXACT:
|
||||||
|
case OP_POSSTAR:
|
||||||
|
case OP_POSPLUS:
|
||||||
|
case OP_POSQUERY:
|
||||||
|
case OP_POSUPTO:
|
||||||
|
case OP_STARI:
|
||||||
|
case OP_MINSTARI:
|
||||||
|
case OP_PLUSI:
|
||||||
|
case OP_MINPLUSI:
|
||||||
|
case OP_QUERYI:
|
||||||
|
case OP_MINQUERYI:
|
||||||
|
case OP_UPTOI:
|
||||||
|
case OP_MINUPTOI:
|
||||||
|
case OP_EXACTI:
|
||||||
|
case OP_POSSTARI:
|
||||||
|
case OP_POSPLUSI:
|
||||||
|
case OP_POSQUERYI:
|
||||||
|
case OP_POSUPTOI:
|
||||||
|
case OP_NOTSTAR:
|
||||||
|
case OP_NOTMINSTAR:
|
||||||
|
case OP_NOTPLUS:
|
||||||
|
case OP_NOTMINPLUS:
|
||||||
|
case OP_NOTQUERY:
|
||||||
|
case OP_NOTMINQUERY:
|
||||||
|
case OP_NOTUPTO:
|
||||||
|
case OP_NOTMINUPTO:
|
||||||
|
case OP_NOTEXACT:
|
||||||
|
case OP_NOTPOSSTAR:
|
||||||
|
case OP_NOTPOSPLUS:
|
||||||
|
case OP_NOTPOSQUERY:
|
||||||
|
case OP_NOTPOSUPTO:
|
||||||
|
case OP_NOTSTARI:
|
||||||
|
case OP_NOTMINSTARI:
|
||||||
|
case OP_NOTPLUSI:
|
||||||
|
case OP_NOTMINPLUSI:
|
||||||
|
case OP_NOTQUERYI:
|
||||||
|
case OP_NOTMINQUERYI:
|
||||||
|
case OP_NOTUPTOI:
|
||||||
|
case OP_NOTMINUPTOI:
|
||||||
|
case OP_NOTEXACTI:
|
||||||
|
case OP_NOTPOSSTARI:
|
||||||
|
case OP_NOTPOSPLUSI:
|
||||||
|
case OP_NOTPOSQUERYI:
|
||||||
|
case OP_NOTPOSUPTOI:
|
||||||
|
cc += PRIV(OP_lengths)[*cc];
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
|
||||||
|
case OP_TYPESTAR:
|
||||||
|
case OP_TYPEMINSTAR:
|
||||||
|
case OP_TYPEPLUS:
|
||||||
|
case OP_TYPEMINPLUS:
|
||||||
|
case OP_TYPEQUERY:
|
||||||
|
case OP_TYPEMINQUERY:
|
||||||
|
case OP_TYPEUPTO:
|
||||||
|
case OP_TYPEMINUPTO:
|
||||||
|
case OP_TYPEEXACT:
|
||||||
|
case OP_TYPEPOSSTAR:
|
||||||
|
case OP_TYPEPOSPLUS:
|
||||||
|
case OP_TYPEPOSQUERY:
|
||||||
|
case OP_TYPEPOSUPTO:
|
||||||
|
cc += PRIV(OP_lengths)[*cc];
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if (cc[-1] == OP_PROP || cc[-1] == OP_NOTPROP) cc += 2;
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
|
||||||
|
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
|
||||||
|
case OP_XCLASS:
|
||||||
|
cc += GET(cc, 1);
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
case OP_MARK:
|
||||||
|
case OP_PRUNE_ARG:
|
||||||
|
case OP_SKIP_ARG:
|
||||||
|
case OP_THEN_ARG:
|
||||||
|
cc += PRIV(OP_lengths)[*cc] + cc[1];
|
||||||
|
break;
|
||||||
|
|
||||||
|
case OP_CALLOUT:
|
||||||
|
cb.pattern_position = GET(cc, 1);
|
||||||
|
cb.next_item_length = GET(cc, 1 + LINK_SIZE);
|
||||||
|
cb.callout_number = cc[1 + 2*LINK_SIZE];
|
||||||
|
cb.callout_string_offset = 0;
|
||||||
|
cb.callout_string_length = 0;
|
||||||
|
cb.callout_string = NULL;
|
||||||
|
rc = callback(&cb, callout_data);
|
||||||
|
if (rc != 0) return rc;
|
||||||
|
cc += PRIV(OP_lengths)[*cc];
|
||||||
|
break;
|
||||||
|
|
||||||
|
case OP_CALLOUT_STR:
|
||||||
|
cb.pattern_position = GET(cc, 1);
|
||||||
|
cb.next_item_length = GET(cc, 1 + LINK_SIZE);
|
||||||
|
cb.callout_number = 0;
|
||||||
|
cb.callout_string_offset = GET(cc, 1 + 3*LINK_SIZE);
|
||||||
|
cb.callout_string_length =
|
||||||
|
GET(cc, 1 + 2*LINK_SIZE) - (1 + 4*LINK_SIZE) - 2;
|
||||||
|
cb.callout_string = cc + (1 + 4*LINK_SIZE) + 1;
|
||||||
|
rc = callback(&cb, callout_data);
|
||||||
|
if (rc != 0) return rc;
|
||||||
|
cc += GET(cc, 1 + 2*LINK_SIZE);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
cc += PRIV(OP_lengths)[*cc];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* End of pcre2_pattern_info.c */
|
/* End of pcre2_pattern_info.c */
|
||||||
|
|
141
src/pcre2test.c
141
src/pcre2test.c
|
@ -382,28 +382,29 @@ either on a pattern or a data line, so they must all be distinct. */
|
||||||
#define CTL_ALTGLOBAL 0x00000010u
|
#define CTL_ALTGLOBAL 0x00000010u
|
||||||
#define CTL_BINCODE 0x00000020u
|
#define CTL_BINCODE 0x00000020u
|
||||||
#define CTL_CALLOUT_CAPTURE 0x00000040u
|
#define CTL_CALLOUT_CAPTURE 0x00000040u
|
||||||
#define CTL_CALLOUT_NONE 0x00000080u
|
#define CTL_CALLOUT_INFO 0x00000080u
|
||||||
#define CTL_DFA 0x00000100u
|
#define CTL_CALLOUT_NONE 0x00000100u
|
||||||
#define CTL_FINDLIMITS 0x00000200u
|
#define CTL_DFA 0x00000200u
|
||||||
#define CTL_FULLBINCODE 0x00000400u
|
#define CTL_FINDLIMITS 0x00000400u
|
||||||
#define CTL_GETALL 0x00000800u
|
#define CTL_FULLBINCODE 0x00000800u
|
||||||
#define CTL_GLOBAL 0x00001000u
|
#define CTL_GETALL 0x00001000u
|
||||||
#define CTL_HEXPAT 0x00002000u
|
#define CTL_GLOBAL 0x00002000u
|
||||||
#define CTL_INFO 0x00004000u
|
#define CTL_HEXPAT 0x00004000u
|
||||||
#define CTL_JITFAST 0x00008000u
|
#define CTL_INFO 0x00008000u
|
||||||
#define CTL_JITVERIFY 0x00010000u
|
#define CTL_JITFAST 0x00010000u
|
||||||
#define CTL_MARK 0x00020000u
|
#define CTL_JITVERIFY 0x00020000u
|
||||||
#define CTL_MEMORY 0x00040000u
|
#define CTL_MARK 0x00040000u
|
||||||
#define CTL_POSIX 0x00080000u
|
#define CTL_MEMORY 0x00080000u
|
||||||
#define CTL_PUSH 0x00100000u
|
#define CTL_POSIX 0x00100000u
|
||||||
#define CTL_STARTCHAR 0x00200000u
|
#define CTL_PUSH 0x00200000u
|
||||||
#define CTL_ZERO_TERMINATE 0x00400000u
|
#define CTL_STARTCHAR 0x00400000u
|
||||||
|
#define CTL_ZERO_TERMINATE 0x00800000u
|
||||||
|
|
||||||
#define CTL_BSR_SET 0x80000000u /* This is informational */
|
#define CTL_BSR_SET 0x80000000u /* This is informational */
|
||||||
#define CTL_NL_SET 0x40000000u /* This is informational */
|
#define CTL_NL_SET 0x40000000u /* This is informational */
|
||||||
|
|
||||||
#define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */
|
#define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */
|
||||||
#define CTL_ANYINFO (CTL_DEBUG|CTL_BINCODE) /* For testing */
|
#define CTL_ANYINFO (CTL_DEBUG|CTL_BINCODE|CTL_CALLOUT_INFO)
|
||||||
#define CTL_ANYGLOB (CTL_ALTGLOBAL|CTL_GLOBAL)
|
#define CTL_ANYGLOB (CTL_ALTGLOBAL|CTL_GLOBAL)
|
||||||
|
|
||||||
/* These are all the controls that may be set either on a pattern or on a
|
/* These are all the controls that may be set either on a pattern or on a
|
||||||
|
@ -431,7 +432,7 @@ typedef struct patctl { /* Structure for pattern modifiers. */
|
||||||
uint32_t jit;
|
uint32_t jit;
|
||||||
uint32_t stackguard_test;
|
uint32_t stackguard_test;
|
||||||
uint32_t tables_id;
|
uint32_t tables_id;
|
||||||
uint8_t locale[LOCALESIZE];
|
uint8_t locale[LOCALESIZE];
|
||||||
} patctl;
|
} patctl;
|
||||||
|
|
||||||
#define MAXCPYGET 10
|
#define MAXCPYGET 10
|
||||||
|
@ -494,6 +495,7 @@ static modstruct modlist[] = {
|
||||||
{ "callout_capture", MOD_DAT, MOD_CTL, CTL_CALLOUT_CAPTURE, DO(control) },
|
{ "callout_capture", MOD_DAT, MOD_CTL, CTL_CALLOUT_CAPTURE, DO(control) },
|
||||||
{ "callout_data", MOD_DAT, MOD_INS, 0, DO(callout_data) },
|
{ "callout_data", MOD_DAT, MOD_INS, 0, DO(callout_data) },
|
||||||
{ "callout_fail", MOD_DAT, MOD_IN2, 0, DO(cfail) },
|
{ "callout_fail", MOD_DAT, MOD_IN2, 0, DO(cfail) },
|
||||||
|
{ "callout_info", MOD_PAT, MOD_CTL, CTL_CALLOUT_INFO, PO(control) },
|
||||||
{ "callout_none", MOD_DAT, MOD_CTL, CTL_CALLOUT_NONE, DO(control) },
|
{ "callout_none", MOD_DAT, MOD_CTL, CTL_CALLOUT_NONE, DO(control) },
|
||||||
{ "caseless", MOD_PATP, MOD_OPT, PCRE2_CASELESS, PO(options) },
|
{ "caseless", MOD_PATP, MOD_OPT, PCRE2_CASELESS, PO(options) },
|
||||||
{ "copy", MOD_DAT, MOD_NN, DO(copy_numbers), DO(copy_names) },
|
{ "copy", MOD_DAT, MOD_NN, DO(copy_numbers), DO(copy_names) },
|
||||||
|
@ -578,8 +580,8 @@ static modstruct modlist[] = {
|
||||||
/* Control bits that are not ignored with 'push'. */
|
/* Control bits that are not ignored with 'push'. */
|
||||||
|
|
||||||
#define PUSH_SUPPORTED_COMPILE_CONTROLS ( \
|
#define PUSH_SUPPORTED_COMPILE_CONTROLS ( \
|
||||||
CTL_BINCODE|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO|CTL_JITVERIFY| \
|
CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \
|
||||||
CTL_MEMORY|CTL_PUSH|CTL_BSR_SET|CTL_NL_SET)
|
CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_BSR_SET|CTL_NL_SET)
|
||||||
|
|
||||||
/* Controls that apply only at compile time with 'push'. */
|
/* Controls that apply only at compile time with 'push'. */
|
||||||
|
|
||||||
|
@ -841,6 +843,17 @@ are supported. */
|
||||||
else \
|
else \
|
||||||
(void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
(void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
||||||
|
|
||||||
|
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||||
|
if (test_mode == PCRE8_MODE) \
|
||||||
|
a = pcre2_callout_enumerate_8(compiled_code8, \
|
||||||
|
(int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c); \
|
||||||
|
else if (test_mode == PCRE16_MODE) \
|
||||||
|
a = pcre2_callout_enumerate_16(compiled_code16, \
|
||||||
|
(int(*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c); \
|
||||||
|
else \
|
||||||
|
a = pcre2_callout_enumerate_32(compiled_code32, \
|
||||||
|
(int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c)
|
||||||
|
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
if (test_mode == PCRE8_MODE) \
|
if (test_mode == PCRE8_MODE) \
|
||||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8)); \
|
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8)); \
|
||||||
|
@ -1268,6 +1281,14 @@ the three different cases. */
|
||||||
else \
|
else \
|
||||||
(void)G(pchars,BITTWO)((G(PCRE2_SPTR,BITTWO))(p)+offset, len, utf, f)
|
(void)G(pchars,BITTWO)((G(PCRE2_SPTR,BITTWO))(p)+offset, len, utf, f)
|
||||||
|
|
||||||
|
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||||
|
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||||
|
a = G(pcre2_callout_enumerate,BITONE)(G(compiled_code,BITONE), \
|
||||||
|
(int (*)(struct G(pcre2_callout_enumerate_block_,BITONE) *, void *))b,c); \
|
||||||
|
else \
|
||||||
|
a = G(pcre2_callout_enumerate,BITTWO)(G(compiled_code,BITTWO), \
|
||||||
|
(int (*)(struct G(pcre2_callout_enumerate_block_,BITTWO) *, void *))b,c)
|
||||||
|
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||||
G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,G(g,BITONE)); \
|
G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,G(g,BITONE)); \
|
||||||
|
@ -1588,6 +1609,9 @@ the three different cases. */
|
||||||
lv = pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
lv = pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
||||||
#define PCHARSV(p, offset, len, utf, f) \
|
#define PCHARSV(p, offset, len, utf, f) \
|
||||||
(void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
(void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
||||||
|
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||||
|
a = pcre2_callout_enumerate_8(compiled_code8, \
|
||||||
|
(int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c)
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8))
|
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8))
|
||||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||||
|
@ -1676,6 +1700,9 @@ the three different cases. */
|
||||||
lv = pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
|
lv = pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
|
||||||
#define PCHARSV(p, offset, len, utf, f) \
|
#define PCHARSV(p, offset, len, utf, f) \
|
||||||
(void)pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
|
(void)pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
|
||||||
|
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||||
|
a = pcre2_callout_enumerate_16(compiled_code16, \
|
||||||
|
(int (*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c)
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,G(g,16))
|
G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,G(g,16))
|
||||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||||
|
@ -1764,6 +1791,9 @@ the three different cases. */
|
||||||
lv = pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
|
lv = pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
|
||||||
#define PCHARSV(p, offset, len, utf, f) \
|
#define PCHARSV(p, offset, len, utf, f) \
|
||||||
(void)pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
|
(void)pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
|
||||||
|
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||||
|
a = pcre2_callout_enumerate_32(compiled_code32, \
|
||||||
|
(int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c)
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,G(g,32))
|
G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,G(g,32))
|
||||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||||
|
@ -3381,7 +3411,7 @@ Returns: nothing
|
||||||
static void
|
static void
|
||||||
show_controls(uint32_t controls, const char *before)
|
show_controls(uint32_t controls, const char *before)
|
||||||
{
|
{
|
||||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
before,
|
before,
|
||||||
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
||||||
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
||||||
|
@ -3390,6 +3420,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
|
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
|
||||||
((controls & CTL_BINCODE) != 0)? " bincode" : "",
|
((controls & CTL_BINCODE) != 0)? " bincode" : "",
|
||||||
((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "",
|
((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "",
|
||||||
|
((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "",
|
||||||
((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "",
|
((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "",
|
||||||
((controls & CTL_DFA) != 0)? " dfa" : "",
|
((controls & CTL_DFA) != 0)? " dfa" : "",
|
||||||
((controls & CTL_FINDLIMITS) != 0)? " find_limits" : "",
|
((controls & CTL_FINDLIMITS) != 0)? " find_limits" : "",
|
||||||
|
@ -3517,6 +3548,56 @@ if (pat_patctl.jit != 0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Callback function for callout enumeration *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* The only differences in the callout emumeration block for different code
|
||||||
|
unit widths are that the pointers to the subject, the most recent MARK, and a
|
||||||
|
callout argument string point to strings of the appropriate width. Casts can be
|
||||||
|
used to deal with this.
|
||||||
|
|
||||||
|
Argument:
|
||||||
|
cb pointer to enumerate block
|
||||||
|
callout_data user data
|
||||||
|
|
||||||
|
Returns: 0
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int callout_callback(pcre2_callout_enumerate_block_8 *cb,
|
||||||
|
void *callout_data)
|
||||||
|
{
|
||||||
|
uint32_t i;
|
||||||
|
BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
|
||||||
|
|
||||||
|
(void)callout_data; /* Not currently displayed */
|
||||||
|
|
||||||
|
fprintf(outfile, "Callout ");
|
||||||
|
if (cb->callout_string != NULL)
|
||||||
|
{
|
||||||
|
uint32_t delimiter = CODE_UNIT(cb->callout_string, -1);
|
||||||
|
fprintf(outfile, "%c", delimiter);
|
||||||
|
PCHARSV(cb->callout_string, 0,
|
||||||
|
cb->callout_string_length, utf, outfile);
|
||||||
|
for (i = 0; callout_start_delims[i] != 0; i++)
|
||||||
|
if (delimiter == callout_start_delims[i])
|
||||||
|
{
|
||||||
|
delimiter = callout_end_delims[i];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
fprintf(outfile, "%c ", delimiter);
|
||||||
|
}
|
||||||
|
else fprintf(outfile, "%d ", cb->callout_number);
|
||||||
|
|
||||||
|
fprintf(outfile, "%.*s\n",
|
||||||
|
(int)((cb->next_item_length == 0)? 1 : cb->next_item_length),
|
||||||
|
pbuffer8 + cb->pattern_position);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Show information about a pattern *
|
* Show information about a pattern *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
@ -3789,6 +3870,24 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ((pat_patctl.control & CTL_CALLOUT_INFO) != 0)
|
||||||
|
{
|
||||||
|
int errorcode;
|
||||||
|
PCRE2_CALLOUT_ENUMERATE(errorcode, callout_callback, 0);
|
||||||
|
if (errorcode != 0)
|
||||||
|
{
|
||||||
|
int len;
|
||||||
|
fprintf(outfile, "Callout enumerate failed: error %d: ", errorcode);
|
||||||
|
if (errorcode < 0)
|
||||||
|
{
|
||||||
|
PCRE2_GET_ERROR_MESSAGE(len, errorcode, pbuffer);
|
||||||
|
PCHARSV(CASTVAR(void *, pbuffer), 0, len, FALSE, outfile);
|
||||||
|
}
|
||||||
|
fprintf(outfile, "\n");
|
||||||
|
return PR_SKIP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return PR_OK;
|
return PR_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4206,11 +4206,11 @@ a random value. /Ix
|
||||||
/^a(b)c(?C{AB})def/B
|
/^a(b)c(?C{AB})def/B
|
||||||
abcdef\=callout_capture
|
abcdef\=callout_capture
|
||||||
|
|
||||||
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B
|
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B,callout_info
|
||||||
|
|
||||||
/(?:a(?C`code`)){3}/B
|
/(?:a(?C`code`)){3}/B
|
||||||
|
|
||||||
/^(?(?C25)(?=abc)abcd|xyz)/B
|
/^(?(?C25)(?=abc)abcd|xyz)/B,callout_info
|
||||||
abcdefg
|
abcdefg
|
||||||
xyz123
|
xyz123
|
||||||
|
|
||||||
|
@ -4226,7 +4226,7 @@ a random value. /Ix
|
||||||
|
|
||||||
# Binary zero in callout string
|
# Binary zero in callout string
|
||||||
# a ( ? C ' x z ' ) b
|
# a ( ? C ' x z ' ) b
|
||||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex
|
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex,callout_info
|
||||||
abcdefgh
|
abcdefgh
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -14060,7 +14060,7 @@ Callout (10): {AB} last capture = 1
|
||||||
0: abcdef
|
0: abcdef
|
||||||
1: b
|
1: b
|
||||||
|
|
||||||
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B
|
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B,callout_info
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
CalloutStr `a`b` 4 10 0
|
CalloutStr `a`b` 4 10 0
|
||||||
|
@ -14074,6 +14074,14 @@ Callout (10): {AB} last capture = 1
|
||||||
Ket
|
Ket
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
Callout `a`b` (
|
||||||
|
Callout 'a'b' (
|
||||||
|
Callout "a"b" (
|
||||||
|
Callout ^a^b^ (
|
||||||
|
Callout %a%b% (
|
||||||
|
Callout #a#b# (
|
||||||
|
Callout $a$b$ (
|
||||||
|
Callout {a}b}
|
||||||
|
|
||||||
/(?:a(?C`code`)){3}/B
|
/(?:a(?C`code`)){3}/B
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
@ -14094,7 +14102,7 @@ Callout (10): {AB} last capture = 1
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
/^(?(?C25)(?=abc)abcd|xyz)/B
|
/^(?(?C25)(?=abc)abcd|xyz)/B,callout_info
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
Bra
|
Bra
|
||||||
^
|
^
|
||||||
|
@ -14110,6 +14118,7 @@ Callout (10): {AB} last capture = 1
|
||||||
Ket
|
Ket
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
Callout 25 (?=abc)
|
||||||
abcdefg
|
abcdefg
|
||||||
--->abcdefg
|
--->abcdefg
|
||||||
25 ^ (?=abc)
|
25 ^ (?=abc)
|
||||||
|
@ -14171,7 +14180,8 @@ Callout (8): `code`
|
||||||
|
|
||||||
# Binary zero in callout string
|
# Binary zero in callout string
|
||||||
# a ( ? C ' x z ' ) b
|
# a ( ? C ' x z ' ) b
|
||||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex
|
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex,callout_info
|
||||||
|
Callout 'x\x00z' b
|
||||||
abcdefgh
|
abcdefgh
|
||||||
Callout (5): 'x\x00z'
|
Callout (5): 'x\x00z'
|
||||||
--->abcdefgh
|
--->abcdefgh
|
||||||
|
|
Loading…
Reference in New Issue