Implement pcre2_callout_enumerate().
This commit is contained in:
parent
b15698b077
commit
4e61019ffe
|
@ -10,7 +10,9 @@ Version 10.20 xx-xx-2015
|
|||
|
||||
3. The invalid pattern (?(?C) has a missing assertion condition at the end. The
|
||||
pcre2_compile() function read past the end of the input before diagnosing an
|
||||
error.
|
||||
error. This bug was discovered by the LLVM fuzzer.
|
||||
|
||||
4. Implemented pcre2_callout_enumerate().
|
||||
|
||||
|
||||
Version 10.10 06-March-2015
|
||||
|
|
|
@ -24,6 +24,7 @@ dist_html_DATA = \
|
|||
doc/html/index.html \
|
||||
doc/html/pcre2-config.html \
|
||||
doc/html/pcre2.html \
|
||||
doc/html/pcre2_callout_enumerate.html \
|
||||
doc/html/pcre2_code_free.html \
|
||||
doc/html/pcre2_compile.html \
|
||||
doc/html/pcre2_compile_context_copy.html \
|
||||
|
@ -102,6 +103,7 @@ dist_html_DATA = \
|
|||
dist_man_MANS = \
|
||||
doc/pcre2-config.1 \
|
||||
doc/pcre2.3 \
|
||||
doc/pcre2_callout_enumerate.3 \
|
||||
doc/pcre2_code_free.3 \
|
||||
doc/pcre2_compile.3 \
|
||||
doc/pcre2_compile_context_copy.3 \
|
||||
|
|
|
@ -88,6 +88,9 @@ in the library.
|
|||
|
||||
<table>
|
||||
|
||||
<tr><td><a href="pcre2_callout_enumerate.html">pcre2_callout_enumerate</a></td>
|
||||
<td> Enumerate callouts in a compiled pattern</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||
<td> Free a compiled pattern</td></tr>
|
||||
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2_callout_enumerate specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2_callout_enumerate man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
SYNOPSIS
|
||||
</b><br>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function scans a compiled regular expression and calls the <i>callback()</i>
|
||||
function for each callout within the pattern. The yield of the function is zero
|
||||
for success and non-zero otherwise. The arguments are:
|
||||
<pre>
|
||||
<i>code</i> Points to the compiled pattern
|
||||
<i>callback</i> The callback function
|
||||
<i>callout_data</i> User data that is passed to the callback
|
||||
</pre>
|
||||
The <i>callback()</i> function is passed a pointer to a data block containing
|
||||
the following fields:
|
||||
<pre>
|
||||
<i>version</i> Block version number
|
||||
<i>pattern_position</i> Offset to next item in pattern
|
||||
<i>next_item_length</i> Length of next item in pattern
|
||||
<i>callout_number</i> Number for numbered callouts
|
||||
<i>callout_string_offset</i> Offset to string within pattern
|
||||
<i>callout_string_length</i> Length of callout string
|
||||
<i>callout_string</i> Points to callout string or is NULL
|
||||
</pre>
|
||||
The second argument is the callout data that was passed to
|
||||
<b>pcre2_callout_enumerate()</b>. The <b>callback()</b> function must return zero
|
||||
for success. Any other value causes the pattern scan to stop, with the value
|
||||
being passed back as the result of <b>pcre2_callout_enumerate()</b>.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
page and a description of the POSIX API in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
page.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -35,23 +35,24 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC20" href="#SEC20">JUST-IN-TIME (JIT) COMPILATION</a>
|
||||
<li><a name="TOC21" href="#SEC21">LOCALE SUPPORT</a>
|
||||
<li><a name="TOC22" href="#SEC22">INFORMATION ABOUT A COMPILED PATTERN</a>
|
||||
<li><a name="TOC23" href="#SEC23">SERIALIZATION AND PRECOMPILING</a>
|
||||
<li><a name="TOC24" href="#SEC24">THE MATCH DATA BLOCK</a>
|
||||
<li><a name="TOC25" href="#SEC25">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
||||
<li><a name="TOC26" href="#SEC26">NEWLINE HANDLING WHEN MATCHING</a>
|
||||
<li><a name="TOC27" href="#SEC27">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC28" href="#SEC28">OTHER INFORMATION ABOUT A MATCH</a>
|
||||
<li><a name="TOC29" href="#SEC29">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
||||
<li><a name="TOC30" href="#SEC30">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
||||
<li><a name="TOC31" href="#SEC31">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC32" href="#SEC32">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
||||
<li><a name="TOC33" href="#SEC33">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
||||
<li><a name="TOC34" href="#SEC34">DUPLICATE SUBPATTERN NAMES</a>
|
||||
<li><a name="TOC35" href="#SEC35">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
||||
<li><a name="TOC36" href="#SEC36">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
||||
<li><a name="TOC37" href="#SEC37">SEE ALSO</a>
|
||||
<li><a name="TOC38" href="#SEC38">AUTHOR</a>
|
||||
<li><a name="TOC39" href="#SEC39">REVISION</a>
|
||||
<li><a name="TOC23" href="#SEC23">INFORMATION ABOUT A PATTERN'S CALLOUTS</a>
|
||||
<li><a name="TOC24" href="#SEC24">SERIALIZATION AND PRECOMPILING</a>
|
||||
<li><a name="TOC25" href="#SEC25">THE MATCH DATA BLOCK</a>
|
||||
<li><a name="TOC26" href="#SEC26">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a>
|
||||
<li><a name="TOC27" href="#SEC27">NEWLINE HANDLING WHEN MATCHING</a>
|
||||
<li><a name="TOC28" href="#SEC28">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC29" href="#SEC29">OTHER INFORMATION ABOUT A MATCH</a>
|
||||
<li><a name="TOC30" href="#SEC30">ERROR RETURNS FROM <b>pcre2_match()</b></a>
|
||||
<li><a name="TOC31" href="#SEC31">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a>
|
||||
<li><a name="TOC32" href="#SEC32">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a>
|
||||
<li><a name="TOC33" href="#SEC33">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
|
||||
<li><a name="TOC34" href="#SEC34">CREATING A NEW STRING WITH SUBSTITUTIONS</a>
|
||||
<li><a name="TOC35" href="#SEC35">DUPLICATE SUBPATTERN NAMES</a>
|
||||
<li><a name="TOC36" href="#SEC36">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a>
|
||||
<li><a name="TOC37" href="#SEC37">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
|
||||
<li><a name="TOC38" href="#SEC38">SEE ALSO</a>
|
||||
<li><a name="TOC39" href="#SEC39">AUTHOR</a>
|
||||
<li><a name="TOC40" href="#SEC40">REVISION</a>
|
||||
</ul>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
|
@ -291,6 +292,11 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||
<b> void *<i>user_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_config(uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||
</P>
|
||||
<br><a name="SEC11" href="#TOC1">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a><br>
|
||||
|
@ -1433,14 +1439,16 @@ can be processed in different locales.
|
|||
<b>int pcre2_pattern_info(const pcre2 *<i>code</i>, uint32_t <i>what</i>, void *<i>where</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_pattern_info()</b> function returns information about a compiled
|
||||
pattern. The first argument is a pointer to the compiled pattern. The second
|
||||
argument specifies which piece of information is required, and the third
|
||||
argument is a pointer to a variable to receive the data. If the third argument
|
||||
is NULL, the first argument is ignored, and the function returns the size in
|
||||
bytes of the variable that is required for the information requested.
|
||||
Otherwise, The yield of the function is zero for success, or one of the
|
||||
following negative numbers:
|
||||
The <b>pcre2_pattern_info()</b> function returns general information about a
|
||||
compiled pattern. For information about callouts, see the
|
||||
<a href="pcre2pattern.html#infoaboutcallouts">next section.</a>
|
||||
The first argument for <b>pcre2_pattern_info()</b> is a pointer to the compiled
|
||||
pattern. The second argument specifies which piece of information is required,
|
||||
and the third argument is a pointer to a variable to receive the data. If the
|
||||
third argument is NULL, the first argument is ignored, and the function returns
|
||||
the size in bytes of the variable that is required for the information
|
||||
requested. Otherwise, The yield of the function is zero for success, or one of
|
||||
the following negative numbers:
|
||||
<pre>
|
||||
PCRE2_ERROR_NULL the argument <i>code</i> was NULL
|
||||
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
||||
|
@ -1719,8 +1727,27 @@ memory in which to place the compiled pattern may be slightly larger than the
|
|||
value returned by this option, because there are cases where the code that
|
||||
calculates the size has to over-estimate. Processing a pattern with the JIT
|
||||
compiler does not alter the value returned by this option.
|
||||
<a name="infoaboutcallouts"></a></P>
|
||||
<br><a name="SEC23" href="#TOC1">INFORMATION ABOUT A PATTERN'S CALLOUTS</a><br>
|
||||
<P>
|
||||
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||
<b> void *<i>user_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
A script language that supports the use of string arguments in callouts might
|
||||
like to scan all the callouts in a pattern before running the match. This can
|
||||
be done by calling <b>pcre2_callout_enumerate()</b>. The first argument is a
|
||||
pointer to a compiled pattern, the second points to a callback function, and
|
||||
the third is arbitrary user data. The callback function is called for every
|
||||
callout in the pattern in the order in which they appear. Its first argument is
|
||||
a pointer to a callout enumeration block, and its second argument is the
|
||||
<i>user_data</i> value that was passed to <b>pcre2_callout_enumerate()</b>. The
|
||||
contents of the callout enumeration block are described in the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation, which also gives further details about callouts.
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">SERIALIZATION AND PRECOMPILING</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">SERIALIZATION AND PRECOMPILING</a><br>
|
||||
<P>
|
||||
It is possible to save compiled patterns on disc or elsewhere, and reload them
|
||||
later, subject to a number of restrictions. The functions whose names begin
|
||||
|
@ -1729,7 +1756,7 @@ the
|
|||
<a href="pcre2serialize.html"><b>pcre2serialize</b></a>
|
||||
documentation.
|
||||
<a name="matchdatablock"></a></P>
|
||||
<br><a name="SEC24" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||
<P>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
|
@ -1800,7 +1827,7 @@ match data block (for that match) have taken place.
|
|||
When a match data block itself is no longer needed, it should be freed by
|
||||
calling <b>pcre2_match_data_free()</b>.
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
||||
<br><a name="SEC26" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
|
||||
<P>
|
||||
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
|
@ -2014,7 +2041,7 @@ examples, in the
|
|||
<a href="pcre2partial.html"><b>pcre2partial</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<br><a name="SEC26" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
||||
<br><a name="SEC27" href="#TOC1">NEWLINE HANDLING WHEN MATCHING</a><br>
|
||||
<P>
|
||||
When PCRE2 is built, a default newline convention is set; this is usually the
|
||||
standard convention for the operating system. The default can be overridden in
|
||||
|
@ -2049,7 +2076,7 @@ LF in the characters that it matches.
|
|||
Notwithstanding the above, anomalous effects may still occur when CRLF is a
|
||||
valid newline sequence and explicit \r or \n escapes appear in the pattern.
|
||||
<a name="matchedstrings"></a></P>
|
||||
<br><a name="SEC27" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
||||
<br><a name="SEC28" href="#TOC1">HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS</a><br>
|
||||
<P>
|
||||
<b>uint32_t pcre2_get_ovector_count(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
|
@ -2151,7 +2178,7 @@ parentheses, no more than <i>ovector[0]</i> to <i>ovector[2n+1]</i> are set by
|
|||
<b>pcre2_match()</b>. The other elements retain whatever values they previously
|
||||
had.
|
||||
<a name="matchotherdata"></a></P>
|
||||
<br><a name="SEC28" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
||||
<br><a name="SEC29" href="#TOC1">OTHER INFORMATION ABOUT A MATCH</a><br>
|
||||
<P>
|
||||
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
|
@ -2195,7 +2222,7 @@ the code unit offset of the invalid UTF character. Details are given in the
|
|||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
<a name="errorlist"></a></P>
|
||||
<br><a name="SEC29" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
||||
<br><a name="SEC30" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
||||
<P>
|
||||
If <b>pcre2_match()</b> fails, it returns a negative number. This can be
|
||||
converted to a text string by calling <b>pcre2_get_error_message()</b>. Negative
|
||||
|
@ -2246,8 +2273,8 @@ of the subject.
|
|||
PCRE2_ERROR_CALLOUT
|
||||
</pre>
|
||||
This error is never generated by <b>pcre2_match()</b> itself. It is provided for
|
||||
use by callout functions that want to cause <b>pcre2_match()</b> to return a
|
||||
distinctive error code. See the
|
||||
use by callout functions that want to cause <b>pcre2_match()</b> or
|
||||
<b>pcre2_callout_enumerate()</b> to return a distinctive error code. See the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation for details.
|
||||
<pre>
|
||||
|
@ -2304,7 +2331,7 @@ is attempted.
|
|||
</pre>
|
||||
The internal recursion limit was reached.
|
||||
<a name="extractbynumber"></a></P>
|
||||
<br><a name="SEC30" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
||||
<br><a name="SEC31" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NUMBER</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_length_bynumber(pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b> uint32_t <i>number</i>, PCRE2_SIZE *<i>length</i>);</b>
|
||||
|
@ -2401,7 +2428,7 @@ The substring did not participate in the match. For example, if the pattern is
|
|||
(abc)|(def) and the subject is "def", and the ovector contains at least two
|
||||
capturing slots, substring number 1 is unset.
|
||||
</P>
|
||||
<br><a name="SEC31" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
||||
<br><a name="SEC32" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_list_get(pcre2_match_data *<i>match_data</i>,</b>
|
||||
<b>" PCRE2_UCHAR ***<i>listptr</i>, PCRE2_SIZE **<i>lengthsptr</i>);</b>
|
||||
|
@ -2440,7 +2467,7 @@ can be distinguished from a genuine zero-length substring by inspecting the
|
|||
appropriate offset in the ovector, which contain PCRE2_UNSET for unset
|
||||
substrings, or by calling <b>pcre2_substring_length_bynumber()</b>.
|
||||
<a name="extractbyname"></a></P>
|
||||
<br><a name="SEC32" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
||||
<br><a name="SEC33" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_number_from_name(const pcre2_code *<i>code</i>,</b>
|
||||
<b> PCRE2_SPTR <i>name</i>);</b>
|
||||
|
@ -2500,7 +2527,7 @@ names are not included in the compiled code. The matching process uses only
|
|||
numbers. For this reason, the use of different names for subpatterns of the
|
||||
same number causes an error at compile time.
|
||||
</P>
|
||||
<br><a name="SEC33" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
||||
<br><a name="SEC34" href="#TOC1">CREATING A NEW STRING WITH SUBSTITUTIONS</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substitute(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
|
@ -2561,7 +2588,7 @@ straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid
|
|||
replacement string (unrecognized sequence following a dollar sign), and
|
||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.
|
||||
</P>
|
||||
<br><a name="SEC34" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<P>
|
||||
<b>int pcre2_substring_nametable_scan(const pcre2_code *<i>code</i>,</b>
|
||||
<b> PCRE2_SPTR <i>name</i>, PCRE2_SPTR *<i>first</i>, PCRE2_SPTR *<i>last</i>);</b>
|
||||
|
@ -2606,7 +2633,7 @@ The format of the name table is described above in the section entitled
|
|||
Given all the relevant entries for the name, you can extract each of their
|
||||
numbers, and hence the captured data.
|
||||
</P>
|
||||
<br><a name="SEC35" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
||||
<br><a name="SEC36" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
||||
<P>
|
||||
The traditional matching function uses a similar algorithm to Perl, which stops
|
||||
when it finds the first match at a given point in the subject. If you want to
|
||||
|
@ -2624,7 +2651,7 @@ substring. Then return 1, which forces <b>pcre2_match()</b> to backtrack and try
|
|||
other alternatives. Ultimately, when it runs out of matches,
|
||||
<b>pcre2_match()</b> will yield PCRE2_ERROR_NOMATCH.
|
||||
<a name="dfamatch"></a></P>
|
||||
<br><a name="SEC36" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
||||
<br><a name="SEC37" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
|
||||
<P>
|
||||
<b>int pcre2_dfa_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
<b> PCRE2_SIZE <i>length</i>, PCRE2_SIZE <i>startoffset</i>,</b>
|
||||
|
@ -2819,13 +2846,13 @@ some plausibility checks are made on the contents of the workspace, which
|
|||
should contain data about the previous partial match. If any of these checks
|
||||
fail, this error is given.
|
||||
</P>
|
||||
<br><a name="SEC37" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC38" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2build</b>(3), <b>pcre2callout</b>(3), <b>pcre2demo(3)</b>,
|
||||
<b>pcre2matching</b>(3), <b>pcre2partial</b>(3), <b>pcre2posix</b>(3),
|
||||
<b>pcre2sample</b>(3), <b>pcre2stack</b>(3), <b>pcre2unicode</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC38" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC39" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -2834,9 +2861,9 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC39" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 January 2015
|
||||
Last updated: 23 March 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -17,9 +17,10 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
||||
<li><a name="TOC3" href="#SEC3">MISSING CALLOUTS</a>
|
||||
<li><a name="TOC4" href="#SEC4">THE CALLOUT INTERFACE</a>
|
||||
<li><a name="TOC5" href="#SEC5">RETURN VALUES</a>
|
||||
<li><a name="TOC6" href="#SEC6">AUTHOR</a>
|
||||
<li><a name="TOC7" href="#SEC7">REVISION</a>
|
||||
<li><a name="TOC5" href="#SEC5">RETURN VALUES FROM CALLOUTS</a>
|
||||
<li><a name="TOC6" href="#SEC6">CALLOUT ENUMERATION</a>
|
||||
<li><a name="TOC7" href="#SEC7">AUTHOR</a>
|
||||
<li><a name="TOC8" href="#SEC8">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
||||
<P>
|
||||
|
@ -27,23 +28,32 @@ please consult the man page, in case the conversion went wrong.
|
|||
</P>
|
||||
<P>
|
||||
<b>int (*pcre2_callout)(pcre2_callout_block *, void *);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||
<b> void *<i>user_data</i>);</b>
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
||||
<P>
|
||||
PCRE2 provides a feature called "callout", which is a means of temporarily
|
||||
passing control to the caller of PCRE2 in the middle of pattern matching. The
|
||||
caller of PCRE2 provides an external function by putting its entry point in
|
||||
a match context (see <b>pcre2_set_callout()</b>) in the
|
||||
a match context (see <b>pcre2_set_callout()</b> in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation).
|
||||
</P>
|
||||
<P>
|
||||
Within a regular expression, (?C) indicates the points at which the external
|
||||
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||
function is to be called. Different callout points can be identified by putting
|
||||
a number less than 256 after the letter C. The default value is zero.
|
||||
For example, this pattern has two callout points:
|
||||
Alternatively, the argument may be a delimited string. The starting delimiter
|
||||
must be one of ` ' " ^ % # $ { and the ending delimiter is the same as the
|
||||
start, except for {, where the ending delimiter is }. If the ending delimiter
|
||||
is needed within the string, it must be doubled. For example, this pattern has
|
||||
two callout points:
|
||||
<pre>
|
||||
(?C1)abc(?C2)def
|
||||
(?C1)abc(?C"some ""arbitrary"" text")def
|
||||
</pre>
|
||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
|
||||
automatically inserts callouts, all with number 255, before each item in the
|
||||
|
@ -62,19 +72,18 @@ alternation bar. If the pattern contains a conditional group whose condition is
|
|||
an assertion, an automatic callout is inserted immediately before the
|
||||
condition. Such a callout may also be inserted explicitly, for example:
|
||||
<pre>
|
||||
(?(?C9)(?=a)ab|de)
|
||||
(?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de)
|
||||
</pre>
|
||||
This applies only to assertion conditions (because they are themselves
|
||||
independent groups).
|
||||
</P>
|
||||
<P>
|
||||
Automatic callouts can be used for tracking the progress of pattern matching.
|
||||
The
|
||||
Callouts can be useful for tracking the progress of pattern matching. The
|
||||
<a href="pcre2test.html"><b>pcre2test</b></a>
|
||||
program has a pattern qualifier (/auto_callout) that sets automatic callouts;
|
||||
when it is used, the output indicates how the pattern is being matched. This is
|
||||
useful information when you are trying to optimize the performance of a
|
||||
particular pattern.
|
||||
program has a pattern qualifier (/auto_callout) that sets automatic callouts.
|
||||
When any callouts are present, the output from <b>pcre2test</b> indicates how
|
||||
the pattern is being matched. This is useful information when you are trying to
|
||||
optimize the performance of a particular pattern.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">MISSING CALLOUTS</a><br>
|
||||
<P>
|
||||
|
@ -185,7 +194,7 @@ You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
|
|||
option to <b>pcre2_compile()</b>, or by starting the pattern with
|
||||
(*NO_START_OPT). This slows down the matching process, but does ensure that
|
||||
callouts such as the example above are obeyed.
|
||||
</P>
|
||||
<a name="calloutinterface"></a></P>
|
||||
<br><a name="SEC4" href="#TOC1">THE CALLOUT INTERFACE</a><br>
|
||||
<P>
|
||||
During matching, when PCRE2 reaches a callout point, if an external function is
|
||||
|
@ -209,16 +218,53 @@ documentation). The callout block structure contains the following fields:
|
|||
PCRE2_SIZE <i>current_position</i>;
|
||||
PCRE2_SIZE <i>pattern_position</i>;
|
||||
PCRE2_SIZE <i>next_item_length</i>;
|
||||
PCRE2_SIZE <i>callout_string_offset</i>;
|
||||
PCRE2_SIZE <i>callout_string_length</i>;
|
||||
PCRE2_SPTR <i>callout_string</i>;
|
||||
</pre>
|
||||
The <i>version</i> field contains the version number of the block format. The
|
||||
current version is 0. The version number will change in future if additional
|
||||
fields are added, but the intention is never to remove any of the existing
|
||||
fields.
|
||||
current version is 1; the three callout string fields were added for this
|
||||
version. If you are writing an application that might use an earlier release of
|
||||
PCRE2, you should check the version number before accessing any of these
|
||||
fields. The version number will increase in future if more fields are added,
|
||||
but the intention is never to remove any of the existing fields.
|
||||
</P>
|
||||
<br><b>
|
||||
Fields for numerical callouts
|
||||
</b><br>
|
||||
<P>
|
||||
For a numerical callout, <i>callout_string</i> is NULL, and <i>callout_number</i>
|
||||
contains the number of the callout, in the range 0-255. This is the number
|
||||
that follows (?C for manual callouts; it is 255 for automatically generated
|
||||
callouts.
|
||||
</P>
|
||||
<br><b>
|
||||
Fields for string callouts
|
||||
</b><br>
|
||||
<P>
|
||||
For callouts with string arguments, <i>callout_number</i> is always zero, and
|
||||
<i>callout_string</i> points to the string that is contained within the compiled
|
||||
pattern. Its length is given by <i>callout_string_length</i>. Duplicated ending
|
||||
delimiters that were present in the original pattern string have been turned
|
||||
into single characters, but there is no other processing of the callout string
|
||||
argument. An additional code unit containing binary zero is present after the
|
||||
string, but is not included in the length. The delimiter that was used to start
|
||||
the string is also stored within the pattern, immediately before the string
|
||||
itself. You can access this delimiter as <i>callout_string</i>[-1] if you need
|
||||
it.
|
||||
</P>
|
||||
<P>
|
||||
The <i>callout_number</i> field contains the number of the callout, as compiled
|
||||
into the pattern (that is, the number after ?C for manual callouts, and 255 for
|
||||
automatically generated callouts).
|
||||
The <i>callout_string_offset</i> field is the code unit offset to the start of
|
||||
the callout argument string within the original pattern string. This is
|
||||
provided for the benefit of applications such as script languages that might
|
||||
need to report errors in the callout string within the pattern.
|
||||
</P>
|
||||
<br><b>
|
||||
Fields for all callouts
|
||||
</b><br>
|
||||
<P>
|
||||
The remaining fields in the callout block are the same for both kinds of
|
||||
callout.
|
||||
</P>
|
||||
<P>
|
||||
The <i>offset_vector</i> field is a pointer to the vector of capturing offsets
|
||||
|
@ -259,8 +305,8 @@ substrings have been captured, the value of <i>capture_last</i> is 0. This is
|
|||
always the case for the DFA matching functions.
|
||||
</P>
|
||||
<P>
|
||||
The <i>pattern_position</i> field contains the offset to the next item to be
|
||||
matched in the pattern string.
|
||||
The <i>pattern_position</i> field contains the offset in the pattern string to
|
||||
the next item to be matched.
|
||||
</P>
|
||||
<P>
|
||||
The <i>next_item_length</i> field contains the length of the next item to be
|
||||
|
@ -272,7 +318,9 @@ of the entire subpattern.
|
|||
<P>
|
||||
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
|
||||
help in distinguishing between different automatic callouts, which all have the
|
||||
same callout number. However, they are set for all callouts.
|
||||
same callout number. However, they are set for all callouts, and are used by
|
||||
<b>pcre2test</b> to show the next item to be matched when displaying callout
|
||||
information.
|
||||
</P>
|
||||
<P>
|
||||
In callouts from <b>pcre2_match()</b> the <i>mark</i> field contains a pointer to
|
||||
|
@ -281,7 +329,7 @@ the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
|||
of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In
|
||||
callouts from the DFA matching function this field always contains NULL.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">RETURN VALUES</a><br>
|
||||
<br><a name="SEC5" href="#TOC1">RETURN VALUES FROM CALLOUTS</a><br>
|
||||
<P>
|
||||
The external callout function returns an integer to PCRE2. If the value is
|
||||
zero, matching proceeds as normal. If the value is greater than zero, matching
|
||||
|
@ -296,7 +344,51 @@ values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match"
|
|||
failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout
|
||||
functions; it will never be used by PCRE2 itself.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC6" href="#TOC1">CALLOUT ENUMERATION</a><br>
|
||||
<P>
|
||||
<b>int pcre2_callout_enumerate(const pcre2_code *<i>code</i>,</b>
|
||||
<b> int (*<i>callback</i>)(pcre2_callout_enumerate_block *, void *),</b>
|
||||
<b> void *<i>user_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
A script language that supports the use of string arguments in callouts might
|
||||
like to scan all the callouts in a pattern before running the match. This can
|
||||
be done by calling <b>pcre2_callout_enumerate()</b>. The first argument is a
|
||||
pointer to a compiled pattern, the second points to a callback function, and
|
||||
the third is arbitrary user data. The callback function is called for every
|
||||
callout in the pattern in the order in which they appear. Its first argument is
|
||||
a pointer to a callout enumeration block, and its second argument is the
|
||||
<i>user_data</i> value that was passed to <b>pcre2_callout_enumerate()</b>. The
|
||||
data block contains the following fields:
|
||||
<pre>
|
||||
<i>version</i> Block version number
|
||||
<i>pattern_position</i> Offset to next item in pattern
|
||||
<i>next_item_length</i> Length of next item in pattern
|
||||
<i>callout_number</i> Number for numbered callouts
|
||||
<i>callout_string_offset</i> Offset to string within pattern
|
||||
<i>callout_string_length</i> Length of callout string
|
||||
<i>callout_string</i> Points to callout string or is NULL
|
||||
</pre>
|
||||
The version number is currently 0. It will increase if new fields are ever
|
||||
added to the block. The remaining fields are the same as their namesakes in the
|
||||
<b>pcre2_callout</b> block that is used for callouts during matching, as
|
||||
described
|
||||
<a href="#calloutinterface">above.</a>
|
||||
</P>
|
||||
<P>
|
||||
Note that the value of <i>pattern_position</i> is unique for each callout.
|
||||
However, if a callout occurs inside a group that is quantified with a non-zero
|
||||
minimum or a fixed maximum, the group is replicated inside the compiled
|
||||
pattern. For example, a pattern such as /(a){2}/ is compiled as if it were
|
||||
/(a)(a)/. This means that the callout will be enumerated more than once, but
|
||||
with the same value for <i>pattern_position</i> in each case.
|
||||
</P>
|
||||
<P>
|
||||
The callback function should normally return zero. If it returns a non-zero
|
||||
value, scanning the pattern stops, and that value is returned from
|
||||
<b>pcre2_callout_enumerate()</b>.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -305,9 +397,9 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 02 January 2015
|
||||
Last updated: 23 March 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -83,11 +83,11 @@ the
|
|||
documentation for details.
|
||||
</P>
|
||||
<P>
|
||||
8. Subpatterns that are called as subroutines (whether or not recursively) are
|
||||
always treated as atomic groups in PCRE2. This is like Python, but unlike Perl.
|
||||
Captured values that are set outside a subroutine call can be reference from
|
||||
inside in PCRE2, but not in Perl. There is a discussion that explains these
|
||||
differences in more detail in the
|
||||
8. Subroutine calls (whether recursive or not) are treated as atomic groups.
|
||||
Atomic recursion is like Python, but unlike Perl. Captured values that are set
|
||||
outside a subroutine call can be referenced from inside in PCRE2, but not in
|
||||
Perl. There is a discussion that explains these differences in more detail in
|
||||
the
|
||||
<a href="pcre2pattern.html#recursiondifference">section on recursion differences from Perl</a>
|
||||
in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
|
@ -214,9 +214,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 28 September 2014
|
||||
Last updated: 15 March 2015
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -2786,43 +2786,70 @@ same pair of parentheses when there is a repetition.
|
|||
PCRE2 provides a similar feature, but of course it cannot obey arbitrary Perl
|
||||
code. The feature is called "callout". The caller of PCRE2 provides an external
|
||||
function by putting its entry point in a match context using the function
|
||||
<b>pcre2_set_callout()</b> and passing the context to <b>pcre2_match()</b> or
|
||||
<b>pcre2_dfa_match()</b>. If no match context is passed, or if the callout entry
|
||||
point is set to NULL, callouts are disabled.
|
||||
<b>pcre2_set_callout()</b>, and then passing that context to <b>pcre2_match()</b>
|
||||
or <b>pcre2_dfa_match()</b>. If no match context is passed, or if the callout
|
||||
entry point is set to NULL, callouts are disabled.
|
||||
</P>
|
||||
<P>
|
||||
Within a regular expression, (?C) indicates the points at which the external
|
||||
function is to be called. If you want to identify different callout points, you
|
||||
can put a number less than 256 after the letter C. The default value is zero.
|
||||
For example, this pattern has two callout points:
|
||||
Within a regular expression, (?C<arg>) indicates a point at which the external
|
||||
function is to be called. There are two kinds of callout: those with a
|
||||
numerical argument and those with a string argument. (?C) on its own with no
|
||||
argument is treated as (?C0). A numerical argument allows the application to
|
||||
distinguish between different callouts. String arguments were added for release
|
||||
10.20 to make it possible for script languages that use PCRE2 to embed short
|
||||
scripts within patterns in a similar way to Perl.
|
||||
</P>
|
||||
<P>
|
||||
During matching, when PCRE2 reaches a callout point, the external function is
|
||||
called. It is provided with the number or string argument of the callout, the
|
||||
position in the pattern, and one item of data that is also set in the match
|
||||
block. The callout function may cause matching to proceed, to backtrack, or to
|
||||
fail.
|
||||
</P>
|
||||
<P>
|
||||
By default, PCRE2 implements a number of optimizations at matching time, and
|
||||
one side-effect is that sometimes callouts are skipped. If you need all
|
||||
possible callouts to happen, you need to set options that disable the relevant
|
||||
optimizations. More details, including a complete description of the
|
||||
programming interface to the callout function, are given in the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<br><b>
|
||||
Callouts with numerical arguments
|
||||
</b><br>
|
||||
<P>
|
||||
If you just want to have a means of identifying different callout points, put a
|
||||
number less than 256 after the letter C. For example, this pattern has two
|
||||
callout points:
|
||||
<pre>
|
||||
(?C1)abc(?C2)def
|
||||
</pre>
|
||||
If the PCRE2_AUTO_CALLOUT flag is passed to <b>pcre2_compile()</b>, callouts are
|
||||
automatically installed before each item in the pattern. They are all numbered
|
||||
255. If there is a conditional group in the pattern whose condition is an
|
||||
assertion, an additional callout is inserted just before the condition. An
|
||||
explicit callout may also be set at this position, as in this example:
|
||||
If the PCRE2_AUTO_CALLOUT flag is passed to <b>pcre2_compile()</b>, numerical
|
||||
callouts are automatically installed before each item in the pattern. They are
|
||||
all numbered 255. If there is a conditional group in the pattern whose
|
||||
condition is an assertion, an additional callout is inserted just before the
|
||||
condition. An explicit callout may also be set at this position, as in this
|
||||
example:
|
||||
<pre>
|
||||
(?(?C9)(?=a)abc|def)
|
||||
</pre>
|
||||
Note that this applies only to assertion conditions, not to other types of
|
||||
condition.
|
||||
</P>
|
||||
<br><b>
|
||||
Callouts with string arguments
|
||||
</b><br>
|
||||
<P>
|
||||
During matching, when PCRE2 reaches a callout point, the external function is
|
||||
called. It is provided with the number of the callout, the position in the
|
||||
pattern, and one item of data that is also set in the match block. The callout
|
||||
function may cause matching to proceed, to backtrack, or to fail.
|
||||
</P>
|
||||
<P>
|
||||
By default, PCRE2 implements a number of optimizations at matching time, and
|
||||
one side-effect is that sometimes callouts are skipped. If you need all
|
||||
possible callouts to happen, you need to set options that disable the relevant
|
||||
optimizations. More details, and a complete description of the interface to the
|
||||
callout function, are given in the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
A delimited string may be used instead of a number as a callout argument. The
|
||||
starting delimiter must be one of ` ' " ^ % # $ { and the ending delimiter is
|
||||
the same as the start, except for {, where the ending delimiter is }. If the
|
||||
ending delimiter is needed within the string, it must be doubled. For
|
||||
example:
|
||||
<pre>
|
||||
(?C'ab ''c'' d')xyz(?C{any text})pqr
|
||||
</pre>
|
||||
The doubling is removed before the string is passed to the callout function.
|
||||
<a name="backtrackcontrol"></a></P>
|
||||
<br><a name="SEC27" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
|
@ -3258,7 +3285,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 28 January 2015
|
||||
Last updated: 15 March 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -535,9 +535,13 @@ pattern is not anchored.
|
|||
<br><a name="SEC24" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
<pre>
|
||||
(?C) callout
|
||||
(?Cn) callout with data n
|
||||
</PRE>
|
||||
(?C) callout (assumed number 0)
|
||||
(?Cn) callout with numerical data n
|
||||
(?C"text") callout with string data
|
||||
</pre>
|
||||
The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
|
||||
start and the end), and the starting delimiter { matched with the ending
|
||||
delimiter }. To encode the ending delimiter within the string, double it.
|
||||
</P>
|
||||
<br><a name="SEC25" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
|
@ -555,7 +559,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 26 January 2015
|
||||
Last updated: 15 March 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -90,11 +90,18 @@ names used in the libraries have a suffix _8, _16, or _32, as appropriate.
|
|||
<P>
|
||||
Input to <b>pcre2test</b> is processed line by line, either by calling the C
|
||||
library's <b>fgets()</b> function, or via the <b>libreadline</b> library (see
|
||||
below). In Unix-like environments, <b>fgets()</b> treats any bytes other than
|
||||
newline as data characters. However, in some Windows environments character 26
|
||||
(hex 1A) causes an immediate end of file, and no further data is read. For
|
||||
maximum portability, therefore, it is safest to avoid non-printing characters
|
||||
in <b>pcre2test</b> input files.
|
||||
below). The input is processed using using C's string functions, so must not
|
||||
contain binary zeroes, even though in Unix-like environments, <b>fgets()</b>
|
||||
treats any bytes other than newline as data characters. In some Windows
|
||||
environments character 26 (hex 1A) causes an immediate end of file, and no
|
||||
further data is read.
|
||||
</P>
|
||||
<P>
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in <b>pcre2test</b> input files. There is a facility for specifying a
|
||||
pattern's characters as hexadecimal pairs, thus making it possible to include
|
||||
binary zeroes in a pattern for testing purposes. Subject lines are processed
|
||||
for backslash escapes, which makes it possible to include any data value.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
||||
<P>
|
||||
|
@ -499,6 +506,7 @@ about the pattern:
|
|||
<pre>
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
/B bincode show binary code without lengths
|
||||
callout_info show callout information
|
||||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
|
@ -580,6 +588,12 @@ unit" is the last literal code unit that must be present in any match. This is
|
|||
not necessarily the last character. These lines are omitted if no starting or
|
||||
ending code units are recorded.
|
||||
</P>
|
||||
<P>
|
||||
The <b>callout_info</b> modifier requests information about all the callouts in
|
||||
the pattern. A list of them is output at the end of any other information that
|
||||
is requested. For each callout, either its number or string is given, followed
|
||||
by the item that follows it in the pattern.
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying a pattern in hex
|
||||
</b><br>
|
||||
|
@ -907,12 +921,15 @@ set, the current captured groups are output when a callout occurs.
|
|||
The <b>callout_fail</b> modifier can be given one or two numbers. If there is
|
||||
only one number, 1 is returned instead of 0 when a callout of that number is
|
||||
reached. If two numbers are given, 1 is returned when callout <n> is reached
|
||||
for the <m>th time.
|
||||
for the <m>th time. Note that callouts with string arguments are always given
|
||||
the number zero. See "Callouts" below for a description of the output when a
|
||||
callout it taken.
|
||||
</P>
|
||||
<P>
|
||||
The <b>callout_data</b> modifier can be given an unsigned or a negative number.
|
||||
Any value other than zero is used as a return from <b>pcre2test</b>'s callout
|
||||
function.
|
||||
This is set as the "user data" that is passed to the matching function, and
|
||||
passed back when the callout function is invoked. Any value other than zero is
|
||||
used as a return from <b>pcre2test</b>'s callout function.
|
||||
</P>
|
||||
<br><b>
|
||||
Finding all matches in a string
|
||||
|
@ -1262,10 +1279,32 @@ documentation.
|
|||
<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
If the pattern contains any callout requests, <b>pcre2test</b>'s callout
|
||||
function is called during matching. This works with both matching functions. By
|
||||
default, the called function displays the callout number, the start and current
|
||||
positions in the text at the callout time, and the next pattern item to be
|
||||
tested. For example:
|
||||
function is called during matching unless <b>callout_none</b> is specified.
|
||||
This works with both matching functions.
|
||||
</P>
|
||||
<P>
|
||||
The callout function in <b>pcre2test</b> returns zero (carry on matching) by
|
||||
default, but you can use a <b>callout_fail</b> modifier in a subject line (as
|
||||
described above) to change this and other parameters of the callout.
|
||||
</P>
|
||||
<P>
|
||||
Inserting callouts can be helpful when using <b>pcre2test</b> to check
|
||||
complicated regular expressions. For further information about callouts, see
|
||||
the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
</P>
|
||||
<P>
|
||||
The output for callouts with numerical arguments and those with string
|
||||
arguments is slightly different.
|
||||
</P>
|
||||
<br><b>
|
||||
Callouts with numerical arguments
|
||||
</b><br>
|
||||
<P>
|
||||
By default, the callout function displays the callout number, the start and
|
||||
current positions in the subject text at the callout time, and the next pattern
|
||||
item to be tested. For example:
|
||||
<pre>
|
||||
--->pqrabcdef
|
||||
0 ^ ^ \d
|
||||
|
@ -1308,17 +1347,27 @@ The mark changes between matching "a" and "b", but stays the same for the rest
|
|||
of the match, so nothing more is output. If, as a result of backtracking, the
|
||||
mark reverts to being unset, the text "<unset>" is output.
|
||||
</P>
|
||||
<br><b>
|
||||
Callouts with string arguments
|
||||
</b><br>
|
||||
<P>
|
||||
The callout function in <b>pcre2test</b> returns zero (carry on matching) by
|
||||
default, but you can use a <b>callout_fail</b> modifier in a subject line (as
|
||||
described above) to change this and other parameters of the callout.
|
||||
</P>
|
||||
<P>
|
||||
Inserting callouts can be helpful when using <b>pcre2test</b> to check
|
||||
complicated regular expressions. For further information about callouts, see
|
||||
the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
The output for a callout with a string argument is similar, except that instead
|
||||
of outputting a callout number before the position indicators, the callout
|
||||
string and its offset in the pattern string are output before the reflection of
|
||||
the subject string, and the subject string is reflected for each callout. For
|
||||
example:
|
||||
<pre>
|
||||
re> /^ab(?C'first')cd(?C"second")ef/
|
||||
data> abcdefg
|
||||
Callout (7): 'first'
|
||||
--->abcdefg
|
||||
^ ^ c
|
||||
Callout (20): "second"
|
||||
--->abcdefg
|
||||
^ ^ e
|
||||
0: abcdef
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<br><a name="SEC17" href="#TOC1">NON-PRINTING CHARACTERS</a><br>
|
||||
<P>
|
||||
|
@ -1411,7 +1460,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 January 2015
|
||||
Last updated: 22 March 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -88,6 +88,9 @@ in the library.
|
|||
|
||||
<table>
|
||||
|
||||
<tr><td><a href="pcre2_callout_enumerate.html">pcre2_callout_enumerate</a></td>
|
||||
<td> Enumerate callouts in a compiled pattern</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||
<td> Free a compiled pattern</td></tr>
|
||||
|
||||
|
|
352
doc/pcre2.txt
352
doc/pcre2.txt
|
@ -367,6 +367,10 @@ PCRE2 NATIVE API AUXILIARY FUNCTIONS
|
|||
|
||||
int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
|
||||
|
||||
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||
void *user_data);
|
||||
|
||||
int pcre2_config(uint32_t what, void *where);
|
||||
|
||||
|
||||
|
@ -1452,14 +1456,16 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
|
||||
int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
|
||||
|
||||
The pcre2_pattern_info() function returns information about a compiled
|
||||
pattern. The first argument is a pointer to the compiled pattern. The
|
||||
second argument specifies which piece of information is required, and
|
||||
the third argument is a pointer to a variable to receive the data. If
|
||||
the third argument is NULL, the first argument is ignored, and the
|
||||
function returns the size in bytes of the variable that is required for
|
||||
the information requested. Otherwise, The yield of the function is
|
||||
zero for success, or one of the following negative numbers:
|
||||
The pcre2_pattern_info() function returns general information about a
|
||||
compiled pattern. For information about callouts, see the next section.
|
||||
The first argument for pcre2_pattern_info() is a pointer to the com-
|
||||
piled pattern. The second argument specifies which piece of information
|
||||
is required, and the third argument is a pointer to a variable to
|
||||
receive the data. If the third argument is NULL, the first argument is
|
||||
ignored, and the function returns the size in bytes of the variable
|
||||
that is required for the information requested. Otherwise, The yield of
|
||||
the function is zero for success, or one of the following negative num-
|
||||
bers:
|
||||
|
||||
PCRE2_ERROR_NULL the argument code was NULL
|
||||
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
||||
|
@ -1744,6 +1750,25 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
alter the value returned by this option.
|
||||
|
||||
|
||||
INFORMATION ABOUT A PATTERN'S CALLOUTS
|
||||
|
||||
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||
void *user_data);
|
||||
|
||||
A script language that supports the use of string arguments in callouts
|
||||
might like to scan all the callouts in a pattern before running the
|
||||
match. This can be done by calling pcre2_callout_enumerate(). The first
|
||||
argument is a pointer to a compiled pattern, the second points to a
|
||||
callback function, and the third is arbitrary user data. The callback
|
||||
function is called for every callout in the pattern in the order in
|
||||
which they appear. Its first argument is a pointer to a callout enumer-
|
||||
ation block, and its second argument is the user_data value that was
|
||||
passed to pcre2_callout_enumerate(). The contents of the callout enu-
|
||||
meration block are described in the pcre2callout documentation, which
|
||||
also gives further details about callouts.
|
||||
|
||||
|
||||
SERIALIZATION AND PRECOMPILING
|
||||
|
||||
It is possible to save compiled patterns on disc or elsewhere, and
|
||||
|
@ -2221,9 +2246,9 @@ ERROR RETURNS FROM pcre2_match()
|
|||
PCRE2_ERROR_CALLOUT
|
||||
|
||||
This error is never generated by pcre2_match() itself. It is provided
|
||||
for use by callout functions that want to cause pcre2_match() to return
|
||||
a distinctive error code. See the pcre2callout documentation for
|
||||
details.
|
||||
for use by callout functions that want to cause pcre2_match() or
|
||||
pcre2_callout_enumerate() to return a distinctive error code. See the
|
||||
pcre2callout documentation for details.
|
||||
|
||||
PCRE2_ERROR_INTERNAL
|
||||
|
||||
|
@ -2771,7 +2796,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 January 2015
|
||||
Last updated: 23 March 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -3250,22 +3275,30 @@ SYNOPSIS
|
|||
|
||||
int (*pcre2_callout)(pcre2_callout_block *, void *);
|
||||
|
||||
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||
void *user_data);
|
||||
|
||||
|
||||
DESCRIPTION
|
||||
|
||||
PCRE2 provides a feature called "callout", which is a means of tempo-
|
||||
rarily passing control to the caller of PCRE2 in the middle of pattern
|
||||
matching. The caller of PCRE2 provides an external function by putting
|
||||
its entry point in a match context (see pcre2_set_callout()) in the
|
||||
its entry point in a match context (see pcre2_set_callout() in the
|
||||
pcre2api documentation).
|
||||
|
||||
Within a regular expression, (?C) indicates the points at which the
|
||||
Within a regular expression, (?C<arg>) indicates a point at which the
|
||||
external function is to be called. Different callout points can be
|
||||
identified by putting a number less than 256 after the letter C. The
|
||||
default value is zero. For example, this pattern has two callout
|
||||
default value is zero. Alternatively, the argument may be a delimited
|
||||
string. The starting delimiter must be one of ` ' " ^ % # $ { and the
|
||||
ending delimiter is the same as the start, except for {, where the end-
|
||||
ing delimiter is }. If the ending delimiter is needed within the
|
||||
string, it must be doubled. For example, this pattern has two callout
|
||||
points:
|
||||
|
||||
(?C1)abc(?C2)def
|
||||
(?C1)abc(?C"some ""arbitrary"" text")def
|
||||
|
||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
||||
PCRE2 automatically inserts callouts, all with number 255, before each
|
||||
|
@ -3284,29 +3317,30 @@ DESCRIPTION
|
|||
before the condition. Such a callout may also be inserted explicitly,
|
||||
for example:
|
||||
|
||||
(?(?C9)(?=a)ab|de)
|
||||
(?(?C9)(?=a)ab|de) (?(?C%text%)(?!=d)ab|de)
|
||||
|
||||
This applies only to assertion conditions (because they are themselves
|
||||
independent groups).
|
||||
|
||||
Automatic callouts can be used for tracking the progress of pattern
|
||||
matching. The pcre2test program has a pattern qualifier (/auto_call-
|
||||
out) that sets automatic callouts; when it is used, the output indi-
|
||||
cates how the pattern is being matched. This is useful information when
|
||||
you are trying to optimize the performance of a particular pattern.
|
||||
Callouts can be useful for tracking the progress of pattern matching.
|
||||
The pcre2test program has a pattern qualifier (/auto_callout) that sets
|
||||
automatic callouts. When any callouts are present, the output from
|
||||
pcre2test indicates how the pattern is being matched. This is useful
|
||||
information when you are trying to optimize the performance of a par-
|
||||
ticular pattern.
|
||||
|
||||
|
||||
MISSING CALLOUTS
|
||||
|
||||
You should be aware that, because of optimizations in the way PCRE2
|
||||
You should be aware that, because of optimizations in the way PCRE2
|
||||
compiles and matches patterns, callouts sometimes do not happen exactly
|
||||
as you might expect.
|
||||
|
||||
Auto-possessification
|
||||
|
||||
At compile time, PCRE2 "auto-possessifies" repeated items when it knows
|
||||
that what follows cannot be part of the repeat. For example, a+[bc] is
|
||||
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
||||
that what follows cannot be part of the repeat. For example, a+[bc] is
|
||||
compiled as if it were a++[bc]. The pcre2test output when this pattern
|
||||
is compiled with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied
|
||||
to the string "aaaa" is:
|
||||
|
||||
|
@ -3315,10 +3349,10 @@ MISSING CALLOUTS
|
|||
+2 ^ ^ [bc]
|
||||
No match
|
||||
|
||||
This indicates that when matching [bc] fails, there is no backtracking
|
||||
into a+ and therefore the callouts that would be taken for the back-
|
||||
tracks do not occur. You can disable the auto-possessify feature by
|
||||
passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat-
|
||||
This indicates that when matching [bc] fails, there is no backtracking
|
||||
into a+ and therefore the callouts that would be taken for the back-
|
||||
tracks do not occur. You can disable the auto-possessify feature by
|
||||
passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat-
|
||||
tern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
|
||||
|
||||
--->aaaa
|
||||
|
@ -3335,16 +3369,16 @@ MISSING CALLOUTS
|
|||
Automatic .* anchoring
|
||||
|
||||
By default, an optimization is applied when .* is the first significant
|
||||
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
||||
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
||||
is not set, a match can start only after an internal newline or at the
|
||||
beginning of the subject, and pcre2_compile() remembers this. This
|
||||
optimization is disabled, however, if .* is in an atomic group or if
|
||||
there is a back reference to the capturing group in which it appears.
|
||||
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
||||
item in a pattern. If PCRE2_DOTALL is set, so that the dot can match
|
||||
any character, the pattern is automatically anchored. If PCRE2_DOTALL
|
||||
is not set, a match can start only after an internal newline or at the
|
||||
beginning of the subject, and pcre2_compile() remembers this. This
|
||||
optimization is disabled, however, if .* is in an atomic group or if
|
||||
there is a back reference to the capturing group in which it appears.
|
||||
It is also disabled if the pattern contains (*PRUNE) or (*SKIP). How-
|
||||
ever, the presence of callouts does not affect it.
|
||||
|
||||
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
||||
For example, if the pattern .*\d is compiled with PCRE2_AUTO_CALLOUT
|
||||
and applied to the string "aa", the pcre2test output is:
|
||||
|
||||
--->aa
|
||||
|
@ -3354,10 +3388,10 @@ MISSING CALLOUTS
|
|||
+2 ^ \d
|
||||
No match
|
||||
|
||||
This shows that all match attempts start at the beginning of the sub-
|
||||
ject. In other words, the pattern is anchored. You can disable this
|
||||
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
||||
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
||||
This shows that all match attempts start at the beginning of the sub-
|
||||
ject. In other words, the pattern is anchored. You can disable this
|
||||
optimization by passing PCRE2_NO_DOTSTAR_ANCHOR to pcre2_compile(), or
|
||||
starting the pattern with (*NO_DOTSTAR_ANCHOR). In this case, the out-
|
||||
put changes to:
|
||||
|
||||
--->aa
|
||||
|
@ -3370,43 +3404,43 @@ MISSING CALLOUTS
|
|||
+2 ^ \d
|
||||
No match
|
||||
|
||||
This shows more match attempts, starting at the second subject charac-
|
||||
ter. Another optimization, described in the next section, means that
|
||||
This shows more match attempts, starting at the second subject charac-
|
||||
ter. Another optimization, described in the next section, means that
|
||||
there is no subsequent attempt to match with an empty subject.
|
||||
|
||||
If a pattern has more than one top-level branch, automatic anchoring
|
||||
If a pattern has more than one top-level branch, automatic anchoring
|
||||
occurs if all branches are anchorable.
|
||||
|
||||
Other optimizations
|
||||
|
||||
Other optimizations that provide fast "no match" results also affect
|
||||
Other optimizations that provide fast "no match" results also affect
|
||||
callouts. For example, if the pattern is
|
||||
|
||||
ab(?C4)cd
|
||||
|
||||
PCRE2 knows that any matching string must contain the letter "d". If
|
||||
the subject string is "abyz", the lack of "d" means that matching
|
||||
doesn't ever start, and the callout is never reached. However, with
|
||||
PCRE2 knows that any matching string must contain the letter "d". If
|
||||
the subject string is "abyz", the lack of "d" means that matching
|
||||
doesn't ever start, and the callout is never reached. However, with
|
||||
"abyd", though the result is still no match, the callout is obeyed.
|
||||
|
||||
PCRE2 also knows the minimum length of a matching string, and will
|
||||
immediately give a "no match" return without actually running a match
|
||||
if the subject is not long enough, or, for unanchored patterns, if it
|
||||
PCRE2 also knows the minimum length of a matching string, and will
|
||||
immediately give a "no match" return without actually running a match
|
||||
if the subject is not long enough, or, for unanchored patterns, if it
|
||||
has been scanned far enough.
|
||||
|
||||
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
||||
MIZE option to pcre2_compile(), or by starting the pattern with
|
||||
(*NO_START_OPT). This slows down the matching process, but does ensure
|
||||
MIZE option to pcre2_compile(), or by starting the pattern with
|
||||
(*NO_START_OPT). This slows down the matching process, but does ensure
|
||||
that callouts such as the example above are obeyed.
|
||||
|
||||
|
||||
THE CALLOUT INTERFACE
|
||||
|
||||
During matching, when PCRE2 reaches a callout point, if an external
|
||||
function is set in the match context, it is called. This applies to
|
||||
both normal and DFA matching. The first argument to the callout func-
|
||||
tion is a pointer to a pcre2_callout block. The second argument is the
|
||||
void * callout data that was supplied when the callout was set up by
|
||||
During matching, when PCRE2 reaches a callout point, if an external
|
||||
function is set in the match context, it is called. This applies to
|
||||
both normal and DFA matching. The first argument to the callout func-
|
||||
tion is a pointer to a pcre2_callout block. The second argument is the
|
||||
void * callout data that was supplied when the callout was set up by
|
||||
calling pcre2_set_callout() (see the pcre2api documentation). The call-
|
||||
out block structure contains the following fields:
|
||||
|
||||
|
@ -3422,15 +3456,47 @@ THE CALLOUT INTERFACE
|
|||
PCRE2_SIZE current_position;
|
||||
PCRE2_SIZE pattern_position;
|
||||
PCRE2_SIZE next_item_length;
|
||||
PCRE2_SIZE callout_string_offset;
|
||||
PCRE2_SIZE callout_string_length;
|
||||
PCRE2_SPTR callout_string;
|
||||
|
||||
The version field contains the version number of the block format. The
|
||||
current version is 0. The version number will change in future if addi-
|
||||
tional fields are added, but the intention is never to remove any of
|
||||
the existing fields.
|
||||
The version field contains the version number of the block format. The
|
||||
current version is 1; the three callout string fields were added for
|
||||
this version. If you are writing an application that might use an ear-
|
||||
lier release of PCRE2, you should check the version number before
|
||||
accessing any of these fields. The version number will increase in
|
||||
future if more fields are added, but the intention is never to remove
|
||||
any of the existing fields.
|
||||
|
||||
The callout_number field contains the number of the callout, as com-
|
||||
piled into the pattern (that is, the number after ?C for manual call-
|
||||
outs, and 255 for automatically generated callouts).
|
||||
Fields for numerical callouts
|
||||
|
||||
For a numerical callout, callout_string is NULL, and callout_number
|
||||
contains the number of the callout, in the range 0-255. This is the
|
||||
number that follows (?C for manual callouts; it is 255 for automati-
|
||||
cally generated callouts.
|
||||
|
||||
Fields for string callouts
|
||||
|
||||
For callouts with string arguments, callout_number is always zero, and
|
||||
callout_string points to the string that is contained within the com-
|
||||
piled pattern. Its length is given by callout_string_length. Duplicated
|
||||
ending delimiters that were present in the original pattern string have
|
||||
been turned into single characters, but there is no other processing of
|
||||
the callout string argument. An additional code unit containing binary
|
||||
zero is present after the string, but is not included in the length.
|
||||
The delimiter that was used to start the string is also stored within
|
||||
the pattern, immediately before the string itself. You can access this
|
||||
delimiter as callout_string[-1] if you need it.
|
||||
|
||||
The callout_string_offset field is the code unit offset to the start of
|
||||
the callout argument string within the original pattern string. This is
|
||||
provided for the benefit of applications such as script languages that
|
||||
might need to report errors in the callout string within the pattern.
|
||||
|
||||
Fields for all callouts
|
||||
|
||||
The remaining fields in the callout block are the same for both kinds
|
||||
of callout.
|
||||
|
||||
The offset_vector field is a pointer to the vector of capturing offsets
|
||||
(the "ovector") that was passed to the matching function in the match
|
||||
|
@ -3464,8 +3530,8 @@ THE CALLOUT INTERFACE
|
|||
substrings. If no substrings have been captured, the value of cap-
|
||||
ture_last is 0. This is always the case for the DFA matching functions.
|
||||
|
||||
The pattern_position field contains the offset to the next item to be
|
||||
matched in the pattern string.
|
||||
The pattern_position field contains the offset in the pattern string to
|
||||
the next item to be matched.
|
||||
|
||||
The next_item_length field contains the length of the next item to be
|
||||
matched in the pattern string. When the callout immediately precedes an
|
||||
|
@ -3475,7 +3541,9 @@ THE CALLOUT INTERFACE
|
|||
|
||||
The pattern_position and next_item_length fields are intended to help
|
||||
in distinguishing between different automatic callouts, which all have
|
||||
the same callout number. However, they are set for all callouts.
|
||||
the same callout number. However, they are set for all callouts, and
|
||||
are used by pcre2test to show the next item to be matched when display-
|
||||
ing callout information.
|
||||
|
||||
In callouts from pcre2_match() the mark field contains a pointer to the
|
||||
zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
|
||||
|
@ -3485,7 +3553,7 @@ THE CALLOUT INTERFACE
|
|||
always contains NULL.
|
||||
|
||||
|
||||
RETURN VALUES
|
||||
RETURN VALUES FROM CALLOUTS
|
||||
|
||||
The external callout function returns an integer to PCRE2. If the value
|
||||
is zero, matching proceeds as normal. If the value is greater than
|
||||
|
@ -3501,6 +3569,49 @@ RETURN VALUES
|
|||
itself.
|
||||
|
||||
|
||||
CALLOUT ENUMERATION
|
||||
|
||||
int pcre2_callout_enumerate(const pcre2_code *code,
|
||||
int (*callback)(pcre2_callout_enumerate_block *, void *),
|
||||
void *user_data);
|
||||
|
||||
A script language that supports the use of string arguments in callouts
|
||||
might like to scan all the callouts in a pattern before running the
|
||||
match. This can be done by calling pcre2_callout_enumerate(). The first
|
||||
argument is a pointer to a compiled pattern, the second points to a
|
||||
callback function, and the third is arbitrary user data. The callback
|
||||
function is called for every callout in the pattern in the order in
|
||||
which they appear. Its first argument is a pointer to a callout enumer-
|
||||
ation block, and its second argument is the user_data value that was
|
||||
passed to pcre2_callout_enumerate(). The data block contains the fol-
|
||||
lowing fields:
|
||||
|
||||
version Block version number
|
||||
pattern_position Offset to next item in pattern
|
||||
next_item_length Length of next item in pattern
|
||||
callout_number Number for numbered callouts
|
||||
callout_string_offset Offset to string within pattern
|
||||
callout_string_length Length of callout string
|
||||
callout_string Points to callout string or is NULL
|
||||
|
||||
The version number is currently 0. It will increase if new fields are
|
||||
ever added to the block. The remaining fields are the same as their
|
||||
namesakes in the pcre2_callout block that is used for callouts during
|
||||
matching, as described above.
|
||||
|
||||
Note that the value of pattern_position is unique for each callout.
|
||||
However, if a callout occurs inside a group that is quantified with a
|
||||
non-zero minimum or a fixed maximum, the group is replicated inside the
|
||||
compiled pattern. For example, a pattern such as /(a){2}/ is compiled
|
||||
as if it were /(a)(a)/. This means that the callout will be enumerated
|
||||
more than once, but with the same value for pattern_position in each
|
||||
case.
|
||||
|
||||
The callback function should normally return zero. If it returns a non-
|
||||
zero value, scanning the pattern stops, and that value is returned from
|
||||
pcre2_callout_enumerate().
|
||||
|
||||
|
||||
AUTHOR
|
||||
|
||||
Philip Hazel
|
||||
|
@ -3510,7 +3621,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 02 January 2015
|
||||
Last updated: 23 March 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -3585,104 +3696,103 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
|||
during pattern matching. See the pcre2callout documentation for
|
||||
details.
|
||||
|
||||
8. Subpatterns that are called as subroutines (whether or not recur-
|
||||
sively) are always treated as atomic groups in PCRE2. This is like
|
||||
Python, but unlike Perl. Captured values that are set outside a sub-
|
||||
routine call can be reference from inside in PCRE2, but not in Perl.
|
||||
There is a discussion that explains these differences in more detail in
|
||||
the section on recursion differences from Perl in the pcre2pattern
|
||||
page.
|
||||
8. Subroutine calls (whether recursive or not) are treated as atomic
|
||||
groups. Atomic recursion is like Python, but unlike Perl. Captured
|
||||
values that are set outside a subroutine call can be referenced from
|
||||
inside in PCRE2, but not in Perl. There is a discussion that explains
|
||||
these differences in more detail in the section on recursion differ-
|
||||
ences from Perl in the pcre2pattern page.
|
||||
|
||||
9. If any of the backtracking control verbs are used in a subpattern
|
||||
that is called as a subroutine (whether or not recursively), their
|
||||
effect is confined to that subpattern; it does not extend to the sur-
|
||||
rounding pattern. This is not always the case in Perl. In particular,
|
||||
if (*THEN) is present in a group that is called as a subroutine, its
|
||||
9. If any of the backtracking control verbs are used in a subpattern
|
||||
that is called as a subroutine (whether or not recursively), their
|
||||
effect is confined to that subpattern; it does not extend to the sur-
|
||||
rounding pattern. This is not always the case in Perl. In particular,
|
||||
if (*THEN) is present in a group that is called as a subroutine, its
|
||||
action is limited to that group, even if the group does not contain any
|
||||
| characters. Note that such subpatterns are processed as anchored at
|
||||
| characters. Note that such subpatterns are processed as anchored at
|
||||
the point where they are tested.
|
||||
|
||||
10. If a pattern contains more than one backtracking control verb, the
|
||||
first one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure
|
||||
10. If a pattern contains more than one backtracking control verb, the
|
||||
first one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure
|
||||
in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases
|
||||
it is the same as PCRE2, but there are examples where it differs.
|
||||
|
||||
11. Most backtracking verbs in assertions have their normal actions.
|
||||
11. Most backtracking verbs in assertions have their normal actions.
|
||||
They are not confined to the assertion.
|
||||
|
||||
12. There are some differences that are concerned with the settings of
|
||||
captured strings when part of a pattern is repeated. For example,
|
||||
matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
|
||||
12. There are some differences that are concerned with the settings of
|
||||
captured strings when part of a pattern is repeated. For example,
|
||||
matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2
|
||||
unset, but in PCRE2 it is set to "b".
|
||||
|
||||
13. PCRE2's handling of duplicate subpattern numbers and duplicate sub-
|
||||
pattern names is not as general as Perl's. This is a consequence of the
|
||||
fact the PCRE2 works internally just with numbers, using an external
|
||||
table to translate between numbers and names. In particular, a pattern
|
||||
such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
|
||||
the same number but different names, is not supported, and causes an
|
||||
error at compile time. If it were allowed, it would not be possible to
|
||||
distinguish which parentheses matched, because both names map to cap-
|
||||
fact the PCRE2 works internally just with numbers, using an external
|
||||
table to translate between numbers and names. In particular, a pattern
|
||||
such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
|
||||
the same number but different names, is not supported, and causes an
|
||||
error at compile time. If it were allowed, it would not be possible to
|
||||
distinguish which parentheses matched, because both names map to cap-
|
||||
turing subpattern number 1. To avoid this confusing situation, an error
|
||||
is given at compile time.
|
||||
|
||||
14. Perl recognizes comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a subpattern. If the /x
|
||||
modifier is set, Perl allows white space between ( and ? (though cur-
|
||||
rent Perls warn that this is deprecated) but PCRE2 never does, even if
|
||||
14. Perl recognizes comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a subpattern. If the /x
|
||||
modifier is set, Perl allows white space between ( and ? (though cur-
|
||||
rent Perls warn that this is deprecated) but PCRE2 never does, even if
|
||||
the PCRE2_EXTENDED option is set.
|
||||
|
||||
15. Perl, when in warning mode, gives warnings for character classes
|
||||
such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter-
|
||||
15. Perl, when in warning mode, gives warnings for character classes
|
||||
such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter-
|
||||
als. PCRE2 has no warning features, so it gives an error in these cases
|
||||
because they are almost certainly user mistakes.
|
||||
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are
|
||||
not affected when case-independent matching is specified. For example,
|
||||
16. In PCRE2, the upper/lower case character properties Lu and Ll are
|
||||
not affected when case-independent matching is specified. For example,
|
||||
\p{Lu} always matches an upper case letter. I think Perl has changed in
|
||||
this respect; in the release at the time of writing (5.16), \p{Lu} and
|
||||
this respect; in the release at the time of writing (5.16), \p{Lu} and
|
||||
\p{Ll} match all letters, regardless of case, when case independence is
|
||||
specified.
|
||||
|
||||
17. PCRE2 provides some extensions to the Perl regular expression
|
||||
facilities. Perl 5.10 includes new features that are not in earlier
|
||||
versions of Perl, some of which (such as named parentheses) have been
|
||||
17. PCRE2 provides some extensions to the Perl regular expression
|
||||
facilities. Perl 5.10 includes new features that are not in earlier
|
||||
versions of Perl, some of which (such as named parentheses) have been
|
||||
in PCRE2 for some time. This list is with respect to Perl 5.10:
|
||||
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length
|
||||
strings, each alternative branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same
|
||||
(a) Although lookbehind assertions in PCRE2 must match fixed length
|
||||
strings, each alternative branch of a lookbehind assertion can match a
|
||||
different length of string. Perl requires them all to have the same
|
||||
length.
|
||||
|
||||
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the
|
||||
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the
|
||||
$ meta-character matches only at the very end of the string.
|
||||
|
||||
(c) A backslash followed by a letter with no special meaning is
|
||||
(c) A backslash followed by a letter with no special meaning is
|
||||
faulted. (Perl can be made to issue a warning.)
|
||||
|
||||
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti-
|
||||
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti-
|
||||
fiers is inverted, that is, by default they are not greedy, but if fol-
|
||||
lowed by a question mark they are.
|
||||
|
||||
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to
|
||||
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to
|
||||
be tried only at the first matching position in the subject string.
|
||||
|
||||
(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||
PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl
|
||||
PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl
|
||||
equivalents.
|
||||
|
||||
(g) The \R escape sequence can be restricted to match only CR, LF, or
|
||||
(g) The \R escape sequence can be restricted to match only CR, LF, or
|
||||
CRLF by the PCRE2_BSR_ANYCRLF option.
|
||||
|
||||
(h) The callout facility is PCRE2-specific.
|
||||
|
||||
(i) The partial matching facility is PCRE2-specific.
|
||||
|
||||
(j) The alternative matching function (pcre2_dfa_match() matches in a
|
||||
(j) The alternative matching function (pcre2_dfa_match() matches in a
|
||||
different way and is not Perl-compatible.
|
||||
|
||||
(k) PCRE2 recognizes some special sequences such as (*CR) at the start
|
||||
(k) PCRE2 recognizes some special sequences such as (*CR) at the start
|
||||
of a pattern that set overall options that cannot be changed within the
|
||||
pattern.
|
||||
|
||||
|
@ -3696,8 +3806,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 28 September 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 15 March 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
.TH PCRE2_COMPILE 3 "23 March 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.nf
|
||||
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||
.B " void *\fIcallout_data\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function scans a compiled regular expression and calls the \fIcallback()\fP
|
||||
function for each callout within the pattern. The yield of the function is zero
|
||||
for success and non-zero otherwise. The arguments are:
|
||||
.sp
|
||||
\fIcode\fP Points to the compiled pattern
|
||||
\fIcallback\fP The callback function
|
||||
\fIcallout_data\fP User data that is passed to the callback
|
||||
.sp
|
||||
The \fIcallback()\fP function is passed a pointer to a data block containing
|
||||
the following fields:
|
||||
.sp
|
||||
\fIversion\fP Block version number
|
||||
\fIpattern_position\fP Offset to next item in pattern
|
||||
\fInext_item_length\fP Length of next item in pattern
|
||||
\fIcallout_number\fP Number for numbered callouts
|
||||
\fIcallout_string_offset\fP Offset to string within pattern
|
||||
\fIcallout_string_length\fP Length of callout string
|
||||
\fIcallout_string\fP Points to callout string or is NULL
|
||||
.sp
|
||||
The second argument is the callout data that was passed to
|
||||
\fBpcre2_callout_enumerate()\fP. The \fBcallback()\fP function must return zero
|
||||
for success. Any other value causes the pattern scan to stop, with the value
|
||||
being passed back as the result of \fBpcre2_callout_enumerate()\fP.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
page and a description of the POSIX API in the
|
||||
.\" HREF
|
||||
\fBpcre2posix\fP
|
||||
.\"
|
||||
page.
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "23 January 2015" "PCRE2 10.10"
|
||||
.TH PCRE2API 3 "23 March 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -234,6 +234,10 @@ document for an overview of all the PCRE2 documentation.
|
|||
.sp
|
||||
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
||||
.sp
|
||||
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||
.B " void *\fIuser_data\fP);"
|
||||
.sp
|
||||
.B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
||||
.fi
|
||||
.
|
||||
|
@ -1427,14 +1431,19 @@ can be processed in different locales.
|
|||
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
|
||||
.fi
|
||||
.P
|
||||
The \fBpcre2_pattern_info()\fP function returns information about a compiled
|
||||
pattern. The first argument is a pointer to the compiled pattern. The second
|
||||
argument specifies which piece of information is required, and the third
|
||||
argument is a pointer to a variable to receive the data. If the third argument
|
||||
is NULL, the first argument is ignored, and the function returns the size in
|
||||
bytes of the variable that is required for the information requested.
|
||||
Otherwise, The yield of the function is zero for success, or one of the
|
||||
following negative numbers:
|
||||
The \fBpcre2_pattern_info()\fP function returns general information about a
|
||||
compiled pattern. For information about callouts, see the
|
||||
.\" HTML <a href="pcre2pattern.html#infoaboutcallouts">
|
||||
.\" </a>
|
||||
next section.
|
||||
.\"
|
||||
The first argument for \fBpcre2_pattern_info()\fP is a pointer to the compiled
|
||||
pattern. The second argument specifies which piece of information is required,
|
||||
and the third argument is a pointer to a variable to receive the data. If the
|
||||
third argument is NULL, the first argument is ignored, and the function returns
|
||||
the size in bytes of the variable that is required for the information
|
||||
requested. Otherwise, The yield of the function is zero for success, or one of
|
||||
the following negative numbers:
|
||||
.sp
|
||||
PCRE2_ERROR_NULL the argument \fIcode\fP was NULL
|
||||
PCRE2_ERROR_BADMAGIC the "magic number" was not found
|
||||
|
@ -1716,6 +1725,31 @@ calculates the size has to over-estimate. Processing a pattern with the JIT
|
|||
compiler does not alter the value returned by this option.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="infoaboutcallouts"></a>
|
||||
.SH "INFORMATION ABOUT A PATTERN'S CALLOUTS"
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||
.B " void *\fIuser_data\fP);"
|
||||
.fi
|
||||
.sp
|
||||
A script language that supports the use of string arguments in callouts might
|
||||
like to scan all the callouts in a pattern before running the match. This can
|
||||
be done by calling \fBpcre2_callout_enumerate()\fP. The first argument is a
|
||||
pointer to a compiled pattern, the second points to a callback function, and
|
||||
the third is arbitrary user data. The callback function is called for every
|
||||
callout in the pattern in the order in which they appear. Its first argument is
|
||||
a pointer to a callout enumeration block, and its second argument is the
|
||||
\fIuser_data\fP value that was passed to \fBpcre2_callout_enumerate()\fP. The
|
||||
contents of the callout enumeration block are described in the
|
||||
.\" HREF
|
||||
\fBpcre2callout\fP
|
||||
.\"
|
||||
documentation, which also gives further details about callouts.
|
||||
.
|
||||
.
|
||||
.SH "SERIALIZATION AND PRECOMPILING"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -2275,8 +2309,8 @@ of the subject.
|
|||
PCRE2_ERROR_CALLOUT
|
||||
.sp
|
||||
This error is never generated by \fBpcre2_match()\fP itself. It is provided for
|
||||
use by callout functions that want to cause \fBpcre2_match()\fP to return a
|
||||
distinctive error code. See the
|
||||
use by callout functions that want to cause \fBpcre2_match()\fP or
|
||||
\fBpcre2_callout_enumerate()\fP to return a distinctive error code. See the
|
||||
.\" HREF
|
||||
\fBpcre2callout\fP
|
||||
.\"
|
||||
|
@ -2885,6 +2919,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 January 2015
|
||||
Last updated: 23 March 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2CALLOUT 3 "16 March 2015" "PCRE2 10.20"
|
||||
.TH PCRE2CALLOUT 3 "23 March 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -7,7 +7,13 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.SM
|
||||
.nf
|
||||
.B int (*pcre2_callout)(pcre2_callout_block *, void *);
|
||||
.sp
|
||||
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||
.B " void *\fIuser_data\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
|
@ -170,6 +176,7 @@ option to \fBpcre2_compile()\fP, or by starting the pattern with
|
|||
callouts such as the example above are obeyed.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="calloutinterface"></a>
|
||||
.SH "THE CALLOUT INTERFACE"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -199,7 +206,6 @@ documentation). The callout block structure contains the following fields:
|
|||
PCRE2_SIZE \fIcallout_string_offset\fP;
|
||||
PCRE2_SIZE \fIcallout_string_length\fP;
|
||||
PCRE2_SPTR \fIcallout_string\fP;
|
||||
|
||||
.sp
|
||||
The \fIversion\fP field contains the version number of the block format. The
|
||||
current version is 1; the three callout string fields were added for this
|
||||
|
@ -276,8 +282,8 @@ outside the recursion, as do the values of all captured substrings. If no
|
|||
substrings have been captured, the value of \fIcapture_last\fP is 0. This is
|
||||
always the case for the DFA matching functions.
|
||||
.P
|
||||
The \fIpattern_position\fP field contains the offset to the next item to be
|
||||
matched in the pattern string.
|
||||
The \fIpattern_position\fP field contains the offset in the pattern string to
|
||||
the next item to be matched.
|
||||
.P
|
||||
The \fInext_item_length\fP field contains the length of the next item to be
|
||||
matched in the pattern string. When the callout immediately precedes an
|
||||
|
@ -298,7 +304,7 @@ of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In
|
|||
callouts from the DFA matching function this field always contains NULL.
|
||||
.
|
||||
.
|
||||
.SH "RETURN VALUES"
|
||||
.SH "RETURN VALUES FROM CALLOUTS"
|
||||
.rs
|
||||
.sp
|
||||
The external callout function returns an integer to PCRE2. If the value is
|
||||
|
@ -314,6 +320,54 @@ failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout
|
|||
functions; it will never be used by PCRE2 itself.
|
||||
.
|
||||
.
|
||||
.SH "CALLOUT ENUMERATION"
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
|
||||
.B " int (*\fIcallback\fP)(pcre2_callout_enumerate_block *, void *),"
|
||||
.B " void *\fIuser_data\fP);"
|
||||
.fi
|
||||
.sp
|
||||
A script language that supports the use of string arguments in callouts might
|
||||
like to scan all the callouts in a pattern before running the match. This can
|
||||
be done by calling \fBpcre2_callout_enumerate()\fP. The first argument is a
|
||||
pointer to a compiled pattern, the second points to a callback function, and
|
||||
the third is arbitrary user data. The callback function is called for every
|
||||
callout in the pattern in the order in which they appear. Its first argument is
|
||||
a pointer to a callout enumeration block, and its second argument is the
|
||||
\fIuser_data\fP value that was passed to \fBpcre2_callout_enumerate()\fP. The
|
||||
data block contains the following fields:
|
||||
.sp
|
||||
\fIversion\fP Block version number
|
||||
\fIpattern_position\fP Offset to next item in pattern
|
||||
\fInext_item_length\fP Length of next item in pattern
|
||||
\fIcallout_number\fP Number for numbered callouts
|
||||
\fIcallout_string_offset\fP Offset to string within pattern
|
||||
\fIcallout_string_length\fP Length of callout string
|
||||
\fIcallout_string\fP Points to callout string or is NULL
|
||||
.sp
|
||||
The version number is currently 0. It will increase if new fields are ever
|
||||
added to the block. The remaining fields are the same as their namesakes in the
|
||||
\fBpcre2_callout\fP block that is used for callouts during matching, as
|
||||
described
|
||||
.\" HTML <a href="#calloutinterface">
|
||||
.\" </a>
|
||||
above.
|
||||
.\"
|
||||
.P
|
||||
Note that the value of \fIpattern_position\fP is unique for each callout.
|
||||
However, if a callout occurs inside a group that is quantified with a non-zero
|
||||
minimum or a fixed maximum, the group is replicated inside the compiled
|
||||
pattern. For example, a pattern such as /(a){2}/ is compiled as if it were
|
||||
/(a)(a)/. This means that the callout will be enumerated more than once, but
|
||||
with the same value for \fIpattern_position\fP in each case.
|
||||
.P
|
||||
The callback function should normally return zero. If it returns a non-zero
|
||||
value, scanning the pattern stops, and that value is returned from
|
||||
\fBpcre2_callout_enumerate()\fP.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
|
@ -328,6 +382,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 16 March 2015
|
||||
Last updated: 23 March 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "16 March 2015" "PCRE 10.20"
|
||||
.TH PCRE2TEST 1 "22 March 2015" "PCRE 10.20"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -473,6 +473,7 @@ about the pattern:
|
|||
.sp
|
||||
bsr=[anycrlf|unicode] specify \eR handling
|
||||
/B bincode show binary code without lengths
|
||||
callout_info show callout information
|
||||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
|
@ -549,6 +550,11 @@ if there is more than one they are listed as "starting code units". "Last code
|
|||
unit" is the last literal code unit that must be present in any match. This is
|
||||
not necessarily the last character. These lines are omitted if no starting or
|
||||
ending code units are recorded.
|
||||
.P
|
||||
The \fBcallout_info\fP modifier requests information about all the callouts in
|
||||
the pattern. A list of them is output at the end of any other information that
|
||||
is requested. For each callout, either its number or string is given, followed
|
||||
by the item that follows it in the pattern.
|
||||
.
|
||||
.
|
||||
.SS "Specifying a pattern in hex"
|
||||
|
@ -1437,6 +1443,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 16 March 2015
|
||||
Last updated: 22 March 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -59,41 +59,48 @@ INPUT ENCODING
|
|||
|
||||
Input to pcre2test is processed line by line, either by calling the C
|
||||
library's fgets() function, or via the libreadline library (see below).
|
||||
In Unix-like environments, fgets() treats any bytes other than newline
|
||||
as data characters. However, in some Windows environments character 26
|
||||
(hex 1A) causes an immediate end of file, and no further data is read.
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in pcre2test input files.
|
||||
The input is processed using using C's string functions, so must not
|
||||
contain binary zeroes, even though in Unix-like environments, fgets()
|
||||
treats any bytes other than newline as data characters. In some Windows
|
||||
environments character 26 (hex 1A) causes an immediate end of file, and
|
||||
no further data is read.
|
||||
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in pcre2test input files. There is a facility for specifying
|
||||
a pattern's characters as hexadecimal pairs, thus making it possible to
|
||||
include binary zeroes in a pattern for testing purposes. Subject lines
|
||||
are processed for backslash escapes, which makes it possible to include
|
||||
any data value.
|
||||
|
||||
|
||||
COMMAND LINE OPTIONS
|
||||
|
||||
-8 If the 8-bit library has been built, this option causes it to
|
||||
be used (this is the default). If the 8-bit library has not
|
||||
be used (this is the default). If the 8-bit library has not
|
||||
been built, this option causes an error.
|
||||
|
||||
-16 If the 16-bit library has been built, this option causes it
|
||||
to be used. If only the 16-bit library has been built, this
|
||||
is the default. If the 16-bit library has not been built,
|
||||
-16 If the 16-bit library has been built, this option causes it
|
||||
to be used. If only the 16-bit library has been built, this
|
||||
is the default. If the 16-bit library has not been built,
|
||||
this option causes an error.
|
||||
|
||||
-32 If the 32-bit library has been built, this option causes it
|
||||
to be used. If only the 32-bit library has been built, this
|
||||
is the default. If the 32-bit library has not been built,
|
||||
-32 If the 32-bit library has been built, this option causes it
|
||||
to be used. If only the 32-bit library has been built, this
|
||||
is the default. If the 32-bit library has not been built,
|
||||
this option causes an error.
|
||||
|
||||
-b Behave as if each pattern has the /fullbincode modifier; the
|
||||
-b Behave as if each pattern has the /fullbincode modifier; the
|
||||
full internal binary form of the pattern is output after com-
|
||||
pilation.
|
||||
|
||||
-C Output the version number of the PCRE2 library, and all
|
||||
available information about the optional features that are
|
||||
included, and then exit with zero exit code. All other
|
||||
-C Output the version number of the PCRE2 library, and all
|
||||
available information about the optional features that are
|
||||
included, and then exit with zero exit code. All other
|
||||
options are ignored.
|
||||
|
||||
-C option Output information about a specific build-time option, then
|
||||
exit. This functionality is intended for use in scripts such
|
||||
as RunTest. The following options output the value and set
|
||||
-C option Output information about a specific build-time option, then
|
||||
exit. This functionality is intended for use in scripts such
|
||||
as RunTest. The following options output the value and set
|
||||
the exit code as indicated:
|
||||
|
||||
ebcdic-nl the code for LF (= NL) in an EBCDIC environment:
|
||||
|
@ -109,7 +116,7 @@ COMMAND LINE OPTIONS
|
|||
ANYCRLF or ANY
|
||||
exit code is always 0
|
||||
|
||||
The following options output 1 for true or 0 for false, and
|
||||
The following options output 1 for true or 0 for false, and
|
||||
set the exit code to the same value:
|
||||
|
||||
ebcdic compiled for an EBCDIC environment
|
||||
|
@ -119,15 +126,15 @@ COMMAND LINE OPTIONS
|
|||
pcre2-8 the 8-bit library was built
|
||||
unicode Unicode support is available
|
||||
|
||||
If an unknown option is given, an error message is output;
|
||||
If an unknown option is given, an error message is output;
|
||||
the exit code is 0.
|
||||
|
||||
-d Behave as if each pattern has the debug modifier; the inter-
|
||||
-d Behave as if each pattern has the debug modifier; the inter-
|
||||
nal form and information about the compiled pattern is output
|
||||
after compilation; -d is equivalent to -b -i.
|
||||
|
||||
-dfa Behave as if each subject line has the dfa modifier; matching
|
||||
is done using the pcre2_dfa_match() function instead of the
|
||||
is done using the pcre2_dfa_match() function instead of the
|
||||
default pcre2_match().
|
||||
|
||||
-help Output a brief summary these options and then exit.
|
||||
|
@ -135,8 +142,8 @@ COMMAND LINE OPTIONS
|
|||
-i Behave as if each pattern has the /info modifier; information
|
||||
about the compiled pattern is given after compilation.
|
||||
|
||||
-jit Behave as if each pattern line has the jit modifier; after
|
||||
successful compilation, each pattern is passed to the just-
|
||||
-jit Behave as if each pattern line has the jit modifier; after
|
||||
successful compilation, each pattern is passed to the just-
|
||||
in-time compiler, if available.
|
||||
|
||||
-pattern modifier-list
|
||||
|
@ -145,25 +152,25 @@ COMMAND LINE OPTIONS
|
|||
-q Do not output the version number of pcre2test at the start of
|
||||
execution.
|
||||
|
||||
-S size On Unix-like systems, set the size of the run-time stack to
|
||||
-S size On Unix-like systems, set the size of the run-time stack to
|
||||
size megabytes.
|
||||
|
||||
-subject modifier-list
|
||||
Behave as if each subject line contains the given modifiers.
|
||||
|
||||
-t Run each compile and match many times with a timer, and out-
|
||||
put the resulting times per compile or match. When JIT is
|
||||
used, separate times are given for the initial compile and
|
||||
the JIT compile. You can control the number of iterations
|
||||
that are used for timing by following -t with a number (as a
|
||||
separate item on the command line). For example, "-t 1000"
|
||||
-t Run each compile and match many times with a timer, and out-
|
||||
put the resulting times per compile or match. When JIT is
|
||||
used, separate times are given for the initial compile and
|
||||
the JIT compile. You can control the number of iterations
|
||||
that are used for timing by following -t with a number (as a
|
||||
separate item on the command line). For example, "-t 1000"
|
||||
iterates 1000 times. The default is to iterate 500,000 times.
|
||||
|
||||
-tm This is like -t except that it times only the matching phase,
|
||||
not the compile phase.
|
||||
|
||||
-T -TM These behave like -t and -tm, but in addition, at the end of
|
||||
a run, the total times for all compiles and matches are out-
|
||||
-T -TM These behave like -t and -tm, but in addition, at the end of
|
||||
a run, the total times for all compiles and matches are out-
|
||||
put.
|
||||
|
||||
-version Output the PCRE2 version number and then exit.
|
||||
|
@ -171,158 +178,158 @@ COMMAND LINE OPTIONS
|
|||
|
||||
DESCRIPTION
|
||||
|
||||
If pcre2test is given two filename arguments, it reads from the first
|
||||
If pcre2test is given two filename arguments, it reads from the first
|
||||
and writes to the second. If the first name is "-", input is taken from
|
||||
the standard input. If pcre2test is given only one argument, it reads
|
||||
the standard input. If pcre2test is given only one argument, it reads
|
||||
from that file and writes to stdout. Otherwise, it reads from stdin and
|
||||
writes to stdout.
|
||||
|
||||
When pcre2test is built, a configuration option can specify that it
|
||||
should be linked with the libreadline or libedit library. When this is
|
||||
done, if the input is from a terminal, it is read using the readline()
|
||||
When pcre2test is built, a configuration option can specify that it
|
||||
should be linked with the libreadline or libedit library. When this is
|
||||
done, if the input is from a terminal, it is read using the readline()
|
||||
function. This provides line-editing and history facilities. The output
|
||||
from the -help option states whether or not readline() will be used.
|
||||
|
||||
The program handles any number of tests, each of which consists of a
|
||||
set of input lines. Each set starts with a regular expression pattern,
|
||||
The program handles any number of tests, each of which consists of a
|
||||
set of input lines. Each set starts with a regular expression pattern,
|
||||
followed by any number of subject lines to be matched against that pat-
|
||||
tern. In between sets of test data, command lines that begin with # may
|
||||
appear. This file format, with some restrictions, can also be processed
|
||||
by the perltest.sh script that is distributed with PCRE2 as a means of
|
||||
by the perltest.sh script that is distributed with PCRE2 as a means of
|
||||
checking that the behaviour of PCRE2 and Perl is the same.
|
||||
|
||||
When the input is a terminal, pcre2test prompts for each line of input,
|
||||
using "re>" to prompt for regular expression patterns, and "data>" to
|
||||
prompt for subject lines. Command lines starting with # can be entered
|
||||
using "re>" to prompt for regular expression patterns, and "data>" to
|
||||
prompt for subject lines. Command lines starting with # can be entered
|
||||
only in response to the "re>" prompt.
|
||||
|
||||
Each subject line is matched separately and independently. If you want
|
||||
Each subject line is matched separately and independently. If you want
|
||||
to do multi-line matches, you have to use the \n escape sequence (or \r
|
||||
or \r\n, etc., depending on the newline setting) in a single line of
|
||||
input to encode the newline sequences. There is no limit on the length
|
||||
of subject lines; the input buffer is automatically extended if it is
|
||||
too small. There is a replication feature that makes it possible to
|
||||
or \r\n, etc., depending on the newline setting) in a single line of
|
||||
input to encode the newline sequences. There is no limit on the length
|
||||
of subject lines; the input buffer is automatically extended if it is
|
||||
too small. There is a replication feature that makes it possible to
|
||||
generate long subject lines without having to supply them explicitly.
|
||||
|
||||
An empty line or the end of the file signals the end of the subject
|
||||
lines for a test, at which point a new pattern or command line is
|
||||
An empty line or the end of the file signals the end of the subject
|
||||
lines for a test, at which point a new pattern or command line is
|
||||
expected if there is still input to be read.
|
||||
|
||||
|
||||
COMMAND LINES
|
||||
|
||||
In between sets of test data, a line that begins with # is interpreted
|
||||
In between sets of test data, a line that begins with # is interpreted
|
||||
as a command line. If the first character is followed by white space or
|
||||
an exclamation mark, the line is treated as a comment, and ignored.
|
||||
an exclamation mark, the line is treated as a comment, and ignored.
|
||||
Otherwise, the following commands are recognized:
|
||||
|
||||
#forbid_utf
|
||||
|
||||
Subsequent patterns automatically have the PCRE2_NEVER_UTF and
|
||||
Subsequent patterns automatically have the PCRE2_NEVER_UTF and
|
||||
PCRE2_NEVER_UCP options set, which locks out the use of UTF and Unicode
|
||||
property features. This is a trigger guard that is used in test files
|
||||
property features. This is a trigger guard that is used in test files
|
||||
to ensure that UTF or Unicode property tests are not accidentally added
|
||||
to files that are used when Unicode support is not included in the
|
||||
library. This effect can also be obtained by the use of #pattern; the
|
||||
difference is that #forbid_utf cannot be unset, and the automatic
|
||||
options are not displayed in pattern information, to avoid cluttering
|
||||
to files that are used when Unicode support is not included in the
|
||||
library. This effect can also be obtained by the use of #pattern; the
|
||||
difference is that #forbid_utf cannot be unset, and the automatic
|
||||
options are not displayed in pattern information, to avoid cluttering
|
||||
up test output.
|
||||
|
||||
#load <filename>
|
||||
|
||||
This command is used to load a set of precompiled patterns from a file,
|
||||
as described in the section entitled "Saving and restoring compiled
|
||||
as described in the section entitled "Saving and restoring compiled
|
||||
patterns" below.
|
||||
|
||||
#pattern <modifier-list>
|
||||
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
quent patterns. Modifiers on a pattern can change these settings.
|
||||
|
||||
#perltest
|
||||
|
||||
The appearance of this line causes all subsequent modifier settings to
|
||||
The appearance of this line causes all subsequent modifier settings to
|
||||
be checked for compatibility with the perltest.sh script, which is used
|
||||
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
||||
comment lines, none of the other command lines are permitted, because
|
||||
they and many of the modifiers are specific to pcre2test, and should
|
||||
not be used in test files that are also processed by perltest.sh. The
|
||||
#perltest command helps detect tests that are accidentally put in the
|
||||
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
||||
comment lines, none of the other command lines are permitted, because
|
||||
they and many of the modifiers are specific to pcre2test, and should
|
||||
not be used in test files that are also processed by perltest.sh. The
|
||||
#perltest command helps detect tests that are accidentally put in the
|
||||
wrong file.
|
||||
|
||||
#pop [<modifiers>]
|
||||
|
||||
This command is used to manipulate the stack of compiled patterns, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
This command is used to manipulate the stack of compiled patterns, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
terns" below.
|
||||
|
||||
#save <filename>
|
||||
|
||||
This command is used to save a set of compiled patterns to a file, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
This command is used to save a set of compiled patterns to a file, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
terns" below.
|
||||
|
||||
#subject <modifier-list>
|
||||
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
quent subject lines. Modifiers on a subject line can change these set-
|
||||
This command sets a default modifier list that applies to all subse-
|
||||
quent subject lines. Modifiers on a subject line can change these set-
|
||||
tings.
|
||||
|
||||
|
||||
MODIFIER SYNTAX
|
||||
|
||||
Modifier lists are used with both pattern and subject lines. Items in a
|
||||
list are separated by commas and optional white space. Some modifiers
|
||||
may be given for both patterns and subject lines, whereas others are
|
||||
valid for one or the other only. Each modifier has a long name, for
|
||||
list are separated by commas and optional white space. Some modifiers
|
||||
may be given for both patterns and subject lines, whereas others are
|
||||
valid for one or the other only. Each modifier has a long name, for
|
||||
example "anchored", and some of them must be followed by an equals sign
|
||||
and a value, for example, "offset=12". Modifiers that do not take val-
|
||||
ues may be preceded by a minus sign to turn off a previous setting.
|
||||
|
||||
A few of the more common modifiers can also be specified as single let-
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
Perl convention, these are written with a slash ("the /i modifier") for
|
||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||
item of a modifier list. If the first item is not recognized as a long
|
||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||
item of a modifier list. If the first item is not recognized as a long
|
||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||
For example:
|
||||
|
||||
/abc/ig,newline=cr,jit=3
|
||||
|
||||
This is a pattern line whose modifier list starts with two one-letter
|
||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||
This is a pattern line whose modifier list starts with two one-letter
|
||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||
same as used in Perl.
|
||||
|
||||
|
||||
PATTERN SYNTAX
|
||||
|
||||
A pattern line must start with one of the following characters (common
|
||||
A pattern line must start with one of the following characters (common
|
||||
symbols, excluding pattern meta-characters):
|
||||
|
||||
/ ! " ' ` - = _ : ; , % & @ ~
|
||||
|
||||
This is interpreted as the pattern's delimiter. A regular expression
|
||||
may be continued over several input lines, in which case the newline
|
||||
This is interpreted as the pattern's delimiter. A regular expression
|
||||
may be continued over several input lines, in which case the newline
|
||||
characters are included within it. It is possible to include the delim-
|
||||
iter within the pattern by escaping it with a backslash, for example
|
||||
|
||||
/abc\/def/
|
||||
|
||||
If you do this, the escape and the delimiter form part of the pattern,
|
||||
If you do this, the escape and the delimiter form part of the pattern,
|
||||
but since the delimiters are all non-alphanumeric, this does not affect
|
||||
its interpretation. If the terminating delimiter is immediately fol-
|
||||
its interpretation. If the terminating delimiter is immediately fol-
|
||||
lowed by a backslash, for example,
|
||||
|
||||
/abc/\
|
||||
|
||||
then a backslash is added to the end of the pattern. This is done to
|
||||
provide a way of testing the error condition that arises if a pattern
|
||||
then a backslash is added to the end of the pattern. This is done to
|
||||
provide a way of testing the error condition that arises if a pattern
|
||||
finishes with a backslash, because
|
||||
|
||||
/abc\/
|
||||
|
||||
is interpreted as the first line of a pattern that starts with "abc/",
|
||||
causing pcre2test to read the next line as a continuation of the regu-
|
||||
is interpreted as the first line of a pattern that starts with "abc/",
|
||||
causing pcre2test to read the next line as a continuation of the regu-
|
||||
lar expression.
|
||||
|
||||
A pattern can be followed by a modifier list (details below).
|
||||
|
@ -330,7 +337,7 @@ PATTERN SYNTAX
|
|||
|
||||
SUBJECT LINE SYNTAX
|
||||
|
||||
Before each subject line is passed to pcre2_match() or
|
||||
Before each subject line is passed to pcre2_match() or
|
||||
pcre2_dfa_match(), leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes. The following provide a means of
|
||||
encoding non-printing characters in a visible way:
|
||||
|
@ -350,23 +357,23 @@ SUBJECT LINE SYNTAX
|
|||
\x{hh...} hexadecimal character (any number of hex digits)
|
||||
|
||||
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
||||
the pattern. It is recognized always. There may be any number of hexa-
|
||||
decimal digits inside the braces; invalid values provoke error mes-
|
||||
the pattern. It is recognized always. There may be any number of hexa-
|
||||
decimal digits inside the braces; invalid values provoke error mes-
|
||||
sages.
|
||||
|
||||
Note that \xhh specifies one byte rather than one character in UTF-8
|
||||
mode; this makes it possible to construct invalid UTF-8 sequences for
|
||||
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
||||
character in UTF-8 mode, generating more than one byte if the value is
|
||||
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
||||
Note that \xhh specifies one byte rather than one character in UTF-8
|
||||
mode; this makes it possible to construct invalid UTF-8 sequences for
|
||||
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
||||
character in UTF-8 mode, generating more than one byte if the value is
|
||||
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
||||
\x{hh} generates one byte for values less than 256, and causes an error
|
||||
for greater values.
|
||||
|
||||
In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
|
||||
possible to construct invalid UTF-16 sequences for testing purposes.
|
||||
|
||||
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
||||
makes it possible to construct invalid UTF-32 sequences for testing
|
||||
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
||||
makes it possible to construct invalid UTF-32 sequences for testing
|
||||
purposes.
|
||||
|
||||
There is a special backslash sequence that specifies replication of one
|
||||
|
@ -374,38 +381,38 @@ SUBJECT LINE SYNTAX
|
|||
|
||||
\[<characters>]{<count>}
|
||||
|
||||
This makes it possible to test long strings without having to provide
|
||||
This makes it possible to test long strings without having to provide
|
||||
them as part of the file. For example:
|
||||
|
||||
\[abc]{4}
|
||||
|
||||
is converted to "abcabcabcabc". This feature does not support nesting.
|
||||
is converted to "abcabcabcabc". This feature does not support nesting.
|
||||
To include a closing square bracket in the characters, code it as \x5D.
|
||||
|
||||
A backslash followed by an equals sign marks the end of the subject
|
||||
A backslash followed by an equals sign marks the end of the subject
|
||||
string and the start of a modifier list. For example:
|
||||
|
||||
abc\=notbol,notempty
|
||||
|
||||
A backslash followed by any other non-alphanumeric character just
|
||||
A backslash followed by any other non-alphanumeric character just
|
||||
escapes that character. A backslash followed by anything else causes an
|
||||
error. However, if the very last character in the line is a backslash
|
||||
(and there is no modifier list), it is ignored. This gives a way of
|
||||
passing an empty line as data, since a real empty line terminates the
|
||||
error. However, if the very last character in the line is a backslash
|
||||
(and there is no modifier list), it is ignored. This gives a way of
|
||||
passing an empty line as data, since a real empty line terminates the
|
||||
data input.
|
||||
|
||||
|
||||
PATTERN MODIFIERS
|
||||
|
||||
There are three types of modifier that can appear in pattern lines, two
|
||||
of which may also be used in a #pattern command. A pattern's modifier
|
||||
of which may also be used in a #pattern command. A pattern's modifier
|
||||
list can add to or override default modifiers that were set by a previ-
|
||||
ous #pattern command.
|
||||
|
||||
Setting compilation options
|
||||
|
||||
The following modifiers set options for pcre2_compile(). The most com-
|
||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||
The following modifiers set options for pcre2_compile(). The most com-
|
||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||
tion of their effects.
|
||||
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
|
@ -432,17 +439,18 @@ PATTERN MODIFIERS
|
|||
utf set PCRE2_UTF
|
||||
|
||||
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
||||
non-printing characters in output strings to be printed using the
|
||||
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
||||
non-printing characters in output strings to be printed using the
|
||||
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
||||
without the curly brackets.
|
||||
|
||||
Setting compilation controls
|
||||
|
||||
The following modifiers affect the compilation process or request
|
||||
The following modifiers affect the compilation process or request
|
||||
information about the pattern:
|
||||
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
/B bincode show binary code without lengths
|
||||
callout_info show callout information
|
||||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
|
@ -463,34 +471,34 @@ PATTERN MODIFIERS
|
|||
|
||||
Newline and \R handling
|
||||
|
||||
The bsr modifier specifies what \R in a pattern should match. If it is
|
||||
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
||||
"unicode", \R matches any Unicode newline sequence. The default is
|
||||
The bsr modifier specifies what \R in a pattern should match. If it is
|
||||
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
||||
"unicode", \R matches any Unicode newline sequence. The default is
|
||||
specified when PCRE2 is built, with the default default being Unicode.
|
||||
|
||||
The newline modifier specifies which characters are to be interpreted
|
||||
The newline modifier specifies which characters are to be interpreted
|
||||
as newlines, both in the pattern and in subject lines. The type must be
|
||||
one of CR, LF, CRLF, ANYCRLF, or ANY (in upper or lower case).
|
||||
|
||||
Information about a pattern
|
||||
|
||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||
available information.
|
||||
|
||||
The bincode modifier causes a representation of the compiled code to be
|
||||
output after compilation. This information does not contain length and
|
||||
output after compilation. This information does not contain length and
|
||||
offset values, which ensures that the same output is generated for dif-
|
||||
ferent internal link sizes and different code unit widths. By using
|
||||
bincode, the same regression tests can be used in different environ-
|
||||
ferent internal link sizes and different code unit widths. By using
|
||||
bincode, the same regression tests can be used in different environ-
|
||||
ments.
|
||||
|
||||
The fullbincode modifier, by contrast, does include length and offset
|
||||
values. This is used in a few special tests that run only for specific
|
||||
The fullbincode modifier, by contrast, does include length and offset
|
||||
values. This is used in a few special tests that run only for specific
|
||||
code unit widths and link sizes, and is also useful for one-off tests.
|
||||
|
||||
The info modifier requests information about the compiled pattern
|
||||
(whether it is anchored, has a fixed first character, and so on). The
|
||||
information is obtained from the pcre2_pattern_info() function. Here
|
||||
The info modifier requests information about the compiled pattern
|
||||
(whether it is anchored, has a fixed first character, and so on). The
|
||||
information is obtained from the pcre2_pattern_info() function. Here
|
||||
are some typical examples:
|
||||
|
||||
re> /(?i)(^a|^b)/m,info
|
||||
|
@ -508,16 +516,21 @@ PATTERN MODIFIERS
|
|||
Last code unit = 'c' (caseless)
|
||||
Subject length lower bound = 3
|
||||
|
||||
"Compile options" are those specified by modifiers; "overall options"
|
||||
have added options that are taken or deduced from the pattern. If both
|
||||
sets of options are the same, just a single "options" line is output;
|
||||
if there are no options, the line is omitted. "First code unit" is
|
||||
where any match must start; if there is more than one they are listed
|
||||
as "starting code units". "Last code unit" is the last literal code
|
||||
unit that must be present in any match. This is not necessarily the
|
||||
last character. These lines are omitted if no starting or ending code
|
||||
"Compile options" are those specified by modifiers; "overall options"
|
||||
have added options that are taken or deduced from the pattern. If both
|
||||
sets of options are the same, just a single "options" line is output;
|
||||
if there are no options, the line is omitted. "First code unit" is
|
||||
where any match must start; if there is more than one they are listed
|
||||
as "starting code units". "Last code unit" is the last literal code
|
||||
unit that must be present in any match. This is not necessarily the
|
||||
last character. These lines are omitted if no starting or ending code
|
||||
units are recorded.
|
||||
|
||||
The callout_info modifier requests information about all the callouts
|
||||
in the pattern. A list of them is output at the end of any other infor-
|
||||
mation that is requested. For each callout, either its number or string
|
||||
is given, followed by the item that follows it in the pattern.
|
||||
|
||||
Specifying a pattern in hex
|
||||
|
||||
The hex modifier specifies that the characters of the pattern are to be
|
||||
|
@ -808,11 +821,15 @@ SUBJECT MODIFIERS
|
|||
The callout_fail modifier can be given one or two numbers. If there is
|
||||
only one number, 1 is returned instead of 0 when a callout of that num-
|
||||
ber is reached. If two numbers are given, 1 is returned when callout
|
||||
<n> is reached for the <m>th time.
|
||||
<n> is reached for the <m>th time. Note that callouts with string argu-
|
||||
ments are always given the number zero. See "Callouts" below for a
|
||||
description of the output when a callout it taken.
|
||||
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. Any value other than zero is used as a return from pcre2test's
|
||||
callout function.
|
||||
ber. This is set as the "user data" that is passed to the matching
|
||||
function, and passed back when the callout function is invoked. Any
|
||||
value other than zero is used as a return from pcre2test's callout
|
||||
function.
|
||||
|
||||
Finding all matches in a string
|
||||
|
||||
|
@ -1136,22 +1153,37 @@ RESTARTING AFTER A PARTIAL MATCH
|
|||
CALLOUTS
|
||||
|
||||
If the pattern contains any callout requests, pcre2test's callout func-
|
||||
tion is called during matching. This works with both matching func-
|
||||
tions. By default, the called function displays the callout number, the
|
||||
start and current positions in the text at the callout time, and the
|
||||
tion is called during matching unless callout_none is specified. This
|
||||
works with both matching functions.
|
||||
|
||||
The callout function in pcre2test returns zero (carry on matching) by
|
||||
default, but you can use a callout_fail modifier in a subject line (as
|
||||
described above) to change this and other parameters of the callout.
|
||||
|
||||
Inserting callouts can be helpful when using pcre2test to check compli-
|
||||
cated regular expressions. For further information about callouts, see
|
||||
the pcre2callout documentation.
|
||||
|
||||
The output for callouts with numerical arguments and those with string
|
||||
arguments is slightly different.
|
||||
|
||||
Callouts with numerical arguments
|
||||
|
||||
By default, the callout function displays the callout number, the start
|
||||
and current positions in the subject text at the callout time, and the
|
||||
next pattern item to be tested. For example:
|
||||
|
||||
--->pqrabcdef
|
||||
0 ^ ^ \d
|
||||
|
||||
This output indicates that callout number 0 occurred for a match
|
||||
attempt starting at the fourth character of the subject string, when
|
||||
the pointer was at the seventh character, and when the next pattern
|
||||
item was \d. Just one circumflex is output if the start and current
|
||||
This output indicates that callout number 0 occurred for a match
|
||||
attempt starting at the fourth character of the subject string, when
|
||||
the pointer was at the seventh character, and when the next pattern
|
||||
item was \d. Just one circumflex is output if the start and current
|
||||
positions are the same.
|
||||
|
||||
Callouts numbered 255 are assumed to be automatic callouts, inserted as
|
||||
a result of the /auto_callout pattern modifier. In this case, instead
|
||||
a result of the /auto_callout pattern modifier. In this case, instead
|
||||
of showing the callout number, the offset in the pattern, preceded by a
|
||||
plus, is output. For example:
|
||||
|
||||
|
@ -1165,7 +1197,7 @@ CALLOUTS
|
|||
0: E*
|
||||
|
||||
If a pattern contains (*MARK) items, an additional line is output when-
|
||||
ever a change of latest mark is passed to the callout function. For
|
||||
ever a change of latest mark is passed to the callout function. For
|
||||
example:
|
||||
|
||||
re> /a(*MARK:X)bc/auto_callout
|
||||
|
@ -1179,76 +1211,86 @@ CALLOUTS
|
|||
+12 ^ ^
|
||||
0: abc
|
||||
|
||||
The mark changes between matching "a" and "b", but stays the same for
|
||||
the rest of the match, so nothing more is output. If, as a result of
|
||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||
The mark changes between matching "a" and "b", but stays the same for
|
||||
the rest of the match, so nothing more is output. If, as a result of
|
||||
backtracking, the mark reverts to being unset, the text "<unset>" is
|
||||
output.
|
||||
|
||||
The callout function in pcre2test returns zero (carry on matching) by
|
||||
default, but you can use a callout_fail modifier in a subject line (as
|
||||
described above) to change this and other parameters of the callout.
|
||||
Callouts with string arguments
|
||||
|
||||
Inserting callouts can be helpful when using pcre2test to check compli-
|
||||
cated regular expressions. For further information about callouts, see
|
||||
the pcre2callout documentation.
|
||||
The output for a callout with a string argument is similar, except that
|
||||
instead of outputting a callout number before the position indicators,
|
||||
the callout string and its offset in the pattern string are output
|
||||
before the reflection of the subject string, and the subject string is
|
||||
reflected for each callout. For example:
|
||||
|
||||
re> /^ab(?C'first')cd(?C"second")ef/
|
||||
data> abcdefg
|
||||
Callout (7): 'first'
|
||||
--->abcdefg
|
||||
^ ^ c
|
||||
Callout (20): "second"
|
||||
--->abcdefg
|
||||
^ ^ e
|
||||
0: abcdef
|
||||
|
||||
|
||||
NON-PRINTING CHARACTERS
|
||||
|
||||
When pcre2test is outputting text in the compiled version of a pattern,
|
||||
bytes other than 32-126 are always treated as non-printing characters
|
||||
bytes other than 32-126 are always treated as non-printing characters
|
||||
and are therefore shown as hex escapes.
|
||||
|
||||
When pcre2test is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been
|
||||
set for the pattern (using the /locale modifier). In this case, the
|
||||
isprint() function is used to distinguish printing and non-printing
|
||||
When pcre2test is outputting text that is a matched part of a subject
|
||||
string, it behaves in the same way, unless a different locale has been
|
||||
set for the pattern (using the /locale modifier). In this case, the
|
||||
isprint() function is used to distinguish printing and non-printing
|
||||
characters.
|
||||
|
||||
|
||||
SAVING AND RESTORING COMPILED PATTERNS
|
||||
|
||||
It is possible to save compiled patterns on disc or elsewhere, and
|
||||
It is possible to save compiled patterns on disc or elsewhere, and
|
||||
reload them later, subject to a number of restrictions. JIT data cannot
|
||||
be saved. The host on which the patterns are reloaded must be running
|
||||
be saved. The host on which the patterns are reloaded must be running
|
||||
the same version of PCRE2, with the same code unit width, and must also
|
||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||
compiled patterns can be saved they must be serialized, that is, con-
|
||||
verted to a stream of bytes. A single byte stream may contain any num-
|
||||
ber of compiled patterns, but they must all use the same character
|
||||
have the same endianness, pointer width and PCRE2_SIZE type. Before
|
||||
compiled patterns can be saved they must be serialized, that is, con-
|
||||
verted to a stream of bytes. A single byte stream may contain any num-
|
||||
ber of compiled patterns, but they must all use the same character
|
||||
tables. A single copy of the tables is included in the byte stream (its
|
||||
size is 1088 bytes).
|
||||
|
||||
The functions whose names begin with pcre2_serialize_ are used for
|
||||
serializing and de-serializing. They are described in the pcre2serial-
|
||||
The functions whose names begin with pcre2_serialize_ are used for
|
||||
serializing and de-serializing. They are described in the pcre2serial-
|
||||
ize documentation. In this section we describe the features of
|
||||
pcre2test that can be used to test these functions.
|
||||
|
||||
When a pattern with push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or command) instead of a subject
|
||||
When a pattern with push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or command) instead of a subject
|
||||
line. By this means, a number of patterns can be compiled and retained.
|
||||
The push modifier is incompatible with posix, and control modifiers
|
||||
The push modifier is incompatible with posix, and control modifiers
|
||||
that act at match time are ignored (with a message). The jitverify mod-
|
||||
ifier applies only at compile time. The command
|
||||
|
||||
#save <filename>
|
||||
|
||||
causes all the stacked patterns to be serialized and the result written
|
||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||
to the named file. Afterwards, all the stacked patterns are freed. The
|
||||
command
|
||||
|
||||
#load <filename>
|
||||
|
||||
reads the data in the file, and then arranges for it to be de-serial-
|
||||
ized, with the resulting compiled patterns added to the pattern stack.
|
||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||
mand, which must be followed by lines of subjects that are to be
|
||||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, and push are not allowed, nor are any option-
|
||||
setting modifiers. The JIT modifiers are, however permitted. Here is
|
||||
reads the data in the file, and then arranges for it to be de-serial-
|
||||
ized, with the resulting compiled patterns added to the pattern stack.
|
||||
The pattern on the top of the stack can be retrieved by the #pop com-
|
||||
mand, which must be followed by lines of subjects that are to be
|
||||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, and push are not allowed, nor are any option-
|
||||
setting modifiers. The JIT modifiers are, however permitted. Here is
|
||||
an example that saves and reloads two patterns.
|
||||
|
||||
/abc/push
|
||||
|
@ -1261,7 +1303,7 @@ SAVING AND RESTORING COMPILED PATTERNS
|
|||
#pop jit,bincode
|
||||
abc
|
||||
|
||||
If jitverify is used with #pop, it does not automatically imply jit,
|
||||
If jitverify is used with #pop, it does not automatically imply jit,
|
||||
which is different behaviour from when it is used on a pattern.
|
||||
|
||||
|
||||
|
@ -1280,5 +1322,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 January 2015
|
||||
Last updated: 22 March 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
|
|
|
@ -342,7 +342,19 @@ typedef struct pcre2_callout_block { \
|
|||
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||
/* ------------------------------------------------------------------ */ \
|
||||
} pcre2_callout_block;
|
||||
} pcre2_callout_block; \
|
||||
\
|
||||
typedef struct pcre2_callout_enumerate_block { \
|
||||
uint32_t version; /* Identifies version of block */ \
|
||||
/* ------------------------ Version 0 ------------------------------- */ \
|
||||
PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \
|
||||
PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \
|
||||
uint32_t callout_number; /* Number compiled into pattern */ \
|
||||
PCRE2_SIZE callout_string_offset; /* Offset to string within pattern */ \
|
||||
PCRE2_SIZE callout_string_length; /* Length of string compiled into pattern */ \
|
||||
PCRE2_SPTR callout_string; /* String compiled into pattern */ \
|
||||
/* ------------------------------------------------------------------ */ \
|
||||
} pcre2_callout_enumerate_block;
|
||||
|
||||
|
||||
/* List the generic forms of all other functions in macros, which will be
|
||||
|
@ -410,6 +422,9 @@ PCRE2_EXP_DECL void pcre2_code_free(pcre2_code *);
|
|||
|
||||
#define PCRE2_PATTERN_INFO_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int pcre2_pattern_info(const pcre2_code *, uint32_t, \
|
||||
void *); \
|
||||
PCRE2_EXP_DECL int pcre2_callout_enumerate(const pcre2_code *, \
|
||||
int (*)(pcre2_callout_enumerate_block *, void *), \
|
||||
void *);
|
||||
|
||||
|
||||
|
@ -538,15 +553,17 @@ pcre2_compile are called by application code. */
|
|||
|
||||
/* Data blocks */
|
||||
|
||||
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
||||
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
||||
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
||||
#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_)
|
||||
#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_)
|
||||
#define pcre2_callout_block PCRE2_SUFFIX(pcre2_callout_block_)
|
||||
#define pcre2_callout_enumerate_block PCRE2_SUFFIX(pcre2_callout_enumerate_block_)
|
||||
#define pcre2_general_context PCRE2_SUFFIX(pcre2_general_context_)
|
||||
#define pcre2_compile_context PCRE2_SUFFIX(pcre2_compile_context_)
|
||||
#define pcre2_match_context PCRE2_SUFFIX(pcre2_match_context_)
|
||||
#define pcre2_match_data PCRE2_SUFFIX(pcre2_match_data_)
|
||||
|
||||
|
||||
/* Functions: the complete list in alphabetical order */
|
||||
|
||||
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
||||
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
||||
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
||||
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
||||
|
@ -554,7 +571,6 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_compile_context_free PCRE2_SUFFIX(pcre2_compile_context_free_)
|
||||
#define pcre2_config PCRE2_SUFFIX(pcre2_config_)
|
||||
#define pcre2_dfa_match PCRE2_SUFFIX(pcre2_dfa_match_)
|
||||
#define pcre2_match PCRE2_SUFFIX(pcre2_match_)
|
||||
#define pcre2_general_context_copy PCRE2_SUFFIX(pcre2_general_context_copy_)
|
||||
#define pcre2_general_context_create PCRE2_SUFFIX(pcre2_general_context_create_)
|
||||
#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_)
|
||||
|
@ -570,6 +586,7 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_jit_stack_create PCRE2_SUFFIX(pcre2_jit_stack_create_)
|
||||
#define pcre2_jit_stack_free PCRE2_SUFFIX(pcre2_jit_stack_free_)
|
||||
#define pcre2_maketables PCRE2_SUFFIX(pcre2_maketables_)
|
||||
#define pcre2_match PCRE2_SUFFIX(pcre2_match_)
|
||||
#define pcre2_match_context_copy PCRE2_SUFFIX(pcre2_match_context_copy_)
|
||||
#define pcre2_match_context_create PCRE2_SUFFIX(pcre2_match_context_create_)
|
||||
#define pcre2_match_context_free PCRE2_SUFFIX(pcre2_match_context_free_)
|
||||
|
|
|
@ -225,4 +225,181 @@ switch(what)
|
|||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Callout enumerator *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
code points to compiled code
|
||||
callback function called for each callout block
|
||||
callout_data user data passed to the callback
|
||||
|
||||
Returns: 0 when successfully completed
|
||||
< 0 on local error
|
||||
!= 0 for callback error
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_callout_enumerate(const pcre2_code *code,
|
||||
int (*callback)(pcre2_callout_enumerate_block *, void *), void *callout_data)
|
||||
{
|
||||
pcre2_real_code *re = (pcre2_real_code *)code;
|
||||
pcre2_callout_enumerate_block cb;
|
||||
PCRE2_SPTR cc;
|
||||
#ifdef SUPPORT_UNICODE
|
||||
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
|
||||
#endif
|
||||
|
||||
if (re == NULL) return PCRE2_ERROR_NULL;
|
||||
|
||||
/* Check that the first field in the block is the magic number. If it is not,
|
||||
return with PCRE2_ERROR_BADMAGIC. */
|
||||
|
||||
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
|
||||
|
||||
/* Check that this pattern was compiled in the correct bit mode */
|
||||
|
||||
if ((re->flags & (PCRE2_CODE_UNIT_WIDTH/8)) == 0) return PCRE2_ERROR_BADMODE;
|
||||
|
||||
cb.version = 0;
|
||||
cc = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code))
|
||||
+ re->name_count * re->name_entry_size;
|
||||
|
||||
while (TRUE)
|
||||
{
|
||||
int rc;
|
||||
switch (*cc)
|
||||
{
|
||||
case OP_END:
|
||||
return 0;
|
||||
|
||||
case OP_CHAR:
|
||||
case OP_CHARI:
|
||||
case OP_NOT:
|
||||
case OP_NOTI:
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_EXACT:
|
||||
case OP_POSSTAR:
|
||||
case OP_POSPLUS:
|
||||
case OP_POSQUERY:
|
||||
case OP_POSUPTO:
|
||||
case OP_STARI:
|
||||
case OP_MINSTARI:
|
||||
case OP_PLUSI:
|
||||
case OP_MINPLUSI:
|
||||
case OP_QUERYI:
|
||||
case OP_MINQUERYI:
|
||||
case OP_UPTOI:
|
||||
case OP_MINUPTOI:
|
||||
case OP_EXACTI:
|
||||
case OP_POSSTARI:
|
||||
case OP_POSPLUSI:
|
||||
case OP_POSQUERYI:
|
||||
case OP_POSUPTOI:
|
||||
case OP_NOTSTAR:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTQUERY:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_NOTUPTO:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_NOTPOSPLUS:
|
||||
case OP_NOTPOSQUERY:
|
||||
case OP_NOTPOSUPTO:
|
||||
case OP_NOTSTARI:
|
||||
case OP_NOTMINSTARI:
|
||||
case OP_NOTPLUSI:
|
||||
case OP_NOTMINPLUSI:
|
||||
case OP_NOTQUERYI:
|
||||
case OP_NOTMINQUERYI:
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOTMINUPTOI:
|
||||
case OP_NOTEXACTI:
|
||||
case OP_NOTPOSSTARI:
|
||||
case OP_NOTPOSPLUSI:
|
||||
case OP_NOTPOSQUERYI:
|
||||
case OP_NOTPOSUPTOI:
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
|
||||
#endif
|
||||
break;
|
||||
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEEXACT:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEPOSPLUS:
|
||||
case OP_TYPEPOSQUERY:
|
||||
case OP_TYPEPOSUPTO:
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (cc[-1] == OP_PROP || cc[-1] == OP_NOTPROP) cc += 2;
|
||||
#endif
|
||||
break;
|
||||
|
||||
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
|
||||
case OP_XCLASS:
|
||||
cc += GET(cc, 1);
|
||||
break;
|
||||
#endif
|
||||
|
||||
case OP_MARK:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_SKIP_ARG:
|
||||
case OP_THEN_ARG:
|
||||
cc += PRIV(OP_lengths)[*cc] + cc[1];
|
||||
break;
|
||||
|
||||
case OP_CALLOUT:
|
||||
cb.pattern_position = GET(cc, 1);
|
||||
cb.next_item_length = GET(cc, 1 + LINK_SIZE);
|
||||
cb.callout_number = cc[1 + 2*LINK_SIZE];
|
||||
cb.callout_string_offset = 0;
|
||||
cb.callout_string_length = 0;
|
||||
cb.callout_string = NULL;
|
||||
rc = callback(&cb, callout_data);
|
||||
if (rc != 0) return rc;
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
break;
|
||||
|
||||
case OP_CALLOUT_STR:
|
||||
cb.pattern_position = GET(cc, 1);
|
||||
cb.next_item_length = GET(cc, 1 + LINK_SIZE);
|
||||
cb.callout_number = 0;
|
||||
cb.callout_string_offset = GET(cc, 1 + 3*LINK_SIZE);
|
||||
cb.callout_string_length =
|
||||
GET(cc, 1 + 2*LINK_SIZE) - (1 + 4*LINK_SIZE) - 2;
|
||||
cb.callout_string = cc + (1 + 4*LINK_SIZE) + 1;
|
||||
rc = callback(&cb, callout_data);
|
||||
if (rc != 0) return rc;
|
||||
cc += GET(cc, 1 + 2*LINK_SIZE);
|
||||
break;
|
||||
|
||||
default:
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* End of pcre2_pattern_info.c */
|
||||
|
|
141
src/pcre2test.c
141
src/pcre2test.c
|
@ -382,28 +382,29 @@ either on a pattern or a data line, so they must all be distinct. */
|
|||
#define CTL_ALTGLOBAL 0x00000010u
|
||||
#define CTL_BINCODE 0x00000020u
|
||||
#define CTL_CALLOUT_CAPTURE 0x00000040u
|
||||
#define CTL_CALLOUT_NONE 0x00000080u
|
||||
#define CTL_DFA 0x00000100u
|
||||
#define CTL_FINDLIMITS 0x00000200u
|
||||
#define CTL_FULLBINCODE 0x00000400u
|
||||
#define CTL_GETALL 0x00000800u
|
||||
#define CTL_GLOBAL 0x00001000u
|
||||
#define CTL_HEXPAT 0x00002000u
|
||||
#define CTL_INFO 0x00004000u
|
||||
#define CTL_JITFAST 0x00008000u
|
||||
#define CTL_JITVERIFY 0x00010000u
|
||||
#define CTL_MARK 0x00020000u
|
||||
#define CTL_MEMORY 0x00040000u
|
||||
#define CTL_POSIX 0x00080000u
|
||||
#define CTL_PUSH 0x00100000u
|
||||
#define CTL_STARTCHAR 0x00200000u
|
||||
#define CTL_ZERO_TERMINATE 0x00400000u
|
||||
#define CTL_CALLOUT_INFO 0x00000080u
|
||||
#define CTL_CALLOUT_NONE 0x00000100u
|
||||
#define CTL_DFA 0x00000200u
|
||||
#define CTL_FINDLIMITS 0x00000400u
|
||||
#define CTL_FULLBINCODE 0x00000800u
|
||||
#define CTL_GETALL 0x00001000u
|
||||
#define CTL_GLOBAL 0x00002000u
|
||||
#define CTL_HEXPAT 0x00004000u
|
||||
#define CTL_INFO 0x00008000u
|
||||
#define CTL_JITFAST 0x00010000u
|
||||
#define CTL_JITVERIFY 0x00020000u
|
||||
#define CTL_MARK 0x00040000u
|
||||
#define CTL_MEMORY 0x00080000u
|
||||
#define CTL_POSIX 0x00100000u
|
||||
#define CTL_PUSH 0x00200000u
|
||||
#define CTL_STARTCHAR 0x00400000u
|
||||
#define CTL_ZERO_TERMINATE 0x00800000u
|
||||
|
||||
#define CTL_BSR_SET 0x80000000u /* This is informational */
|
||||
#define CTL_NL_SET 0x40000000u /* This is informational */
|
||||
|
||||
#define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */
|
||||
#define CTL_ANYINFO (CTL_DEBUG|CTL_BINCODE) /* For testing */
|
||||
#define CTL_ANYINFO (CTL_DEBUG|CTL_BINCODE|CTL_CALLOUT_INFO)
|
||||
#define CTL_ANYGLOB (CTL_ALTGLOBAL|CTL_GLOBAL)
|
||||
|
||||
/* These are all the controls that may be set either on a pattern or on a
|
||||
|
@ -431,7 +432,7 @@ typedef struct patctl { /* Structure for pattern modifiers. */
|
|||
uint32_t jit;
|
||||
uint32_t stackguard_test;
|
||||
uint32_t tables_id;
|
||||
uint8_t locale[LOCALESIZE];
|
||||
uint8_t locale[LOCALESIZE];
|
||||
} patctl;
|
||||
|
||||
#define MAXCPYGET 10
|
||||
|
@ -494,6 +495,7 @@ static modstruct modlist[] = {
|
|||
{ "callout_capture", MOD_DAT, MOD_CTL, CTL_CALLOUT_CAPTURE, DO(control) },
|
||||
{ "callout_data", MOD_DAT, MOD_INS, 0, DO(callout_data) },
|
||||
{ "callout_fail", MOD_DAT, MOD_IN2, 0, DO(cfail) },
|
||||
{ "callout_info", MOD_PAT, MOD_CTL, CTL_CALLOUT_INFO, PO(control) },
|
||||
{ "callout_none", MOD_DAT, MOD_CTL, CTL_CALLOUT_NONE, DO(control) },
|
||||
{ "caseless", MOD_PATP, MOD_OPT, PCRE2_CASELESS, PO(options) },
|
||||
{ "copy", MOD_DAT, MOD_NN, DO(copy_numbers), DO(copy_names) },
|
||||
|
@ -578,8 +580,8 @@ static modstruct modlist[] = {
|
|||
/* Control bits that are not ignored with 'push'. */
|
||||
|
||||
#define PUSH_SUPPORTED_COMPILE_CONTROLS ( \
|
||||
CTL_BINCODE|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO|CTL_JITVERIFY| \
|
||||
CTL_MEMORY|CTL_PUSH|CTL_BSR_SET|CTL_NL_SET)
|
||||
CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \
|
||||
CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_BSR_SET|CTL_NL_SET)
|
||||
|
||||
/* Controls that apply only at compile time with 'push'. */
|
||||
|
||||
|
@ -841,6 +843,17 @@ are supported. */
|
|||
else \
|
||||
(void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
||||
|
||||
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
a = pcre2_callout_enumerate_8(compiled_code8, \
|
||||
(int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c); \
|
||||
else if (test_mode == PCRE16_MODE) \
|
||||
a = pcre2_callout_enumerate_16(compiled_code16, \
|
||||
(int(*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c); \
|
||||
else \
|
||||
a = pcre2_callout_enumerate_32(compiled_code32, \
|
||||
(int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c)
|
||||
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8)); \
|
||||
|
@ -1268,6 +1281,14 @@ the three different cases. */
|
|||
else \
|
||||
(void)G(pchars,BITTWO)((G(PCRE2_SPTR,BITTWO))(p)+offset, len, utf, f)
|
||||
|
||||
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
a = G(pcre2_callout_enumerate,BITONE)(G(compiled_code,BITONE), \
|
||||
(int (*)(struct G(pcre2_callout_enumerate_block_,BITONE) *, void *))b,c); \
|
||||
else \
|
||||
a = G(pcre2_callout_enumerate,BITTWO)(G(compiled_code,BITTWO), \
|
||||
(int (*)(struct G(pcre2_callout_enumerate_block_,BITTWO) *, void *))b,c)
|
||||
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,G(g,BITONE)); \
|
||||
|
@ -1588,6 +1609,9 @@ the three different cases. */
|
|||
lv = pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
||||
#define PCHARSV(p, offset, len, utf, f) \
|
||||
(void)pchars8((PCRE2_SPTR8)(p)+offset, len, utf, f)
|
||||
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||
a = pcre2_callout_enumerate_8(compiled_code8, \
|
||||
(int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c)
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8))
|
||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||
|
@ -1676,6 +1700,9 @@ the three different cases. */
|
|||
lv = pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
|
||||
#define PCHARSV(p, offset, len, utf, f) \
|
||||
(void)pchars16((PCRE2_SPTR16)(p)+offset, len, utf, f)
|
||||
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||
a = pcre2_callout_enumerate_16(compiled_code16, \
|
||||
(int (*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c)
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,G(g,16))
|
||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||
|
@ -1764,6 +1791,9 @@ the three different cases. */
|
|||
lv = pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
|
||||
#define PCHARSV(p, offset, len, utf, f) \
|
||||
(void)pchars32((PCRE2_SPTR32)(p)+offset, len, utf, f)
|
||||
#define PCRE2_CALLOUT_ENUMERATE(a,b,c) \
|
||||
a = pcre2_callout_enumerate_32(compiled_code32, \
|
||||
(int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c)
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,G(g,32))
|
||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||
|
@ -3381,7 +3411,7 @@ Returns: nothing
|
|||
static void
|
||||
show_controls(uint32_t controls, const char *before)
|
||||
{
|
||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
||||
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
||||
|
@ -3390,6 +3420,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
|||
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
|
||||
((controls & CTL_BINCODE) != 0)? " bincode" : "",
|
||||
((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "",
|
||||
((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "",
|
||||
((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "",
|
||||
((controls & CTL_DFA) != 0)? " dfa" : "",
|
||||
((controls & CTL_FINDLIMITS) != 0)? " find_limits" : "",
|
||||
|
@ -3517,6 +3548,56 @@ if (pat_patctl.jit != 0)
|
|||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Callback function for callout enumeration *
|
||||
*************************************************/
|
||||
|
||||
/* The only differences in the callout emumeration block for different code
|
||||
unit widths are that the pointers to the subject, the most recent MARK, and a
|
||||
callout argument string point to strings of the appropriate width. Casts can be
|
||||
used to deal with this.
|
||||
|
||||
Argument:
|
||||
cb pointer to enumerate block
|
||||
callout_data user data
|
||||
|
||||
Returns: 0
|
||||
*/
|
||||
|
||||
static int callout_callback(pcre2_callout_enumerate_block_8 *cb,
|
||||
void *callout_data)
|
||||
{
|
||||
uint32_t i;
|
||||
BOOL utf = (FLD(compiled_code, overall_options) & PCRE2_UTF) != 0;
|
||||
|
||||
(void)callout_data; /* Not currently displayed */
|
||||
|
||||
fprintf(outfile, "Callout ");
|
||||
if (cb->callout_string != NULL)
|
||||
{
|
||||
uint32_t delimiter = CODE_UNIT(cb->callout_string, -1);
|
||||
fprintf(outfile, "%c", delimiter);
|
||||
PCHARSV(cb->callout_string, 0,
|
||||
cb->callout_string_length, utf, outfile);
|
||||
for (i = 0; callout_start_delims[i] != 0; i++)
|
||||
if (delimiter == callout_start_delims[i])
|
||||
{
|
||||
delimiter = callout_end_delims[i];
|
||||
break;
|
||||
}
|
||||
fprintf(outfile, "%c ", delimiter);
|
||||
}
|
||||
else fprintf(outfile, "%d ", cb->callout_number);
|
||||
|
||||
fprintf(outfile, "%.*s\n",
|
||||
(int)((cb->next_item_length == 0)? 1 : cb->next_item_length),
|
||||
pbuffer8 + cb->pattern_position);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Show information about a pattern *
|
||||
*************************************************/
|
||||
|
@ -3789,6 +3870,24 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
|||
}
|
||||
}
|
||||
|
||||
if ((pat_patctl.control & CTL_CALLOUT_INFO) != 0)
|
||||
{
|
||||
int errorcode;
|
||||
PCRE2_CALLOUT_ENUMERATE(errorcode, callout_callback, 0);
|
||||
if (errorcode != 0)
|
||||
{
|
||||
int len;
|
||||
fprintf(outfile, "Callout enumerate failed: error %d: ", errorcode);
|
||||
if (errorcode < 0)
|
||||
{
|
||||
PCRE2_GET_ERROR_MESSAGE(len, errorcode, pbuffer);
|
||||
PCHARSV(CASTVAR(void *, pbuffer), 0, len, FALSE, outfile);
|
||||
}
|
||||
fprintf(outfile, "\n");
|
||||
return PR_SKIP;
|
||||
}
|
||||
}
|
||||
|
||||
return PR_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -4206,11 +4206,11 @@ a random value. /Ix
|
|||
/^a(b)c(?C{AB})def/B
|
||||
abcdef\=callout_capture
|
||||
|
||||
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B
|
||||
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B,callout_info
|
||||
|
||||
/(?:a(?C`code`)){3}/B
|
||||
|
||||
/^(?(?C25)(?=abc)abcd|xyz)/B
|
||||
/^(?(?C25)(?=abc)abcd|xyz)/B,callout_info
|
||||
abcdefg
|
||||
xyz123
|
||||
|
||||
|
@ -4226,7 +4226,7 @@ a random value. /Ix
|
|||
|
||||
# Binary zero in callout string
|
||||
# a ( ? C ' x z ' ) b
|
||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex
|
||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex,callout_info
|
||||
abcdefgh
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -14060,7 +14060,7 @@ Callout (10): {AB} last capture = 1
|
|||
0: abcdef
|
||||
1: b
|
||||
|
||||
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B
|
||||
/(?C`a``b`)(?C'a''b')(?C"a""b")(?C^a^^b^)(?C%a%%b%)(?C#a##b#)(?C$a$$b$)(?C{a}}b})/B,callout_info
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
CalloutStr `a`b` 4 10 0
|
||||
|
@ -14074,6 +14074,14 @@ Callout (10): {AB} last capture = 1
|
|||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
Callout `a`b` (
|
||||
Callout 'a'b' (
|
||||
Callout "a"b" (
|
||||
Callout ^a^b^ (
|
||||
Callout %a%b% (
|
||||
Callout #a#b# (
|
||||
Callout $a$b$ (
|
||||
Callout {a}b}
|
||||
|
||||
/(?:a(?C`code`)){3}/B
|
||||
------------------------------------------------------------------
|
||||
|
@ -14094,7 +14102,7 @@ Callout (10): {AB} last capture = 1
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/^(?(?C25)(?=abc)abcd|xyz)/B
|
||||
/^(?(?C25)(?=abc)abcd|xyz)/B,callout_info
|
||||
------------------------------------------------------------------
|
||||
Bra
|
||||
^
|
||||
|
@ -14110,6 +14118,7 @@ Callout (10): {AB} last capture = 1
|
|||
Ket
|
||||
End
|
||||
------------------------------------------------------------------
|
||||
Callout 25 (?=abc)
|
||||
abcdefg
|
||||
--->abcdefg
|
||||
25 ^ (?=abc)
|
||||
|
@ -14171,7 +14180,8 @@ Callout (8): `code`
|
|||
|
||||
# Binary zero in callout string
|
||||
# a ( ? C ' x z ' ) b
|
||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex
|
||||
/ 61 28 3f 43 27 78 00 7a 27 29 62/hex,callout_info
|
||||
Callout 'x\x00z' b
|
||||
abcdefgh
|
||||
Callout (5): 'x\x00z'
|
||||
--->abcdefgh
|
||||
|
|
Loading…
Reference in New Issue