Implemented PCRE2_ALT_VERBNAMES
This commit is contained in:
parent
fd08e11c1e
commit
d2e87a75af
2
132html
2
132html
|
@ -148,7 +148,7 @@ while (<STDIN>)
|
||||||
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
|
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
|
||||||
$ref, $ref);
|
$ref, $ref);
|
||||||
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
|
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
|
||||||
$ref, $ref);
|
$ref);
|
||||||
$ref++;
|
$ref++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -167,6 +167,8 @@ test (there are now 20 in total).
|
||||||
47. Modifier lists in pcre2test were splitting at spaces without the required
|
47. Modifier lists in pcre2test were splitting at spaces without the required
|
||||||
commas.
|
commas.
|
||||||
|
|
||||||
|
48. Implemented PCRE2_ALT_VERBNAMES.
|
||||||
|
|
||||||
|
|
||||||
Version 10.20 30-June-2015
|
Version 10.20 30-June-2015
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -97,6 +97,7 @@ can skip ahead to the CMake section.
|
||||||
pcre2_context.c
|
pcre2_context.c
|
||||||
pcre2_dfa_match.c
|
pcre2_dfa_match.c
|
||||||
pcre2_error.c
|
pcre2_error.c
|
||||||
|
pcre2_find_bracket.c
|
||||||
pcre2_jit_compile.c
|
pcre2_jit_compile.c
|
||||||
pcre2_maketables.c
|
pcre2_maketables.c
|
||||||
pcre2_match.c
|
pcre2_match.c
|
||||||
|
@ -388,4 +389,4 @@ and executable, is in EBCDIC and native z/OS file formats and this is the
|
||||||
recommended download site.
|
recommended download site.
|
||||||
|
|
||||||
=============================
|
=============================
|
||||||
Last Updated: 15 June 2015
|
Last Updated: 16 July 2015
|
||||||
|
|
|
@ -724,6 +724,7 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_context.c )
|
src/pcre2_context.c )
|
||||||
src/pcre2_dfa_match.c )
|
src/pcre2_dfa_match.c )
|
||||||
src/pcre2_error.c )
|
src/pcre2_error.c )
|
||||||
|
src/pcre2_find_bracket.c )
|
||||||
src/pcre2_jit_compile.c )
|
src/pcre2_jit_compile.c )
|
||||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||||
|
@ -832,4 +833,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 24 April 2015
|
Last updated: 16 July 2015
|
||||||
|
|
|
@ -19,7 +19,7 @@ SYNOPSIS
|
||||||
<b>#include <pcre2.h></b>
|
<b>#include <pcre2.h></b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
|
|
|
@ -19,7 +19,7 @@ SYNOPSIS
|
||||||
<b>#include <pcre2.h></b>
|
<b>#include <pcre2.h></b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
|
|
@ -19,8 +19,8 @@ SYNOPSIS
|
||||||
<b>#include <pcre2.h></b>
|
<b>#include <pcre2.h></b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
|
|
|
@ -70,15 +70,15 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||||
|
@ -936,7 +936,7 @@ The <i>where</i> argument should point to a buffer that is at least 24 code
|
||||||
units long. (The exact length required can be found by calling
|
units long. (The exact length required can be found by calling
|
||||||
<b>pcre2_config()</b> with <b>where</b> set to NULL.) If PCRE2 has been compiled
|
<b>pcre2_config()</b> with <b>where</b> set to NULL.) If PCRE2 has been compiled
|
||||||
without Unicode support, the buffer is filled with the text "Unicode not
|
without Unicode support, the buffer is filled with the text "Unicode not
|
||||||
supported". Otherwise, the Unicode version string (for example, "7.0.0") is
|
supported". Otherwise, the Unicode version string (for example, "8.0.0") is
|
||||||
inserted. The number of code units used is returned. This is the length of the
|
inserted. The number of code units used is returned. This is the length of the
|
||||||
string plus one unit for the terminating zero.
|
string plus one unit for the terminating zero.
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -961,7 +961,7 @@ zero.
|
||||||
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
||||||
|
@ -1083,6 +1083,15 @@ after any internal newline. However, it does not match after a newline at the
|
||||||
end of the subject, for compatibility with Perl. If you want a multiline
|
end of the subject, for compatibility with Perl. If you want a multiline
|
||||||
circumflex also to match after a terminating newline, you must set
|
circumflex also to match after a terminating newline, you must set
|
||||||
PCRE2_ALT_CIRCUMFLEX.
|
PCRE2_ALT_CIRCUMFLEX.
|
||||||
|
<pre>
|
||||||
|
PCRE2_ALT_VERBNAMES
|
||||||
|
</pre>
|
||||||
|
By default, for compatibility with Perl, the name in any verb sequence such as
|
||||||
|
(*MARK:NAME) is any sequence of characters that does not include a closing
|
||||||
|
parenthesis. The name is not processed in any way, and it is not possible to
|
||||||
|
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
||||||
|
option is set, normal backslash processing is applied to verb names and only an
|
||||||
|
unescaped closing parenthesis terminates the name.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_AUTO_CALLOUT
|
PCRE2_AUTO_CALLOUT
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -1778,12 +1787,12 @@ documentation.
|
||||||
<a name="matchdatablock"></a></P>
|
<a name="matchdatablock"></a></P>
|
||||||
<br><a name="SEC25" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
<br><a name="SEC25" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b>
|
<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
|
@ -2010,12 +2019,20 @@ If the pattern is anchored, such a match can occur only if the pattern contains
|
||||||
</pre>
|
</pre>
|
||||||
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
|
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
|
||||||
string is checked by default when <b>pcre2_match()</b> is subsequently called.
|
string is checked by default when <b>pcre2_match()</b> is subsequently called.
|
||||||
The entire string is checked before any other processing takes place, and a
|
If a non-zero starting offset is given, the check is applied only to that part
|
||||||
|
of the subject that could be inspected during matching, and there is a check
|
||||||
|
that the starting offset points to the first code unit of a character or to the
|
||||||
|
end of the subject. If there are no lookbehind assertions in the pattern, the
|
||||||
|
check starts at the starting offset. Otherwise, it starts at the length of the
|
||||||
|
longest lookbehind before the starting offset, or at the start of the subject
|
||||||
|
if there are not that many characters before the starting offset. Note that the
|
||||||
|
sequences \b and \B are one-character lookbehinds.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The check is carried out before any other processing takes place, and a
|
||||||
negative error code is returned if the check fails. There are several UTF error
|
negative error code is returned if the check fails. There are several UTF error
|
||||||
codes for each code unit width, corresponding to different problems with the
|
codes for each code unit width, corresponding to different problems with the
|
||||||
code unit sequence. The value of <i>startoffset</i> is also checked, to ensure
|
code unit sequence. There are discussions about the validity of
|
||||||
that it points to the start of a character or to the end of the subject. There
|
|
||||||
are discussions about the validity of
|
|
||||||
<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a>
|
<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a>
|
||||||
<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a>
|
<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a>
|
||||||
and
|
and
|
||||||
|
@ -2564,12 +2581,12 @@ be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||||
dollar character is an escape character that can specify the insertion of
|
dollar character is an escape character that can specify the insertion of
|
||||||
characters from capturing groups in the pattern. The following forms are
|
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||||
recognized:
|
forms are recognized:
|
||||||
<pre>
|
<pre>
|
||||||
$$ insert a dollar character
|
$$ insert a dollar character
|
||||||
$<n> insert the contents of group <n>
|
$<n> or ${<n>} insert the contents of group <n>
|
||||||
${<n>} insert the contents of group <n>
|
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||||
</pre>
|
</pre>
|
||||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||||
required only if the following character would be interpreted as part of the
|
required only if the following character would be interpreted as part of the
|
||||||
|
@ -2580,6 +2597,15 @@ calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
|
||||||
appropriate.
|
appropriate.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
The facility for inserting a (*MARK) name can be used to perform simple
|
||||||
|
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||||
|
<pre>
|
||||||
|
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||||
|
apple lemon
|
||||||
|
2: pear orange
|
||||||
|
</PRE>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||||
<b>pcre2_match()</b>, except that the partial matching options are not
|
<b>pcre2_match()</b>, except that the partial matching options are not
|
||||||
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
||||||
|
@ -2883,7 +2909,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 22 April 2015
|
Last updated: 30 August 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -224,8 +224,14 @@ whether a match operation was executed by JIT or by the interpreter.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
You may safely use the same JIT stack for more than one pattern (either by
|
You may safely use the same JIT stack for more than one pattern (either by
|
||||||
assigning directly or by callback), as long as the patterns are all matched
|
assigning directly or by callback), as long as the patterns are matched
|
||||||
sequentially in the same thread. In a multithread application, if you do not
|
sequentially in the same thread. Currently, the only way to set up
|
||||||
|
non-sequential matches in one thread is to use callouts: if a callout function
|
||||||
|
starts another match, that match must use a different JIT stack to the one used
|
||||||
|
for currently suspended match(es).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
In a multithread application, if you do not
|
||||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||||
is thread-safe, because each thread has its own machine stack. However, if you
|
is thread-safe, because each thread has its own machine stack. However, if you
|
||||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||||
|
@ -419,9 +425,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 27 November 2014
|
Last updated: 28 July 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -736,6 +736,8 @@ Those that are not part of an identified script are lumped together as
|
||||||
"Common". The current list of scripts is:
|
"Common". The current list of scripts is:
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
Ahom,
|
||||||
|
Anatolian_Hieroglyphs,
|
||||||
Arabic,
|
Arabic,
|
||||||
Armenian,
|
Armenian,
|
||||||
Avestan,
|
Avestan,
|
||||||
|
@ -776,6 +778,7 @@ Gurmukhi,
|
||||||
Han,
|
Han,
|
||||||
Hangul,
|
Hangul,
|
||||||
Hanunoo,
|
Hanunoo,
|
||||||
|
Hatran,
|
||||||
Hebrew,
|
Hebrew,
|
||||||
Hiragana,
|
Hiragana,
|
||||||
Imperial_Aramaic,
|
Imperial_Aramaic,
|
||||||
|
@ -812,12 +815,14 @@ Miao,
|
||||||
Modi,
|
Modi,
|
||||||
Mongolian,
|
Mongolian,
|
||||||
Mro,
|
Mro,
|
||||||
|
Multani,
|
||||||
Myanmar,
|
Myanmar,
|
||||||
Nabataean,
|
Nabataean,
|
||||||
New_Tai_Lue,
|
New_Tai_Lue,
|
||||||
Nko,
|
Nko,
|
||||||
Ogham,
|
Ogham,
|
||||||
Ol_Chiki,
|
Ol_Chiki,
|
||||||
|
Old_Hungarian,
|
||||||
Old_Italic,
|
Old_Italic,
|
||||||
Old_North_Arabian,
|
Old_North_Arabian,
|
||||||
Old_Permic,
|
Old_Permic,
|
||||||
|
@ -839,6 +844,7 @@ Saurashtra,
|
||||||
Sharada,
|
Sharada,
|
||||||
Shavian,
|
Shavian,
|
||||||
Siddham,
|
Siddham,
|
||||||
|
SignWriting,
|
||||||
Sinhala,
|
Sinhala,
|
||||||
Sora_Sompeng,
|
Sora_Sompeng,
|
||||||
Sundanese,
|
Sundanese,
|
||||||
|
@ -1322,9 +1328,19 @@ where a range ending character is expected. For example, [z-\xff] is valid,
|
||||||
but [A-\d] and [A-[:digit:]] are not.
|
but [A-\d] and [A-[:digit:]] are not.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Ranges operate in the collating sequence of character values. They can also be
|
Ranges normally include all code points between the start and end characters,
|
||||||
used for characters specified numerically, for example [\000-\037]. Ranges
|
inclusive. They can also be used for code points specified numerically, for
|
||||||
can include any characters that are valid for the current mode.
|
example [\000-\037]. Ranges can include any characters that are valid for the
|
||||||
|
current mode.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
There is a special case in EBCDIC environments for ranges whose end points are
|
||||||
|
both specified as literal letters in the same case. For compatibility with
|
||||||
|
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||||
|
example, [h-k] matches only four characters, even though the codes for h and k
|
||||||
|
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||||
|
specified numerically, for example, [\x88-\x92] or [h-\x92], all code points
|
||||||
|
are included.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If a range that includes letters is used when caseless matching is set, it
|
If a range that includes letters is used when caseless matching is set, it
|
||||||
|
@ -2899,14 +2915,23 @@ remarks apply to the PCRE2 features described in this section.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The new verbs make use of what was previously invalid syntax: an opening
|
The new verbs make use of what was previously invalid syntax: an opening
|
||||||
parenthesis followed by an asterisk. They are generally of the form
|
parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
||||||
(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
(*VERB:NAME). Some verbs take either form, possibly behaving differently
|
||||||
differently depending on whether or not a name is present. A name is any
|
depending on whether or not a name is present.
|
||||||
sequence of characters that does not include a closing parenthesis. The maximum
|
</P>
|
||||||
length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit
|
<P>
|
||||||
libraries. If the name is empty, that is, if the closing parenthesis
|
By default, for compatibility with Perl, a name is any sequence of characters
|
||||||
immediately follows the colon, the effect is as if the colon were not there.
|
that does not include a closing parenthesis. The name is not processed in
|
||||||
Any number of these verbs may occur in a pattern.
|
any way, and it is not possible to include a closing parenthesis in the name.
|
||||||
|
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||||
|
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||||
|
the name.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||||
|
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||||
|
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||||
|
not there. Any number of these verbs may occur in a pattern.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Since these verbs are specifically related to backtracking, most of them can be
|
Since these verbs are specifically related to backtracking, most of them can be
|
||||||
|
@ -3323,7 +3348,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 13 June 2015
|
Last updated: 30 August 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -187,6 +187,8 @@ at release 5.18.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
Ahom,
|
||||||
|
Anatolian_Hieroglyphs,
|
||||||
Arabic,
|
Arabic,
|
||||||
Armenian,
|
Armenian,
|
||||||
Avestan,
|
Avestan,
|
||||||
|
@ -227,6 +229,7 @@ Gurmukhi,
|
||||||
Han,
|
Han,
|
||||||
Hangul,
|
Hangul,
|
||||||
Hanunoo,
|
Hanunoo,
|
||||||
|
Hatran,
|
||||||
Hebrew,
|
Hebrew,
|
||||||
Hiragana,
|
Hiragana,
|
||||||
Imperial_Aramaic,
|
Imperial_Aramaic,
|
||||||
|
@ -263,12 +266,14 @@ Miao,
|
||||||
Modi,
|
Modi,
|
||||||
Mongolian,
|
Mongolian,
|
||||||
Mro,
|
Mro,
|
||||||
|
Multani,
|
||||||
Myanmar,
|
Myanmar,
|
||||||
Nabataean,
|
Nabataean,
|
||||||
New_Tai_Lue,
|
New_Tai_Lue,
|
||||||
Nko,
|
Nko,
|
||||||
Ogham,
|
Ogham,
|
||||||
Ol_Chiki,
|
Ol_Chiki,
|
||||||
|
Old_Hungarian,
|
||||||
Old_Italic,
|
Old_Italic,
|
||||||
Old_North_Arabian,
|
Old_North_Arabian,
|
||||||
Old_Permic,
|
Old_Permic,
|
||||||
|
@ -290,6 +295,7 @@ Saurashtra,
|
||||||
Sharada,
|
Sharada,
|
||||||
Shavian,
|
Shavian,
|
||||||
Siddham,
|
Siddham,
|
||||||
|
SignWriting,
|
||||||
Sinhala,
|
Sinhala,
|
||||||
Sora_Sompeng,
|
Sora_Sompeng,
|
||||||
Sundanese,
|
Sundanese,
|
||||||
|
@ -582,7 +588,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 13 June 2015
|
Last updated: 17 July 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -340,12 +340,13 @@ subject lines. Modifiers on a subject line can change these settings.
|
||||||
<br><a name="SEC7" href="#TOC1">MODIFIER SYNTAX</a><br>
|
<br><a name="SEC7" href="#TOC1">MODIFIER SYNTAX</a><br>
|
||||||
<P>
|
<P>
|
||||||
Modifier lists are used with both pattern and subject lines. Items in a list
|
Modifier lists are used with both pattern and subject lines. Items in a list
|
||||||
are separated by commas and optional white space. Some modifiers may be given
|
are separated by commas followed by optional white space. Trailing whitespace
|
||||||
for both patterns and subject lines, whereas others are valid for one or the
|
in a modifier list is ignored. Some modifiers may be given for both patterns
|
||||||
other only. Each modifier has a long name, for example "anchored", and some of
|
and subject lines, whereas others are valid only for one or the other. Each
|
||||||
them must be followed by an equals sign and a value, for example, "offset=12".
|
modifier has a long name, for example "anchored", and some of them must be
|
||||||
Modifiers that do not take values may be preceded by a minus sign to turn off a
|
followed by an equals sign and a value, for example, "offset=12". Values cannot
|
||||||
previous setting.
|
contain comma characters, but may contain spaces. Modifiers that do not take
|
||||||
|
values may be preceded by a minus sign to turn off a previous setting.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A few of the more common modifiers can also be specified as single letters, for
|
A few of the more common modifiers can also be specified as single letters, for
|
||||||
|
@ -479,6 +480,7 @@ for a description of their effects.
|
||||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||||
alt_bsux set PCRE2_ALT_BSUX
|
alt_bsux set PCRE2_ALT_BSUX
|
||||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||||
|
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
auto_callout set PCRE2_AUTO_CALLOUT
|
auto_callout set PCRE2_AUTO_CALLOUT
|
||||||
/i caseless set PCRE2_CASELESS
|
/i caseless set PCRE2_CASELESS
|
||||||
|
@ -1469,7 +1471,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 20 May 2015
|
Last updated: 30 August 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -126,11 +126,22 @@ as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
||||||
strings to be in host byte order.
|
strings to be in host byte order.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The entire string is checked before any other processing takes place. In
|
A UTF string is checked before any other processing takes place. In the case of
|
||||||
addition to checking the format of the string, there is a check to ensure that
|
<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b> calls with a non-zero starting
|
||||||
all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
|
offset, the check is applied only to that part of the subject that could be
|
||||||
The so-called "non-character" code points are not excluded because Unicode
|
inspected during matching, and there is a check that the starting offset points
|
||||||
corrigendum #9 makes it clear that they should not be.
|
to the first code unit of a character or to the end of the subject. If there
|
||||||
|
are no lookbehind assertions in the pattern, the check starts at the starting
|
||||||
|
offset. Otherwise, it starts at the length of the longest lookbehind before the
|
||||||
|
starting offset, or at the start of the subject if there are not that many
|
||||||
|
characters before the starting offset. Note that the sequences \b and \B are
|
||||||
|
one-character lookbehinds.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
In addition to checking the format of the string, there is a check to ensure
|
||||||
|
that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate
|
||||||
|
area. The so-called "non-character" code points are not excluded because
|
||||||
|
Unicode corrigendum #9 makes it clear that they should not be.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
|
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
|
||||||
|
@ -264,9 +275,9 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 23 November 2014
|
Last updated: 18 August 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
400
doc/pcre2.txt
400
doc/pcre2.txt
|
@ -168,8 +168,8 @@ REVISION
|
||||||
Last updated: 13 April 2015
|
Last updated: 13 April 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2API(3) Library Functions Manual PCRE2API(3)
|
PCRE2API(3) Library Functions Manual PCRE2API(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -190,13 +190,13 @@ PCRE2 NATIVE API BASIC FUNCTIONS
|
||||||
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
||||||
pcre2_compile_context *ccontext);
|
pcre2_compile_context *ccontext);
|
||||||
|
|
||||||
pcre2_code_free(pcre2_code *code);
|
void pcre2_code_free(pcre2_code *code);
|
||||||
|
|
||||||
pcre2_match_data_create(uint32_t ovecsize,
|
pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
|
||||||
pcre2_general_context *gcontext);
|
pcre2_general_context *gcontext);
|
||||||
|
|
||||||
pcre2_match_data_create_from_pattern(const pcre2_code *code,
|
pcre2_match_data *pcre2_match_data_create_from_pattern(
|
||||||
pcre2_general_context *gcontext);
|
const pcre2_code *code, pcre2_general_context *gcontext);
|
||||||
|
|
||||||
int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
|
int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
|
||||||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||||
|
@ -989,7 +989,7 @@ CHECKING BUILD-TIME OPTIONS
|
||||||
pcre2_config() with where set to NULL.) If PCRE2 has been compiled
|
pcre2_config() with where set to NULL.) If PCRE2 has been compiled
|
||||||
without Unicode support, the buffer is filled with the text "Unicode
|
without Unicode support, the buffer is filled with the text "Unicode
|
||||||
not supported". Otherwise, the Unicode version string (for example,
|
not supported". Otherwise, the Unicode version string (for example,
|
||||||
"7.0.0") is inserted. The number of code units used is returned. This
|
"8.0.0") is inserted. The number of code units used is returned. This
|
||||||
is the length of the string plus one unit for the terminating zero.
|
is the length of the string plus one unit for the terminating zero.
|
||||||
|
|
||||||
PCRE2_CONFIG_UNICODE
|
PCRE2_CONFIG_UNICODE
|
||||||
|
@ -1014,7 +1014,7 @@ COMPILING A PATTERN
|
||||||
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
||||||
pcre2_compile_context *ccontext);
|
pcre2_compile_context *ccontext);
|
||||||
|
|
||||||
pcre2_code_free(pcre2_code *code);
|
void pcre2_code_free(pcre2_code *code);
|
||||||
|
|
||||||
The pcre2_compile() function compiles a pattern into an internal form.
|
The pcre2_compile() function compiles a pattern into an internal form.
|
||||||
The pattern is defined by a pointer to a string of code units and a
|
The pattern is defined by a pointer to a string of code units and a
|
||||||
|
@ -1128,6 +1128,16 @@ COMPILING A PATTERN
|
||||||
Perl. If you want a multiline circumflex also to match after a termi-
|
Perl. If you want a multiline circumflex also to match after a termi-
|
||||||
nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
|
nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
|
||||||
|
|
||||||
|
PCRE2_ALT_VERBNAMES
|
||||||
|
|
||||||
|
By default, for compatibility with Perl, the name in any verb sequence
|
||||||
|
such as (*MARK:NAME) is any sequence of characters that does not
|
||||||
|
include a closing parenthesis. The name is not processed in any way,
|
||||||
|
and it is not possible to include a closing parenthesis in the name.
|
||||||
|
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash
|
||||||
|
processing is applied to verb names and only an unescaped closing
|
||||||
|
parenthesis terminates the name.
|
||||||
|
|
||||||
PCRE2_AUTO_CALLOUT
|
PCRE2_AUTO_CALLOUT
|
||||||
|
|
||||||
If this bit is set, pcre2_compile() automatically inserts callout
|
If this bit is set, pcre2_compile() automatically inserts callout
|
||||||
|
@ -1809,11 +1819,11 @@ SERIALIZATION AND PRECOMPILING
|
||||||
|
|
||||||
THE MATCH DATA BLOCK
|
THE MATCH DATA BLOCK
|
||||||
|
|
||||||
pcre2_match_data_create(uint32_t ovecsize,
|
pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
|
||||||
pcre2_general_context *gcontext);
|
pcre2_general_context *gcontext);
|
||||||
|
|
||||||
pcre2_match_data_create_from_pattern(const pcre2_code *code,
|
pcre2_match_data *pcre2_match_data_create_from_pattern(
|
||||||
pcre2_general_context *gcontext);
|
const pcre2_code *code, pcre2_general_context *gcontext);
|
||||||
|
|
||||||
void pcre2_match_data_free(pcre2_match_data *match_data);
|
void pcre2_match_data_free(pcre2_match_data *match_data);
|
||||||
|
|
||||||
|
@ -2022,12 +2032,20 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
|
||||||
|
|
||||||
When PCRE2_UTF is set at compile time, the validity of the subject as a
|
When PCRE2_UTF is set at compile time, the validity of the subject as a
|
||||||
UTF string is checked by default when pcre2_match() is subsequently
|
UTF string is checked by default when pcre2_match() is subsequently
|
||||||
called. The entire string is checked before any other processing takes
|
called. If a non-zero starting offset is given, the check is applied
|
||||||
place, and a negative error code is returned if the check fails. There
|
only to that part of the subject that could be inspected during match-
|
||||||
are several UTF error codes for each code unit width, corresponding to
|
ing, and there is a check that the starting offset points to the first
|
||||||
different problems with the code unit sequence. The value of startoff-
|
code unit of a character or to the end of the subject. If there are no
|
||||||
set is also checked, to ensure that it points to the start of a charac-
|
lookbehind assertions in the pattern, the check starts at the starting
|
||||||
ter or to the end of the subject. There are discussions about the
|
offset. Otherwise, it starts at the length of the longest lookbehind
|
||||||
|
before the starting offset, or at the start of the subject if there are
|
||||||
|
not that many characters before the starting offset. Note that the
|
||||||
|
sequences \b and \B are one-character lookbehinds.
|
||||||
|
|
||||||
|
The check is carried out before any other processing takes place, and a
|
||||||
|
negative error code is returned if the check fails. There are several
|
||||||
|
UTF error codes for each code unit width, corresponding to different
|
||||||
|
problems with the code unit sequence. There are discussions about the
|
||||||
validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the
|
validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the
|
||||||
pcre2unicode page.
|
pcre2unicode page.
|
||||||
|
|
||||||
|
@ -2525,12 +2543,12 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF
|
In the replacement string, which is interpreted as a UTF string in UTF
|
||||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||||
option is set, a dollar character is an escape character that can spec-
|
option is set, a dollar character is an escape character that can spec-
|
||||||
ify the insertion of characters from capturing groups in the pattern.
|
ify the insertion of characters from capturing groups or (*MARK) items
|
||||||
The following forms are recognized:
|
in the pattern. The following forms are recognized:
|
||||||
|
|
||||||
$$ insert a dollar character
|
$$ insert a dollar character
|
||||||
$<n> insert the contents of group <n>
|
$<n> or ${<n>} insert the contents of group <n>
|
||||||
${<n>} insert the contents of group <n>
|
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||||
|
|
||||||
Either a group number or a group name can be given for <n>. Curly
|
Either a group number or a group name can be given for <n>. Curly
|
||||||
brackets are required only if the following character would be inter-
|
brackets are required only if the following character would be inter-
|
||||||
|
@ -2540,30 +2558,37 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
||||||
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
|
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
|
||||||
or pcre2_copy_bynumber() as appropriate.
|
or pcre2_copy_bynumber() as appropriate.
|
||||||
|
|
||||||
The first seven arguments of pcre2_substitute() are the same as for
|
The facility for inserting a (*MARK) name can be used to perform simple
|
||||||
|
simultaneous substitutions, as this pcre2test example shows:
|
||||||
|
|
||||||
|
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||||
|
apple lemon
|
||||||
|
2: pear orange
|
||||||
|
|
||||||
|
The first seven arguments of pcre2_substitute() are the same as for
|
||||||
pcre2_match(), except that the partial matching options are not permit-
|
pcre2_match(), except that the partial matching options are not permit-
|
||||||
ted, and match_data may be passed as NULL, in which case a match data
|
ted, and match_data may be passed as NULL, in which case a match data
|
||||||
block is obtained and freed within this function, using memory manage-
|
block is obtained and freed within this function, using memory manage-
|
||||||
ment functions from the match context, if provided, or else those that
|
ment functions from the match context, if provided, or else those that
|
||||||
were used to allocate memory for the compiled code.
|
were used to allocate memory for the compiled code.
|
||||||
|
|
||||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||||
the function to iterate over the subject string, replacing every match-
|
the function to iterate over the subject string, replacing every match-
|
||||||
ing substring. If this is not set, only the first matching substring is
|
ing substring. If this is not set, only the first matching substring is
|
||||||
replaced.
|
replaced.
|
||||||
|
|
||||||
The outlengthptr argument must point to a variable that contains the
|
The outlengthptr argument must point to a variable that contains the
|
||||||
length, in code units, of the output buffer. It is updated to contain
|
length, in code units, of the output buffer. It is updated to contain
|
||||||
the length of the new string, excluding the trailing zero that is auto-
|
the length of the new string, excluding the trailing zero that is auto-
|
||||||
matically added.
|
matically added.
|
||||||
|
|
||||||
The function returns the number of replacements that were made. This
|
The function returns the number of replacements that were made. This
|
||||||
may be zero if no matches were found, and is never greater than 1
|
may be zero if no matches were found, and is never greater than 1
|
||||||
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg-
|
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg-
|
||||||
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
|
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
|
||||||
never returned), any errors from pcre2_match() or the substring copying
|
never returned), any errors from pcre2_match() or the substring copying
|
||||||
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
|
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
|
||||||
returned for an invalid replacement string (unrecognized sequence fol-
|
returned for an invalid replacement string (unrecognized sequence fol-
|
||||||
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out-
|
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out-
|
||||||
put buffer is not big enough.
|
put buffer is not big enough.
|
||||||
|
|
||||||
|
@ -2573,56 +2598,56 @@ DUPLICATE SUBPATTERN NAMES
|
||||||
int pcre2_substring_nametable_scan(const pcre2_code *code,
|
int pcre2_substring_nametable_scan(const pcre2_code *code,
|
||||||
PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
|
PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
|
||||||
|
|
||||||
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
||||||
subpatterns are not required to be unique. Duplicate names are always
|
subpatterns are not required to be unique. Duplicate names are always
|
||||||
allowed for subpatterns with the same number, created by using the (?|
|
allowed for subpatterns with the same number, created by using the (?|
|
||||||
feature. Indeed, if such subpatterns are named, they are required to
|
feature. Indeed, if such subpatterns are named, they are required to
|
||||||
use the same names.
|
use the same names.
|
||||||
|
|
||||||
Normally, patterns with duplicate names are such that in any one match,
|
Normally, patterns with duplicate names are such that in any one match,
|
||||||
only one of the named subpatterns participates. An example is shown in
|
only one of the named subpatterns participates. An example is shown in
|
||||||
the pcre2pattern documentation.
|
the pcre2pattern documentation.
|
||||||
|
|
||||||
When duplicates are present, pcre2_substring_copy_byname() and
|
When duplicates are present, pcre2_substring_copy_byname() and
|
||||||
pcre2_substring_get_byname() return the first substring corresponding
|
pcre2_substring_get_byname() return the first substring corresponding
|
||||||
to the given name that is set. Only if none are set is
|
to the given name that is set. Only if none are set is
|
||||||
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
||||||
function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
|
function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
|
||||||
duplicate names.
|
duplicate names.
|
||||||
|
|
||||||
If you want to get full details of all captured substrings for a given
|
If you want to get full details of all captured substrings for a given
|
||||||
name, you must use the pcre2_substring_nametable_scan() function. The
|
name, you must use the pcre2_substring_nametable_scan() function. The
|
||||||
first argument is the compiled pattern, and the second is the name. If
|
first argument is the compiled pattern, and the second is the name. If
|
||||||
the third and fourth arguments are NULL, the function returns a group
|
the third and fourth arguments are NULL, the function returns a group
|
||||||
number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
|
number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
|
||||||
|
|
||||||
When the third and fourth arguments are not NULL, they must be pointers
|
When the third and fourth arguments are not NULL, they must be pointers
|
||||||
to variables that are updated by the function. After it has run, they
|
to variables that are updated by the function. After it has run, they
|
||||||
point to the first and last entries in the name-to-number table for the
|
point to the first and last entries in the name-to-number table for the
|
||||||
given name, and the function returns the length of each entry in code
|
given name, and the function returns the length of each entry in code
|
||||||
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
||||||
no entries for the given name.
|
no entries for the given name.
|
||||||
|
|
||||||
The format of the name table is described above in the section entitled
|
The format of the name table is described above in the section entitled
|
||||||
Information about a pattern above. Given all the relevant entries for
|
Information about a pattern above. Given all the relevant entries for
|
||||||
the name, you can extract each of their numbers, and hence the captured
|
the name, you can extract each of their numbers, and hence the captured
|
||||||
data.
|
data.
|
||||||
|
|
||||||
|
|
||||||
FINDING ALL POSSIBLE MATCHES AT ONE POSITION
|
FINDING ALL POSSIBLE MATCHES AT ONE POSITION
|
||||||
|
|
||||||
The traditional matching function uses a similar algorithm to Perl,
|
The traditional matching function uses a similar algorithm to Perl,
|
||||||
which stops when it finds the first match at a given point in the sub-
|
which stops when it finds the first match at a given point in the sub-
|
||||||
ject. If you want to find all possible matches, or the longest possible
|
ject. If you want to find all possible matches, or the longest possible
|
||||||
match at a given position, consider using the alternative matching
|
match at a given position, consider using the alternative matching
|
||||||
function (see below) instead. If you cannot use the alternative func-
|
function (see below) instead. If you cannot use the alternative func-
|
||||||
tion, you can kludge it up by making use of the callout facility, which
|
tion, you can kludge it up by making use of the callout facility, which
|
||||||
is described in the pcre2callout documentation.
|
is described in the pcre2callout documentation.
|
||||||
|
|
||||||
What you have to do is to insert a callout right at the end of the pat-
|
What you have to do is to insert a callout right at the end of the pat-
|
||||||
tern. When your callout function is called, extract and save the cur-
|
tern. When your callout function is called, extract and save the cur-
|
||||||
rent matched substring. Then return 1, which forces pcre2_match() to
|
rent matched substring. Then return 1, which forces pcre2_match() to
|
||||||
backtrack and try other alternatives. Ultimately, when it runs out of
|
backtrack and try other alternatives. Ultimately, when it runs out of
|
||||||
matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
|
matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
|
||||||
|
|
||||||
|
|
||||||
|
@ -2634,26 +2659,26 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
pcre2_match_context *mcontext,
|
pcre2_match_context *mcontext,
|
||||||
int *workspace, PCRE2_SIZE wscount);
|
int *workspace, PCRE2_SIZE wscount);
|
||||||
|
|
||||||
The function pcre2_dfa_match() is called to match a subject string
|
The function pcre2_dfa_match() is called to match a subject string
|
||||||
against a compiled pattern, using a matching algorithm that scans the
|
against a compiled pattern, using a matching algorithm that scans the
|
||||||
subject string just once, and does not backtrack. This has different
|
subject string just once, and does not backtrack. This has different
|
||||||
characteristics to the normal algorithm, and is not compatible with
|
characteristics to the normal algorithm, and is not compatible with
|
||||||
Perl. Some of the features of PCRE2 patterns are not supported. Never-
|
Perl. Some of the features of PCRE2 patterns are not supported. Never-
|
||||||
theless, there are times when this kind of matching can be useful. For
|
theless, there are times when this kind of matching can be useful. For
|
||||||
a discussion of the two matching algorithms, and a list of features
|
a discussion of the two matching algorithms, and a list of features
|
||||||
that pcre2_dfa_match() does not support, see the pcre2matching documen-
|
that pcre2_dfa_match() does not support, see the pcre2matching documen-
|
||||||
tation.
|
tation.
|
||||||
|
|
||||||
The arguments for the pcre2_dfa_match() function are the same as for
|
The arguments for the pcre2_dfa_match() function are the same as for
|
||||||
pcre2_match(), plus two extras. The ovector within the match data block
|
pcre2_match(), plus two extras. The ovector within the match data block
|
||||||
is used in a different way, and this is described below. The other com-
|
is used in a different way, and this is described below. The other com-
|
||||||
mon arguments are used in the same way as for pcre2_match(), so their
|
mon arguments are used in the same way as for pcre2_match(), so their
|
||||||
description is not repeated here.
|
description is not repeated here.
|
||||||
|
|
||||||
The two additional arguments provide workspace for the function. The
|
The two additional arguments provide workspace for the function. The
|
||||||
workspace vector should contain at least 20 elements. It is used for
|
workspace vector should contain at least 20 elements. It is used for
|
||||||
keeping track of multiple paths through the pattern tree. More
|
keeping track of multiple paths through the pattern tree. More
|
||||||
workspace is needed for patterns and subjects where there are a lot of
|
workspace is needed for patterns and subjects where there are a lot of
|
||||||
potential matches.
|
potential matches.
|
||||||
|
|
||||||
Here is an example of a simple call to pcre2_dfa_match():
|
Here is an example of a simple call to pcre2_dfa_match():
|
||||||
|
@ -2673,45 +2698,45 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
|
|
||||||
Option bits for pcre_dfa_match()
|
Option bits for pcre_dfa_match()
|
||||||
|
|
||||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||||
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
||||||
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
|
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
|
||||||
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of
|
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of
|
||||||
these are exactly the same as for pcre2_match(), so their description
|
these are exactly the same as for pcre2_match(), so their description
|
||||||
is not repeated here.
|
is not repeated here.
|
||||||
|
|
||||||
PCRE2_PARTIAL_HARD
|
PCRE2_PARTIAL_HARD
|
||||||
PCRE2_PARTIAL_SOFT
|
PCRE2_PARTIAL_SOFT
|
||||||
|
|
||||||
These have the same general effect as they do for pcre2_match(), but
|
These have the same general effect as they do for pcre2_match(), but
|
||||||
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
||||||
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
||||||
subject is reached and there is still at least one matching possibility
|
subject is reached and there is still at least one matching possibility
|
||||||
that requires additional characters. This happens even if some complete
|
that requires additional characters. This happens even if some complete
|
||||||
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
||||||
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
||||||
if the end of the subject is reached, there have been no complete
|
if the end of the subject is reached, there have been no complete
|
||||||
matches, but there is still at least one matching possibility. The por-
|
matches, but there is still at least one matching possibility. The por-
|
||||||
tion of the string that was inspected when the longest partial match
|
tion of the string that was inspected when the longest partial match
|
||||||
was found is set as the first matching string in both cases. There is a
|
was found is set as the first matching string in both cases. There is a
|
||||||
more detailed discussion of partial and multi-segment matching, with
|
more detailed discussion of partial and multi-segment matching, with
|
||||||
examples, in the pcre2partial documentation.
|
examples, in the pcre2partial documentation.
|
||||||
|
|
||||||
PCRE2_DFA_SHORTEST
|
PCRE2_DFA_SHORTEST
|
||||||
|
|
||||||
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
||||||
stop as soon as it has found one match. Because of the way the alterna-
|
stop as soon as it has found one match. Because of the way the alterna-
|
||||||
tive algorithm works, this is necessarily the shortest possible match
|
tive algorithm works, this is necessarily the shortest possible match
|
||||||
at the first possible matching point in the subject string.
|
at the first possible matching point in the subject string.
|
||||||
|
|
||||||
PCRE2_DFA_RESTART
|
PCRE2_DFA_RESTART
|
||||||
|
|
||||||
When pcre2_dfa_match() returns a partial match, it is possible to call
|
When pcre2_dfa_match() returns a partial match, it is possible to call
|
||||||
it again, with additional subject characters, and have it continue with
|
it again, with additional subject characters, and have it continue with
|
||||||
the same match. The PCRE2_DFA_RESTART option requests this action; when
|
the same match. The PCRE2_DFA_RESTART option requests this action; when
|
||||||
it is set, the workspace and wscount options must reference the same
|
it is set, the workspace and wscount options must reference the same
|
||||||
vector as before because data about the match so far is left in them
|
vector as before because data about the match so far is left in them
|
||||||
after a partial match. There is more discussion of this facility in the
|
after a partial match. There is more discussion of this facility in the
|
||||||
pcre2partial documentation.
|
pcre2partial documentation.
|
||||||
|
|
||||||
|
@ -2719,8 +2744,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
|
|
||||||
When pcre2_dfa_match() succeeds, it may have matched more than one sub-
|
When pcre2_dfa_match() succeeds, it may have matched more than one sub-
|
||||||
string in the subject. Note, however, that all the matches from one run
|
string in the subject. Note, however, that all the matches from one run
|
||||||
of the function start at the same point in the subject. The shorter
|
of the function start at the same point in the subject. The shorter
|
||||||
matches are all initial substrings of the longer matches. For example,
|
matches are all initial substrings of the longer matches. For example,
|
||||||
if the pattern
|
if the pattern
|
||||||
|
|
||||||
<.*>
|
<.*>
|
||||||
|
@ -2735,17 +2760,17 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
<something> <something else>
|
<something> <something else>
|
||||||
<something>
|
<something>
|
||||||
|
|
||||||
On success, the yield of the function is a number greater than zero,
|
On success, the yield of the function is a number greater than zero,
|
||||||
which is the number of matched substrings. The offsets of the sub-
|
which is the number of matched substrings. The offsets of the sub-
|
||||||
strings are returned in the ovector, and can be extracted by number in
|
strings are returned in the ovector, and can be extracted by number in
|
||||||
the same way as for pcre2_match(), but the numbers bear no relation to
|
the same way as for pcre2_match(), but the numbers bear no relation to
|
||||||
any capturing groups that may exist in the pattern, because DFA match-
|
any capturing groups that may exist in the pattern, because DFA match-
|
||||||
ing does not support group capture.
|
ing does not support group capture.
|
||||||
|
|
||||||
Calls to the convenience functions that extract substrings by name
|
Calls to the convenience functions that extract substrings by name
|
||||||
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
||||||
after a DFA match. The convenience functions that extract substrings by
|
after a DFA match. The convenience functions that extract substrings by
|
||||||
number never return PCRE2_ERROR_NOSUBSTRING, and the meanings of some
|
number never return PCRE2_ERROR_NOSUBSTRING, and the meanings of some
|
||||||
other errors are slightly different:
|
other errors are slightly different:
|
||||||
|
|
||||||
PCRE2_ERROR_UNAVAILABLE
|
PCRE2_ERROR_UNAVAILABLE
|
||||||
|
@ -2755,64 +2780,64 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
|
|
||||||
PCRE2_ERROR_UNSET
|
PCRE2_ERROR_UNSET
|
||||||
|
|
||||||
There is a slot in the ovector for this substring, but there were
|
There is a slot in the ovector for this substring, but there were
|
||||||
insufficient matches to fill it.
|
insufficient matches to fill it.
|
||||||
|
|
||||||
The matched strings are stored in the ovector in reverse order of
|
The matched strings are stored in the ovector in reverse order of
|
||||||
length; that is, the longest matching string is first. If there were
|
length; that is, the longest matching string is first. If there were
|
||||||
too many matches to fit into the ovector, the yield of the function is
|
too many matches to fit into the ovector, the yield of the function is
|
||||||
zero, and the vector is filled with the longest matches.
|
zero, and the vector is filled with the longest matches.
|
||||||
|
|
||||||
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
||||||
character repeats at the end of a pattern (as well as internally). For
|
character repeats at the end of a pattern (as well as internally). For
|
||||||
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
||||||
matching, this means that only one possible match is found. If you
|
matching, this means that only one possible match is found. If you
|
||||||
really do want multiple matches in such cases, either use an ungreedy
|
really do want multiple matches in such cases, either use an ungreedy
|
||||||
repeat auch as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
repeat auch as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||||
compiling.
|
compiling.
|
||||||
|
|
||||||
Error returns from pcre2_dfa_match()
|
Error returns from pcre2_dfa_match()
|
||||||
|
|
||||||
The pcre2_dfa_match() function returns a negative number when it fails.
|
The pcre2_dfa_match() function returns a negative number when it fails.
|
||||||
Many of the errors are the same as for pcre2_match(), as described
|
Many of the errors are the same as for pcre2_match(), as described
|
||||||
above. There are in addition the following errors that are specific to
|
above. There are in addition the following errors that are specific to
|
||||||
pcre2_dfa_match():
|
pcre2_dfa_match():
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_UITEM
|
PCRE2_ERROR_DFA_UITEM
|
||||||
|
|
||||||
This return is given if pcre2_dfa_match() encounters an item in the
|
This return is given if pcre2_dfa_match() encounters an item in the
|
||||||
pattern that it does not support, for instance, the use of \C or a back
|
pattern that it does not support, for instance, the use of \C or a back
|
||||||
reference.
|
reference.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_UCOND
|
PCRE2_ERROR_DFA_UCOND
|
||||||
|
|
||||||
This return is given if pcre2_dfa_match() encounters a condition item
|
This return is given if pcre2_dfa_match() encounters a condition item
|
||||||
that uses a back reference for the condition, or a test for recursion
|
that uses a back reference for the condition, or a test for recursion
|
||||||
in a specific group. These are not supported.
|
in a specific group. These are not supported.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_WSSIZE
|
PCRE2_ERROR_DFA_WSSIZE
|
||||||
|
|
||||||
This return is given if pcre2_dfa_match() runs out of space in the
|
This return is given if pcre2_dfa_match() runs out of space in the
|
||||||
workspace vector.
|
workspace vector.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_RECURSE
|
PCRE2_ERROR_DFA_RECURSE
|
||||||
|
|
||||||
When a recursive subpattern is processed, the matching function calls
|
When a recursive subpattern is processed, the matching function calls
|
||||||
itself recursively, using private memory for the ovector and workspace.
|
itself recursively, using private memory for the ovector and workspace.
|
||||||
This error is given if the internal ovector is not large enough. This
|
This error is given if the internal ovector is not large enough. This
|
||||||
should be extremely rare, as a vector of size 1000 is used.
|
should be extremely rare, as a vector of size 1000 is used.
|
||||||
|
|
||||||
PCRE2_ERROR_DFA_BADRESTART
|
PCRE2_ERROR_DFA_BADRESTART
|
||||||
|
|
||||||
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
||||||
some plausibility checks are made on the contents of the workspace,
|
some plausibility checks are made on the contents of the workspace,
|
||||||
which should contain data about the previous partial match. If any of
|
which should contain data about the previous partial match. If any of
|
||||||
these checks fail, this error is given.
|
these checks fail, this error is given.
|
||||||
|
|
||||||
|
|
||||||
SEE ALSO
|
SEE ALSO
|
||||||
|
|
||||||
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
||||||
pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2stack(3),
|
pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2stack(3),
|
||||||
pcre2unicode(3).
|
pcre2unicode(3).
|
||||||
|
|
||||||
|
@ -2826,11 +2851,11 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 22 April 2015
|
Last updated: 30 August 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3)
|
PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -3305,8 +3330,8 @@ REVISION
|
||||||
Last updated: 24 April 2015
|
Last updated: 24 April 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3)
|
PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -3669,8 +3694,8 @@ REVISION
|
||||||
Last updated: 23 March 2015
|
Last updated: 23 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3)
|
PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -3854,8 +3879,8 @@ REVISION
|
||||||
Last updated: 15 March 2015
|
Last updated: 15 March 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2JIT(3) Library Functions Manual PCRE2JIT(3)
|
PCRE2JIT(3) Library Functions Manual PCRE2JIT(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4051,13 +4076,17 @@ CONTROLLING THE JIT STACK
|
||||||
interpreter.
|
interpreter.
|
||||||
|
|
||||||
You may safely use the same JIT stack for more than one pattern (either
|
You may safely use the same JIT stack for more than one pattern (either
|
||||||
by assigning directly or by callback), as long as the patterns are all
|
by assigning directly or by callback), as long as the patterns are
|
||||||
matched sequentially in the same thread. In a multithread application,
|
matched sequentially in the same thread. Currently, the only way to set
|
||||||
if you do not specify a JIT stack, or if you assign or pass back NULL
|
up non-sequential matches in one thread is to use callouts: if a call-
|
||||||
from a callback, that is thread-safe, because each thread has its own
|
out function starts another match, that match must use a different JIT
|
||||||
machine stack. However, if you assign or pass back a non-NULL JIT
|
stack to the one used for currently suspended match(es).
|
||||||
stack, this must be a different stack for each thread so that the
|
|
||||||
application is thread-safe.
|
In a multithread application, if you do not specify a JIT stack, or if
|
||||||
|
you assign or pass back NULL from a callback, that is thread-safe,
|
||||||
|
because each thread has its own machine stack. However, if you assign
|
||||||
|
or pass back a non-NULL JIT stack, this must be a different stack for
|
||||||
|
each thread so that the application is thread-safe.
|
||||||
|
|
||||||
Strictly speaking, even more is allowed. You can assign the same non-
|
Strictly speaking, even more is allowed. You can assign the same non-
|
||||||
NULL stack to a match context that is used by any number of patterns,
|
NULL stack to a match context that is used by any number of patterns,
|
||||||
|
@ -4234,11 +4263,11 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 27 November 2014
|
Last updated: 28 July 2015
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3)
|
PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4309,8 +4338,8 @@ REVISION
|
||||||
Last updated: 25 November 2014
|
Last updated: 25 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3)
|
PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4528,8 +4557,8 @@ REVISION
|
||||||
Last updated: 29 September 2014
|
Last updated: 29 September 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3)
|
PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4968,8 +4997,8 @@ REVISION
|
||||||
Last updated: 22 December 2014
|
Last updated: 22 December 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3)
|
PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3)
|
||||||
|
|
||||||
|
|
||||||
|
@ -5069,33 +5098,44 @@ VALIDITY OF UTF STRINGS
|
||||||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||||
this, expecting strings to be in host byte order.
|
this, expecting strings to be in host byte order.
|
||||||
|
|
||||||
The entire string is checked before any other processing takes place.
|
A UTF string is checked before any other processing takes place. In the
|
||||||
In addition to checking the format of the string, there is a check to
|
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
|
||||||
|
starting offset, the check is applied only to that part of the subject
|
||||||
|
that could be inspected during matching, and there is a check that the
|
||||||
|
starting offset points to the first code unit of a character or to the
|
||||||
|
end of the subject. If there are no lookbehind assertions in the pat-
|
||||||
|
tern, the check starts at the starting offset. Otherwise, it starts at
|
||||||
|
the length of the longest lookbehind before the starting offset, or at
|
||||||
|
the start of the subject if there are not that many characters before
|
||||||
|
the starting offset. Note that the sequences \b and \B are one-charac-
|
||||||
|
ter lookbehinds.
|
||||||
|
|
||||||
|
In addition to checking the format of the string, there is a check to
|
||||||
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
||||||
the surrogate area. The so-called "non-character" code points are not
|
the surrogate area. The so-called "non-character" code points are not
|
||||||
excluded because Unicode corrigendum #9 makes it clear that they should
|
excluded because Unicode corrigendum #9 makes it clear that they should
|
||||||
not be.
|
not be.
|
||||||
|
|
||||||
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
||||||
UTF-16, where they are used in pairs to encode code points with values
|
UTF-16, where they are used in pairs to encode code points with values
|
||||||
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
||||||
are available independently in the UTF-8 and UTF-32 encodings. (In
|
are available independently in the UTF-8 and UTF-32 encodings. (In
|
||||||
other words, the whole surrogate thing is a fudge for UTF-16 which
|
other words, the whole surrogate thing is a fudge for UTF-16 which
|
||||||
unfortunately messes up UTF-8 and UTF-32.)
|
unfortunately messes up UTF-8 and UTF-32.)
|
||||||
|
|
||||||
In some situations, you may already know that your strings are valid,
|
In some situations, you may already know that your strings are valid,
|
||||||
and therefore want to skip these checks in order to improve perfor-
|
and therefore want to skip these checks in order to improve perfor-
|
||||||
mance, for example in the case of a long subject string that is being
|
mance, for example in the case of a long subject string that is being
|
||||||
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
||||||
pile time or at match time, PCRE2 assumes that the pattern or subject
|
pile time or at match time, PCRE2 assumes that the pattern or subject
|
||||||
it is given (respectively) contains only valid UTF code unit sequences.
|
it is given (respectively) contains only valid UTF code unit sequences.
|
||||||
|
|
||||||
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
||||||
for the pattern; it does not also apply to subject strings. If you want
|
for the pattern; it does not also apply to subject strings. If you want
|
||||||
to disable the check for a subject string you must pass this option to
|
to disable the check for a subject string you must pass this option to
|
||||||
pcre2_match() or pcre2_dfa_match().
|
pcre2_match() or pcre2_dfa_match().
|
||||||
|
|
||||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||||
result is undefined and your program may crash or loop indefinitely.
|
result is undefined and your program may crash or loop indefinitely.
|
||||||
|
|
||||||
Errors in UTF-8 strings
|
Errors in UTF-8 strings
|
||||||
|
@ -5108,10 +5148,10 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR4
|
PCRE2_ERROR_UTF8_ERR4
|
||||||
PCRE2_ERROR_UTF8_ERR5
|
PCRE2_ERROR_UTF8_ERR5
|
||||||
|
|
||||||
The string ends with a truncated UTF-8 character; the code specifies
|
The string ends with a truncated UTF-8 character; the code specifies
|
||||||
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
||||||
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
||||||
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
||||||
checked first; hence the possibility of 4 or 5 missing bytes.
|
checked first; hence the possibility of 4 or 5 missing bytes.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR6
|
PCRE2_ERROR_UTF8_ERR6
|
||||||
|
@ -5121,24 +5161,24 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR10
|
PCRE2_ERROR_UTF8_ERR10
|
||||||
|
|
||||||
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
||||||
the character do not have the binary value 0b10 (that is, either the
|
the character do not have the binary value 0b10 (that is, either the
|
||||||
most significant bit is 0, or the next bit is 1).
|
most significant bit is 0, or the next bit is 1).
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR11
|
PCRE2_ERROR_UTF8_ERR11
|
||||||
PCRE2_ERROR_UTF8_ERR12
|
PCRE2_ERROR_UTF8_ERR12
|
||||||
|
|
||||||
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
||||||
long; these code points are excluded by RFC 3629.
|
long; these code points are excluded by RFC 3629.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR13
|
PCRE2_ERROR_UTF8_ERR13
|
||||||
|
|
||||||
A 4-byte character has a value greater than 0x10fff; these code points
|
A 4-byte character has a value greater than 0x10fff; these code points
|
||||||
are excluded by RFC 3629.
|
are excluded by RFC 3629.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR14
|
PCRE2_ERROR_UTF8_ERR14
|
||||||
|
|
||||||
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
||||||
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
||||||
so are excluded from UTF-8.
|
so are excluded from UTF-8.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR15
|
PCRE2_ERROR_UTF8_ERR15
|
||||||
|
@ -5147,26 +5187,26 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR18
|
PCRE2_ERROR_UTF8_ERR18
|
||||||
PCRE2_ERROR_UTF8_ERR19
|
PCRE2_ERROR_UTF8_ERR19
|
||||||
|
|
||||||
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
||||||
for a value that can be represented by fewer bytes, which is invalid.
|
for a value that can be represented by fewer bytes, which is invalid.
|
||||||
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
||||||
rect coding uses just one byte.
|
rect coding uses just one byte.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR20
|
PCRE2_ERROR_UTF8_ERR20
|
||||||
|
|
||||||
The two most significant bits of the first byte of a character have the
|
The two most significant bits of the first byte of a character have the
|
||||||
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
||||||
ond is 0). Such a byte can only validly occur as the second or subse-
|
ond is 0). Such a byte can only validly occur as the second or subse-
|
||||||
quent byte of a multi-byte character.
|
quent byte of a multi-byte character.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR21
|
PCRE2_ERROR_UTF8_ERR21
|
||||||
|
|
||||||
The first byte of a character has the value 0xfe or 0xff. These values
|
The first byte of a character has the value 0xfe or 0xff. These values
|
||||||
can never occur in a valid UTF-8 string.
|
can never occur in a valid UTF-8 string.
|
||||||
|
|
||||||
Errors in UTF-16 strings
|
Errors in UTF-16 strings
|
||||||
|
|
||||||
The following negative error codes are given for invalid UTF-16
|
The following negative error codes are given for invalid UTF-16
|
||||||
strings:
|
strings:
|
||||||
|
|
||||||
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
||||||
|
@ -5176,7 +5216,7 @@ VALIDITY OF UTF STRINGS
|
||||||
|
|
||||||
Errors in UTF-32 strings
|
Errors in UTF-32 strings
|
||||||
|
|
||||||
The following negative error codes are given for invalid UTF-32
|
The following negative error codes are given for invalid UTF-32
|
||||||
strings:
|
strings:
|
||||||
|
|
||||||
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
||||||
|
@ -5192,8 +5232,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 23 November 2014
|
Last updated: 18 August 2015
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2API 3 "29 August 2015" "PCRE2 10.21"
|
.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.sp
|
.sp
|
||||||
|
@ -1052,6 +1052,15 @@ after any internal newline. However, it does not match after a newline at the
|
||||||
end of the subject, for compatibility with Perl. If you want a multiline
|
end of the subject, for compatibility with Perl. If you want a multiline
|
||||||
circumflex also to match after a terminating newline, you must set
|
circumflex also to match after a terminating newline, you must set
|
||||||
PCRE2_ALT_CIRCUMFLEX.
|
PCRE2_ALT_CIRCUMFLEX.
|
||||||
|
.sp
|
||||||
|
PCRE2_ALT_VERBNAMES
|
||||||
|
.sp
|
||||||
|
By default, for compatibility with Perl, the name in any verb sequence such as
|
||||||
|
(*MARK:NAME) is any sequence of characters that does not include a closing
|
||||||
|
parenthesis. The name is not processed in any way, and it is not possible to
|
||||||
|
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
||||||
|
option is set, normal backslash processing is applied to verb names and only an
|
||||||
|
unescaped closing parenthesis terminates the name.
|
||||||
.sp
|
.sp
|
||||||
PCRE2_AUTO_CALLOUT
|
PCRE2_AUTO_CALLOUT
|
||||||
.sp
|
.sp
|
||||||
|
@ -2953,6 +2962,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 29 August 2015
|
Last updated: 30 August 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
|
.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||||
|
@ -1334,7 +1334,7 @@ both specified as literal letters in the same case. For compatibility with
|
||||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||||
example, [h-k] matches only four characters, even though the codes for h and k
|
example, [h-k] matches only four characters, even though the codes for h and k
|
||||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||||
specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
|
specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points
|
||||||
are included.
|
are included.
|
||||||
.P
|
.P
|
||||||
If a range that includes letters is used when caseless matching is set, it
|
If a range that includes letters is used when caseless matching is set, it
|
||||||
|
@ -2944,14 +2944,21 @@ in production code should be noted to avoid problems during upgrades." The same
|
||||||
remarks apply to the PCRE2 features described in this section.
|
remarks apply to the PCRE2 features described in this section.
|
||||||
.P
|
.P
|
||||||
The new verbs make use of what was previously invalid syntax: an opening
|
The new verbs make use of what was previously invalid syntax: an opening
|
||||||
parenthesis followed by an asterisk. They are generally of the form
|
parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
||||||
(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
(*VERB:NAME). Some verbs take either form, possibly behaving differently
|
||||||
differently depending on whether or not a name is present. A name is any
|
depending on whether or not a name is present.
|
||||||
sequence of characters that does not include a closing parenthesis. The maximum
|
.P
|
||||||
length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit
|
By default, for compatibility with Perl, a name is any sequence of characters
|
||||||
libraries. If the name is empty, that is, if the closing parenthesis
|
that does not include a closing parenthesis. The name is not processed in
|
||||||
immediately follows the colon, the effect is as if the colon were not there.
|
any way, and it is not possible to include a closing parenthesis in the name.
|
||||||
Any number of these verbs may occur in a pattern.
|
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||||
|
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||||
|
the name.
|
||||||
|
.P
|
||||||
|
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||||
|
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||||
|
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||||
|
not there. Any number of these verbs may occur in a pattern.
|
||||||
.P
|
.P
|
||||||
Since these verbs are specifically related to backtracking, most of them can be
|
Since these verbs are specifically related to backtracking, most of them can be
|
||||||
used only when the pattern is to be matched using the traditional matching
|
used only when the pattern is to be matched using the traditional matching
|
||||||
|
@ -3376,6 +3383,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 24 July 2015
|
Last updated: 30 August 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -445,6 +445,7 @@ for a description of their effects.
|
||||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||||
alt_bsux set PCRE2_ALT_BSUX
|
alt_bsux set PCRE2_ALT_BSUX
|
||||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||||
|
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
auto_callout set PCRE2_AUTO_CALLOUT
|
auto_callout set PCRE2_AUTO_CALLOUT
|
||||||
/i caseless set PCRE2_CASELESS
|
/i caseless set PCRE2_CASELESS
|
||||||
|
|
|
@ -285,12 +285,14 @@ COMMAND LINES
|
||||||
MODIFIER SYNTAX
|
MODIFIER SYNTAX
|
||||||
|
|
||||||
Modifier lists are used with both pattern and subject lines. Items in a
|
Modifier lists are used with both pattern and subject lines. Items in a
|
||||||
list are separated by commas and optional white space. Some modifiers
|
list are separated by commas followed by optional white space. Trailing
|
||||||
may be given for both patterns and subject lines, whereas others are
|
whitespace in a modifier list is ignored. Some modifiers may be given
|
||||||
valid for one or the other only. Each modifier has a long name, for
|
for both patterns and subject lines, whereas others are valid only for
|
||||||
example "anchored", and some of them must be followed by an equals sign
|
one or the other. Each modifier has a long name, for example
|
||||||
and a value, for example, "offset=12". Modifiers that do not take val-
|
"anchored", and some of them must be followed by an equals sign and a
|
||||||
ues may be preceded by a minus sign to turn off a previous setting.
|
value, for example, "offset=12". Values cannot contain comma charac-
|
||||||
|
ters, but may contain spaces. Modifiers that do not take values may be
|
||||||
|
preceded by a minus sign to turn off a previous setting.
|
||||||
|
|
||||||
A few of the more common modifiers can also be specified as single let-
|
A few of the more common modifiers can also be specified as single let-
|
||||||
ters, for example "i" for "caseless". In documentation, following the
|
ters, for example "i" for "caseless". In documentation, following the
|
||||||
|
@ -424,6 +426,7 @@ PATTERN MODIFIERS
|
||||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||||
alt_bsux set PCRE2_ALT_BSUX
|
alt_bsux set PCRE2_ALT_BSUX
|
||||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||||
|
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
auto_callout set PCRE2_AUTO_CALLOUT
|
auto_callout set PCRE2_AUTO_CALLOUT
|
||||||
/i caseless set PCRE2_CASELESS
|
/i caseless set PCRE2_CASELESS
|
||||||
|
@ -1330,5 +1333,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 20 May 2015
|
Last updated: 30 August 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
|
|
|
@ -120,6 +120,7 @@ D is inspected during pcre2_dfa_match() execution
|
||||||
#define PCRE2_UTF 0x00080000u /* C J M D */
|
#define PCRE2_UTF 0x00080000u /* C J M D */
|
||||||
#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */
|
#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */
|
||||||
#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */
|
#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */
|
||||||
|
#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */
|
||||||
|
|
||||||
/* These are for pcre2_jit_compile(). */
|
/* These are for pcre2_jit_compile(). */
|
||||||
|
|
||||||
|
|
|
@ -561,12 +561,12 @@ static PCRE2_SPTR posix_substitutes[] = {
|
||||||
|
|
||||||
#define PUBLIC_COMPILE_OPTIONS \
|
#define PUBLIC_COMPILE_OPTIONS \
|
||||||
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
|
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
|
||||||
PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL| \
|
PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
|
||||||
PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \
|
PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
|
||||||
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
|
PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
|
||||||
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
|
PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
|
||||||
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \
|
PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
|
||||||
PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
||||||
|
|
||||||
/* Compile time error code numbers. They are given names so that they can more
|
/* Compile time error code numbers. They are given names so that they can more
|
||||||
easily be tracked. When a new number is added, the tables called eint1 and
|
easily be tracked. When a new number is added, the tables called eint1 and
|
||||||
|
@ -5382,13 +5382,52 @@ for (;; ptr++)
|
||||||
|
|
||||||
/* It appears that Perl allows any characters whatsoever, other than
|
/* It appears that Perl allows any characters whatsoever, other than
|
||||||
a closing parenthesis, to appear in arguments, so we no longer insist on
|
a closing parenthesis, to appear in arguments, so we no longer insist on
|
||||||
letters, digits, and underscores. */
|
letters, digits, and underscores. Perl does not, however, do any
|
||||||
|
interpretation within arguments, and has no means of including a closing
|
||||||
|
parenthesis. PCRE supports escape processing but only when it is
|
||||||
|
requested by an option. Note that check_escape() will not return values
|
||||||
|
greater than the code unit maximum when not in UTF mode. */
|
||||||
|
|
||||||
if (*ptr == CHAR_COLON)
|
if (*ptr == CHAR_COLON)
|
||||||
{
|
{
|
||||||
arg = ++ptr;
|
arg = ++ptr;
|
||||||
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
|
|
||||||
arglen = (int)(ptr - arg);
|
if ((options & PCRE2_ALT_VERBNAMES) == 0)
|
||||||
|
{
|
||||||
|
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
|
||||||
|
arglen = (int)(ptr - arg);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
arglen = 0;
|
||||||
|
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS)
|
||||||
|
{
|
||||||
|
if (*ptr == '\\')
|
||||||
|
{
|
||||||
|
uint32_t x;
|
||||||
|
*errorcodeptr = 0;
|
||||||
|
i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
|
||||||
|
if (*errorcodeptr != 0) goto FAILED;
|
||||||
|
if (i != 0)
|
||||||
|
{
|
||||||
|
*errorcodeptr = ERR40;
|
||||||
|
goto FAILED;
|
||||||
|
}
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||||
|
for (i = 0; i < PRIV(utf8_table1_size); i++)
|
||||||
|
if ((int)x <= PRIV(utf8_table1)[i]) break;
|
||||||
|
arglen += i;
|
||||||
|
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||||
|
if (x > 0xffff) arglen++;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
arglen++;
|
||||||
|
ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ((unsigned int)arglen > MAX_MARK)
|
if ((unsigned int)arglen > MAX_MARK)
|
||||||
{
|
{
|
||||||
*errorcodeptr = ERR76;
|
*errorcodeptr = ERR76;
|
||||||
|
@ -5456,8 +5495,42 @@ for (;; ptr++)
|
||||||
}
|
}
|
||||||
setverb = *code++ = verbs[i].op_arg;
|
setverb = *code++ = verbs[i].op_arg;
|
||||||
*code++ = arglen;
|
*code++ = arglen;
|
||||||
memcpy(code, arg, CU2BYTES(arglen));
|
|
||||||
code += arglen;
|
/* If we are processing the argument for escapes, we don't need
|
||||||
|
to apply checks here because it was all checked above when
|
||||||
|
computing the length. */
|
||||||
|
|
||||||
|
if ((options & PCRE2_ALT_VERBNAMES) != 0)
|
||||||
|
{
|
||||||
|
for (; arg != ptr; arg++)
|
||||||
|
{
|
||||||
|
if (*arg == '\\')
|
||||||
|
{
|
||||||
|
uint32_t x;
|
||||||
|
*errorcodeptr = 0;
|
||||||
|
(void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
|
||||||
|
cb);
|
||||||
|
#ifdef SUPPORT_UNICODE
|
||||||
|
if (utf)
|
||||||
|
{
|
||||||
|
PCRE2_UCHAR cbuff[8];
|
||||||
|
x = PRIV(ord2utf)(x, cbuff);
|
||||||
|
memcpy(code, cbuff, CU2BYTES(x));
|
||||||
|
code += x;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
*code++ = x;
|
||||||
|
}
|
||||||
|
else *code++ = *arg;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else /* No argument processing */
|
||||||
|
{
|
||||||
|
memcpy(code, arg, CU2BYTES(arglen));
|
||||||
|
code += arglen;
|
||||||
|
}
|
||||||
|
|
||||||
*code++ = 0;
|
*code++ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6322,12 +6395,12 @@ for (;; ptr++)
|
||||||
}
|
}
|
||||||
recno += cb->bracount;
|
recno += cb->bracount;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((uint32_t)recno > cb->final_bracount)
|
if ((uint32_t)recno > cb->final_bracount)
|
||||||
{
|
{
|
||||||
*errorcodeptr = ERR15;
|
*errorcodeptr = ERR15;
|
||||||
goto FAILED;
|
goto FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Come here from code above that handles a named recursion.
|
/* Come here from code above that handles a named recursion.
|
||||||
We insert the number of the called group after OP_RECURSE. At the
|
We insert the number of the called group after OP_RECURSE. At the
|
||||||
|
@ -7944,9 +8017,9 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
|
||||||
if (!IS_DIGIT(ptr[pp]))
|
if (!IS_DIGIT(ptr[pp]))
|
||||||
{
|
{
|
||||||
errorcode = ERR60;
|
errorcode = ERR60;
|
||||||
ptr += pp;
|
ptr += pp;
|
||||||
goto HAD_ERROR;
|
goto HAD_ERROR;
|
||||||
}
|
}
|
||||||
while (IS_DIGIT(ptr[pp]))
|
while (IS_DIGIT(ptr[pp]))
|
||||||
{
|
{
|
||||||
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
|
if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */
|
||||||
|
@ -7955,7 +8028,7 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
|
||||||
if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
|
if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
|
||||||
{
|
{
|
||||||
errorcode = ERR60;
|
errorcode = ERR60;
|
||||||
ptr += pp;
|
ptr += pp;
|
||||||
goto HAD_ERROR;
|
goto HAD_ERROR;
|
||||||
}
|
}
|
||||||
if (p->type == PSO_LIMM) limit_match = c;
|
if (p->type == PSO_LIMM) limit_match = c;
|
||||||
|
@ -8237,7 +8310,7 @@ if (errorcode == 0 && cb.had_recurse)
|
||||||
recno = (int)GET(rcode, 1);
|
recno = (int)GET(rcode, 1);
|
||||||
if (recno == 0) rgroup = codestart; else
|
if (recno == 0) rgroup = codestart; else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR search_from = codestart;
|
PCRE2_SPTR search_from = codestart;
|
||||||
rgroup = NULL;
|
rgroup = NULL;
|
||||||
for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
|
for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
|
||||||
{
|
{
|
||||||
|
@ -8246,11 +8319,11 @@ if (errorcode == 0 && cb.had_recurse)
|
||||||
rgroup = rc[p].group;
|
rgroup = rc[p].group;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Group n+1 must always start to the right of group n, so we can save
|
/* Group n+1 must always start to the right of group n, so we can save
|
||||||
search time below when the new group number is greater than any of the
|
search time below when the new group number is greater than any of the
|
||||||
previously found groups. */
|
previously found groups. */
|
||||||
|
|
||||||
if (recno > rc[p].recno) search_from = rc[p].group;
|
if (recno > rc[p].recno) search_from = rc[p].group;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -111,7 +111,7 @@ static const char compile_error_texts[] =
|
||||||
"number after (?C is greater than 255\0"
|
"number after (?C is greater than 255\0"
|
||||||
"closing parenthesis for (?C expected\0"
|
"closing parenthesis for (?C expected\0"
|
||||||
/* 40 */
|
/* 40 */
|
||||||
"SPARE ERROR\0"
|
"invalid escape sequence in (*VERB) name\0"
|
||||||
"unrecognized character after (?P\0"
|
"unrecognized character after (?P\0"
|
||||||
"syntax error in subpattern name (missing terminator)\0"
|
"syntax error in subpattern name (missing terminator)\0"
|
||||||
"two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
|
"two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
|
||||||
|
|
|
@ -496,6 +496,7 @@ static modstruct modlist[] = {
|
||||||
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
|
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
|
||||||
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
|
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
|
||||||
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
|
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
|
||||||
|
{ "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) },
|
||||||
{ "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) },
|
{ "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) },
|
||||||
{ "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) },
|
{ "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) },
|
||||||
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
||||||
|
@ -3467,10 +3468,11 @@ static void
|
||||||
show_compile_options(uint32_t options, const char *before, const char *after)
|
show_compile_options(uint32_t options, const char *before, const char *after)
|
||||||
{
|
{
|
||||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
before,
|
before,
|
||||||
((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "",
|
((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "",
|
||||||
((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "",
|
((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "",
|
||||||
|
((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "",
|
||||||
((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "",
|
((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "",
|
||||||
((options & PCRE2_ANCHORED) != 0)? " anchored" : "",
|
((options & PCRE2_ANCHORED) != 0)? " anchored" : "",
|
||||||
((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "",
|
((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "",
|
||||||
|
|
|
@ -4442,4 +4442,11 @@ a random value. /Ix
|
||||||
/((*MARK:A))++a(*SKIP:B)b/
|
/((*MARK:A))++a(*SKIP:B)b/
|
||||||
aacb
|
aacb
|
||||||
|
|
||||||
|
/(*MARK:a\zb)z/alt_verbnames
|
||||||
|
|
||||||
|
/(*:ab\t(d\)c)xxx/
|
||||||
|
|
||||||
|
/(*:ab\t(d\)c)xxx/alt_verbnames,mark
|
||||||
|
cxxxz
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -1662,4 +1662,9 @@
|
||||||
/[\pS#moq]/
|
/[\pS#moq]/
|
||||||
=
|
=
|
||||||
|
|
||||||
|
# UTF tests
|
||||||
|
|
||||||
|
/(*:a\x{12345}b\t(d\)c)xxx/utf,alt_verbnames,mark
|
||||||
|
cxxxz
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
|
@ -251,4 +251,6 @@
|
||||||
|
|
||||||
/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B
|
/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B
|
||||||
|
|
||||||
|
/(*MARK:a\x{100}b)z/alt_verbnames
|
||||||
|
|
||||||
# End of testinput9
|
# End of testinput9
|
||||||
|
|
|
@ -14713,4 +14713,15 @@ No match
|
||||||
aacb
|
aacb
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/(*MARK:a\zb)z/alt_verbnames
|
||||||
|
Failed: error 140 at offset 9: invalid escape sequence in (*VERB) name
|
||||||
|
|
||||||
|
/(*:ab\t(d\)c)xxx/
|
||||||
|
Failed: error 122 at offset 12: unmatched closing parenthesis
|
||||||
|
|
||||||
|
/(*:ab\t(d\)c)xxx/alt_verbnames,mark
|
||||||
|
cxxxz
|
||||||
|
0: xxx
|
||||||
|
MK: ab\x09(d)c
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -4064,4 +4064,11 @@ No match
|
||||||
=
|
=
|
||||||
0: =
|
0: =
|
||||||
|
|
||||||
|
# UTF tests
|
||||||
|
|
||||||
|
/(*:a\x{12345}b\t(d\)c)xxx/utf,alt_verbnames,mark
|
||||||
|
cxxxz
|
||||||
|
0: xxx
|
||||||
|
MK: a\x{12345}b\x{09}(d)c
|
||||||
|
|
||||||
# End of testinput5
|
# End of testinput5
|
||||||
|
|
|
@ -356,4 +356,7 @@ Failed: error 177 at offset 6: character code point value in \u.... sequence is
|
||||||
End
|
End
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
/(*MARK:a\x{100}b)z/alt_verbnames
|
||||||
|
Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large
|
||||||
|
|
||||||
# End of testinput9
|
# End of testinput9
|
||||||
|
|
Loading…
Reference in New Issue