Implemented PCRE2_ALT_VERBNAMES
This commit is contained in:
parent
fd08e11c1e
commit
d2e87a75af
2
132html
2
132html
|
@ -148,7 +148,7 @@ while (<STDIN>)
|
|||
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
|
||||
$ref, $ref);
|
||||
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
|
||||
$ref, $ref);
|
||||
$ref);
|
||||
$ref++;
|
||||
}
|
||||
else
|
||||
|
|
|
@ -167,6 +167,8 @@ test (there are now 20 in total).
|
|||
47. Modifier lists in pcre2test were splitting at spaces without the required
|
||||
commas.
|
||||
|
||||
48. Implemented PCRE2_ALT_VERBNAMES.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
|
|
@ -97,6 +97,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_context.c
|
||||
pcre2_dfa_match.c
|
||||
pcre2_error.c
|
||||
pcre2_find_bracket.c
|
||||
pcre2_jit_compile.c
|
||||
pcre2_maketables.c
|
||||
pcre2_match.c
|
||||
|
@ -388,4 +389,4 @@ and executable, is in EBCDIC and native z/OS file formats and this is the
|
|||
recommended download site.
|
||||
|
||||
=============================
|
||||
Last Updated: 15 June 2015
|
||||
Last Updated: 16 July 2015
|
||||
|
|
|
@ -724,6 +724,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_context.c )
|
||||
src/pcre2_dfa_match.c )
|
||||
src/pcre2_error.c )
|
||||
src/pcre2_find_bracket.c )
|
||||
src/pcre2_jit_compile.c )
|
||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||
|
@ -832,4 +833,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 July 2015
|
||||
|
|
|
@ -19,7 +19,7 @@ SYNOPSIS
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
|
|
|
@ -19,7 +19,7 @@ SYNOPSIS
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
|
|
|
@ -19,8 +19,8 @@ SYNOPSIS
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
|
|
|
@ -70,15 +70,15 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
|
@ -936,7 +936,7 @@ The <i>where</i> argument should point to a buffer that is at least 24 code
|
|||
units long. (The exact length required can be found by calling
|
||||
<b>pcre2_config()</b> with <b>where</b> set to NULL.) If PCRE2 has been compiled
|
||||
without Unicode support, the buffer is filled with the text "Unicode not
|
||||
supported". Otherwise, the Unicode version string (for example, "7.0.0") is
|
||||
supported". Otherwise, the Unicode version string (for example, "8.0.0") is
|
||||
inserted. The number of code units used is returned. This is the length of the
|
||||
string plus one unit for the terminating zero.
|
||||
<pre>
|
||||
|
@ -961,7 +961,7 @@ zero.
|
|||
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
||||
|
@ -1083,6 +1083,15 @@ after any internal newline. However, it does not match after a newline at the
|
|||
end of the subject, for compatibility with Perl. If you want a multiline
|
||||
circumflex also to match after a terminating newline, you must set
|
||||
PCRE2_ALT_CIRCUMFLEX.
|
||||
<pre>
|
||||
PCRE2_ALT_VERBNAMES
|
||||
</pre>
|
||||
By default, for compatibility with Perl, the name in any verb sequence such as
|
||||
(*MARK:NAME) is any sequence of characters that does not include a closing
|
||||
parenthesis. The name is not processed in any way, and it is not possible to
|
||||
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
||||
option is set, normal backslash processing is applied to verb names and only an
|
||||
unescaped closing parenthesis terminates the name.
|
||||
<pre>
|
||||
PCRE2_AUTO_CALLOUT
|
||||
</pre>
|
||||
|
@ -1778,12 +1787,12 @@ documentation.
|
|||
<a name="matchdatablock"></a></P>
|
||||
<br><a name="SEC25" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||
<P>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b>
|
||||
|
@ -2010,12 +2019,20 @@ If the pattern is anchored, such a match can occur only if the pattern contains
|
|||
</pre>
|
||||
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
|
||||
string is checked by default when <b>pcre2_match()</b> is subsequently called.
|
||||
The entire string is checked before any other processing takes place, and a
|
||||
If a non-zero starting offset is given, the check is applied only to that part
|
||||
of the subject that could be inspected during matching, and there is a check
|
||||
that the starting offset points to the first code unit of a character or to the
|
||||
end of the subject. If there are no lookbehind assertions in the pattern, the
|
||||
check starts at the starting offset. Otherwise, it starts at the length of the
|
||||
longest lookbehind before the starting offset, or at the start of the subject
|
||||
if there are not that many characters before the starting offset. Note that the
|
||||
sequences \b and \B are one-character lookbehinds.
|
||||
</P>
|
||||
<P>
|
||||
The check is carried out before any other processing takes place, and a
|
||||
negative error code is returned if the check fails. There are several UTF error
|
||||
codes for each code unit width, corresponding to different problems with the
|
||||
code unit sequence. The value of <i>startoffset</i> is also checked, to ensure
|
||||
that it points to the start of a character or to the end of the subject. There
|
||||
are discussions about the validity of
|
||||
code unit sequence. There are discussions about the validity of
|
||||
<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a>
|
||||
<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a>
|
||||
and
|
||||
|
@ -2564,12 +2581,12 @@ be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
|||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups in the pattern. The following forms are
|
||||
recognized:
|
||||
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||
forms are recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> insert the contents of group <n>
|
||||
${<n>} insert the contents of group <n>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
</pre>
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
|
@ -2580,6 +2597,15 @@ calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
|
|||
appropriate.
|
||||
</P>
|
||||
<P>
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||
<pre>
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||
<b>pcre2_match()</b>, except that the partial matching options are not
|
||||
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
||||
|
@ -2883,7 +2909,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 22 April 2015
|
||||
Last updated: 30 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -224,8 +224,14 @@ whether a match operation was executed by JIT or by the interpreter.
|
|||
</P>
|
||||
<P>
|
||||
You may safely use the same JIT stack for more than one pattern (either by
|
||||
assigning directly or by callback), as long as the patterns are all matched
|
||||
sequentially in the same thread. In a multithread application, if you do not
|
||||
assigning directly or by callback), as long as the patterns are matched
|
||||
sequentially in the same thread. Currently, the only way to set up
|
||||
non-sequential matches in one thread is to use callouts: if a callout function
|
||||
starts another match, that match must use a different JIT stack to the one used
|
||||
for currently suspended match(es).
|
||||
</P>
|
||||
<P>
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
|
@ -419,9 +425,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 27 November 2014
|
||||
Last updated: 28 July 2015
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -736,6 +736,8 @@ Those that are not part of an identified script are lumped together as
|
|||
"Common". The current list of scripts is:
|
||||
</P>
|
||||
<P>
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
|
@ -776,6 +778,7 @@ Gurmukhi,
|
|||
Han,
|
||||
Hangul,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
|
@ -812,12 +815,14 @@ Miao,
|
|||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
New_Tai_Lue,
|
||||
Nko,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
|
@ -839,6 +844,7 @@ Saurashtra,
|
|||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sora_Sompeng,
|
||||
Sundanese,
|
||||
|
@ -1322,9 +1328,19 @@ where a range ending character is expected. For example, [z-\xff] is valid,
|
|||
but [A-\d] and [A-[:digit:]] are not.
|
||||
</P>
|
||||
<P>
|
||||
Ranges operate in the collating sequence of character values. They can also be
|
||||
used for characters specified numerically, for example [\000-\037]. Ranges
|
||||
can include any characters that are valid for the current mode.
|
||||
Ranges normally include all code points between the start and end characters,
|
||||
inclusive. They can also be used for code points specified numerically, for
|
||||
example [\000-\037]. Ranges can include any characters that are valid for the
|
||||
current mode.
|
||||
</P>
|
||||
<P>
|
||||
There is a special case in EBCDIC environments for ranges whose end points are
|
||||
both specified as literal letters in the same case. For compatibility with
|
||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
specified numerically, for example, [\x88-\x92] or [h-\x92], all code points
|
||||
are included.
|
||||
</P>
|
||||
<P>
|
||||
If a range that includes letters is used when caseless matching is set, it
|
||||
|
@ -2899,14 +2915,23 @@ remarks apply to the PCRE2 features described in this section.
|
|||
</P>
|
||||
<P>
|
||||
The new verbs make use of what was previously invalid syntax: an opening
|
||||
parenthesis followed by an asterisk. They are generally of the form
|
||||
(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
||||
differently depending on whether or not a name is present. A name is any
|
||||
sequence of characters that does not include a closing parenthesis. The maximum
|
||||
length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit
|
||||
libraries. If the name is empty, that is, if the closing parenthesis
|
||||
immediately follows the colon, the effect is as if the colon were not there.
|
||||
Any number of these verbs may occur in a pattern.
|
||||
parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
||||
(*VERB:NAME). Some verbs take either form, possibly behaving differently
|
||||
depending on whether or not a name is present.
|
||||
</P>
|
||||
<P>
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
any way, and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||
not there. Any number of these verbs may occur in a pattern.
|
||||
</P>
|
||||
<P>
|
||||
Since these verbs are specifically related to backtracking, most of them can be
|
||||
|
@ -3323,7 +3348,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 13 June 2015
|
||||
Last updated: 30 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -187,6 +187,8 @@ at release 5.18.
|
|||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
|
@ -227,6 +229,7 @@ Gurmukhi,
|
|||
Han,
|
||||
Hangul,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
|
@ -263,12 +266,14 @@ Miao,
|
|||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
New_Tai_Lue,
|
||||
Nko,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
|
@ -290,6 +295,7 @@ Saurashtra,
|
|||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sora_Sompeng,
|
||||
Sundanese,
|
||||
|
@ -582,7 +588,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 13 June 2015
|
||||
Last updated: 17 July 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -340,12 +340,13 @@ subject lines. Modifiers on a subject line can change these settings.
|
|||
<br><a name="SEC7" href="#TOC1">MODIFIER SYNTAX</a><br>
|
||||
<P>
|
||||
Modifier lists are used with both pattern and subject lines. Items in a list
|
||||
are separated by commas and optional white space. Some modifiers may be given
|
||||
for both patterns and subject lines, whereas others are valid for one or the
|
||||
other only. Each modifier has a long name, for example "anchored", and some of
|
||||
them must be followed by an equals sign and a value, for example, "offset=12".
|
||||
Modifiers that do not take values may be preceded by a minus sign to turn off a
|
||||
previous setting.
|
||||
are separated by commas followed by optional white space. Trailing whitespace
|
||||
in a modifier list is ignored. Some modifiers may be given for both patterns
|
||||
and subject lines, whereas others are valid only for one or the other. Each
|
||||
modifier has a long name, for example "anchored", and some of them must be
|
||||
followed by an equals sign and a value, for example, "offset=12". Values cannot
|
||||
contain comma characters, but may contain spaces. Modifiers that do not take
|
||||
values may be preceded by a minus sign to turn off a previous setting.
|
||||
</P>
|
||||
<P>
|
||||
A few of the more common modifiers can also be specified as single letters, for
|
||||
|
@ -479,6 +480,7 @@ for a description of their effects.
|
|||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
anchored set PCRE2_ANCHORED
|
||||
auto_callout set PCRE2_AUTO_CALLOUT
|
||||
/i caseless set PCRE2_CASELESS
|
||||
|
@ -1469,7 +1471,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 20 May 2015
|
||||
Last updated: 30 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -126,11 +126,22 @@ as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
|||
strings to be in host byte order.
|
||||
</P>
|
||||
<P>
|
||||
The entire string is checked before any other processing takes place. In
|
||||
addition to checking the format of the string, there is a check to ensure that
|
||||
all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
|
||||
The so-called "non-character" code points are not excluded because Unicode
|
||||
corrigendum #9 makes it clear that they should not be.
|
||||
A UTF string is checked before any other processing takes place. In the case of
|
||||
<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b> calls with a non-zero starting
|
||||
offset, the check is applied only to that part of the subject that could be
|
||||
inspected during matching, and there is a check that the starting offset points
|
||||
to the first code unit of a character or to the end of the subject. If there
|
||||
are no lookbehind assertions in the pattern, the check starts at the starting
|
||||
offset. Otherwise, it starts at the length of the longest lookbehind before the
|
||||
starting offset, or at the start of the subject if there are not that many
|
||||
characters before the starting offset. Note that the sequences \b and \B are
|
||||
one-character lookbehinds.
|
||||
</P>
|
||||
<P>
|
||||
In addition to checking the format of the string, there is a check to ensure
|
||||
that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate
|
||||
area. The so-called "non-character" code points are not excluded because
|
||||
Unicode corrigendum #9 makes it clear that they should not be.
|
||||
</P>
|
||||
<P>
|
||||
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
|
||||
|
@ -264,9 +275,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 23 November 2014
|
||||
Last updated: 18 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
360
doc/pcre2.txt
360
doc/pcre2.txt
|
@ -190,13 +190,13 @@ PCRE2 NATIVE API BASIC FUNCTIONS
|
|||
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
||||
pcre2_compile_context *ccontext);
|
||||
|
||||
pcre2_code_free(pcre2_code *code);
|
||||
void pcre2_code_free(pcre2_code *code);
|
||||
|
||||
pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_general_context *gcontext);
|
||||
|
||||
pcre2_match_data_create_from_pattern(const pcre2_code *code,
|
||||
pcre2_general_context *gcontext);
|
||||
pcre2_match_data *pcre2_match_data_create_from_pattern(
|
||||
const pcre2_code *code, pcre2_general_context *gcontext);
|
||||
|
||||
int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
|
||||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||
|
@ -989,7 +989,7 @@ CHECKING BUILD-TIME OPTIONS
|
|||
pcre2_config() with where set to NULL.) If PCRE2 has been compiled
|
||||
without Unicode support, the buffer is filled with the text "Unicode
|
||||
not supported". Otherwise, the Unicode version string (for example,
|
||||
"7.0.0") is inserted. The number of code units used is returned. This
|
||||
"8.0.0") is inserted. The number of code units used is returned. This
|
||||
is the length of the string plus one unit for the terminating zero.
|
||||
|
||||
PCRE2_CONFIG_UNICODE
|
||||
|
@ -1014,7 +1014,7 @@ COMPILING A PATTERN
|
|||
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
||||
pcre2_compile_context *ccontext);
|
||||
|
||||
pcre2_code_free(pcre2_code *code);
|
||||
void pcre2_code_free(pcre2_code *code);
|
||||
|
||||
The pcre2_compile() function compiles a pattern into an internal form.
|
||||
The pattern is defined by a pointer to a string of code units and a
|
||||
|
@ -1128,6 +1128,16 @@ COMPILING A PATTERN
|
|||
Perl. If you want a multiline circumflex also to match after a termi-
|
||||
nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
|
||||
|
||||
PCRE2_ALT_VERBNAMES
|
||||
|
||||
By default, for compatibility with Perl, the name in any verb sequence
|
||||
such as (*MARK:NAME) is any sequence of characters that does not
|
||||
include a closing parenthesis. The name is not processed in any way,
|
||||
and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash
|
||||
processing is applied to verb names and only an unescaped closing
|
||||
parenthesis terminates the name.
|
||||
|
||||
PCRE2_AUTO_CALLOUT
|
||||
|
||||
If this bit is set, pcre2_compile() automatically inserts callout
|
||||
|
@ -1809,11 +1819,11 @@ SERIALIZATION AND PRECOMPILING
|
|||
|
||||
THE MATCH DATA BLOCK
|
||||
|
||||
pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_general_context *gcontext);
|
||||
|
||||
pcre2_match_data_create_from_pattern(const pcre2_code *code,
|
||||
pcre2_general_context *gcontext);
|
||||
pcre2_match_data *pcre2_match_data_create_from_pattern(
|
||||
const pcre2_code *code, pcre2_general_context *gcontext);
|
||||
|
||||
void pcre2_match_data_free(pcre2_match_data *match_data);
|
||||
|
||||
|
@ -2022,12 +2032,20 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
|
|||
|
||||
When PCRE2_UTF is set at compile time, the validity of the subject as a
|
||||
UTF string is checked by default when pcre2_match() is subsequently
|
||||
called. The entire string is checked before any other processing takes
|
||||
place, and a negative error code is returned if the check fails. There
|
||||
are several UTF error codes for each code unit width, corresponding to
|
||||
different problems with the code unit sequence. The value of startoff-
|
||||
set is also checked, to ensure that it points to the start of a charac-
|
||||
ter or to the end of the subject. There are discussions about the
|
||||
called. If a non-zero starting offset is given, the check is applied
|
||||
only to that part of the subject that could be inspected during match-
|
||||
ing, and there is a check that the starting offset points to the first
|
||||
code unit of a character or to the end of the subject. If there are no
|
||||
lookbehind assertions in the pattern, the check starts at the starting
|
||||
offset. Otherwise, it starts at the length of the longest lookbehind
|
||||
before the starting offset, or at the start of the subject if there are
|
||||
not that many characters before the starting offset. Note that the
|
||||
sequences \b and \B are one-character lookbehinds.
|
||||
|
||||
The check is carried out before any other processing takes place, and a
|
||||
negative error code is returned if the check fails. There are several
|
||||
UTF error codes for each code unit width, corresponding to different
|
||||
problems with the code unit sequence. There are discussions about the
|
||||
validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the
|
||||
pcre2unicode page.
|
||||
|
||||
|
@ -2525,12 +2543,12 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
In the replacement string, which is interpreted as a UTF string in UTF
|
||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||
option is set, a dollar character is an escape character that can spec-
|
||||
ify the insertion of characters from capturing groups in the pattern.
|
||||
The following forms are recognized:
|
||||
ify the insertion of characters from capturing groups or (*MARK) items
|
||||
in the pattern. The following forms are recognized:
|
||||
|
||||
$$ insert a dollar character
|
||||
$<n> insert the contents of group <n>
|
||||
${<n>} insert the contents of group <n>
|
||||
$$ insert a dollar character
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
|
||||
Either a group number or a group name can be given for <n>. Curly
|
||||
brackets are required only if the following character would be inter-
|
||||
|
@ -2540,30 +2558,37 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
|
||||
or pcre2_copy_bynumber() as appropriate.
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this pcre2test example shows:
|
||||
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
pcre2_match(), except that the partial matching options are not permit-
|
||||
ted, and match_data may be passed as NULL, in which case a match data
|
||||
block is obtained and freed within this function, using memory manage-
|
||||
ment functions from the match context, if provided, or else those that
|
||||
ted, and match_data may be passed as NULL, in which case a match data
|
||||
block is obtained and freed within this function, using memory manage-
|
||||
ment functions from the match context, if provided, or else those that
|
||||
were used to allocate memory for the compiled code.
|
||||
|
||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||
There is one additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes
|
||||
the function to iterate over the subject string, replacing every match-
|
||||
ing substring. If this is not set, only the first matching substring is
|
||||
replaced.
|
||||
|
||||
The outlengthptr argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. It is updated to contain
|
||||
The outlengthptr argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. It is updated to contain
|
||||
the length of the new string, excluding the trailing zero that is auto-
|
||||
matically added.
|
||||
|
||||
The function returns the number of replacements that were made. This
|
||||
may be zero if no matches were found, and is never greater than 1
|
||||
The function returns the number of replacements that were made. This
|
||||
may be zero if no matches were found, and is never greater than 1
|
||||
unless PCRE2_SUBSTITUTE_GLOBAL is set. In the event of an error, a neg-
|
||||
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
|
||||
ative error code is returned. Except for PCRE2_ERROR_NOMATCH (which is
|
||||
never returned), any errors from pcre2_match() or the substring copying
|
||||
functions are passed straight back. PCRE2_ERROR_BADREPLACEMENT is
|
||||
returned for an invalid replacement string (unrecognized sequence fol-
|
||||
returned for an invalid replacement string (unrecognized sequence fol-
|
||||
lowing a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the out-
|
||||
put buffer is not big enough.
|
||||
|
||||
|
@ -2573,56 +2598,56 @@ DUPLICATE SUBPATTERN NAMES
|
|||
int pcre2_substring_nametable_scan(const pcre2_code *code,
|
||||
PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last);
|
||||
|
||||
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
||||
subpatterns are not required to be unique. Duplicate names are always
|
||||
allowed for subpatterns with the same number, created by using the (?|
|
||||
feature. Indeed, if such subpatterns are named, they are required to
|
||||
When a pattern is compiled with the PCRE2_DUPNAMES option, names for
|
||||
subpatterns are not required to be unique. Duplicate names are always
|
||||
allowed for subpatterns with the same number, created by using the (?|
|
||||
feature. Indeed, if such subpatterns are named, they are required to
|
||||
use the same names.
|
||||
|
||||
Normally, patterns with duplicate names are such that in any one match,
|
||||
only one of the named subpatterns participates. An example is shown in
|
||||
only one of the named subpatterns participates. An example is shown in
|
||||
the pcre2pattern documentation.
|
||||
|
||||
When duplicates are present, pcre2_substring_copy_byname() and
|
||||
pcre2_substring_get_byname() return the first substring corresponding
|
||||
to the given name that is set. Only if none are set is
|
||||
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
||||
When duplicates are present, pcre2_substring_copy_byname() and
|
||||
pcre2_substring_get_byname() return the first substring corresponding
|
||||
to the given name that is set. Only if none are set is
|
||||
PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
|
||||
function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
|
||||
duplicate names.
|
||||
|
||||
If you want to get full details of all captured substrings for a given
|
||||
name, you must use the pcre2_substring_nametable_scan() function. The
|
||||
first argument is the compiled pattern, and the second is the name. If
|
||||
the third and fourth arguments are NULL, the function returns a group
|
||||
If you want to get full details of all captured substrings for a given
|
||||
name, you must use the pcre2_substring_nametable_scan() function. The
|
||||
first argument is the compiled pattern, and the second is the name. If
|
||||
the third and fourth arguments are NULL, the function returns a group
|
||||
number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise.
|
||||
|
||||
When the third and fourth arguments are not NULL, they must be pointers
|
||||
to variables that are updated by the function. After it has run, they
|
||||
to variables that are updated by the function. After it has run, they
|
||||
point to the first and last entries in the name-to-number table for the
|
||||
given name, and the function returns the length of each entry in code
|
||||
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
||||
given name, and the function returns the length of each entry in code
|
||||
units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are
|
||||
no entries for the given name.
|
||||
|
||||
The format of the name table is described above in the section entitled
|
||||
Information about a pattern above. Given all the relevant entries for
|
||||
Information about a pattern above. Given all the relevant entries for
|
||||
the name, you can extract each of their numbers, and hence the captured
|
||||
data.
|
||||
|
||||
|
||||
FINDING ALL POSSIBLE MATCHES AT ONE POSITION
|
||||
|
||||
The traditional matching function uses a similar algorithm to Perl,
|
||||
which stops when it finds the first match at a given point in the sub-
|
||||
The traditional matching function uses a similar algorithm to Perl,
|
||||
which stops when it finds the first match at a given point in the sub-
|
||||
ject. If you want to find all possible matches, or the longest possible
|
||||
match at a given position, consider using the alternative matching
|
||||
function (see below) instead. If you cannot use the alternative func-
|
||||
match at a given position, consider using the alternative matching
|
||||
function (see below) instead. If you cannot use the alternative func-
|
||||
tion, you can kludge it up by making use of the callout facility, which
|
||||
is described in the pcre2callout documentation.
|
||||
|
||||
What you have to do is to insert a callout right at the end of the pat-
|
||||
tern. When your callout function is called, extract and save the cur-
|
||||
rent matched substring. Then return 1, which forces pcre2_match() to
|
||||
backtrack and try other alternatives. Ultimately, when it runs out of
|
||||
tern. When your callout function is called, extract and save the cur-
|
||||
rent matched substring. Then return 1, which forces pcre2_match() to
|
||||
backtrack and try other alternatives. Ultimately, when it runs out of
|
||||
matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.
|
||||
|
||||
|
||||
|
@ -2634,26 +2659,26 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
pcre2_match_context *mcontext,
|
||||
int *workspace, PCRE2_SIZE wscount);
|
||||
|
||||
The function pcre2_dfa_match() is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the
|
||||
subject string just once, and does not backtrack. This has different
|
||||
characteristics to the normal algorithm, and is not compatible with
|
||||
Perl. Some of the features of PCRE2 patterns are not supported. Never-
|
||||
theless, there are times when this kind of matching can be useful. For
|
||||
a discussion of the two matching algorithms, and a list of features
|
||||
The function pcre2_dfa_match() is called to match a subject string
|
||||
against a compiled pattern, using a matching algorithm that scans the
|
||||
subject string just once, and does not backtrack. This has different
|
||||
characteristics to the normal algorithm, and is not compatible with
|
||||
Perl. Some of the features of PCRE2 patterns are not supported. Never-
|
||||
theless, there are times when this kind of matching can be useful. For
|
||||
a discussion of the two matching algorithms, and a list of features
|
||||
that pcre2_dfa_match() does not support, see the pcre2matching documen-
|
||||
tation.
|
||||
|
||||
The arguments for the pcre2_dfa_match() function are the same as for
|
||||
The arguments for the pcre2_dfa_match() function are the same as for
|
||||
pcre2_match(), plus two extras. The ovector within the match data block
|
||||
is used in a different way, and this is described below. The other com-
|
||||
mon arguments are used in the same way as for pcre2_match(), so their
|
||||
mon arguments are used in the same way as for pcre2_match(), so their
|
||||
description is not repeated here.
|
||||
|
||||
The two additional arguments provide workspace for the function. The
|
||||
workspace vector should contain at least 20 elements. It is used for
|
||||
The two additional arguments provide workspace for the function. The
|
||||
workspace vector should contain at least 20 elements. It is used for
|
||||
keeping track of multiple paths through the pattern tree. More
|
||||
workspace is needed for patterns and subjects where there are a lot of
|
||||
workspace is needed for patterns and subjects where there are a lot of
|
||||
potential matches.
|
||||
|
||||
Here is an example of a simple call to pcre2_dfa_match():
|
||||
|
@ -2673,45 +2698,45 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
|
||||
Option bits for pcre_dfa_match()
|
||||
|
||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
||||
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
|
||||
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of
|
||||
these are exactly the same as for pcre2_match(), so their description
|
||||
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of
|
||||
these are exactly the same as for pcre2_match(), so their description
|
||||
is not repeated here.
|
||||
|
||||
PCRE2_PARTIAL_HARD
|
||||
PCRE2_PARTIAL_SOFT
|
||||
|
||||
These have the same general effect as they do for pcre2_match(), but
|
||||
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
||||
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
||||
These have the same general effect as they do for pcre2_match(), but
|
||||
the details are slightly different. When PCRE2_PARTIAL_HARD is set for
|
||||
pcre2_dfa_match(), it returns PCRE2_ERROR_PARTIAL if the end of the
|
||||
subject is reached and there is still at least one matching possibility
|
||||
that requires additional characters. This happens even if some complete
|
||||
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
||||
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
||||
if the end of the subject is reached, there have been no complete
|
||||
matches have already been found. When PCRE2_PARTIAL_SOFT is set, the
|
||||
return code PCRE2_ERROR_NOMATCH is converted into PCRE2_ERROR_PARTIAL
|
||||
if the end of the subject is reached, there have been no complete
|
||||
matches, but there is still at least one matching possibility. The por-
|
||||
tion of the string that was inspected when the longest partial match
|
||||
tion of the string that was inspected when the longest partial match
|
||||
was found is set as the first matching string in both cases. There is a
|
||||
more detailed discussion of partial and multi-segment matching, with
|
||||
more detailed discussion of partial and multi-segment matching, with
|
||||
examples, in the pcre2partial documentation.
|
||||
|
||||
PCRE2_DFA_SHORTEST
|
||||
|
||||
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
||||
Setting the PCRE2_DFA_SHORTEST option causes the matching algorithm to
|
||||
stop as soon as it has found one match. Because of the way the alterna-
|
||||
tive algorithm works, this is necessarily the shortest possible match
|
||||
tive algorithm works, this is necessarily the shortest possible match
|
||||
at the first possible matching point in the subject string.
|
||||
|
||||
PCRE2_DFA_RESTART
|
||||
|
||||
When pcre2_dfa_match() returns a partial match, it is possible to call
|
||||
When pcre2_dfa_match() returns a partial match, it is possible to call
|
||||
it again, with additional subject characters, and have it continue with
|
||||
the same match. The PCRE2_DFA_RESTART option requests this action; when
|
||||
it is set, the workspace and wscount options must reference the same
|
||||
vector as before because data about the match so far is left in them
|
||||
it is set, the workspace and wscount options must reference the same
|
||||
vector as before because data about the match so far is left in them
|
||||
after a partial match. There is more discussion of this facility in the
|
||||
pcre2partial documentation.
|
||||
|
||||
|
@ -2719,8 +2744,8 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
|
||||
When pcre2_dfa_match() succeeds, it may have matched more than one sub-
|
||||
string in the subject. Note, however, that all the matches from one run
|
||||
of the function start at the same point in the subject. The shorter
|
||||
matches are all initial substrings of the longer matches. For example,
|
||||
of the function start at the same point in the subject. The shorter
|
||||
matches are all initial substrings of the longer matches. For example,
|
||||
if the pattern
|
||||
|
||||
<.*>
|
||||
|
@ -2735,17 +2760,17 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
<something> <something else>
|
||||
<something>
|
||||
|
||||
On success, the yield of the function is a number greater than zero,
|
||||
which is the number of matched substrings. The offsets of the sub-
|
||||
strings are returned in the ovector, and can be extracted by number in
|
||||
the same way as for pcre2_match(), but the numbers bear no relation to
|
||||
any capturing groups that may exist in the pattern, because DFA match-
|
||||
On success, the yield of the function is a number greater than zero,
|
||||
which is the number of matched substrings. The offsets of the sub-
|
||||
strings are returned in the ovector, and can be extracted by number in
|
||||
the same way as for pcre2_match(), but the numbers bear no relation to
|
||||
any capturing groups that may exist in the pattern, because DFA match-
|
||||
ing does not support group capture.
|
||||
|
||||
Calls to the convenience functions that extract substrings by name
|
||||
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
||||
Calls to the convenience functions that extract substrings by name
|
||||
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
|
||||
after a DFA match. The convenience functions that extract substrings by
|
||||
number never return PCRE2_ERROR_NOSUBSTRING, and the meanings of some
|
||||
number never return PCRE2_ERROR_NOSUBSTRING, and the meanings of some
|
||||
other errors are slightly different:
|
||||
|
||||
PCRE2_ERROR_UNAVAILABLE
|
||||
|
@ -2755,64 +2780,64 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
|
||||
PCRE2_ERROR_UNSET
|
||||
|
||||
There is a slot in the ovector for this substring, but there were
|
||||
There is a slot in the ovector for this substring, but there were
|
||||
insufficient matches to fill it.
|
||||
|
||||
The matched strings are stored in the ovector in reverse order of
|
||||
length; that is, the longest matching string is first. If there were
|
||||
too many matches to fit into the ovector, the yield of the function is
|
||||
The matched strings are stored in the ovector in reverse order of
|
||||
length; that is, the longest matching string is first. If there were
|
||||
too many matches to fit into the ovector, the yield of the function is
|
||||
zero, and the vector is filled with the longest matches.
|
||||
|
||||
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
||||
character repeats at the end of a pattern (as well as internally). For
|
||||
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
||||
matching, this means that only one possible match is found. If you
|
||||
really do want multiple matches in such cases, either use an ungreedy
|
||||
repeat auch as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||
NOTE: PCRE2's "auto-possessification" optimization usually applies to
|
||||
character repeats at the end of a pattern (as well as internally). For
|
||||
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
||||
matching, this means that only one possible match is found. If you
|
||||
really do want multiple matches in such cases, either use an ungreedy
|
||||
repeat auch as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||
compiling.
|
||||
|
||||
Error returns from pcre2_dfa_match()
|
||||
|
||||
The pcre2_dfa_match() function returns a negative number when it fails.
|
||||
Many of the errors are the same as for pcre2_match(), as described
|
||||
Many of the errors are the same as for pcre2_match(), as described
|
||||
above. There are in addition the following errors that are specific to
|
||||
pcre2_dfa_match():
|
||||
|
||||
PCRE2_ERROR_DFA_UITEM
|
||||
|
||||
This return is given if pcre2_dfa_match() encounters an item in the
|
||||
This return is given if pcre2_dfa_match() encounters an item in the
|
||||
pattern that it does not support, for instance, the use of \C or a back
|
||||
reference.
|
||||
|
||||
PCRE2_ERROR_DFA_UCOND
|
||||
|
||||
This return is given if pcre2_dfa_match() encounters a condition item
|
||||
that uses a back reference for the condition, or a test for recursion
|
||||
This return is given if pcre2_dfa_match() encounters a condition item
|
||||
that uses a back reference for the condition, or a test for recursion
|
||||
in a specific group. These are not supported.
|
||||
|
||||
PCRE2_ERROR_DFA_WSSIZE
|
||||
|
||||
This return is given if pcre2_dfa_match() runs out of space in the
|
||||
This return is given if pcre2_dfa_match() runs out of space in the
|
||||
workspace vector.
|
||||
|
||||
PCRE2_ERROR_DFA_RECURSE
|
||||
|
||||
When a recursive subpattern is processed, the matching function calls
|
||||
When a recursive subpattern is processed, the matching function calls
|
||||
itself recursively, using private memory for the ovector and workspace.
|
||||
This error is given if the internal ovector is not large enough. This
|
||||
This error is given if the internal ovector is not large enough. This
|
||||
should be extremely rare, as a vector of size 1000 is used.
|
||||
|
||||
PCRE2_ERROR_DFA_BADRESTART
|
||||
|
||||
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
||||
some plausibility checks are made on the contents of the workspace,
|
||||
which should contain data about the previous partial match. If any of
|
||||
When pcre2_dfa_match() is called with the PCRE2_DFA_RESTART option,
|
||||
some plausibility checks are made on the contents of the workspace,
|
||||
which should contain data about the previous partial match. If any of
|
||||
these checks fail, this error is given.
|
||||
|
||||
|
||||
SEE ALSO
|
||||
|
||||
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
||||
pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3),
|
||||
pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2stack(3),
|
||||
pcre2unicode(3).
|
||||
|
||||
|
@ -2826,7 +2851,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 22 April 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -4051,13 +4076,17 @@ CONTROLLING THE JIT STACK
|
|||
interpreter.
|
||||
|
||||
You may safely use the same JIT stack for more than one pattern (either
|
||||
by assigning directly or by callback), as long as the patterns are all
|
||||
matched sequentially in the same thread. In a multithread application,
|
||||
if you do not specify a JIT stack, or if you assign or pass back NULL
|
||||
from a callback, that is thread-safe, because each thread has its own
|
||||
machine stack. However, if you assign or pass back a non-NULL JIT
|
||||
stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
by assigning directly or by callback), as long as the patterns are
|
||||
matched sequentially in the same thread. Currently, the only way to set
|
||||
up non-sequential matches in one thread is to use callouts: if a call-
|
||||
out function starts another match, that match must use a different JIT
|
||||
stack to the one used for currently suspended match(es).
|
||||
|
||||
In a multithread application, if you do not specify a JIT stack, or if
|
||||
you assign or pass back NULL from a callback, that is thread-safe,
|
||||
because each thread has its own machine stack. However, if you assign
|
||||
or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
|
||||
Strictly speaking, even more is allowed. You can assign the same non-
|
||||
NULL stack to a match context that is used by any number of patterns,
|
||||
|
@ -4234,8 +4263,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 27 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 28 July 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -5069,33 +5098,44 @@ VALIDITY OF UTF STRINGS
|
|||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||
this, expecting strings to be in host byte order.
|
||||
|
||||
The entire string is checked before any other processing takes place.
|
||||
In addition to checking the format of the string, there is a check to
|
||||
A UTF string is checked before any other processing takes place. In the
|
||||
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
|
||||
starting offset, the check is applied only to that part of the subject
|
||||
that could be inspected during matching, and there is a check that the
|
||||
starting offset points to the first code unit of a character or to the
|
||||
end of the subject. If there are no lookbehind assertions in the pat-
|
||||
tern, the check starts at the starting offset. Otherwise, it starts at
|
||||
the length of the longest lookbehind before the starting offset, or at
|
||||
the start of the subject if there are not that many characters before
|
||||
the starting offset. Note that the sequences \b and \B are one-charac-
|
||||
ter lookbehinds.
|
||||
|
||||
In addition to checking the format of the string, there is a check to
|
||||
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
||||
the surrogate area. The so-called "non-character" code points are not
|
||||
the surrogate area. The so-called "non-character" code points are not
|
||||
excluded because Unicode corrigendum #9 makes it clear that they should
|
||||
not be.
|
||||
|
||||
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
||||
UTF-16, where they are used in pairs to encode code points with values
|
||||
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
||||
are available independently in the UTF-8 and UTF-32 encodings. (In
|
||||
other words, the whole surrogate thing is a fudge for UTF-16 which
|
||||
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
||||
UTF-16, where they are used in pairs to encode code points with values
|
||||
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
||||
are available independently in the UTF-8 and UTF-32 encodings. (In
|
||||
other words, the whole surrogate thing is a fudge for UTF-16 which
|
||||
unfortunately messes up UTF-8 and UTF-32.)
|
||||
|
||||
In some situations, you may already know that your strings are valid,
|
||||
and therefore want to skip these checks in order to improve perfor-
|
||||
mance, for example in the case of a long subject string that is being
|
||||
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
||||
pile time or at match time, PCRE2 assumes that the pattern or subject
|
||||
In some situations, you may already know that your strings are valid,
|
||||
and therefore want to skip these checks in order to improve perfor-
|
||||
mance, for example in the case of a long subject string that is being
|
||||
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK option at com-
|
||||
pile time or at match time, PCRE2 assumes that the pattern or subject
|
||||
it is given (respectively) contains only valid UTF code unit sequences.
|
||||
|
||||
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
||||
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
||||
for the pattern; it does not also apply to subject strings. If you want
|
||||
to disable the check for a subject string you must pass this option to
|
||||
to disable the check for a subject string you must pass this option to
|
||||
pcre2_match() or pcre2_dfa_match().
|
||||
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||
result is undefined and your program may crash or loop indefinitely.
|
||||
|
||||
Errors in UTF-8 strings
|
||||
|
@ -5108,10 +5148,10 @@ VALIDITY OF UTF STRINGS
|
|||
PCRE2_ERROR_UTF8_ERR4
|
||||
PCRE2_ERROR_UTF8_ERR5
|
||||
|
||||
The string ends with a truncated UTF-8 character; the code specifies
|
||||
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
||||
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
||||
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
||||
The string ends with a truncated UTF-8 character; the code specifies
|
||||
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
||||
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
||||
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
||||
checked first; hence the possibility of 4 or 5 missing bytes.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR6
|
||||
|
@ -5121,24 +5161,24 @@ VALIDITY OF UTF STRINGS
|
|||
PCRE2_ERROR_UTF8_ERR10
|
||||
|
||||
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
||||
the character do not have the binary value 0b10 (that is, either the
|
||||
the character do not have the binary value 0b10 (that is, either the
|
||||
most significant bit is 0, or the next bit is 1).
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR11
|
||||
PCRE2_ERROR_UTF8_ERR12
|
||||
|
||||
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
||||
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
||||
long; these code points are excluded by RFC 3629.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR13
|
||||
|
||||
A 4-byte character has a value greater than 0x10fff; these code points
|
||||
A 4-byte character has a value greater than 0x10fff; these code points
|
||||
are excluded by RFC 3629.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR14
|
||||
|
||||
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
||||
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
||||
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
||||
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
||||
so are excluded from UTF-8.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR15
|
||||
|
@ -5147,26 +5187,26 @@ VALIDITY OF UTF STRINGS
|
|||
PCRE2_ERROR_UTF8_ERR18
|
||||
PCRE2_ERROR_UTF8_ERR19
|
||||
|
||||
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
||||
for a value that can be represented by fewer bytes, which is invalid.
|
||||
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
||||
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
||||
for a value that can be represented by fewer bytes, which is invalid.
|
||||
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
||||
rect coding uses just one byte.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR20
|
||||
|
||||
The two most significant bits of the first byte of a character have the
|
||||
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
||||
ond is 0). Such a byte can only validly occur as the second or subse-
|
||||
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
||||
ond is 0). Such a byte can only validly occur as the second or subse-
|
||||
quent byte of a multi-byte character.
|
||||
|
||||
PCRE2_ERROR_UTF8_ERR21
|
||||
|
||||
The first byte of a character has the value 0xfe or 0xff. These values
|
||||
The first byte of a character has the value 0xfe or 0xff. These values
|
||||
can never occur in a valid UTF-8 string.
|
||||
|
||||
Errors in UTF-16 strings
|
||||
|
||||
The following negative error codes are given for invalid UTF-16
|
||||
The following negative error codes are given for invalid UTF-16
|
||||
strings:
|
||||
|
||||
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
||||
|
@ -5176,7 +5216,7 @@ VALIDITY OF UTF STRINGS
|
|||
|
||||
Errors in UTF-32 strings
|
||||
|
||||
The following negative error codes are given for invalid UTF-32
|
||||
The following negative error codes are given for invalid UTF-32
|
||||
strings:
|
||||
|
||||
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
||||
|
@ -5192,8 +5232,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 18 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "29 August 2015" "PCRE2 10.21"
|
||||
.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -1052,6 +1052,15 @@ after any internal newline. However, it does not match after a newline at the
|
|||
end of the subject, for compatibility with Perl. If you want a multiline
|
||||
circumflex also to match after a terminating newline, you must set
|
||||
PCRE2_ALT_CIRCUMFLEX.
|
||||
.sp
|
||||
PCRE2_ALT_VERBNAMES
|
||||
.sp
|
||||
By default, for compatibility with Perl, the name in any verb sequence such as
|
||||
(*MARK:NAME) is any sequence of characters that does not include a closing
|
||||
parenthesis. The name is not processed in any way, and it is not possible to
|
||||
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
||||
option is set, normal backslash processing is applied to verb names and only an
|
||||
unescaped closing parenthesis terminates the name.
|
||||
.sp
|
||||
PCRE2_AUTO_CALLOUT
|
||||
.sp
|
||||
|
@ -2953,6 +2962,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 29 August 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -1334,7 +1334,7 @@ both specified as literal letters in the same case. For compatibility with
|
|||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
|
||||
specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points
|
||||
are included.
|
||||
.P
|
||||
If a range that includes letters is used when caseless matching is set, it
|
||||
|
@ -2944,14 +2944,21 @@ in production code should be noted to avoid problems during upgrades." The same
|
|||
remarks apply to the PCRE2 features described in this section.
|
||||
.P
|
||||
The new verbs make use of what was previously invalid syntax: an opening
|
||||
parenthesis followed by an asterisk. They are generally of the form
|
||||
(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
||||
differently depending on whether or not a name is present. A name is any
|
||||
sequence of characters that does not include a closing parenthesis. The maximum
|
||||
length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit
|
||||
libraries. If the name is empty, that is, if the closing parenthesis
|
||||
immediately follows the colon, the effect is as if the colon were not there.
|
||||
Any number of these verbs may occur in a pattern.
|
||||
parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
||||
(*VERB:NAME). Some verbs take either form, possibly behaving differently
|
||||
depending on whether or not a name is present.
|
||||
.P
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
any way, and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name.
|
||||
.P
|
||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||
not there. Any number of these verbs may occur in a pattern.
|
||||
.P
|
||||
Since these verbs are specifically related to backtracking, most of them can be
|
||||
used only when the pattern is to be matched using the traditional matching
|
||||
|
@ -3376,6 +3383,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 24 July 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -445,6 +445,7 @@ for a description of their effects.
|
|||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
anchored set PCRE2_ANCHORED
|
||||
auto_callout set PCRE2_AUTO_CALLOUT
|
||||
/i caseless set PCRE2_CASELESS
|
||||
|
|
|
@ -285,12 +285,14 @@ COMMAND LINES
|
|||
MODIFIER SYNTAX
|
||||
|
||||
Modifier lists are used with both pattern and subject lines. Items in a
|
||||
list are separated by commas and optional white space. Some modifiers
|
||||
may be given for both patterns and subject lines, whereas others are
|
||||
valid for one or the other only. Each modifier has a long name, for
|
||||
example "anchored", and some of them must be followed by an equals sign
|
||||
and a value, for example, "offset=12". Modifiers that do not take val-
|
||||
ues may be preceded by a minus sign to turn off a previous setting.
|
||||
list are separated by commas followed by optional white space. Trailing
|
||||
whitespace in a modifier list is ignored. Some modifiers may be given
|
||||
for both patterns and subject lines, whereas others are valid only for
|
||||
one or the other. Each modifier has a long name, for example
|
||||
"anchored", and some of them must be followed by an equals sign and a
|
||||
value, for example, "offset=12". Values cannot contain comma charac-
|
||||
ters, but may contain spaces. Modifiers that do not take values may be
|
||||
preceded by a minus sign to turn off a previous setting.
|
||||
|
||||
A few of the more common modifiers can also be specified as single let-
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
|
@ -424,6 +426,7 @@ PATTERN MODIFIERS
|
|||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
anchored set PCRE2_ANCHORED
|
||||
auto_callout set PCRE2_AUTO_CALLOUT
|
||||
/i caseless set PCRE2_CASELESS
|
||||
|
@ -1330,5 +1333,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 20 May 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
|
|
|
@ -120,6 +120,7 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_UTF 0x00080000u /* C J M D */
|
||||
#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */
|
||||
#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */
|
||||
#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
|
|
|
@ -561,12 +561,12 @@ static PCRE2_SPTR posix_substitutes[] = {
|
|||
|
||||
#define PUBLIC_COMPILE_OPTIONS \
|
||||
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
|
||||
PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL| \
|
||||
PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \
|
||||
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
|
||||
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
|
||||
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \
|
||||
PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
||||
PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
|
||||
PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
|
||||
PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
|
||||
PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
|
||||
PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
|
||||
PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
||||
|
||||
/* Compile time error code numbers. They are given names so that they can more
|
||||
easily be tracked. When a new number is added, the tables called eint1 and
|
||||
|
@ -5382,13 +5382,52 @@ for (;; ptr++)
|
|||
|
||||
/* It appears that Perl allows any characters whatsoever, other than
|
||||
a closing parenthesis, to appear in arguments, so we no longer insist on
|
||||
letters, digits, and underscores. */
|
||||
letters, digits, and underscores. Perl does not, however, do any
|
||||
interpretation within arguments, and has no means of including a closing
|
||||
parenthesis. PCRE supports escape processing but only when it is
|
||||
requested by an option. Note that check_escape() will not return values
|
||||
greater than the code unit maximum when not in UTF mode. */
|
||||
|
||||
if (*ptr == CHAR_COLON)
|
||||
{
|
||||
arg = ++ptr;
|
||||
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
|
||||
arglen = (int)(ptr - arg);
|
||||
|
||||
if ((options & PCRE2_ALT_VERBNAMES) == 0)
|
||||
{
|
||||
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
|
||||
arglen = (int)(ptr - arg);
|
||||
}
|
||||
else
|
||||
{
|
||||
arglen = 0;
|
||||
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS)
|
||||
{
|
||||
if (*ptr == '\\')
|
||||
{
|
||||
uint32_t x;
|
||||
*errorcodeptr = 0;
|
||||
i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
|
||||
if (*errorcodeptr != 0) goto FAILED;
|
||||
if (i != 0)
|
||||
{
|
||||
*errorcodeptr = ERR40;
|
||||
goto FAILED;
|
||||
}
|
||||
#ifdef SUPPORT_UNICODE
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
for (i = 0; i < PRIV(utf8_table1_size); i++)
|
||||
if ((int)x <= PRIV(utf8_table1)[i]) break;
|
||||
arglen += i;
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if (x > 0xffff) arglen++;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
arglen++;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
if ((unsigned int)arglen > MAX_MARK)
|
||||
{
|
||||
*errorcodeptr = ERR76;
|
||||
|
@ -5456,8 +5495,42 @@ for (;; ptr++)
|
|||
}
|
||||
setverb = *code++ = verbs[i].op_arg;
|
||||
*code++ = arglen;
|
||||
memcpy(code, arg, CU2BYTES(arglen));
|
||||
code += arglen;
|
||||
|
||||
/* If we are processing the argument for escapes, we don't need
|
||||
to apply checks here because it was all checked above when
|
||||
computing the length. */
|
||||
|
||||
if ((options & PCRE2_ALT_VERBNAMES) != 0)
|
||||
{
|
||||
for (; arg != ptr; arg++)
|
||||
{
|
||||
if (*arg == '\\')
|
||||
{
|
||||
uint32_t x;
|
||||
*errorcodeptr = 0;
|
||||
(void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
|
||||
cb);
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
PCRE2_UCHAR cbuff[8];
|
||||
x = PRIV(ord2utf)(x, cbuff);
|
||||
memcpy(code, cbuff, CU2BYTES(x));
|
||||
code += x;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
*code++ = x;
|
||||
}
|
||||
else *code++ = *arg;
|
||||
}
|
||||
}
|
||||
else /* No argument processing */
|
||||
{
|
||||
memcpy(code, arg, CU2BYTES(arglen));
|
||||
code += arglen;
|
||||
}
|
||||
|
||||
*code++ = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -111,7 +111,7 @@ static const char compile_error_texts[] =
|
|||
"number after (?C is greater than 255\0"
|
||||
"closing parenthesis for (?C expected\0"
|
||||
/* 40 */
|
||||
"SPARE ERROR\0"
|
||||
"invalid escape sequence in (*VERB) name\0"
|
||||
"unrecognized character after (?P\0"
|
||||
"syntax error in subpattern name (missing terminator)\0"
|
||||
"two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
|
||||
|
|
|
@ -496,6 +496,7 @@ static modstruct modlist[] = {
|
|||
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
|
||||
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
|
||||
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
|
||||
{ "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) },
|
||||
{ "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) },
|
||||
{ "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) },
|
||||
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
||||
|
@ -3467,10 +3468,11 @@ static void
|
|||
show_compile_options(uint32_t options, const char *before, const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "",
|
||||
((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "",
|
||||
((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "",
|
||||
((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "",
|
||||
((options & PCRE2_ANCHORED) != 0)? " anchored" : "",
|
||||
((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "",
|
||||
|
|
|
@ -4442,4 +4442,11 @@ a random value. /Ix
|
|||
/((*MARK:A))++a(*SKIP:B)b/
|
||||
aacb
|
||||
|
||||
/(*MARK:a\zb)z/alt_verbnames
|
||||
|
||||
/(*:ab\t(d\)c)xxx/
|
||||
|
||||
/(*:ab\t(d\)c)xxx/alt_verbnames,mark
|
||||
cxxxz
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -1662,4 +1662,9 @@
|
|||
/[\pS#moq]/
|
||||
=
|
||||
|
||||
# UTF tests
|
||||
|
||||
/(*:a\x{12345}b\t(d\)c)xxx/utf,alt_verbnames,mark
|
||||
cxxxz
|
||||
|
||||
# End of testinput5
|
||||
|
|
|
@ -251,4 +251,6 @@
|
|||
|
||||
/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B
|
||||
|
||||
/(*MARK:a\x{100}b)z/alt_verbnames
|
||||
|
||||
# End of testinput9
|
||||
|
|
|
@ -14713,4 +14713,15 @@ No match
|
|||
aacb
|
||||
No match
|
||||
|
||||
/(*MARK:a\zb)z/alt_verbnames
|
||||
Failed: error 140 at offset 9: invalid escape sequence in (*VERB) name
|
||||
|
||||
/(*:ab\t(d\)c)xxx/
|
||||
Failed: error 122 at offset 12: unmatched closing parenthesis
|
||||
|
||||
/(*:ab\t(d\)c)xxx/alt_verbnames,mark
|
||||
cxxxz
|
||||
0: xxx
|
||||
MK: ab\x09(d)c
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -4064,4 +4064,11 @@ No match
|
|||
=
|
||||
0: =
|
||||
|
||||
# UTF tests
|
||||
|
||||
/(*:a\x{12345}b\t(d\)c)xxx/utf,alt_verbnames,mark
|
||||
cxxxz
|
||||
0: xxx
|
||||
MK: a\x{12345}b\x{09}(d)c
|
||||
|
||||
# End of testinput5
|
||||
|
|
|
@ -356,4 +356,7 @@ Failed: error 177 at offset 6: character code point value in \u.... sequence is
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/(*MARK:a\x{100}b)z/alt_verbnames
|
||||
Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large
|
||||
|
||||
# End of testinput9
|
||||
|
|
Loading…
Reference in New Issue