Implemented PCRE2_ALT_VERBNAMES
This commit is contained in:
parent
fd08e11c1e
commit
d2e87a75af
2
132html
2
132html
|
@ -148,7 +148,7 @@ while (<STDIN>)
|
|||
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
|
||||
$ref, $ref);
|
||||
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
|
||||
$ref, $ref);
|
||||
$ref);
|
||||
$ref++;
|
||||
}
|
||||
else
|
||||
|
|
|
@ -167,6 +167,8 @@ test (there are now 20 in total).
|
|||
47. Modifier lists in pcre2test were splitting at spaces without the required
|
||||
commas.
|
||||
|
||||
48. Implemented PCRE2_ALT_VERBNAMES.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
|
|
@ -97,6 +97,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_context.c
|
||||
pcre2_dfa_match.c
|
||||
pcre2_error.c
|
||||
pcre2_find_bracket.c
|
||||
pcre2_jit_compile.c
|
||||
pcre2_maketables.c
|
||||
pcre2_match.c
|
||||
|
@ -388,4 +389,4 @@ and executable, is in EBCDIC and native z/OS file formats and this is the
|
|||
recommended download site.
|
||||
|
||||
=============================
|
||||
Last Updated: 15 June 2015
|
||||
Last Updated: 16 July 2015
|
||||
|
|
|
@ -724,6 +724,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_context.c )
|
||||
src/pcre2_dfa_match.c )
|
||||
src/pcre2_error.c )
|
||||
src/pcre2_find_bracket.c )
|
||||
src/pcre2_jit_compile.c )
|
||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||
|
@ -832,4 +833,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 24 April 2015
|
||||
Last updated: 16 July 2015
|
||||
|
|
|
@ -19,7 +19,7 @@ SYNOPSIS
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
|
|
|
@ -19,7 +19,7 @@ SYNOPSIS
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
|
|
|
@ -19,8 +19,8 @@ SYNOPSIS
|
|||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
|
|
|
@ -70,15 +70,15 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
|
||||
|
@ -936,7 +936,7 @@ The <i>where</i> argument should point to a buffer that is at least 24 code
|
|||
units long. (The exact length required can be found by calling
|
||||
<b>pcre2_config()</b> with <b>where</b> set to NULL.) If PCRE2 has been compiled
|
||||
without Unicode support, the buffer is filled with the text "Unicode not
|
||||
supported". Otherwise, the Unicode version string (for example, "7.0.0") is
|
||||
supported". Otherwise, the Unicode version string (for example, "8.0.0") is
|
||||
inserted. The number of code units used is returned. This is the length of the
|
||||
string plus one unit for the terminating zero.
|
||||
<pre>
|
||||
|
@ -961,7 +961,7 @@ zero.
|
|||
<b> pcre2_compile_context *<i>ccontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
<b>void pcre2_code_free(pcre2_code *<i>code</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
||||
|
@ -1083,6 +1083,15 @@ after any internal newline. However, it does not match after a newline at the
|
|||
end of the subject, for compatibility with Perl. If you want a multiline
|
||||
circumflex also to match after a terminating newline, you must set
|
||||
PCRE2_ALT_CIRCUMFLEX.
|
||||
<pre>
|
||||
PCRE2_ALT_VERBNAMES
|
||||
</pre>
|
||||
By default, for compatibility with Perl, the name in any verb sequence such as
|
||||
(*MARK:NAME) is any sequence of characters that does not include a closing
|
||||
parenthesis. The name is not processed in any way, and it is not possible to
|
||||
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
||||
option is set, normal backslash processing is applied to verb names and only an
|
||||
unescaped closing parenthesis terminates the name.
|
||||
<pre>
|
||||
PCRE2_AUTO_CALLOUT
|
||||
</pre>
|
||||
|
@ -1778,12 +1787,12 @@ documentation.
|
|||
<a name="matchdatablock"></a></P>
|
||||
<br><a name="SEC25" href="#TOC1">THE MATCH DATA BLOCK</a><br>
|
||||
<P>
|
||||
<b>pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create(uint32_t <i>ovecsize</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_data_create_from_pattern(const pcre2_code *<i>code</i>,</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<b>pcre2_match_data *pcre2_match_data_create_from_pattern(</b>
|
||||
<b> const pcre2_code *<i>code</i>, pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b>
|
||||
|
@ -2010,12 +2019,20 @@ If the pattern is anchored, such a match can occur only if the pattern contains
|
|||
</pre>
|
||||
When PCRE2_UTF is set at compile time, the validity of the subject as a UTF
|
||||
string is checked by default when <b>pcre2_match()</b> is subsequently called.
|
||||
The entire string is checked before any other processing takes place, and a
|
||||
If a non-zero starting offset is given, the check is applied only to that part
|
||||
of the subject that could be inspected during matching, and there is a check
|
||||
that the starting offset points to the first code unit of a character or to the
|
||||
end of the subject. If there are no lookbehind assertions in the pattern, the
|
||||
check starts at the starting offset. Otherwise, it starts at the length of the
|
||||
longest lookbehind before the starting offset, or at the start of the subject
|
||||
if there are not that many characters before the starting offset. Note that the
|
||||
sequences \b and \B are one-character lookbehinds.
|
||||
</P>
|
||||
<P>
|
||||
The check is carried out before any other processing takes place, and a
|
||||
negative error code is returned if the check fails. There are several UTF error
|
||||
codes for each code unit width, corresponding to different problems with the
|
||||
code unit sequence. The value of <i>startoffset</i> is also checked, to ensure
|
||||
that it points to the start of a character or to the end of the subject. There
|
||||
are discussions about the validity of
|
||||
code unit sequence. There are discussions about the validity of
|
||||
<a href="pcre2unicode.html#utf8strings">UTF-8 strings,</a>
|
||||
<a href="pcre2unicode.html#utf16strings">UTF-16 strings,</a>
|
||||
and
|
||||
|
@ -2564,12 +2581,12 @@ be given as PCRE2_ZERO_TERMINATED for a zero-terminated string.
|
|||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK option is set, a
|
||||
dollar character is an escape character that can specify the insertion of
|
||||
characters from capturing groups in the pattern. The following forms are
|
||||
recognized:
|
||||
characters from capturing groups or (*MARK) items in the pattern. The following
|
||||
forms are recognized:
|
||||
<pre>
|
||||
$$ insert a dollar character
|
||||
$<n> insert the contents of group <n>
|
||||
${<n>} insert the contents of group <n>
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
</pre>
|
||||
Either a group number or a group name can be given for <n>. Curly brackets are
|
||||
required only if the following character would be interpreted as part of the
|
||||
|
@ -2580,6 +2597,15 @@ calling <b>pcre2_copy_byname()</b> or <b>pcre2_copy_bynumber()</b> as
|
|||
appropriate.
|
||||
</P>
|
||||
<P>
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||
<pre>
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
The first seven arguments of <b>pcre2_substitute()</b> are the same as for
|
||||
<b>pcre2_match()</b>, except that the partial matching options are not
|
||||
permitted, and <i>match_data</i> may be passed as NULL, in which case a match
|
||||
|
@ -2883,7 +2909,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 22 April 2015
|
||||
Last updated: 30 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -224,8 +224,14 @@ whether a match operation was executed by JIT or by the interpreter.
|
|||
</P>
|
||||
<P>
|
||||
You may safely use the same JIT stack for more than one pattern (either by
|
||||
assigning directly or by callback), as long as the patterns are all matched
|
||||
sequentially in the same thread. In a multithread application, if you do not
|
||||
assigning directly or by callback), as long as the patterns are matched
|
||||
sequentially in the same thread. Currently, the only way to set up
|
||||
non-sequential matches in one thread is to use callouts: if a callout function
|
||||
starts another match, that match must use a different JIT stack to the one used
|
||||
for currently suspended match(es).
|
||||
</P>
|
||||
<P>
|
||||
In a multithread application, if you do not
|
||||
specify a JIT stack, or if you assign or pass back NULL from a callback, that
|
||||
is thread-safe, because each thread has its own machine stack. However, if you
|
||||
assign or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
|
@ -419,9 +425,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 27 November 2014
|
||||
Last updated: 28 July 2015
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -736,6 +736,8 @@ Those that are not part of an identified script are lumped together as
|
|||
"Common". The current list of scripts is:
|
||||
</P>
|
||||
<P>
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
|
@ -776,6 +778,7 @@ Gurmukhi,
|
|||
Han,
|
||||
Hangul,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
|
@ -812,12 +815,14 @@ Miao,
|
|||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
New_Tai_Lue,
|
||||
Nko,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
|
@ -839,6 +844,7 @@ Saurashtra,
|
|||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sora_Sompeng,
|
||||
Sundanese,
|
||||
|
@ -1322,9 +1328,19 @@ where a range ending character is expected. For example, [z-\xff] is valid,
|
|||
but [A-\d] and [A-[:digit:]] are not.
|
||||
</P>
|
||||
<P>
|
||||
Ranges operate in the collating sequence of character values. They can also be
|
||||
used for characters specified numerically, for example [\000-\037]. Ranges
|
||||
can include any characters that are valid for the current mode.
|
||||
Ranges normally include all code points between the start and end characters,
|
||||
inclusive. They can also be used for code points specified numerically, for
|
||||
example [\000-\037]. Ranges can include any characters that are valid for the
|
||||
current mode.
|
||||
</P>
|
||||
<P>
|
||||
There is a special case in EBCDIC environments for ranges whose end points are
|
||||
both specified as literal letters in the same case. For compatibility with
|
||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
specified numerically, for example, [\x88-\x92] or [h-\x92], all code points
|
||||
are included.
|
||||
</P>
|
||||
<P>
|
||||
If a range that includes letters is used when caseless matching is set, it
|
||||
|
@ -2899,14 +2915,23 @@ remarks apply to the PCRE2 features described in this section.
|
|||
</P>
|
||||
<P>
|
||||
The new verbs make use of what was previously invalid syntax: an opening
|
||||
parenthesis followed by an asterisk. They are generally of the form
|
||||
(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
||||
differently depending on whether or not a name is present. A name is any
|
||||
sequence of characters that does not include a closing parenthesis. The maximum
|
||||
length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit
|
||||
libraries. If the name is empty, that is, if the closing parenthesis
|
||||
immediately follows the colon, the effect is as if the colon were not there.
|
||||
Any number of these verbs may occur in a pattern.
|
||||
parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
||||
(*VERB:NAME). Some verbs take either form, possibly behaving differently
|
||||
depending on whether or not a name is present.
|
||||
</P>
|
||||
<P>
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
any way, and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||
not there. Any number of these verbs may occur in a pattern.
|
||||
</P>
|
||||
<P>
|
||||
Since these verbs are specifically related to backtracking, most of them can be
|
||||
|
@ -3323,7 +3348,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 13 June 2015
|
||||
Last updated: 30 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -187,6 +187,8 @@ at release 5.18.
|
|||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">SCRIPT NAMES FOR \p AND \P</a><br>
|
||||
<P>
|
||||
Ahom,
|
||||
Anatolian_Hieroglyphs,
|
||||
Arabic,
|
||||
Armenian,
|
||||
Avestan,
|
||||
|
@ -227,6 +229,7 @@ Gurmukhi,
|
|||
Han,
|
||||
Hangul,
|
||||
Hanunoo,
|
||||
Hatran,
|
||||
Hebrew,
|
||||
Hiragana,
|
||||
Imperial_Aramaic,
|
||||
|
@ -263,12 +266,14 @@ Miao,
|
|||
Modi,
|
||||
Mongolian,
|
||||
Mro,
|
||||
Multani,
|
||||
Myanmar,
|
||||
Nabataean,
|
||||
New_Tai_Lue,
|
||||
Nko,
|
||||
Ogham,
|
||||
Ol_Chiki,
|
||||
Old_Hungarian,
|
||||
Old_Italic,
|
||||
Old_North_Arabian,
|
||||
Old_Permic,
|
||||
|
@ -290,6 +295,7 @@ Saurashtra,
|
|||
Sharada,
|
||||
Shavian,
|
||||
Siddham,
|
||||
SignWriting,
|
||||
Sinhala,
|
||||
Sora_Sompeng,
|
||||
Sundanese,
|
||||
|
@ -582,7 +588,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 13 June 2015
|
||||
Last updated: 17 July 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -340,12 +340,13 @@ subject lines. Modifiers on a subject line can change these settings.
|
|||
<br><a name="SEC7" href="#TOC1">MODIFIER SYNTAX</a><br>
|
||||
<P>
|
||||
Modifier lists are used with both pattern and subject lines. Items in a list
|
||||
are separated by commas and optional white space. Some modifiers may be given
|
||||
for both patterns and subject lines, whereas others are valid for one or the
|
||||
other only. Each modifier has a long name, for example "anchored", and some of
|
||||
them must be followed by an equals sign and a value, for example, "offset=12".
|
||||
Modifiers that do not take values may be preceded by a minus sign to turn off a
|
||||
previous setting.
|
||||
are separated by commas followed by optional white space. Trailing whitespace
|
||||
in a modifier list is ignored. Some modifiers may be given for both patterns
|
||||
and subject lines, whereas others are valid only for one or the other. Each
|
||||
modifier has a long name, for example "anchored", and some of them must be
|
||||
followed by an equals sign and a value, for example, "offset=12". Values cannot
|
||||
contain comma characters, but may contain spaces. Modifiers that do not take
|
||||
values may be preceded by a minus sign to turn off a previous setting.
|
||||
</P>
|
||||
<P>
|
||||
A few of the more common modifiers can also be specified as single letters, for
|
||||
|
@ -479,6 +480,7 @@ for a description of their effects.
|
|||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
anchored set PCRE2_ANCHORED
|
||||
auto_callout set PCRE2_AUTO_CALLOUT
|
||||
/i caseless set PCRE2_CASELESS
|
||||
|
@ -1469,7 +1471,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 20 May 2015
|
||||
Last updated: 30 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -126,11 +126,22 @@ as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
|||
strings to be in host byte order.
|
||||
</P>
|
||||
<P>
|
||||
The entire string is checked before any other processing takes place. In
|
||||
addition to checking the format of the string, there is a check to ensure that
|
||||
all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
|
||||
The so-called "non-character" code points are not excluded because Unicode
|
||||
corrigendum #9 makes it clear that they should not be.
|
||||
A UTF string is checked before any other processing takes place. In the case of
|
||||
<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b> calls with a non-zero starting
|
||||
offset, the check is applied only to that part of the subject that could be
|
||||
inspected during matching, and there is a check that the starting offset points
|
||||
to the first code unit of a character or to the end of the subject. If there
|
||||
are no lookbehind assertions in the pattern, the check starts at the starting
|
||||
offset. Otherwise, it starts at the length of the longest lookbehind before the
|
||||
starting offset, or at the start of the subject if there are not that many
|
||||
characters before the starting offset. Note that the sequences \b and \B are
|
||||
one-character lookbehinds.
|
||||
</P>
|
||||
<P>
|
||||
In addition to checking the format of the string, there is a check to ensure
|
||||
that all code points lie in the range U+0 to U+10FFFF, excluding the surrogate
|
||||
area. The so-called "non-character" code points are not excluded because
|
||||
Unicode corrigendum #9 makes it clear that they should not be.
|
||||
</P>
|
||||
<P>
|
||||
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
|
||||
|
@ -264,9 +275,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 23 November 2014
|
||||
Last updated: 18 August 2015
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
104
doc/pcre2.txt
104
doc/pcre2.txt
|
@ -190,13 +190,13 @@ PCRE2 NATIVE API BASIC FUNCTIONS
|
|||
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
||||
pcre2_compile_context *ccontext);
|
||||
|
||||
pcre2_code_free(pcre2_code *code);
|
||||
void pcre2_code_free(pcre2_code *code);
|
||||
|
||||
pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_general_context *gcontext);
|
||||
|
||||
pcre2_match_data_create_from_pattern(const pcre2_code *code,
|
||||
pcre2_general_context *gcontext);
|
||||
pcre2_match_data *pcre2_match_data_create_from_pattern(
|
||||
const pcre2_code *code, pcre2_general_context *gcontext);
|
||||
|
||||
int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
|
||||
PCRE2_SIZE length, PCRE2_SIZE startoffset,
|
||||
|
@ -989,7 +989,7 @@ CHECKING BUILD-TIME OPTIONS
|
|||
pcre2_config() with where set to NULL.) If PCRE2 has been compiled
|
||||
without Unicode support, the buffer is filled with the text "Unicode
|
||||
not supported". Otherwise, the Unicode version string (for example,
|
||||
"7.0.0") is inserted. The number of code units used is returned. This
|
||||
"8.0.0") is inserted. The number of code units used is returned. This
|
||||
is the length of the string plus one unit for the terminating zero.
|
||||
|
||||
PCRE2_CONFIG_UNICODE
|
||||
|
@ -1014,7 +1014,7 @@ COMPILING A PATTERN
|
|||
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
|
||||
pcre2_compile_context *ccontext);
|
||||
|
||||
pcre2_code_free(pcre2_code *code);
|
||||
void pcre2_code_free(pcre2_code *code);
|
||||
|
||||
The pcre2_compile() function compiles a pattern into an internal form.
|
||||
The pattern is defined by a pointer to a string of code units and a
|
||||
|
@ -1128,6 +1128,16 @@ COMPILING A PATTERN
|
|||
Perl. If you want a multiline circumflex also to match after a termi-
|
||||
nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
|
||||
|
||||
PCRE2_ALT_VERBNAMES
|
||||
|
||||
By default, for compatibility with Perl, the name in any verb sequence
|
||||
such as (*MARK:NAME) is any sequence of characters that does not
|
||||
include a closing parenthesis. The name is not processed in any way,
|
||||
and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash
|
||||
processing is applied to verb names and only an unescaped closing
|
||||
parenthesis terminates the name.
|
||||
|
||||
PCRE2_AUTO_CALLOUT
|
||||
|
||||
If this bit is set, pcre2_compile() automatically inserts callout
|
||||
|
@ -1809,11 +1819,11 @@ SERIALIZATION AND PRECOMPILING
|
|||
|
||||
THE MATCH DATA BLOCK
|
||||
|
||||
pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_match_data *pcre2_match_data_create(uint32_t ovecsize,
|
||||
pcre2_general_context *gcontext);
|
||||
|
||||
pcre2_match_data_create_from_pattern(const pcre2_code *code,
|
||||
pcre2_general_context *gcontext);
|
||||
pcre2_match_data *pcre2_match_data_create_from_pattern(
|
||||
const pcre2_code *code, pcre2_general_context *gcontext);
|
||||
|
||||
void pcre2_match_data_free(pcre2_match_data *match_data);
|
||||
|
||||
|
@ -2022,12 +2032,20 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
|
|||
|
||||
When PCRE2_UTF is set at compile time, the validity of the subject as a
|
||||
UTF string is checked by default when pcre2_match() is subsequently
|
||||
called. The entire string is checked before any other processing takes
|
||||
place, and a negative error code is returned if the check fails. There
|
||||
are several UTF error codes for each code unit width, corresponding to
|
||||
different problems with the code unit sequence. The value of startoff-
|
||||
set is also checked, to ensure that it points to the start of a charac-
|
||||
ter or to the end of the subject. There are discussions about the
|
||||
called. If a non-zero starting offset is given, the check is applied
|
||||
only to that part of the subject that could be inspected during match-
|
||||
ing, and there is a check that the starting offset points to the first
|
||||
code unit of a character or to the end of the subject. If there are no
|
||||
lookbehind assertions in the pattern, the check starts at the starting
|
||||
offset. Otherwise, it starts at the length of the longest lookbehind
|
||||
before the starting offset, or at the start of the subject if there are
|
||||
not that many characters before the starting offset. Note that the
|
||||
sequences \b and \B are one-character lookbehinds.
|
||||
|
||||
The check is carried out before any other processing takes place, and a
|
||||
negative error code is returned if the check fails. There are several
|
||||
UTF error codes for each code unit width, corresponding to different
|
||||
problems with the code unit sequence. There are discussions about the
|
||||
validity of UTF-8 strings, UTF-16 strings, and UTF-32 strings in the
|
||||
pcre2unicode page.
|
||||
|
||||
|
@ -2525,12 +2543,12 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
In the replacement string, which is interpreted as a UTF string in UTF
|
||||
mode, and is checked for UTF validity unless the PCRE2_NO_UTF_CHECK
|
||||
option is set, a dollar character is an escape character that can spec-
|
||||
ify the insertion of characters from capturing groups in the pattern.
|
||||
The following forms are recognized:
|
||||
ify the insertion of characters from capturing groups or (*MARK) items
|
||||
in the pattern. The following forms are recognized:
|
||||
|
||||
$$ insert a dollar character
|
||||
$<n> insert the contents of group <n>
|
||||
${<n>} insert the contents of group <n>
|
||||
$<n> or ${<n>} insert the contents of group <n>
|
||||
$*MARK or ${*MARK} insert the name of the last (*MARK) encountered
|
||||
|
||||
Either a group number or a group name can be given for <n>. Curly
|
||||
brackets are required only if the following character would be inter-
|
||||
|
@ -2540,6 +2558,13 @@ CREATING A NEW STRING WITH SUBSTITUTIONS
|
|||
is "=+babcb+=". Group insertion is done by calling pcre2_copy_byname()
|
||||
or pcre2_copy_bynumber() as appropriate.
|
||||
|
||||
The facility for inserting a (*MARK) name can be used to perform simple
|
||||
simultaneous substitutions, as this pcre2test example shows:
|
||||
|
||||
/(*:pear)apple|(*:orange)lemon/g,replace=${*MARK}
|
||||
apple lemon
|
||||
2: pear orange
|
||||
|
||||
The first seven arguments of pcre2_substitute() are the same as for
|
||||
pcre2_match(), except that the partial matching options are not permit-
|
||||
ted, and match_data may be passed as NULL, in which case a match data
|
||||
|
@ -2826,7 +2851,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 22 April 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -4051,13 +4076,17 @@ CONTROLLING THE JIT STACK
|
|||
interpreter.
|
||||
|
||||
You may safely use the same JIT stack for more than one pattern (either
|
||||
by assigning directly or by callback), as long as the patterns are all
|
||||
matched sequentially in the same thread. In a multithread application,
|
||||
if you do not specify a JIT stack, or if you assign or pass back NULL
|
||||
from a callback, that is thread-safe, because each thread has its own
|
||||
machine stack. However, if you assign or pass back a non-NULL JIT
|
||||
stack, this must be a different stack for each thread so that the
|
||||
application is thread-safe.
|
||||
by assigning directly or by callback), as long as the patterns are
|
||||
matched sequentially in the same thread. Currently, the only way to set
|
||||
up non-sequential matches in one thread is to use callouts: if a call-
|
||||
out function starts another match, that match must use a different JIT
|
||||
stack to the one used for currently suspended match(es).
|
||||
|
||||
In a multithread application, if you do not specify a JIT stack, or if
|
||||
you assign or pass back NULL from a callback, that is thread-safe,
|
||||
because each thread has its own machine stack. However, if you assign
|
||||
or pass back a non-NULL JIT stack, this must be a different stack for
|
||||
each thread so that the application is thread-safe.
|
||||
|
||||
Strictly speaking, even more is allowed. You can assign the same non-
|
||||
NULL stack to a match context that is used by any number of patterns,
|
||||
|
@ -4234,8 +4263,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 27 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 28 July 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -5069,7 +5098,18 @@ VALIDITY OF UTF STRINGS
|
|||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||
this, expecting strings to be in host byte order.
|
||||
|
||||
The entire string is checked before any other processing takes place.
|
||||
A UTF string is checked before any other processing takes place. In the
|
||||
case of pcre2_match() and pcre2_dfa_match() calls with a non-zero
|
||||
starting offset, the check is applied only to that part of the subject
|
||||
that could be inspected during matching, and there is a check that the
|
||||
starting offset points to the first code unit of a character or to the
|
||||
end of the subject. If there are no lookbehind assertions in the pat-
|
||||
tern, the check starts at the starting offset. Otherwise, it starts at
|
||||
the length of the longest lookbehind before the starting offset, or at
|
||||
the start of the subject if there are not that many characters before
|
||||
the starting offset. Note that the sequences \b and \B are one-charac-
|
||||
ter lookbehinds.
|
||||
|
||||
In addition to checking the format of the string, there is a check to
|
||||
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
||||
the surrogate area. The so-called "non-character" code points are not
|
||||
|
@ -5192,8 +5232,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 23 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 18 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "29 August 2015" "PCRE2 10.21"
|
||||
.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -1052,6 +1052,15 @@ after any internal newline. However, it does not match after a newline at the
|
|||
end of the subject, for compatibility with Perl. If you want a multiline
|
||||
circumflex also to match after a terminating newline, you must set
|
||||
PCRE2_ALT_CIRCUMFLEX.
|
||||
.sp
|
||||
PCRE2_ALT_VERBNAMES
|
||||
.sp
|
||||
By default, for compatibility with Perl, the name in any verb sequence such as
|
||||
(*MARK:NAME) is any sequence of characters that does not include a closing
|
||||
parenthesis. The name is not processed in any way, and it is not possible to
|
||||
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
||||
option is set, normal backslash processing is applied to verb names and only an
|
||||
unescaped closing parenthesis terminates the name.
|
||||
.sp
|
||||
PCRE2_AUTO_CALLOUT
|
||||
.sp
|
||||
|
@ -2953,6 +2962,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 29 August 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
|
||||
.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -1334,7 +1334,7 @@ both specified as literal letters in the same case. For compatibility with
|
|||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
|
||||
specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points
|
||||
are included.
|
||||
.P
|
||||
If a range that includes letters is used when caseless matching is set, it
|
||||
|
@ -2944,14 +2944,21 @@ in production code should be noted to avoid problems during upgrades." The same
|
|||
remarks apply to the PCRE2 features described in this section.
|
||||
.P
|
||||
The new verbs make use of what was previously invalid syntax: an opening
|
||||
parenthesis followed by an asterisk. They are generally of the form
|
||||
(*VERB) or (*VERB:NAME). Some may take either form, possibly behaving
|
||||
differently depending on whether or not a name is present. A name is any
|
||||
sequence of characters that does not include a closing parenthesis. The maximum
|
||||
length of name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit
|
||||
libraries. If the name is empty, that is, if the closing parenthesis
|
||||
immediately follows the colon, the effect is as if the colon were not there.
|
||||
Any number of these verbs may occur in a pattern.
|
||||
parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
||||
(*VERB:NAME). Some verbs take either form, possibly behaving differently
|
||||
depending on whether or not a name is present.
|
||||
.P
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
any way, and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name.
|
||||
.P
|
||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
16-bit and 32-bit libraries. If the name is empty, that is, if the closing
|
||||
parenthesis immediately follows the colon, the effect is as if the colon were
|
||||
not there. Any number of these verbs may occur in a pattern.
|
||||
.P
|
||||
Since these verbs are specifically related to backtracking, most of them can be
|
||||
used only when the pattern is to be matched using the traditional matching
|
||||
|
@ -3376,6 +3383,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 24 July 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -445,6 +445,7 @@ for a description of their effects.
|
|||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
anchored set PCRE2_ANCHORED
|
||||
auto_callout set PCRE2_AUTO_CALLOUT
|
||||
/i caseless set PCRE2_CASELESS
|
||||
|
|
|
@ -285,12 +285,14 @@ COMMAND LINES
|
|||
MODIFIER SYNTAX
|
||||
|
||||
Modifier lists are used with both pattern and subject lines. Items in a
|
||||
list are separated by commas and optional white space. Some modifiers
|
||||
may be given for both patterns and subject lines, whereas others are
|
||||
valid for one or the other only. Each modifier has a long name, for
|
||||
example "anchored", and some of them must be followed by an equals sign
|
||||
and a value, for example, "offset=12". Modifiers that do not take val-
|
||||
ues may be preceded by a minus sign to turn off a previous setting.
|
||||
list are separated by commas followed by optional white space. Trailing
|
||||
whitespace in a modifier list is ignored. Some modifiers may be given
|
||||
for both patterns and subject lines, whereas others are valid only for
|
||||
one or the other. Each modifier has a long name, for example
|
||||
"anchored", and some of them must be followed by an equals sign and a
|
||||
value, for example, "offset=12". Values cannot contain comma charac-
|
||||
ters, but may contain spaces. Modifiers that do not take values may be
|
||||
preceded by a minus sign to turn off a previous setting.
|
||||
|
||||
A few of the more common modifiers can also be specified as single let-
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
|
@ -424,6 +426,7 @@ PATTERN MODIFIERS
|
|||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
anchored set PCRE2_ANCHORED
|
||||
auto_callout set PCRE2_AUTO_CALLOUT
|
||||
/i caseless set PCRE2_CASELESS
|
||||
|
@ -1330,5 +1333,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 20 May 2015
|
||||
Last updated: 30 August 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
|
|
|
@ -120,6 +120,7 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_UTF 0x00080000u /* C J M D */
|
||||
#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */
|
||||
#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */
|
||||
#define PCRE2_ALT_VERBNAMES 0x00400000u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
|
|
|
@ -561,12 +561,12 @@ static PCRE2_SPTR posix_substitutes[] = {
|
|||
|
||||
#define PUBLIC_COMPILE_OPTIONS \
|
||||
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
|
||||
PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL| \
|
||||
PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \
|
||||
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
|
||||
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
|
||||
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \
|
||||
PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
||||
PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
|
||||
PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
|
||||
PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
|
||||
PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
|
||||
PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
|
||||
PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_UTF)
|
||||
|
||||
/* Compile time error code numbers. They are given names so that they can more
|
||||
easily be tracked. When a new number is added, the tables called eint1 and
|
||||
|
@ -5382,13 +5382,52 @@ for (;; ptr++)
|
|||
|
||||
/* It appears that Perl allows any characters whatsoever, other than
|
||||
a closing parenthesis, to appear in arguments, so we no longer insist on
|
||||
letters, digits, and underscores. */
|
||||
letters, digits, and underscores. Perl does not, however, do any
|
||||
interpretation within arguments, and has no means of including a closing
|
||||
parenthesis. PCRE supports escape processing but only when it is
|
||||
requested by an option. Note that check_escape() will not return values
|
||||
greater than the code unit maximum when not in UTF mode. */
|
||||
|
||||
if (*ptr == CHAR_COLON)
|
||||
{
|
||||
arg = ++ptr;
|
||||
|
||||
if ((options & PCRE2_ALT_VERBNAMES) == 0)
|
||||
{
|
||||
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
|
||||
arglen = (int)(ptr - arg);
|
||||
}
|
||||
else
|
||||
{
|
||||
arglen = 0;
|
||||
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS)
|
||||
{
|
||||
if (*ptr == '\\')
|
||||
{
|
||||
uint32_t x;
|
||||
*errorcodeptr = 0;
|
||||
i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
|
||||
if (*errorcodeptr != 0) goto FAILED;
|
||||
if (i != 0)
|
||||
{
|
||||
*errorcodeptr = ERR40;
|
||||
goto FAILED;
|
||||
}
|
||||
#ifdef SUPPORT_UNICODE
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 8
|
||||
for (i = 0; i < PRIV(utf8_table1_size); i++)
|
||||
if ((int)x <= PRIV(utf8_table1)[i]) break;
|
||||
arglen += i;
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if (x > 0xffff) arglen++;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
arglen++;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
if ((unsigned int)arglen > MAX_MARK)
|
||||
{
|
||||
*errorcodeptr = ERR76;
|
||||
|
@ -5456,8 +5495,42 @@ for (;; ptr++)
|
|||
}
|
||||
setverb = *code++ = verbs[i].op_arg;
|
||||
*code++ = arglen;
|
||||
|
||||
/* If we are processing the argument for escapes, we don't need
|
||||
to apply checks here because it was all checked above when
|
||||
computing the length. */
|
||||
|
||||
if ((options & PCRE2_ALT_VERBNAMES) != 0)
|
||||
{
|
||||
for (; arg != ptr; arg++)
|
||||
{
|
||||
if (*arg == '\\')
|
||||
{
|
||||
uint32_t x;
|
||||
*errorcodeptr = 0;
|
||||
(void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
|
||||
cb);
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
PCRE2_UCHAR cbuff[8];
|
||||
x = PRIV(ord2utf)(x, cbuff);
|
||||
memcpy(code, cbuff, CU2BYTES(x));
|
||||
code += x;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
*code++ = x;
|
||||
}
|
||||
else *code++ = *arg;
|
||||
}
|
||||
}
|
||||
else /* No argument processing */
|
||||
{
|
||||
memcpy(code, arg, CU2BYTES(arglen));
|
||||
code += arglen;
|
||||
}
|
||||
|
||||
*code++ = 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -111,7 +111,7 @@ static const char compile_error_texts[] =
|
|||
"number after (?C is greater than 255\0"
|
||||
"closing parenthesis for (?C expected\0"
|
||||
/* 40 */
|
||||
"SPARE ERROR\0"
|
||||
"invalid escape sequence in (*VERB) name\0"
|
||||
"unrecognized character after (?P\0"
|
||||
"syntax error in subpattern name (missing terminator)\0"
|
||||
"two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
|
||||
|
|
|
@ -496,6 +496,7 @@ static modstruct modlist[] = {
|
|||
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
|
||||
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
|
||||
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
|
||||
{ "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) },
|
||||
{ "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) },
|
||||
{ "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) },
|
||||
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
||||
|
@ -3467,10 +3468,11 @@ static void
|
|||
show_compile_options(uint32_t options, const char *before, const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "",
|
||||
((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "",
|
||||
((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "",
|
||||
((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "",
|
||||
((options & PCRE2_ANCHORED) != 0)? " anchored" : "",
|
||||
((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "",
|
||||
|
|
|
@ -4442,4 +4442,11 @@ a random value. /Ix
|
|||
/((*MARK:A))++a(*SKIP:B)b/
|
||||
aacb
|
||||
|
||||
/(*MARK:a\zb)z/alt_verbnames
|
||||
|
||||
/(*:ab\t(d\)c)xxx/
|
||||
|
||||
/(*:ab\t(d\)c)xxx/alt_verbnames,mark
|
||||
cxxxz
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -1662,4 +1662,9 @@
|
|||
/[\pS#moq]/
|
||||
=
|
||||
|
||||
# UTF tests
|
||||
|
||||
/(*:a\x{12345}b\t(d\)c)xxx/utf,alt_verbnames,mark
|
||||
cxxxz
|
||||
|
||||
# End of testinput5
|
||||
|
|
|
@ -251,4 +251,6 @@
|
|||
|
||||
/[^\s]*\s* [^\W]+\W+ [^\d]*?\d0 [^\d\w]{4,6}?\w*A/B
|
||||
|
||||
/(*MARK:a\x{100}b)z/alt_verbnames
|
||||
|
||||
# End of testinput9
|
||||
|
|
|
@ -14713,4 +14713,15 @@ No match
|
|||
aacb
|
||||
No match
|
||||
|
||||
/(*MARK:a\zb)z/alt_verbnames
|
||||
Failed: error 140 at offset 9: invalid escape sequence in (*VERB) name
|
||||
|
||||
/(*:ab\t(d\)c)xxx/
|
||||
Failed: error 122 at offset 12: unmatched closing parenthesis
|
||||
|
||||
/(*:ab\t(d\)c)xxx/alt_verbnames,mark
|
||||
cxxxz
|
||||
0: xxx
|
||||
MK: ab\x09(d)c
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -4064,4 +4064,11 @@ No match
|
|||
=
|
||||
0: =
|
||||
|
||||
# UTF tests
|
||||
|
||||
/(*:a\x{12345}b\t(d\)c)xxx/utf,alt_verbnames,mark
|
||||
cxxxz
|
||||
0: xxx
|
||||
MK: a\x{12345}b\x{09}(d)c
|
||||
|
||||
# End of testinput5
|
||||
|
|
|
@ -356,4 +356,7 @@ Failed: error 177 at offset 6: character code point value in \u.... sequence is
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
/(*MARK:a\x{100}b)z/alt_verbnames
|
||||
Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large
|
||||
|
||||
# End of testinput9
|
||||
|
|
Loading…
Reference in New Issue