Add additional compile options and PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES.
This commit is contained in:
parent
d9c33d0708
commit
dfc9712bcd
7
132html
7
132html
|
@ -109,8 +109,9 @@ while (<STDIN>)
|
|||
# Handling .sp is subtle. If it is inside a literal section, do nothing if
|
||||
# the next line is a non literal text line; similarly, if not inside a
|
||||
# literal section, do nothing if a literal follows, unless we are inside
|
||||
# a .nf/.ne section. The point being that the <pre> and </pre> that delimit
|
||||
# literal sections will do the spacing. Always skip if no previous output.
|
||||
# a .nf/.fi section or about to enter one. The point being that the <pre>
|
||||
# and </pre> that delimit literal sections will do the spacing. Always skip
|
||||
# if no previous output.
|
||||
|
||||
elsif (/^\.sp/)
|
||||
{
|
||||
|
@ -123,7 +124,7 @@ while (<STDIN>)
|
|||
}
|
||||
else
|
||||
{
|
||||
print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/);
|
||||
print TEMP "<br>\n<br>\n" if ($innf || /^\.nf/ || !/^[\s.]/);
|
||||
}
|
||||
redo; # Now process the lookahead line we just read
|
||||
}
|
||||
|
|
|
@ -166,6 +166,9 @@ pcre2test, a crash could occur.
|
|||
32. Make -bigstack in RunTest allocate a 64Mb stack (instead of 16 MB) so that
|
||||
all the tests can run with clang's sanitizing options.
|
||||
|
||||
33. Implement extra compile options in the compile context and add the first
|
||||
one: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES.
|
||||
|
||||
|
||||
|
||||
Version 10.23 14-February-2017
|
||||
|
|
|
@ -67,6 +67,7 @@ dist_html_DATA = \
|
|||
doc/html/pcre2_set_bsr.html \
|
||||
doc/html/pcre2_set_callout.html \
|
||||
doc/html/pcre2_set_character_tables.html \
|
||||
doc/html/pcre2_set_compile_extra_options.html \
|
||||
doc/html/pcre2_set_compile_recursion_guard.html \
|
||||
doc/html/pcre2_set_depth_limit.html \
|
||||
doc/html/pcre2_set_heap_limit.html \
|
||||
|
@ -151,6 +152,7 @@ dist_man_MANS = \
|
|||
doc/pcre2_set_bsr.3 \
|
||||
doc/pcre2_set_callout.3 \
|
||||
doc/pcre2_set_character_tables.3 \
|
||||
doc/pcre2_set_compile_extra_options.3 \
|
||||
doc/pcre2_set_compile_recursion_guard.3 \
|
||||
doc/pcre2_set_depth_limit.3 \
|
||||
doc/pcre2_set_heap_limit.3 \
|
||||
|
|
2
RunTest
2
RunTest
|
@ -500,7 +500,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry
|
||||
if [ $? = 0 ] ; then
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,188,189,190,191 >>testtry
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,101,191,192 >>testtry
|
||||
checkresult $? 2 "$opt"
|
||||
fi
|
||||
done
|
||||
|
|
|
@ -207,6 +207,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_set_character_tables.html">pcre2_set_character_tables</a></td>
|
||||
<td> Set character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_extra_options.html">pcre2_set_compile_extra_options</a></td>
|
||||
<td> Set compile time extra options</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_recursion_guard.html">pcre2_set_compile_recursion_guard</a></td>
|
||||
<td> Set up a compile recursion guard function</td></tr>
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@ system stack size checking, or to change one or more of these parameters:
|
|||
The newline character sequence;
|
||||
The compile time nested parentheses limit;
|
||||
The maximum pattern length (in code units) that is allowed.
|
||||
The additional options bits
|
||||
</pre>
|
||||
The option bits are:
|
||||
<pre>
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2_set_compile_extra_options specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2_set_compile_extra_options man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
SYNOPSIS
|
||||
</b><br>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>int pcre2_set_compile_extra_options(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>extra_options</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function sets additional option bits for <b>pcre2_compile()</b> that are
|
||||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
</pre>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
page and a description of the POSIX API in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
page.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -60,8 +60,8 @@ please consult the man page, in case the conversion went wrong.
|
|||
<b>#include <pcre2.h></b>
|
||||
<br>
|
||||
<br>
|
||||
PCRE2 is a new API for PCRE. This document contains a description of all its
|
||||
functions. See the
|
||||
PCRE2 is a new API for PCRE, starting at release 10.0. This document contains a
|
||||
description of all its native functions. See the
|
||||
<a href="pcre2.html"><b>pcre2</b></a>
|
||||
document for an overview of all the PCRE2 documentation.
|
||||
</P>
|
||||
|
@ -145,6 +145,10 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b> const unsigned char *<i>tables</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_compile_extra_options(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>extra_options</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_max_pattern_length(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -328,7 +332,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
These functions became obsolete at release 10.30 and are retained only for
|
||||
backward compatibility. They should not be used in new code. The first is
|
||||
replaced by <b>pcre2_set_depth_limit()</b>; the second is no longer needed and
|
||||
no longer has any effect (it always returns zero).
|
||||
has no effect (it always returns zero).
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a><br>
|
||||
<P>
|
||||
|
@ -389,23 +393,23 @@ For example, if you want to run a match using a pattern that was compiled with
|
|||
<P>
|
||||
In the function summaries above, and in the rest of this document and other
|
||||
PCRE2 documents, functions and data types are described using their generic
|
||||
names, without the 8, 16, or 32 suffix.
|
||||
names, without the _8, _16, or _32 suffix.
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">PCRE2 API OVERVIEW</a><br>
|
||||
<P>
|
||||
PCRE2 has its own native API, which is described in this document. There are
|
||||
also some wrapper functions for the 8-bit library that correspond to the
|
||||
POSIX regular expression API, but they do not give access to all the
|
||||
functionality. They are described in the
|
||||
functionality of PCRE2. They are described in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
documentation. Both these APIs define a set of C function calls.
|
||||
</P>
|
||||
<P>
|
||||
The native API C data types, function prototypes, option values, and error
|
||||
codes are defined in the header file <b>pcre2.h</b>, which contains definitions
|
||||
of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the
|
||||
library. Applications can use these to include support for different releases
|
||||
of PCRE2.
|
||||
codes are defined in the header file <b>pcre2.h</b>, which also contains
|
||||
definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers
|
||||
for the library. Applications can use these to include support for different
|
||||
releases of PCRE2.
|
||||
</P>
|
||||
<P>
|
||||
In a Windows environment, if you want to statically link an application program
|
||||
|
@ -478,7 +482,7 @@ been matched by <b>pcre2_match()</b>. They are:
|
|||
<b>pcre2_substring_number_from_name()</b>
|
||||
</pre>
|
||||
<b>pcre2_substring_free()</b> and <b>pcre2_substring_list_free()</b> are also
|
||||
provided, to free the memory used for extracted strings.
|
||||
provided, to free memory used for extracted strings.
|
||||
</P>
|
||||
<P>
|
||||
The function <b>pcre2_substitute()</b> can be called to match a pattern and
|
||||
|
@ -595,7 +599,7 @@ required. JIT compilation updates a pointer within the compiled code block, so
|
|||
a thread must gain unique write access to the pointer before calling
|
||||
<b>pcre2_jit_compile()</b>. Alternatively, <b>pcre2_code_copy()</b> or
|
||||
<b>pcre2_code_copy_with_tables()</b> can be used to obtain a private copy of the
|
||||
compiled code.
|
||||
compiled code before calling the JIT compiler.
|
||||
</P>
|
||||
<br><b>
|
||||
Context blocks
|
||||
|
@ -649,6 +653,8 @@ library. The context is named `general' rather than specifically `memory'
|
|||
because in future other fields may be added. If you do not want to supply your
|
||||
own custom memory management functions, you do not need to bother with a
|
||||
general context. A general context is created by:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_general_context *pcre2_general_context_create(</b>
|
||||
<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
|
||||
<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
|
||||
|
@ -675,11 +681,15 @@ used. When the time comes to free the block, this function is called.
|
|||
</P>
|
||||
<P>
|
||||
A general context can be copied by calling:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_general_context *pcre2_general_context_copy(</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
The memory used for a general context should be freed by calling:
|
||||
<br>
|
||||
<br>
|
||||
<b>void pcre2_general_context_free(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<a name="compilecontext"></a></P>
|
||||
<br><b>
|
||||
|
@ -695,6 +705,7 @@ following compile-time parameters:
|
|||
The newline character sequence
|
||||
The compile time nested parentheses limit
|
||||
The maximum length of the pattern string
|
||||
The extra options bits (none set by default)
|
||||
</pre>
|
||||
A compile context is also required if you are using custom memory management.
|
||||
If none of these apply, just pass NULL as the context argument of
|
||||
|
@ -702,6 +713,8 @@ If none of these apply, just pass NULL as the context argument of
|
|||
</P>
|
||||
<P>
|
||||
A compile context is created, copied, and freed by the following functions:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_compile_context *pcre2_compile_context_create(</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
|
@ -716,6 +729,8 @@ A compile context is created, copied, and freed by the following functions:
|
|||
A compile context is created with default values for its parameters. These can
|
||||
be changed by calling the following functions, which return 0 on success, or
|
||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -725,6 +740,8 @@ or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
|
|||
ending sequence. The value is used by the JIT compiler and by the two
|
||||
interpreted matching functions, <i>pcre2_match()</i> and
|
||||
<i>pcre2_dfa_match()</i>.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> const unsigned char *<i>tables</i>);</b>
|
||||
<br>
|
||||
|
@ -732,6 +749,22 @@ interpreted matching functions, <i>pcre2_match()</i> and
|
|||
The value must be the result of a call to <i>pcre2_maketables()</i>, whose only
|
||||
argument is a general context. This function builds a set of character tables
|
||||
in the current locale.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_compile_extra_options(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>extra_options</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
As PCRE2 has developed, almost all the 32 option bits that are available in
|
||||
the <i>options</i> argument of <b>pcre2_compile()</b> have been used up. To avoid
|
||||
running out, the compile context contains a set of extra option bits which are
|
||||
used for some newer, assumed rarer, options. This function sets those bits. It
|
||||
always sets all the bits (either on or off). It does not modify any existing
|
||||
setting. The available options are defined in the section entitled "Extra
|
||||
compile options"
|
||||
<a href="#extracompileoptions">below.</a>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_max_pattern_length(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -741,6 +774,8 @@ compiled with this context. If the pattern is longer, an error is generated.
|
|||
This facility is provided so that applications that accept patterns from
|
||||
external sources can limit their size. The default is the largest number that a
|
||||
PCRE2_SIZE variable can hold, which is effectively unlimited.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -758,11 +793,13 @@ sequence such as (*CRLF). See the
|
|||
page for details.
|
||||
</P>
|
||||
<P>
|
||||
When a pattern is compiled with the PCRE2_EXTENDED option, the newline
|
||||
convention affects the recognition of white space and the end of internal
|
||||
comments starting with #. The value is saved with the compiled pattern for
|
||||
subsequent use by the JIT compiler and by the two interpreted matching
|
||||
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
||||
When a pattern is compiled with the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE
|
||||
option, the newline convention affects the recognition of white space and the
|
||||
end of internal comments starting with #. The value is saved with the compiled
|
||||
pattern for subsequent use by the JIT compiler and by the two interpreted
|
||||
matching functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -771,6 +808,8 @@ This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
|
|||
depth of parenthesis nesting in a pattern. This limit stops rogue patterns
|
||||
using up too much system stack when being compiled. The limit applies to
|
||||
parentheses of all kinds, not just capturing parentheses.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
||||
<br>
|
||||
|
@ -778,10 +817,10 @@ parentheses of all kinds, not just capturing parentheses.
|
|||
There is at least one application that runs PCRE2 in threads with very limited
|
||||
system stack, where running out of stack is to be avoided at all costs. The
|
||||
parenthesis limit above cannot take account of how much stack is actually
|
||||
available. For a finer control, you can supply a function that is called
|
||||
whenever <b>pcre2_compile()</b> starts to compile a parenthesized part of a
|
||||
pattern. This function can check the actual stack size (or anything else that
|
||||
it wants to, of course).
|
||||
available during compilation. For a finer control, you can supply a function
|
||||
that is called whenever <b>pcre2_compile()</b> starts to compile a parenthesized
|
||||
part of a pattern. This function can check the actual stack size (or anything
|
||||
else that it wants to, of course).
|
||||
</P>
|
||||
<P>
|
||||
The first argument to the callout function gives the current depth of
|
||||
|
@ -807,6 +846,8 @@ If none of these apply, just pass NULL as the context argument of
|
|||
</P>
|
||||
<P>
|
||||
A match context is created, copied, and freed by the following functions:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_context *pcre2_match_context_create(</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
|
@ -821,6 +862,8 @@ A match context is created, copied, and freed by the following functions:
|
|||
A match context is created with default values for its parameters. These can
|
||||
be changed by calling the following functions, which return 0 on success, or
|
||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
|
@ -830,6 +873,8 @@ This sets up a "callout" function for PCRE2 to call at specified points
|
|||
during a matching operation. Details are given in the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -856,6 +901,8 @@ subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
|||
start within the first line of the subject. If this is set with an offset
|
||||
limit, a match must occur in the first line and also within the offset limit.
|
||||
In other words, whichever limit comes first is used.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_heap_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -889,6 +936,8 @@ Heap memory is used only if the initial vector is too small. If the heap limit
|
|||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -926,6 +975,8 @@ of the form
|
|||
where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
||||
less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
||||
limit is set, less than the default.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_depth_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -1282,8 +1333,9 @@ include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
|||
option is set, normal backslash processing is applied to verb names and only an
|
||||
unescaped closing parenthesis terminates the name. A closing parenthesis can be
|
||||
included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED
|
||||
option is set, unescaped whitespace in verb names is skipped and #-comments are
|
||||
recognized in this mode, exactly as in the rest of the pattern.
|
||||
or PCRE2_EXTENDED_MORE option is set, unescaped whitespace in verb names is
|
||||
skipped and #-comments are recognized in this mode, exactly as in the rest of
|
||||
the pattern.
|
||||
<pre>
|
||||
PCRE2_AUTO_CALLOUT
|
||||
</pre>
|
||||
|
@ -1298,7 +1350,13 @@ documentation.
|
|||
</pre>
|
||||
If this bit is set, letters in the pattern match both upper and lower case
|
||||
letters in the subject. It is equivalent to Perl's /i option, and it can be
|
||||
changed within a pattern by a (?i) option setting.
|
||||
changed within a pattern by a (?i) option setting. If PCRE2_UTF is set, Unicode
|
||||
properties are used for all characters with more than one other case, and for
|
||||
all characters whose code points are greater than U+007f. For lower valued
|
||||
characters with only one other case, a lookup table is used for speed. When
|
||||
PCRE2_UTF is not set, a lookup table is used for all code points less than 256,
|
||||
and higher code points (available only in 16-bit or 32-bit mode) are treated as
|
||||
not having another case.
|
||||
<pre>
|
||||
PCRE2_DOLLAR_ENDONLY
|
||||
</pre>
|
||||
|
@ -1380,18 +1438,18 @@ built.
|
|||
<pre>
|
||||
PCRE2_EXTENDED_MORE
|
||||
</pre>
|
||||
This option has the effect of PCRE2_EXTENDED, but, in addition, space and
|
||||
horizontal tab characters are also ignored inside a character class.
|
||||
This option has the effect of PCRE2_EXTENDED, but, in addition, unescaped space
|
||||
and horizontal tab characters are ignored inside a character class.
|
||||
PCRE2_EXTENDED_MORE is equivalent to Perl's 5.26 /xx option, and it can be
|
||||
changed within a pattern by a (?xx) option setting.
|
||||
<pre>
|
||||
PCRE2_FIRSTLINE
|
||||
</pre>
|
||||
If this option is set, an unanchored pattern is required to match before or at
|
||||
the first newline in the subject string, though the matched text may continue
|
||||
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
|
||||
general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a
|
||||
match must occur in the first line and also within the offset limit. In other
|
||||
If this option is set, the start of an unanchored pattern match must be before
|
||||
or at the first newline in the subject string, though the matched text may
|
||||
continue over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a
|
||||
more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit,
|
||||
a match must occur in the first line and also within the offset limit. In other
|
||||
words, whichever limit comes first is used.
|
||||
<pre>
|
||||
PCRE2_MATCH_UNSET_BACKREF
|
||||
|
@ -1457,8 +1515,8 @@ PCRE2_NEVER_UTF causes an error.
|
|||
If this option is set, it disables the use of numbered capturing parentheses in
|
||||
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
||||
were followed by ?: but named parentheses can still be used for capturing (and
|
||||
they acquire numbers in the usual way). There is no equivalent of this option
|
||||
in Perl. Note that, if this option is set, references to capturing groups (back
|
||||
they acquire numbers in the usual way). This is the same as Perl's /n option.
|
||||
Note that, when this option is set, references to capturing groups (back
|
||||
references or recursion/subroutine calls) may only refer to named groups,
|
||||
though the reference can be by name or by number.
|
||||
<pre>
|
||||
|
@ -1494,8 +1552,8 @@ compiler.
|
|||
<P>
|
||||
There are a number of optimizations that may occur at the start of a match, in
|
||||
order to speed up the process. For example, if it is known that an unanchored
|
||||
match must start with a specific character, the matching code searches the
|
||||
subject for that character, and fails immediately if it cannot find it, without
|
||||
match must start with a specific code unit value, the matching code searches
|
||||
the subject for that value, and fails immediately if it cannot find it, without
|
||||
actually running the main matching function. This means that a special item
|
||||
such as (*COMMIT) at the start of a pattern is not considered until after a
|
||||
suitable starting point for the match has been found. Also, when callouts or
|
||||
|
@ -1524,9 +1582,11 @@ current starting position, which in this case, it does. However, if the same
|
|||
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
|
||||
subject string does not happen. The first match attempt is run starting from
|
||||
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
|
||||
the overall result is "no match". There are also other start-up optimizations.
|
||||
For example, a minimum length for the subject may be recorded. Consider the
|
||||
pattern
|
||||
the overall result is "no match".
|
||||
</P>
|
||||
<P>
|
||||
There are also other start-up optimizations. For example, a minimum length for
|
||||
the subject may be recorded. Consider the pattern
|
||||
<pre>
|
||||
(*MARK:A)(X|Y)
|
||||
</pre>
|
||||
|
@ -1551,12 +1611,26 @@ document. If an invalid UTF sequence is found, <b>pcre2_compile()</b> returns a
|
|||
negative error code.
|
||||
</P>
|
||||
<P>
|
||||
If you know that your pattern is valid, and you want to skip this check for
|
||||
performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set,
|
||||
the effect of passing an invalid UTF string as a pattern is undefined. It may
|
||||
cause your program to crash or loop. Note that this option can also be passed
|
||||
to <b>pcre2_match()</b> and <b>pcre_dfa_match()</b>, to suppress validity
|
||||
checking of the subject string.
|
||||
If you know that your pattern is a valid UTF string, and you want to skip this
|
||||
check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When
|
||||
it is set, the effect of passing an invalid UTF string as a pattern is
|
||||
undefined. It may cause your program to crash or loop.
|
||||
</P>
|
||||
<P>
|
||||
Note that this option can also be passed to <b>pcre2_match()</b> and
|
||||
<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
</P>
|
||||
<P>
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
error that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. In particular, the so-called "surrogate" code
|
||||
points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences
|
||||
such as \x{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra
|
||||
option, as described in the section entitled "Extra compile options"
|
||||
<a href="#extracompileoptions">below.</a>
|
||||
However, this is possible only in UTF-8 and UTF-32 modes, because these values
|
||||
are not representable in UTF-16.
|
||||
<pre>
|
||||
PCRE2_UCP
|
||||
</pre>
|
||||
|
@ -1594,10 +1668,42 @@ This option causes PCRE2 to regard both the pattern and the subject strings
|
|||
that are subsequently processed as strings of UTF characters instead of
|
||||
single-code-unit strings. It is available when PCRE2 is built to include
|
||||
Unicode support (which is the default). If Unicode support is not available,
|
||||
the use of this option provokes an error. Details of how this option changes
|
||||
the behaviour of PCRE2 are given in the
|
||||
the use of this option provokes an error. Details of how PCRE2_UTF changes the
|
||||
behaviour of PCRE2 are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
<a name="extracompileoptions"></a></P>
|
||||
<br><b>
|
||||
Extra compile options
|
||||
</b><br>
|
||||
<P>
|
||||
Unlike the main compile-time options, the extra options are not saved with the
|
||||
compiled pattern. The option bits that can be set in a compile context by
|
||||
calling the <b>pcre2_set_compile_extra_options()</b> function are as follows:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
</pre>
|
||||
This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is
|
||||
forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate"
|
||||
code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode
|
||||
code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot
|
||||
therefore be represented in UTF-16. They can be represented in UTF-8 and
|
||||
UTF-32, but are defined as invalid code points, and cause errors if encountered
|
||||
in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2.
|
||||
</P>
|
||||
<P>
|
||||
These values also cause errors if encountered in escape sequences such as
|
||||
\x{d912} within a pattern. However, it seems that some applications, when
|
||||
using PCRE2 to check for unwanted characters in UTF-8 strings, explicitly test
|
||||
for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does
|
||||
not disable the error that occurs, because it applies only to the testing of
|
||||
input strings for UTF validity.
|
||||
</P>
|
||||
<P>
|
||||
If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code
|
||||
point values in UTF-8 and UTF-32 patterns no longer provoke errors and are
|
||||
incorporated in the compiled pattern. However, they can only match subject
|
||||
characters if the matching function is called with PCRE2_NO_UTF_CHECK set.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">COMPILATION ERROR CODES</a><br>
|
||||
<P>
|
||||
|
@ -1806,7 +1912,9 @@ The third argument should point to an <b>uint32_t</b> variable.
|
|||
If the pattern set a backtracking depth limit by including an item of the form
|
||||
(*LIMIT_DEPTH=nnnn) at the start, the value is returned. The third argument
|
||||
should point to an unsigned 32-bit integer. If no such value has been set, the
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. Note
|
||||
that this limit will only be used during matching if it is less than the limit
|
||||
set or defaulted by the caller of the match function.
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTBITMAP
|
||||
</pre>
|
||||
|
@ -1824,15 +1932,15 @@ returned. Otherwise NULL is returned. The third argument should point to an
|
|||
Return information about the first code unit of any matched string, for a
|
||||
non-anchored pattern. The third argument should point to an <b>uint32_t</b>
|
||||
variable. If there is a fixed first value, for example, the letter "c" from a
|
||||
pattern such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
||||
it is known that a match can occur only at the start of the subject or
|
||||
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
||||
patterns, 0 is returned.
|
||||
pattern such as (cat|cow|coyote), 1 is returned, and the value can be retrieved
|
||||
using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is
|
||||
known that a match can occur only at the start of the subject or following a
|
||||
newline in the subject, 2 is returned. Otherwise, and for anchored patterns, 0
|
||||
is returned.
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTCODEUNIT
|
||||
</pre>
|
||||
Return the value of the first code unit of any matched string in the situation
|
||||
Return the value of the first code unit of any matched string for a pattern
|
||||
where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third
|
||||
argument should point to an <b>uint32_t</b> variable. In the 8-bit library, the
|
||||
value is always less than 256. In the 16-bit library the value can be up to
|
||||
|
@ -1864,7 +1972,9 @@ the equivalent hexadecimal or octal escape sequences.
|
|||
If the pattern set a heap memory limit by including an item of the form
|
||||
(*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argument
|
||||
should point to an unsigned 32-bit integer. If no such value has been set, the
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. Note
|
||||
that this limit will only be used during matching if it is less than the limit
|
||||
set or defaulted by the caller of the match function.
|
||||
<pre>
|
||||
PCRE2_INFO_JCHANGED
|
||||
</pre>
|
||||
|
@ -1891,10 +2001,10 @@ PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0.
|
|||
<pre>
|
||||
PCRE2_INFO_LASTCODEUNIT
|
||||
</pre>
|
||||
Return the value of the rightmost literal data unit that must exist in any
|
||||
matched string, other than at its start, if such a value has been recorded. The
|
||||
third argument should point to an <b>uint32_t</b> variable. If there is no such
|
||||
value, 0 is returned.
|
||||
Return the value of the rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start, for a pattern where
|
||||
PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argument
|
||||
should point to an <b>uint32_t</b> variable.
|
||||
<pre>
|
||||
PCRE2_INFO_MATCHEMPTY
|
||||
</pre>
|
||||
|
@ -1909,7 +2019,9 @@ in such cases.
|
|||
If the pattern set a match limit by including an item of the form
|
||||
(*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument
|
||||
should point to an unsigned 32-bit integer. If no such value has been set, the
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. Note
|
||||
that this limit will only be used during matching if it is less than the limit
|
||||
set or defaulted by the caller of the match function.
|
||||
<pre>
|
||||
PCRE2_INFO_MAXLOOKBEHIND
|
||||
</pre>
|
||||
|
@ -1921,7 +2033,8 @@ require a one-character lookbehind. \A also registers a one-character
|
|||
lookbehind, though it does not actually inspect the previous character. This is
|
||||
to ensure that at least one character from the old segment is retained when a
|
||||
new segment is processed. Otherwise, if there are no lookbehinds in the
|
||||
pattern, \A might match incorrectly at the start of a new segment.
|
||||
pattern, \A might match incorrectly at the start of a second or subsequent
|
||||
segment.
|
||||
<pre>
|
||||
PCRE2_INFO_MINLENGTH
|
||||
</pre>
|
||||
|
@ -2216,7 +2329,7 @@ character is CR followed by LF, advance the starting offset by two characters
|
|||
instead of one.
|
||||
</P>
|
||||
<P>
|
||||
If a non-zero starting offset is passed when the pattern is anchored, an single
|
||||
If a non-zero starting offset is passed when the pattern is anchored, a single
|
||||
attempt to match at the given offset is made. This can only succeed if the
|
||||
pattern does not require the match to be at the start of the subject. In other
|
||||
words, the anchoring must be the result of setting the PCRE2_ANCHORED option or
|
||||
|
@ -2611,6 +2724,10 @@ documentation for details.
|
|||
PCRE2_ERROR_DEPTHLIMIT
|
||||
</pre>
|
||||
The nested backtracking depth limit was reached.
|
||||
<pre>
|
||||
PCRE2_ERROR_HEAPLIMIT
|
||||
</pre>
|
||||
The heap limit was reached.
|
||||
<pre>
|
||||
PCRE2_ERROR_INTERNAL
|
||||
</pre>
|
||||
|
@ -3290,7 +3407,7 @@ NOTE: PCRE2's "auto-possessification" optimization usually applies to character
|
|||
repeats at the end of a pattern (as well as internally). For example, the
|
||||
pattern "a\d+" is compiled as if it were "a\d++". For DFA matching, this
|
||||
means that only one possible match is found. If you really do want multiple
|
||||
matches in such cases, either use an ungreedy repeat auch as "a\d+?" or set
|
||||
matches in such cases, either use an ungreedy repeat such as "a\d+?" or set
|
||||
the PCRE2_NO_AUTO_POSSESS option when compiling.
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -3351,7 +3468,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 April 2017
|
||||
Last updated: 17 May 2017
|
||||
<br>
|
||||
Copyright © 1997-2017 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -1545,12 +1545,13 @@ alternative in the subpattern.
|
|||
<br><a name="SEC13" href="#TOC1">INTERNAL OPTION SETTING</a><br>
|
||||
<P>
|
||||
The settings of the PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL,
|
||||
PCRE2_EXTENDED, and PCRE2_EXTENDED_MORE options (which are Perl-compatible) can
|
||||
be changed from within the pattern by a sequence of Perl option letters
|
||||
enclosed between "(?" and ")". The option letters are
|
||||
PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE options (which
|
||||
are Perl-compatible) can be changed from within the pattern by a sequence of
|
||||
Perl option letters enclosed between "(?" and ")". The option letters are
|
||||
<pre>
|
||||
i for PCRE2_CASELESS
|
||||
m for PCRE2_MULTILINE
|
||||
n for PCRE2_NO_AUTO_CAPTURE
|
||||
s for PCRE2_DOTALL
|
||||
x for PCRE2_EXTENDED
|
||||
xx for PCRE2_EXTENDED_MORE
|
||||
|
|
|
@ -430,6 +430,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?m) multiline
|
||||
(?n) no auto capture
|
||||
(?s) single line (dotall)
|
||||
(?U) default ungreedy (lazy)
|
||||
(?x) extended: ignore white space except in classes
|
||||
|
|
|
@ -559,14 +559,19 @@ by a previous <b>#pattern</b> command.
|
|||
Setting compilation options
|
||||
</b><br>
|
||||
<P>
|
||||
The following modifiers set options for <b>pcre2_compile()</b>. The most common
|
||||
ones have single-letter abbreviations, with special handling for /x (to make
|
||||
it like Perl). If a second x is present, PCRE2_EXTENDED is converted into
|
||||
PCRE2_EXTENDED_MORE. A third appearance adds PCRE2_EXTENDED as well. See
|
||||
The following modifiers set options for <b>pcre2_compile()</b>. Most of them set
|
||||
bits in the options argument of that function, but those whose names start with
|
||||
PCRE2_EXTRA are additional options that are set in the compile context. For the
|
||||
main options, there are some single-letter abbreviations that are the same as
|
||||
Perl options. There is special handling for /x: if a second x is present,
|
||||
PCRE2_EXTENDED is converted into PCRE2_EXTENDED_MORE as in Perl. A third
|
||||
appearance adds PCRE2_EXTENDED as well, though this makes no difference to the
|
||||
way <b>pcre2_compile()</b> behaves. See
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
for a description of the effects of these options.
|
||||
<pre>
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
|
@ -585,7 +590,7 @@ for a description of the effects of these options.
|
|||
never_backslash_c set PCRE2_NEVER_BACKSLASH_C
|
||||
never_ucp set PCRE2_NEVER_UCP
|
||||
never_utf set PCRE2_NEVER_UTF
|
||||
no_auto_capture set PCRE2_NO_AUTO_CAPTURE
|
||||
/n no_auto_capture set PCRE2_NO_AUTO_CAPTURE
|
||||
no_auto_possess set PCRE2_NO_AUTO_POSSESS
|
||||
no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR
|
||||
no_start_optimize set PCRE2_NO_START_OPTIMIZE
|
||||
|
@ -607,7 +612,8 @@ Setting compilation controls
|
|||
</b><br>
|
||||
<P>
|
||||
The following modifiers affect the compilation process or request information
|
||||
about the pattern:
|
||||
about the pattern. There are single-letter abbreviations for some that are
|
||||
heavily used in the test files.
|
||||
<pre>
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
/B bincode show binary code without lengths
|
||||
|
@ -1810,7 +1816,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 April 2017
|
||||
Last updated: 17 May 2017
|
||||
<br>
|
||||
Copyright © 1997-2017 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -47,7 +47,7 @@ and
|
|||
documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE does not support this.
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
</P>
|
||||
<br><b>
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -109,10 +109,15 @@ However, the special horizontal and vertical white space matching escapes (\h,
|
|||
\H, \v, and \V) do match all the appropriate Unicode characters, whether or
|
||||
not PCRE2_UCP is set.
|
||||
</P>
|
||||
<br><b>
|
||||
CASE-EQUIVALENCE IN UTF MODES
|
||||
</b><br>
|
||||
<P>
|
||||
Case-insensitive matching in UTF mode makes use of Unicode properties. A few
|
||||
Unicode characters such as Greek sigma have more than two codepoints that are
|
||||
case-equivalent, and these are treated as such.
|
||||
Case-insensitive matching in a UTF mode makes use of Unicode properties except
|
||||
for characters whose code points are less than 128 and that have at most two
|
||||
case-equivalent values. For these, a direct table lookup is used for speed. A
|
||||
few Unicode characters such as Greek sigma have more than two codepoints that
|
||||
are case-equivalent, and these are treated as such.
|
||||
</P>
|
||||
<br><b>
|
||||
VALIDITY OF UTF STRINGS
|
||||
|
@ -173,6 +178,15 @@ or <b>pcre2_dfa_match()</b>.
|
|||
<P>
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
|
||||
is undefined and your program may crash or loop indefinitely.
|
||||
</P>
|
||||
<P>
|
||||
Note that setting PCRE2_NO_UTF_CHECK at compile time does not disable the error
|
||||
that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. If you want to allow escape sequences such as
|
||||
\x{d800} (a surrogate code point) you can set the
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible
|
||||
only in UTF-8 and UTF-32 modes, because these values are not representable in
|
||||
UTF-16.
|
||||
<a name="utf8strings"></a></P>
|
||||
<br><b>
|
||||
Errors in UTF-8 strings
|
||||
|
@ -280,9 +294,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 03 July 2016
|
||||
Last updated: 17 May 2017
|
||||
<br>
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
Copyright © 1997-2017 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -207,6 +207,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_set_character_tables.html">pcre2_set_character_tables</a></td>
|
||||
<td> Set character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_extra_options.html">pcre2_set_compile_extra_options</a></td>
|
||||
<td> Set compile time extra options</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_recursion_guard.html">pcre2_set_compile_recursion_guard</a></td>
|
||||
<td> Set up a compile recursion guard function</td></tr>
|
||||
|
||||
|
|
251
doc/pcre2.txt
251
doc/pcre2.txt
|
@ -181,9 +181,9 @@ NAME
|
|||
|
||||
#include <pcre2.h>
|
||||
|
||||
PCRE2 is a new API for PCRE. This document contains a description of
|
||||
all its functions. See the pcre2 document for an overview of all the
|
||||
PCRE2 documentation.
|
||||
PCRE2 is a new API for PCRE, starting at release 10.0. This document
|
||||
contains a description of all its native functions. See the pcre2 docu-
|
||||
ment for an overview of all the PCRE2 documentation.
|
||||
|
||||
|
||||
PCRE2 NATIVE API BASIC FUNCTIONS
|
||||
|
@ -253,6 +253,9 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS
|
|||
int pcre2_set_character_tables(pcre2_compile_context *ccontext,
|
||||
const unsigned char *tables);
|
||||
|
||||
int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
|
||||
uint32_t extra_options);
|
||||
|
||||
int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
|
||||
PCRE2_SIZE value);
|
||||
|
||||
|
@ -407,7 +410,7 @@ PCRE2 NATIVE API OBSOLETE FUNCTIONS
|
|||
These functions became obsolete at release 10.30 and are retained only
|
||||
for backward compatibility. They should not be used in new code. The
|
||||
first is replaced by pcre2_set_depth_limit(); the second is no longer
|
||||
needed and no longer has any effect (it always returns zero).
|
||||
needed and has no effect (it always returns zero).
|
||||
|
||||
|
||||
PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES
|
||||
|
@ -466,7 +469,7 @@ PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES
|
|||
|
||||
In the function summaries above, and in the rest of this document and
|
||||
other PCRE2 documents, functions and data types are described using
|
||||
their generic names, without the 8, 16, or 32 suffix.
|
||||
their generic names, without the _8, _16, or _32 suffix.
|
||||
|
||||
|
||||
PCRE2 API OVERVIEW
|
||||
|
@ -474,12 +477,12 @@ PCRE2 API OVERVIEW
|
|||
PCRE2 has its own native API, which is described in this document.
|
||||
There are also some wrapper functions for the 8-bit library that corre-
|
||||
spond to the POSIX regular expression API, but they do not give access
|
||||
to all the functionality. They are described in the pcre2posix documen-
|
||||
tation. Both these APIs define a set of C function calls.
|
||||
to all the functionality of PCRE2. They are described in the pcre2posix
|
||||
documentation. Both these APIs define a set of C function calls.
|
||||
|
||||
The native API C data types, function prototypes, option values, and
|
||||
error codes are defined in the header file pcre2.h, which contains def-
|
||||
initions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release
|
||||
error codes are defined in the header file pcre2.h, which also contains
|
||||
definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release
|
||||
numbers for the library. Applications can use these to include support
|
||||
for different releases of PCRE2.
|
||||
|
||||
|
@ -544,7 +547,7 @@ PCRE2 API OVERVIEW
|
|||
pcre2_substring_number_from_name()
|
||||
|
||||
pcre2_substring_free() and pcre2_substring_list_free() are also pro-
|
||||
vided, to free the memory used for extracted strings.
|
||||
vided, to free memory used for extracted strings.
|
||||
|
||||
The function pcre2_substitute() can be called to match a pattern and
|
||||
return a copy of the subject string with substitutions for parts that
|
||||
|
@ -652,7 +655,8 @@ MULTITHREADING
|
|||
compiled code block, so a thread must gain unique write access to the
|
||||
pointer before calling pcre2_jit_compile(). Alternatively,
|
||||
pcre2_code_copy() or pcre2_code_copy_with_tables() can be used to
|
||||
obtain a private copy of the compiled code.
|
||||
obtain a private copy of the compiled code before calling the JIT com-
|
||||
piler.
|
||||
|
||||
Context blocks
|
||||
|
||||
|
@ -748,6 +752,7 @@ PCRE2 CONTEXTS
|
|||
The newline character sequence
|
||||
The compile time nested parentheses limit
|
||||
The maximum length of the pattern string
|
||||
The extra options bits (none set by default)
|
||||
|
||||
A compile context is also required if you are using custom memory man-
|
||||
agement. If none of these apply, just pass NULL as the context argu-
|
||||
|
@ -784,6 +789,17 @@ PCRE2 CONTEXTS
|
|||
only argument is a general context. This function builds a set of char-
|
||||
acter tables in the current locale.
|
||||
|
||||
int pcre2_set_compile_extra_options(pcre2_compile_context *ccontext,
|
||||
uint32_t extra_options);
|
||||
|
||||
As PCRE2 has developed, almost all the 32 option bits that are avail-
|
||||
able in the options argument of pcre2_compile() have been used up. To
|
||||
avoid running out, the compile context contains a set of extra option
|
||||
bits which are used for some newer, assumed rarer, options. This func-
|
||||
tion sets those bits. It always sets all the bits (either on or off).
|
||||
It does not modify any existing setting. The available options are
|
||||
defined in the section entitled "Extra compile options" below.
|
||||
|
||||
int pcre2_set_max_pattern_length(pcre2_compile_context *ccontext,
|
||||
PCRE2_SIZE value);
|
||||
|
||||
|
@ -806,11 +822,12 @@ PCRE2 CONTEXTS
|
|||
A pattern can override the value set in the compile context by starting
|
||||
with a sequence such as (*CRLF). See the pcre2pattern page for details.
|
||||
|
||||
When a pattern is compiled with the PCRE2_EXTENDED option, the newline
|
||||
convention affects the recognition of white space and the end of inter-
|
||||
nal comments starting with #. The value is saved with the compiled pat-
|
||||
tern for subsequent use by the JIT compiler and by the two interpreted
|
||||
matching functions, pcre2_match() and pcre2_dfa_match().
|
||||
When a pattern is compiled with the PCRE2_EXTENDED or
|
||||
PCRE2_EXTENDED_MORE option, the newline convention affects the recogni-
|
||||
tion of white space and the end of internal comments starting with #.
|
||||
The value is saved with the compiled pattern for subsequent use by the
|
||||
JIT compiler and by the two interpreted matching functions,
|
||||
pcre2_match() and pcre2_dfa_match().
|
||||
|
||||
int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
|
||||
uint32_t value);
|
||||
|
@ -827,10 +844,11 @@ PCRE2 CONTEXTS
|
|||
There is at least one application that runs PCRE2 in threads with very
|
||||
limited system stack, where running out of stack is to be avoided at
|
||||
all costs. The parenthesis limit above cannot take account of how much
|
||||
stack is actually available. For a finer control, you can supply a
|
||||
function that is called whenever pcre2_compile() starts to compile a
|
||||
parenthesized part of a pattern. This function can check the actual
|
||||
stack size (or anything else that it wants to, of course).
|
||||
stack is actually available during compilation. For a finer control,
|
||||
you can supply a function that is called whenever pcre2_compile()
|
||||
starts to compile a parenthesized part of a pattern. This function can
|
||||
check the actual stack size (or anything else that it wants to, of
|
||||
course).
|
||||
|
||||
The first argument to the callout function gives the current depth of
|
||||
nesting, and the second is user data that is set up by the last argu-
|
||||
|
@ -1302,10 +1320,10 @@ COMPILING A PATTERN
|
|||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash
|
||||
processing is applied to verb names and only an unescaped closing
|
||||
parenthesis terminates the name. A closing parenthesis can be included
|
||||
in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED
|
||||
option is set, unescaped whitespace in verb names is skipped and #-com-
|
||||
ments are recognized in this mode, exactly as in the rest of the pat-
|
||||
tern.
|
||||
in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED or
|
||||
PCRE2_EXTENDED_MORE option is set, unescaped whitespace in verb names
|
||||
is skipped and #-comments are recognized in this mode, exactly as in
|
||||
the rest of the pattern.
|
||||
|
||||
PCRE2_AUTO_CALLOUT
|
||||
|
||||
|
@ -1318,7 +1336,14 @@ COMPILING A PATTERN
|
|||
|
||||
If this bit is set, letters in the pattern match both upper and lower
|
||||
case letters in the subject. It is equivalent to Perl's /i option, and
|
||||
it can be changed within a pattern by a (?i) option setting.
|
||||
it can be changed within a pattern by a (?i) option setting. If
|
||||
PCRE2_UTF is set, Unicode properties are used for all characters with
|
||||
more than one other case, and for all characters whose code points are
|
||||
greater than U+007f. For lower valued characters with only one other
|
||||
case, a lookup table is used for speed. When PCRE2_UTF is not set, a
|
||||
lookup table is used for all code points less than 256, and higher code
|
||||
points (available only in 16-bit or 32-bit mode) are treated as not
|
||||
having another case.
|
||||
|
||||
PCRE2_DOLLAR_ENDONLY
|
||||
|
||||
|
@ -1398,14 +1423,15 @@ COMPILING A PATTERN
|
|||
|
||||
PCRE2_EXTENDED_MORE
|
||||
|
||||
This option has the effect of PCRE2_EXTENDED, but, in addition, space
|
||||
and horizontal tab characters are also ignored inside a character
|
||||
class. PCRE2_EXTENDED_MORE is equivalent to Perl's 5.26 /xx option,
|
||||
and it can be changed within a pattern by a (?xx) option setting.
|
||||
This option has the effect of PCRE2_EXTENDED, but, in addition,
|
||||
unescaped space and horizontal tab characters are ignored inside a
|
||||
character class. PCRE2_EXTENDED_MORE is equivalent to Perl's 5.26 /xx
|
||||
option, and it can be changed within a pattern by a (?xx) option set-
|
||||
ting.
|
||||
|
||||
PCRE2_FIRSTLINE
|
||||
|
||||
If this option is set, an unanchored pattern is required to match
|
||||
If this option is set, the start of an unanchored pattern match must be
|
||||
before or at the first newline in the subject string, though the
|
||||
matched text may continue over the newline. See also PCRE2_USE_OFF-
|
||||
SET_LIMIT, which provides a more general limiting facility. If
|
||||
|
@ -1479,11 +1505,11 @@ COMPILING A PATTERN
|
|||
If this option is set, it disables the use of numbered capturing paren-
|
||||
theses in the pattern. Any opening parenthesis that is not followed by
|
||||
? behaves as if it were followed by ?: but named parentheses can still
|
||||
be used for capturing (and they acquire numbers in the usual way).
|
||||
There is no equivalent of this option in Perl. Note that, if this
|
||||
option is set, references to capturing groups (back references or
|
||||
recursion/subroutine calls) may only refer to named groups, though the
|
||||
reference can be by name or by number.
|
||||
be used for capturing (and they acquire numbers in the usual way). This
|
||||
is the same as Perl's /n option. Note that, when this option is set,
|
||||
references to capturing groups (back references or recursion/subroutine
|
||||
calls) may only refer to named groups, though the reference can be by
|
||||
name or by number.
|
||||
|
||||
PCRE2_NO_AUTO_POSSESS
|
||||
|
||||
|
@ -1517,8 +1543,8 @@ COMPILING A PATTERN
|
|||
|
||||
There are a number of optimizations that may occur at the start of a
|
||||
match, in order to speed up the process. For example, if it is known
|
||||
that an unanchored match must start with a specific character, the
|
||||
matching code searches the subject for that character, and fails imme-
|
||||
that an unanchored match must start with a specific code unit value,
|
||||
the matching code searches the subject for that value, and fails imme-
|
||||
diately if it cannot find it, without actually running the main match-
|
||||
ing function. This means that a special item such as (*COMMIT) at the
|
||||
start of a pattern is not considered until after a suitable starting
|
||||
|
@ -1548,9 +1574,10 @@ COMPILING A PATTERN
|
|||
set, the initial scan along the subject string does not happen. The
|
||||
first match attempt is run starting from "D" and when this fails,
|
||||
(*COMMIT) prevents any further matches being tried, so the overall
|
||||
result is "no match". There are also other start-up optimizations. For
|
||||
example, a minimum length for the subject may be recorded. Consider the
|
||||
pattern
|
||||
result is "no match".
|
||||
|
||||
There are also other start-up optimizations. For example, a minimum
|
||||
length for the subject may be recorded. Consider the pattern
|
||||
|
||||
(*MARK:A)(X|Y)
|
||||
|
||||
|
@ -1570,12 +1597,25 @@ COMPILING A PATTERN
|
|||
document. If an invalid UTF sequence is found, pcre2_compile() returns
|
||||
a negative error code.
|
||||
|
||||
If you know that your pattern is valid, and you want to skip this check
|
||||
for performance reasons, you can set the PCRE2_NO_UTF_CHECK option.
|
||||
When it is set, the effect of passing an invalid UTF string as a pat-
|
||||
tern is undefined. It may cause your program to crash or loop. Note
|
||||
that this option can also be passed to pcre2_match() and
|
||||
pcre_dfa_match(), to suppress validity checking of the subject string.
|
||||
If you know that your pattern is a valid UTF string, and you want to
|
||||
skip this check for performance reasons, you can set the
|
||||
PCRE2_NO_UTF_CHECK option. When it is set, the effect of passing an
|
||||
invalid UTF string as a pattern is undefined. It may cause your program
|
||||
to crash or loop.
|
||||
|
||||
Note that this option can also be passed to pcre2_match() and
|
||||
pcre_dfa_match(), to suppress UTF validity checking of the subject
|
||||
string.
|
||||
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis-
|
||||
able the error that is given if an escape sequence for an invalid Uni-
|
||||
code code point is encountered in the pattern. In particular, the so-
|
||||
called "surrogate" code points (0xd800 to 0xdfff) are invalid. If you
|
||||
want to allow escape sequences such as \x{d800} you can set the
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option, as described in the
|
||||
section entitled "Extra compile options" below. However, this is pos-
|
||||
sible only in UTF-8 and UTF-32 modes, because these values are not rep-
|
||||
resentable in UTF-16.
|
||||
|
||||
PCRE2_UCP
|
||||
|
||||
|
@ -1611,9 +1651,41 @@ COMPILING A PATTERN
|
|||
instead of single-code-unit strings. It is available when PCRE2 is
|
||||
built to include Unicode support (which is the default). If Unicode
|
||||
support is not available, the use of this option provokes an error.
|
||||
Details of how this option changes the behaviour of PCRE2 are given in
|
||||
Details of how PCRE2_UTF changes the behaviour of PCRE2 are given in
|
||||
the pcre2unicode page.
|
||||
|
||||
Extra compile options
|
||||
|
||||
Unlike the main compile-time options, the extra options are not saved
|
||||
with the compiled pattern. The option bits that can be set in a compile
|
||||
context by calling the pcre2_set_compile_extra_options() function are
|
||||
as follows:
|
||||
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
|
||||
This option applies when compiling a pattern in UTF-8 or UTF-32 mode.
|
||||
It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode
|
||||
"surrogate" code points in the range 0xd800 to 0xdfff are used in pairs
|
||||
in UTF-16 to encode code points with values in the range 0x10000 to
|
||||
0x10ffff. The surrogates cannot therefore be represented in UTF-16.
|
||||
They can be represented in UTF-8 and UTF-32, but are defined as invalid
|
||||
code points, and cause errors if encountered in a UTF-8 or UTF-32
|
||||
string that is being checked for validity by PCRE2.
|
||||
|
||||
These values also cause errors if encountered in escape sequences such
|
||||
as \x{d912} within a pattern. However, it seems that some applications,
|
||||
when using PCRE2 to check for unwanted characters in UTF-8 strings,
|
||||
explicitly test for the surrogates using escape sequences. The
|
||||
PCRE2_NO_UTF_CHECK option does not disable the error that occurs,
|
||||
because it applies only to the testing of input strings for UTF valid-
|
||||
ity.
|
||||
|
||||
If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro-
|
||||
gate code point values in UTF-8 and UTF-32 patterns no longer provoke
|
||||
errors and are incorporated in the compiled pattern. However, they can
|
||||
only match subject characters if the matching function is called with
|
||||
PCRE2_NO_UTF_CHECK set.
|
||||
|
||||
|
||||
COMPILATION ERROR CODES
|
||||
|
||||
|
@ -1815,7 +1887,9 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
the form (*LIMIT_DEPTH=nnnn) at the start, the value is returned. The
|
||||
third argument should point to an unsigned 32-bit integer. If no such
|
||||
value has been set, the call to pcre2_pattern_info() returns the error
|
||||
PCRE2_ERROR_UNSET.
|
||||
PCRE2_ERROR_UNSET. Note that this limit will only be used during match-
|
||||
ing if it is less than the limit set or defaulted by the caller of the
|
||||
match function.
|
||||
|
||||
PCRE2_INFO_FIRSTBITMAP
|
||||
|
||||
|
@ -1833,16 +1907,16 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
Return information about the first code unit of any matched string, for
|
||||
a non-anchored pattern. The third argument should point to an uint32_t
|
||||
variable. If there is a fixed first value, for example, the letter "c"
|
||||
from a pattern such as (cat|cow|coyote), 1 is returned, and the charac-
|
||||
ter value can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is
|
||||
no fixed first value, but it is known that a match can occur only at
|
||||
the start of the subject or following a newline in the subject, 2 is
|
||||
returned. Otherwise, and for anchored patterns, 0 is returned.
|
||||
from a pattern such as (cat|cow|coyote), 1 is returned, and the value
|
||||
can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed
|
||||
first value, but it is known that a match can occur only at the start
|
||||
of the subject or following a newline in the subject, 2 is returned.
|
||||
Otherwise, and for anchored patterns, 0 is returned.
|
||||
|
||||
PCRE2_INFO_FIRSTCODEUNIT
|
||||
|
||||
Return the value of the first code unit of any matched string in the
|
||||
situation where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
|
||||
Return the value of the first code unit of any matched string for a
|
||||
pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
|
||||
The third argument should point to an uint32_t variable. In the 8-bit
|
||||
library, the value is always less than 256. In the 16-bit library the
|
||||
value can be up to 0xffff. In the 32-bit library in UTF-32 mode the
|
||||
|
@ -1877,7 +1951,9 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
(*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argu-
|
||||
ment should point to an unsigned 32-bit integer. If no such value has
|
||||
been set, the call to pcre2_pattern_info() returns the error
|
||||
PCRE2_ERROR_UNSET.
|
||||
PCRE2_ERROR_UNSET. Note that this limit will only be used during match-
|
||||
ing if it is less than the limit set or defaulted by the caller of the
|
||||
match function.
|
||||
|
||||
PCRE2_INFO_JCHANGED
|
||||
|
||||
|
@ -1906,10 +1982,10 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
|
||||
PCRE2_INFO_LASTCODEUNIT
|
||||
|
||||
Return the value of the rightmost literal data unit that must exist in
|
||||
any matched string, other than at its start, if such a value has been
|
||||
recorded. The third argument should point to an uint32_t variable. If
|
||||
there is no such value, 0 is returned.
|
||||
Return the value of the rightmost literal code unit that must exist in
|
||||
any matched string, other than at its start, for a pattern where
|
||||
PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argu-
|
||||
ment should point to an uint32_t variable.
|
||||
|
||||
PCRE2_INFO_MATCHEMPTY
|
||||
|
||||
|
@ -1925,7 +2001,9 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
(*LIMIT_MATCH=nnnn) at the start, the value is returned. The third
|
||||
argument should point to an unsigned 32-bit integer. If no such value
|
||||
has been set, the call to pcre2_pattern_info() returns the error
|
||||
PCRE2_ERROR_UNSET.
|
||||
PCRE2_ERROR_UNSET. Note that this limit will only be used during match-
|
||||
ing if it is less than the limit set or defaulted by the caller of the
|
||||
match function.
|
||||
|
||||
PCRE2_INFO_MAXLOOKBEHIND
|
||||
|
||||
|
@ -1938,7 +2016,7 @@ INFORMATION ABOUT A COMPILED PATTERN
|
|||
inspect the previous character. This is to ensure that at least one
|
||||
character from the old segment is retained when a new segment is pro-
|
||||
cessed. Otherwise, if there are no lookbehinds in the pattern, \A might
|
||||
match incorrectly at the start of a new segment.
|
||||
match incorrectly at the start of a second or subsequent segment.
|
||||
|
||||
PCRE2_INFO_MINLENGTH
|
||||
|
||||
|
@ -2210,9 +2288,9 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
|
|||
so, and the current character is CR followed by LF, advance the start-
|
||||
ing offset by two characters instead of one.
|
||||
|
||||
If a non-zero starting offset is passed when the pattern is anchored,
|
||||
an single attempt to match at the given offset is made. This can only
|
||||
succeed if the pattern does not require the match to be at the start of
|
||||
If a non-zero starting offset is passed when the pattern is anchored, a
|
||||
single attempt to match at the given offset is made. This can only suc-
|
||||
ceed if the pattern does not require the match to be at the start of
|
||||
the subject. In other words, the anchoring must be the result of set-
|
||||
ting the PCRE2_ANCHORED option or the use of .* with PCRE2_DOTALL, not
|
||||
by starting the pattern with ^ or \A.
|
||||
|
@ -2573,6 +2651,10 @@ ERROR RETURNS FROM pcre2_match()
|
|||
|
||||
The nested backtracking depth limit was reached.
|
||||
|
||||
PCRE2_ERROR_HEAPLIMIT
|
||||
|
||||
The heap limit was reached.
|
||||
|
||||
PCRE2_ERROR_INTERNAL
|
||||
|
||||
An unexpected internal error has occurred. This error could be caused
|
||||
|
@ -3208,7 +3290,7 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
|||
example, the pattern "a\d+" is compiled as if it were "a\d++". For DFA
|
||||
matching, this means that only one possible match is found. If you
|
||||
really do want multiple matches in such cases, either use an ungreedy
|
||||
repeat auch as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||
repeat such as "a\d+?" or set the PCRE2_NO_AUTO_POSSESS option when
|
||||
compiling.
|
||||
|
||||
Error returns from pcre2_dfa_match()
|
||||
|
@ -3265,7 +3347,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 17 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -6803,12 +6885,14 @@ VERTICAL BAR
|
|||
INTERNAL OPTION SETTING
|
||||
|
||||
The settings of the PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL,
|
||||
PCRE2_EXTENDED, and PCRE2_EXTENDED_MORE options (which are Perl-compat-
|
||||
ible) can be changed from within the pattern by a sequence of Perl
|
||||
option letters enclosed between "(?" and ")". The option letters are
|
||||
PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE options
|
||||
(which are Perl-compatible) can be changed from within the pattern by a
|
||||
sequence of Perl option letters enclosed between "(?" and ")". The
|
||||
option letters are
|
||||
|
||||
i for PCRE2_CASELESS
|
||||
m for PCRE2_MULTILINE
|
||||
n for PCRE2_NO_AUTO_CAPTURE
|
||||
s for PCRE2_DOTALL
|
||||
x for PCRE2_EXTENDED
|
||||
xx for PCRE2_EXTENDED_MORE
|
||||
|
@ -9649,6 +9733,7 @@ OPTION SETTING
|
|||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?m) multiline
|
||||
(?n) no auto capture
|
||||
(?s) single line (dotall)
|
||||
(?U) default ungreedy (lazy)
|
||||
(?x) extended: ignore white space except in classes
|
||||
|
@ -9856,7 +9941,7 @@ UNICODE PROPERTY SUPPORT
|
|||
names for properties are supported. For example, \p{L} matches a let-
|
||||
ter. Its Perl synonym, \p{Letter}, is not supported. Furthermore, in
|
||||
Perl, many properties may optionally be prefixed by "Is", for compati-
|
||||
bility with Perl 5.6. PCRE does not support this.
|
||||
bility with Perl 5.6. PCRE2 does not support this.
|
||||
|
||||
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -9907,9 +9992,15 @@ WIDE CHARACTERS AND UTF MODES
|
|||
escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
|
||||
acters, whether or not PCRE2_UCP is set.
|
||||
|
||||
Case-insensitive matching in UTF mode makes use of Unicode properties.
|
||||
A few Unicode characters such as Greek sigma have more than two code-
|
||||
points that are case-equivalent, and these are treated as such.
|
||||
|
||||
CASE-EQUIVALENCE IN UTF MODES
|
||||
|
||||
Case-insensitive matching in a UTF mode makes use of Unicode properties
|
||||
except for characters whose code points are less than 128 and that have
|
||||
at most two case-equivalent values. For these, a direct table lookup is
|
||||
used for speed. A few Unicode characters such as Greek sigma have more
|
||||
than two codepoints that are case-equivalent, and these are treated as
|
||||
such.
|
||||
|
||||
|
||||
VALIDITY OF UTF STRINGS
|
||||
|
@ -9965,6 +10056,14 @@ VALIDITY OF UTF STRINGS
|
|||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||
result is undefined and your program may crash or loop indefinitely.
|
||||
|
||||
Note that setting PCRE2_NO_UTF_CHECK at compile time does not disable
|
||||
the error that is given if an escape sequence for an invalid Unicode
|
||||
code point is encountered in the pattern. If you want to allow escape
|
||||
sequences such as \x{d800} (a surrogate code point) you can set the
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is pos-
|
||||
sible only in UTF-8 and UTF-32 modes, because these values are not rep-
|
||||
resentable in UTF-16.
|
||||
|
||||
Errors in UTF-8 strings
|
||||
|
||||
The following negative error codes are given for invalid UTF-8 strings:
|
||||
|
@ -10059,8 +10158,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 03 July 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "04 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2_COMPILE 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -35,6 +35,7 @@ system stack size checking, or to change one or more of these parameters:
|
|||
The newline character sequence;
|
||||
The compile time nested parentheses limit;
|
||||
The maximum pattern length (in code units) that is allowed.
|
||||
The additional options bits
|
||||
.sp
|
||||
The option bits are:
|
||||
.sp
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
.TH PCRE2_SET_MAX_PATTERN_LENGTH 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.nf
|
||||
.B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " PCRE2_SIZE \fIextra_options\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function sets additional option bits for \fBpcre2_compile()\fP that are
|
||||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
.sp
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
.sp
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
page and a description of the POSIX API in the
|
||||
.\" HREF
|
||||
\fBpcre2posix\fP
|
||||
.\"
|
||||
page.
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "20 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2API 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -90,6 +90,9 @@ document for an overview of all the PCRE2 documentation.
|
|||
.B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " const unsigned char *\fItables\fP);"
|
||||
.sp
|
||||
.B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " uint32_t \fIextra_options\fP);"
|
||||
.sp
|
||||
.B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||
.sp
|
||||
|
@ -643,6 +646,7 @@ following compile-time parameters:
|
|||
The newline character sequence
|
||||
The compile time nested parentheses limit
|
||||
The maximum length of the pattern string
|
||||
The extra options bits (none set by default)
|
||||
.sp
|
||||
A compile context is also required if you are using custom memory management.
|
||||
If none of these apply, just pass NULL as the context argument of
|
||||
|
@ -685,6 +689,23 @@ argument is a general context. This function builds a set of character tables
|
|||
in the current locale.
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " uint32_t \fIextra_options\fP);"
|
||||
.fi
|
||||
.sp
|
||||
As PCRE2 has developed, almost all the 32 option bits that are available in
|
||||
the \fIoptions\fP argument of \fBpcre2_compile()\fP have been used up. To avoid
|
||||
running out, the compile context contains a set of extra option bits which are
|
||||
used for some newer, assumed rarer, options. This function sets those bits. It
|
||||
always sets all the bits (either on or off). It does not modify any existing
|
||||
setting. The available options are defined in the section entitled "Extra
|
||||
compile options"
|
||||
.\" HTML <a href="#extracompileoptions">
|
||||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||
.fi
|
||||
|
@ -1535,12 +1556,27 @@ in the
|
|||
document. If an invalid UTF sequence is found, \fBpcre2_compile()\fP returns a
|
||||
negative error code.
|
||||
.P
|
||||
If you know that your pattern is valid, and you want to skip this check for
|
||||
performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set,
|
||||
the effect of passing an invalid UTF string as a pattern is undefined. It may
|
||||
cause your program to crash or loop. Note that this option can also be passed
|
||||
to \fBpcre2_match()\fP and \fBpcre_dfa_match()\fP, to suppress validity
|
||||
checking of the subject string.
|
||||
If you know that your pattern is a valid UTF string, and you want to skip this
|
||||
check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When
|
||||
it is set, the effect of passing an invalid UTF string as a pattern is
|
||||
undefined. It may cause your program to crash or loop.
|
||||
.P
|
||||
Note that this option can also be passed to \fBpcre2_match()\fP and
|
||||
\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
.P
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
error that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. In particular, the so-called "surrogate" code
|
||||
points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences
|
||||
such as \ex{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra
|
||||
option, as described in the section entitled "Extra compile options"
|
||||
.\" HTML <a href="#extracompileoptions">
|
||||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
However, this is possible only in UTF-8 and UTF-32 modes, because these values
|
||||
are not representable in UTF-16.
|
||||
.sp
|
||||
PCRE2_UCP
|
||||
.sp
|
||||
|
@ -1594,6 +1630,37 @@ behaviour of PCRE2 are given in the
|
|||
page.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="extracompileoptions"></a>
|
||||
.SS "Extra compile options"
|
||||
.rs
|
||||
.sp
|
||||
Unlike the main compile-time options, the extra options are not saved with the
|
||||
compiled pattern. The option bits that can be set in a compile context by
|
||||
calling the \fBpcre2_set_compile_extra_options()\fP function are as follows:
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
.sp
|
||||
This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is
|
||||
forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate"
|
||||
code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode
|
||||
code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot
|
||||
therefore be represented in UTF-16. They can be represented in UTF-8 and
|
||||
UTF-32, but are defined as invalid code points, and cause errors if encountered
|
||||
in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2.
|
||||
.P
|
||||
These values also cause errors if encountered in escape sequences such as
|
||||
\ex{d912} within a pattern. However, it seems that some applications, when
|
||||
using PCRE2 to check for unwanted characters in UTF-8 strings, explicitly test
|
||||
for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does
|
||||
not disable the error that occurs, because it applies only to the testing of
|
||||
input strings for UTF validity.
|
||||
.P
|
||||
If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code
|
||||
point values in UTF-8 and UTF-32 patterns no longer provoke errors and are
|
||||
incorporated in the compiled pattern. However, they can only match subject
|
||||
characters if the matching function is called with PCRE2_NO_UTF_CHECK set.
|
||||
.
|
||||
.
|
||||
.SH "COMPILATION ERROR CODES"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -3421,6 +3488,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "18 April 2017" "PCRE 10.30"
|
||||
.TH PCRE2TEST 1 "17 May 2017" "PCRE 10.30"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -519,17 +519,21 @@ by a previous \fB#pattern\fP command.
|
|||
.SS "Setting compilation options"
|
||||
.rs
|
||||
.sp
|
||||
The following modifiers set options for \fBpcre2_compile()\fP. There are some
|
||||
single-letter abbreviations that are the same as Perl options. There is special
|
||||
handling for /x: if a second x is present, PCRE2_EXTENDED is converted into
|
||||
PCRE2_EXTENDED_MORE as in Perl. A third appearance adds PCRE2_EXTENDED as well,
|
||||
though this makes no difference to the way \fBpcre2_compile()\fP behaves. See
|
||||
The following modifiers set options for \fBpcre2_compile()\fP. Most of them set
|
||||
bits in the options argument of that function, but those whose names start with
|
||||
PCRE2_EXTRA are additional options that are set in the compile context. For the
|
||||
main options, there are some single-letter abbreviations that are the same as
|
||||
Perl options. There is special handling for /x: if a second x is present,
|
||||
PCRE2_EXTENDED is converted into PCRE2_EXTENDED_MORE as in Perl. A third
|
||||
appearance adds PCRE2_EXTENDED as well, though this makes no difference to the
|
||||
way \fBpcre2_compile()\fP behaves. See
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
for a description of the effects of these options.
|
||||
.sp
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
|
@ -1788,6 +1792,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 18 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -503,14 +503,19 @@ PATTERN MODIFIERS
|
|||
|
||||
Setting compilation options
|
||||
|
||||
The following modifiers set options for pcre2_compile(). The most com-
|
||||
mon ones have single-letter abbreviations, with special handling for /x
|
||||
(to make it like Perl). If a second x is present, PCRE2_EXTENDED is
|
||||
converted into PCRE2_EXTENDED_MORE. A third appearance adds
|
||||
PCRE2_EXTENDED as well. See pcre2api for a description of the effects
|
||||
The following modifiers set options for pcre2_compile(). Most of them
|
||||
set bits in the options argument of that function, but those whose
|
||||
names start with PCRE2_EXTRA are additional options that are set in the
|
||||
compile context. For the main options, there are some single-letter
|
||||
abbreviations that are the same as Perl options. There is special han-
|
||||
dling for /x: if a second x is present, PCRE2_EXTENDED is converted
|
||||
into PCRE2_EXTENDED_MORE as in Perl. A third appearance adds
|
||||
PCRE2_EXTENDED as well, though this makes no difference to the way
|
||||
pcre2_compile() behaves. See pcre2api for a description of the effects
|
||||
of these options.
|
||||
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
|
@ -529,7 +534,7 @@ PATTERN MODIFIERS
|
|||
never_backslash_c set PCRE2_NEVER_BACKSLASH_C
|
||||
never_ucp set PCRE2_NEVER_UCP
|
||||
never_utf set PCRE2_NEVER_UTF
|
||||
no_auto_capture set PCRE2_NO_AUTO_CAPTURE
|
||||
/n no_auto_capture set PCRE2_NO_AUTO_CAPTURE
|
||||
no_auto_possess set PCRE2_NO_AUTO_POSSESS
|
||||
no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR
|
||||
no_start_optimize set PCRE2_NO_START_OPTIMIZE
|
||||
|
@ -549,7 +554,8 @@ PATTERN MODIFIERS
|
|||
Setting compilation controls
|
||||
|
||||
The following modifiers affect the compilation process or request
|
||||
information about the pattern:
|
||||
information about the pattern. There are single-letter abbreviations
|
||||
for some that are heavily used in the test files.
|
||||
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
/B bincode show binary code without lengths
|
||||
|
@ -1644,5 +1650,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 17 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "20 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2UNICODE 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -164,6 +164,14 @@ or \fBpcre2_dfa_match()\fP.
|
|||
.P
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
|
||||
is undefined and your program may crash or loop indefinitely.
|
||||
.P
|
||||
Note that setting PCRE2_NO_UTF_CHECK at compile time does not disable the error
|
||||
that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. If you want to allow escape sequences such as
|
||||
\ex{d800} (a surrogate code point) you can set the
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible
|
||||
only in UTF-8 and UTF-32 modes, because these values are not representable in
|
||||
UTF-16.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="utf8strings"></a>
|
||||
|
@ -272,6 +280,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -139,6 +139,10 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */
|
||||
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
|
||||
|
||||
/* An additional compile options word is available in the compile context. */
|
||||
|
||||
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */
|
||||
|
@ -448,6 +452,8 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
|||
pcre2_set_bsr(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_character_tables(pcre2_compile_context *, const unsigned char *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
|
@ -721,6 +727,7 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_)
|
||||
#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_)
|
||||
#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_)
|
||||
#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_)
|
||||
#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_)
|
||||
#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_)
|
||||
#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_)
|
||||
|
|
|
@ -139,6 +139,10 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */
|
||||
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
|
||||
|
||||
/* An additional compile options word is available in the compile context. */
|
||||
|
||||
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */
|
||||
|
@ -448,6 +452,8 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
|||
pcre2_set_bsr(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_character_tables(pcre2_compile_context *, const unsigned char *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
|
@ -721,6 +727,7 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_)
|
||||
#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_)
|
||||
#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_)
|
||||
#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_)
|
||||
#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_)
|
||||
#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_)
|
||||
#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_)
|
||||
|
|
|
@ -717,7 +717,8 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
|||
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
||||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90 };
|
||||
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
|
||||
ERR91};
|
||||
|
||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||
|
@ -1474,7 +1475,10 @@ else
|
|||
if (utf)
|
||||
{
|
||||
if (c > 0x10ffffU) *errorcodeptr = ERR77;
|
||||
else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
|
||||
else
|
||||
if (c >= 0xd800 && c <= 0xdfff &&
|
||||
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
|
||||
*errorcodeptr = ERR73;
|
||||
}
|
||||
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
|
||||
}
|
||||
|
@ -1663,7 +1667,8 @@ else
|
|||
}
|
||||
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
|
||||
{
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff)
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff &&
|
||||
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
|
||||
{
|
||||
ptr--;
|
||||
*errorcodeptr = ERR73;
|
||||
|
@ -1732,7 +1737,8 @@ else
|
|||
}
|
||||
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
|
||||
{
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff)
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff &&
|
||||
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
|
||||
{
|
||||
ptr--;
|
||||
*errorcodeptr = ERR73;
|
||||
|
@ -9100,7 +9106,9 @@ if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
|
|||
#endif
|
||||
|
||||
/* Check UTF. We have the original options in 'options', with that value as
|
||||
modified by (*UTF) etc in cb->external_options. */
|
||||
modified by (*UTF) etc in cb->external_options. The extra option
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
|
||||
surrogate code points cannot be represented in UTF-16. */
|
||||
|
||||
utf = (cb.external_options & PCRE2_UTF) != 0;
|
||||
if (utf)
|
||||
|
@ -9113,6 +9121,14 @@ if (utf)
|
|||
if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
|
||||
(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
|
||||
goto HAD_ERROR; /* Offset was set by valid_utf() */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
|
||||
{
|
||||
errorcode = ERR91;
|
||||
goto HAD_EARLY_ERROR;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Check UCP lockout. */
|
||||
|
|
|
@ -138,7 +138,8 @@ const pcre2_compile_context PRIV(default_compile_context) = {
|
|||
PCRE2_UNSET, /* Max pattern length */
|
||||
BSR_DEFAULT, /* Backslash R default */
|
||||
NEWLINE_DEFAULT, /* Newline convention */
|
||||
PARENS_NEST_LIMIT }; /* As it says */
|
||||
PARENS_NEST_LIMIT, /* As it says */
|
||||
0 }; /* Extra options */
|
||||
|
||||
/* The create function copies the default into the new memory, but must
|
||||
override the default memory handling functions if a gcontext was provided. */
|
||||
|
@ -371,6 +372,13 @@ ccontext->parens_nest_limit = limit;
|
|||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t options)
|
||||
{
|
||||
ccontext->extra_options = options;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
||||
int (*guard)(uint32_t, void *), void *user_data)
|
||||
|
@ -448,3 +456,4 @@ return 0;
|
|||
|
||||
|
||||
/* End of pcre2_context.c */
|
||||
|
||||
|
|
|
@ -176,6 +176,7 @@ static const unsigned char compile_error_texts[] =
|
|||
"internal error: unknown code in parsed pattern\0"
|
||||
/* 90 */
|
||||
"internal error: bad code value in parsed_skip()\0"
|
||||
"PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
|
|
|
@ -572,6 +572,7 @@ typedef struct pcre2_real_compile_context {
|
|||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
uint32_t extra_options;
|
||||
} pcre2_real_compile_context;
|
||||
|
||||
/* The real match context structure. */
|
||||
|
|
|
@ -194,6 +194,7 @@ void vms_setsymbol( char *, char *, int );
|
|||
#define LOCALESIZE 32 /* Size of locale name */
|
||||
#define LOOPREPEAT 500000 /* Default loop count for timing */
|
||||
#define MALLOCLISTSIZE 20 /* For remembering mallocs */
|
||||
#define PARENS_NEST_DEFAULT 220 /* Default parentheses nest limit */
|
||||
#define PATSTACKSIZE 20 /* Pattern stack for save/restore testing */
|
||||
#define REPLACE_MODSIZE 100 /* Field for reading 8-bit replacement */
|
||||
#define VERSION_SIZE 64 /* Size of buffer for the version strings */
|
||||
|
@ -577,6 +578,7 @@ static modstruct modlist[] = {
|
|||
{ "allaftertext", MOD_PNDP, MOD_CTL, CTL_ALLAFTERTEXT, PO(control) },
|
||||
{ "allcaptures", MOD_PND, MOD_CTL, CTL_ALLCAPTURES, PO(control) },
|
||||
{ "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) },
|
||||
{ "allow_surrogate_escapes", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES, CO(extra_options) },
|
||||
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
|
||||
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
|
||||
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
|
||||
|
@ -686,6 +688,8 @@ static modstruct modlist[] = {
|
|||
PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_MULTILINE|PCRE2_UCP|PCRE2_UTF| \
|
||||
PCRE2_UNGREEDY)
|
||||
|
||||
#define POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS (0)
|
||||
|
||||
#define POSIX_SUPPORTED_COMPILE_CONTROLS ( \
|
||||
CTL_AFTERTEXT|CTL_ALLAFTERTEXT|CTL_EXPAND|CTL_POSIX|CTL_POSIX_NOSUB)
|
||||
|
||||
|
@ -4025,6 +4029,32 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%
|
|||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Show compile extra options *
|
||||
*************************************************/
|
||||
|
||||
/* Called for unsupported POSIX options.
|
||||
|
||||
Arguments:
|
||||
options an options word
|
||||
before text to print before
|
||||
after text to print after
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
show_compile_extra_options(uint32_t options, const char *before,
|
||||
const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
else fprintf(outfile, "%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
|
||||
after);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef SUPPORT_PCRE2_8
|
||||
/*************************************************
|
||||
|
@ -5161,6 +5191,16 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
|
|||
pat_patctl.options & ~POSIX_SUPPORTED_COMPILE_OPTIONS, msg, "");
|
||||
msg = "";
|
||||
}
|
||||
|
||||
if ((FLD(pat_context, extra_options) &
|
||||
~POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS) != 0)
|
||||
{
|
||||
show_compile_extra_options(
|
||||
FLD(pat_context, extra_options) & ~POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS,
|
||||
msg, "");
|
||||
msg = "";
|
||||
}
|
||||
|
||||
if ((pat_patctl.control & ~POSIX_SUPPORTED_COMPILE_CONTROLS) != 0 ||
|
||||
(pat_patctl.control2 & ~POSIX_SUPPORTED_COMPILE_CONTROLS2) != 0)
|
||||
{
|
||||
|
@ -5170,6 +5210,10 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
|
|||
}
|
||||
|
||||
if (local_newline_default != 0) prmsg(&msg, "#newline_default");
|
||||
if (FLD(pat_context, max_pattern_length) != PCRE2_UNSET)
|
||||
prmsg(&msg, "max_pattern_length");
|
||||
if (FLD(pat_context, parens_nest_limit) != PARENS_NEST_DEFAULT)
|
||||
prmsg(&msg, "parens_nest_limit");
|
||||
|
||||
if (msg[0] == 0) fprintf(outfile, "\n");
|
||||
|
||||
|
@ -8123,6 +8167,7 @@ max_oveccount = DEFAULT_OVECCOUNT;
|
|||
G(match_data,BITS) = G(pcre2_match_data_create_,BITS)(max_oveccount, G(general_context,BITS))
|
||||
|
||||
#define CONTEXTTESTS \
|
||||
(void)G(pcre2_set_compile_extra_options_,BITS)(G(pat_context,BITS), 0); \
|
||||
(void)G(pcre2_set_max_pattern_length_,BITS)(G(pat_context,BITS), 0); \
|
||||
(void)G(pcre2_set_offset_limit_,BITS)(G(dat_context,BITS), 0); \
|
||||
(void)G(pcre2_set_recursion_memory_management_,BITS)(G(dat_context,BITS), my_malloc, my_free, NULL)
|
||||
|
@ -8163,7 +8208,7 @@ if (test_mode == PCRE32_MODE)
|
|||
/* Set a default parentheses nest limit that is large enough to run the
|
||||
standard tests (this also exercises the function). */
|
||||
|
||||
PCRE2_SET_PARENS_NEST_LIMIT(default_pat_context, 220);
|
||||
PCRE2_SET_PARENS_NEST_LIMIT(default_pat_context, PARENS_NEST_DEFAULT);
|
||||
|
||||
/* Handle command line modifier settings, sending any error messages to
|
||||
stderr. We need to know the mode before modifying the context, and it is tidier
|
||||
|
|
|
@ -458,4 +458,13 @@
|
|||
|
||||
/[\s[:^ascii:]]/B,ucp
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 8-bit mode,
|
||||
# but subjects containing them must not be UTF-checked.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -363,4 +363,14 @@
|
|||
/\pP/ucp
|
||||
\x{7fffffff}
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 32-bit mode,
|
||||
# but subjects containing them must not be UTF-checked. These patterns give
|
||||
# errors in 16-bit mode.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#forbid_utf
|
||||
#pattern posix
|
||||
|
||||
# Test invalid options
|
||||
# Test some invalid options
|
||||
|
||||
/abc/auto_callout
|
||||
|
||||
|
@ -15,6 +15,10 @@
|
|||
/abc/
|
||||
abc\=partial_hard
|
||||
|
||||
/a(())bc/parens_nest_limit=1
|
||||
|
||||
/abc/allow_surrogate_escapes,max_pattern_length=2
|
||||
|
||||
# Real tests
|
||||
|
||||
/abc/
|
||||
|
|
|
@ -1575,4 +1575,15 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP),
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 8-bit mode,
|
||||
# but subjects containing them must not be UTF-checked.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
0: \x{d800}
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
0: \x{dfff}\x{df01}
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -1421,4 +1421,16 @@ No match
|
|||
** Truncation will probably give the wrong result.
|
||||
No match
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 32-bit mode,
|
||||
# but subjects containing them must not be UTF-checked. These patterns give
|
||||
# errors in 16-bit mode.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
|
||||
\x{d800}\=no_utf_check
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -1413,4 +1413,16 @@ No match
|
|||
\x{7fffffff}
|
||||
No match
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 32-bit mode,
|
||||
# but subjects containing them must not be UTF-checked. These patterns give
|
||||
# errors in 16-bit mode.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
0: \x{d800}
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
0: \x{dfff}\x{df01}
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -15970,7 +15970,6 @@ Error -2: partial match
|
|||
Error -1: no match
|
||||
Error 0: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error 100: no error
|
||||
Error 188: pattern string is longer than the limit set by the application
|
||||
Error 189: internal error: unknown code in parsed pattern
|
||||
Error 190: internal error: bad code value in parsed_skip()
|
||||
Error 191: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error 101: \ at end of pattern
|
||||
Error 191: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
|
||||
Error 192: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
|
|
Loading…
Reference in New Issue