Add additional compile options and PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES.
This commit is contained in:
parent
d9c33d0708
commit
dfc9712bcd
7
132html
7
132html
|
@ -109,8 +109,9 @@ while (<STDIN>)
|
|||
# Handling .sp is subtle. If it is inside a literal section, do nothing if
|
||||
# the next line is a non literal text line; similarly, if not inside a
|
||||
# literal section, do nothing if a literal follows, unless we are inside
|
||||
# a .nf/.ne section. The point being that the <pre> and </pre> that delimit
|
||||
# literal sections will do the spacing. Always skip if no previous output.
|
||||
# a .nf/.fi section or about to enter one. The point being that the <pre>
|
||||
# and </pre> that delimit literal sections will do the spacing. Always skip
|
||||
# if no previous output.
|
||||
|
||||
elsif (/^\.sp/)
|
||||
{
|
||||
|
@ -123,7 +124,7 @@ while (<STDIN>)
|
|||
}
|
||||
else
|
||||
{
|
||||
print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/);
|
||||
print TEMP "<br>\n<br>\n" if ($innf || /^\.nf/ || !/^[\s.]/);
|
||||
}
|
||||
redo; # Now process the lookahead line we just read
|
||||
}
|
||||
|
|
|
@ -166,6 +166,9 @@ pcre2test, a crash could occur.
|
|||
32. Make -bigstack in RunTest allocate a 64Mb stack (instead of 16 MB) so that
|
||||
all the tests can run with clang's sanitizing options.
|
||||
|
||||
33. Implement extra compile options in the compile context and add the first
|
||||
one: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES.
|
||||
|
||||
|
||||
|
||||
Version 10.23 14-February-2017
|
||||
|
|
|
@ -67,6 +67,7 @@ dist_html_DATA = \
|
|||
doc/html/pcre2_set_bsr.html \
|
||||
doc/html/pcre2_set_callout.html \
|
||||
doc/html/pcre2_set_character_tables.html \
|
||||
doc/html/pcre2_set_compile_extra_options.html \
|
||||
doc/html/pcre2_set_compile_recursion_guard.html \
|
||||
doc/html/pcre2_set_depth_limit.html \
|
||||
doc/html/pcre2_set_heap_limit.html \
|
||||
|
@ -151,6 +152,7 @@ dist_man_MANS = \
|
|||
doc/pcre2_set_bsr.3 \
|
||||
doc/pcre2_set_callout.3 \
|
||||
doc/pcre2_set_character_tables.3 \
|
||||
doc/pcre2_set_compile_extra_options.3 \
|
||||
doc/pcre2_set_compile_recursion_guard.3 \
|
||||
doc/pcre2_set_depth_limit.3 \
|
||||
doc/pcre2_set_heap_limit.3 \
|
||||
|
|
2
RunTest
2
RunTest
|
@ -500,7 +500,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
for opt in "" $jitopt; do
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput2 testtry
|
||||
if [ $? = 0 ] ; then
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,188,189,190,191 >>testtry
|
||||
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $bmode $opt -error -65,-62,-2,-1,0,100,101,191,192 >>testtry
|
||||
checkresult $? 2 "$opt"
|
||||
fi
|
||||
done
|
||||
|
|
|
@ -207,6 +207,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_set_character_tables.html">pcre2_set_character_tables</a></td>
|
||||
<td> Set character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_extra_options.html">pcre2_set_compile_extra_options</a></td>
|
||||
<td> Set compile time extra options</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_recursion_guard.html">pcre2_set_compile_recursion_guard</a></td>
|
||||
<td> Set up a compile recursion guard function</td></tr>
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@ system stack size checking, or to change one or more of these parameters:
|
|||
The newline character sequence;
|
||||
The compile time nested parentheses limit;
|
||||
The maximum pattern length (in code units) that is allowed.
|
||||
The additional options bits
|
||||
</pre>
|
||||
The option bits are:
|
||||
<pre>
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2_set_compile_extra_options specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2_set_compile_extra_options man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
SYNOPSIS
|
||||
</b><br>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>int pcre2_set_compile_extra_options(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>extra_options</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function sets additional option bits for <b>pcre2_compile()</b> that are
|
||||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
</pre>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
page and a description of the POSIX API in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
page.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -60,8 +60,8 @@ please consult the man page, in case the conversion went wrong.
|
|||
<b>#include <pcre2.h></b>
|
||||
<br>
|
||||
<br>
|
||||
PCRE2 is a new API for PCRE. This document contains a description of all its
|
||||
functions. See the
|
||||
PCRE2 is a new API for PCRE, starting at release 10.0. This document contains a
|
||||
description of all its native functions. See the
|
||||
<a href="pcre2.html"><b>pcre2</b></a>
|
||||
document for an overview of all the PCRE2 documentation.
|
||||
</P>
|
||||
|
@ -145,6 +145,10 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b> const unsigned char *<i>tables</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_compile_extra_options(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>extra_options</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_max_pattern_length(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -328,7 +332,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
These functions became obsolete at release 10.30 and are retained only for
|
||||
backward compatibility. They should not be used in new code. The first is
|
||||
replaced by <b>pcre2_set_depth_limit()</b>; the second is no longer needed and
|
||||
no longer has any effect (it always returns zero).
|
||||
has no effect (it always returns zero).
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES</a><br>
|
||||
<P>
|
||||
|
@ -389,23 +393,23 @@ For example, if you want to run a match using a pattern that was compiled with
|
|||
<P>
|
||||
In the function summaries above, and in the rest of this document and other
|
||||
PCRE2 documents, functions and data types are described using their generic
|
||||
names, without the 8, 16, or 32 suffix.
|
||||
names, without the _8, _16, or _32 suffix.
|
||||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">PCRE2 API OVERVIEW</a><br>
|
||||
<P>
|
||||
PCRE2 has its own native API, which is described in this document. There are
|
||||
also some wrapper functions for the 8-bit library that correspond to the
|
||||
POSIX regular expression API, but they do not give access to all the
|
||||
functionality. They are described in the
|
||||
functionality of PCRE2. They are described in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
documentation. Both these APIs define a set of C function calls.
|
||||
</P>
|
||||
<P>
|
||||
The native API C data types, function prototypes, option values, and error
|
||||
codes are defined in the header file <b>pcre2.h</b>, which contains definitions
|
||||
of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers for the
|
||||
library. Applications can use these to include support for different releases
|
||||
of PCRE2.
|
||||
codes are defined in the header file <b>pcre2.h</b>, which also contains
|
||||
definitions of PCRE2_MAJOR and PCRE2_MINOR, the major and minor release numbers
|
||||
for the library. Applications can use these to include support for different
|
||||
releases of PCRE2.
|
||||
</P>
|
||||
<P>
|
||||
In a Windows environment, if you want to statically link an application program
|
||||
|
@ -478,7 +482,7 @@ been matched by <b>pcre2_match()</b>. They are:
|
|||
<b>pcre2_substring_number_from_name()</b>
|
||||
</pre>
|
||||
<b>pcre2_substring_free()</b> and <b>pcre2_substring_list_free()</b> are also
|
||||
provided, to free the memory used for extracted strings.
|
||||
provided, to free memory used for extracted strings.
|
||||
</P>
|
||||
<P>
|
||||
The function <b>pcre2_substitute()</b> can be called to match a pattern and
|
||||
|
@ -595,7 +599,7 @@ required. JIT compilation updates a pointer within the compiled code block, so
|
|||
a thread must gain unique write access to the pointer before calling
|
||||
<b>pcre2_jit_compile()</b>. Alternatively, <b>pcre2_code_copy()</b> or
|
||||
<b>pcre2_code_copy_with_tables()</b> can be used to obtain a private copy of the
|
||||
compiled code.
|
||||
compiled code before calling the JIT compiler.
|
||||
</P>
|
||||
<br><b>
|
||||
Context blocks
|
||||
|
@ -649,6 +653,8 @@ library. The context is named `general' rather than specifically `memory'
|
|||
because in future other fields may be added. If you do not want to supply your
|
||||
own custom memory management functions, you do not need to bother with a
|
||||
general context. A general context is created by:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_general_context *pcre2_general_context_create(</b>
|
||||
<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
|
||||
<b> void (*<i>private_free</i>)(void *, void *), void *<i>memory_data</i>);</b>
|
||||
|
@ -675,11 +681,15 @@ used. When the time comes to free the block, this function is called.
|
|||
</P>
|
||||
<P>
|
||||
A general context can be copied by calling:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_general_context *pcre2_general_context_copy(</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
The memory used for a general context should be freed by calling:
|
||||
<br>
|
||||
<br>
|
||||
<b>void pcre2_general_context_free(pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<a name="compilecontext"></a></P>
|
||||
<br><b>
|
||||
|
@ -695,6 +705,7 @@ following compile-time parameters:
|
|||
The newline character sequence
|
||||
The compile time nested parentheses limit
|
||||
The maximum length of the pattern string
|
||||
The extra options bits (none set by default)
|
||||
</pre>
|
||||
A compile context is also required if you are using custom memory management.
|
||||
If none of these apply, just pass NULL as the context argument of
|
||||
|
@ -702,6 +713,8 @@ If none of these apply, just pass NULL as the context argument of
|
|||
</P>
|
||||
<P>
|
||||
A compile context is created, copied, and freed by the following functions:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_compile_context *pcre2_compile_context_create(</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
|
@ -716,6 +729,8 @@ A compile context is created, copied, and freed by the following functions:
|
|||
A compile context is created with default values for its parameters. These can
|
||||
be changed by calling the following functions, which return 0 on success, or
|
||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -725,6 +740,8 @@ or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
|
|||
ending sequence. The value is used by the JIT compiler and by the two
|
||||
interpreted matching functions, <i>pcre2_match()</i> and
|
||||
<i>pcre2_dfa_match()</i>.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> const unsigned char *<i>tables</i>);</b>
|
||||
<br>
|
||||
|
@ -732,6 +749,22 @@ interpreted matching functions, <i>pcre2_match()</i> and
|
|||
The value must be the result of a call to <i>pcre2_maketables()</i>, whose only
|
||||
argument is a general context. This function builds a set of character tables
|
||||
in the current locale.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_compile_extra_options(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>extra_options</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
As PCRE2 has developed, almost all the 32 option bits that are available in
|
||||
the <i>options</i> argument of <b>pcre2_compile()</b> have been used up. To avoid
|
||||
running out, the compile context contains a set of extra option bits which are
|
||||
used for some newer, assumed rarer, options. This function sets those bits. It
|
||||
always sets all the bits (either on or off). It does not modify any existing
|
||||
setting. The available options are defined in the section entitled "Extra
|
||||
compile options"
|
||||
<a href="#extracompileoptions">below.</a>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_max_pattern_length(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -741,6 +774,8 @@ compiled with this context. If the pattern is longer, an error is generated.
|
|||
This facility is provided so that applications that accept patterns from
|
||||
external sources can limit their size. The default is the largest number that a
|
||||
PCRE2_SIZE variable can hold, which is effectively unlimited.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -758,11 +793,13 @@ sequence such as (*CRLF). See the
|
|||
page for details.
|
||||
</P>
|
||||
<P>
|
||||
When a pattern is compiled with the PCRE2_EXTENDED option, the newline
|
||||
convention affects the recognition of white space and the end of internal
|
||||
comments starting with #. The value is saved with the compiled pattern for
|
||||
subsequent use by the JIT compiler and by the two interpreted matching
|
||||
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
||||
When a pattern is compiled with the PCRE2_EXTENDED or PCRE2_EXTENDED_MORE
|
||||
option, the newline convention affects the recognition of white space and the
|
||||
end of internal comments starting with #. The value is saved with the compiled
|
||||
pattern for subsequent use by the JIT compiler and by the two interpreted
|
||||
matching functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -771,6 +808,8 @@ This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
|
|||
depth of parenthesis nesting in a pattern. This limit stops rogue patterns
|
||||
using up too much system stack when being compiled. The limit applies to
|
||||
parentheses of all kinds, not just capturing parentheses.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
||||
<br>
|
||||
|
@ -778,10 +817,10 @@ parentheses of all kinds, not just capturing parentheses.
|
|||
There is at least one application that runs PCRE2 in threads with very limited
|
||||
system stack, where running out of stack is to be avoided at all costs. The
|
||||
parenthesis limit above cannot take account of how much stack is actually
|
||||
available. For a finer control, you can supply a function that is called
|
||||
whenever <b>pcre2_compile()</b> starts to compile a parenthesized part of a
|
||||
pattern. This function can check the actual stack size (or anything else that
|
||||
it wants to, of course).
|
||||
available during compilation. For a finer control, you can supply a function
|
||||
that is called whenever <b>pcre2_compile()</b> starts to compile a parenthesized
|
||||
part of a pattern. This function can check the actual stack size (or anything
|
||||
else that it wants to, of course).
|
||||
</P>
|
||||
<P>
|
||||
The first argument to the callout function gives the current depth of
|
||||
|
@ -807,6 +846,8 @@ If none of these apply, just pass NULL as the context argument of
|
|||
</P>
|
||||
<P>
|
||||
A match context is created, copied, and freed by the following functions:
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_match_context *pcre2_match_context_create(</b>
|
||||
<b> pcre2_general_context *<i>gcontext</i>);</b>
|
||||
<br>
|
||||
|
@ -821,6 +862,8 @@ A match context is created, copied, and freed by the following functions:
|
|||
A match context is created with default values for its parameters. These can
|
||||
be changed by calling the following functions, which return 0 on success, or
|
||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *, void *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
|
@ -830,6 +873,8 @@ This sets up a "callout" function for PCRE2 to call at specified points
|
|||
during a matching operation. Details are given in the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_offset_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -856,6 +901,8 @@ subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
|||
start within the first line of the subject. If this is set with an offset
|
||||
limit, a match must occur in the first line and also within the offset limit.
|
||||
In other words, whichever limit comes first is used.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_heap_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -889,6 +936,8 @@ Heap memory is used only if the initial vector is too small. If the heap limit
|
|||
is set to a value less than 21 (in particular, zero) no heap memory will be
|
||||
used. In this case, only patterns that do not have a lot of nested backtracking
|
||||
can be successfully processed.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -926,6 +975,8 @@ of the form
|
|||
where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
||||
less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
||||
limit is set, less than the default.
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_depth_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -1281,9 +1332,10 @@ parenthesis. The name is not processed in any way, and it is not possible to
|
|||
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
|
||||
option is set, normal backslash processing is applied to verb names and only an
|
||||
unescaped closing parenthesis terminates the name. A closing parenthesis can be
|
||||
included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED
|
||||
option is set, unescaped whitespace in verb names is skipped and #-comments are
|
||||
recognized in this mode, exactly as in the rest of the pattern.
|
||||
included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED
|
||||
or PCRE2_EXTENDED_MORE option is set, unescaped whitespace in verb names is
|
||||
skipped and #-comments are recognized in this mode, exactly as in the rest of
|
||||
the pattern.
|
||||
<pre>
|
||||
PCRE2_AUTO_CALLOUT
|
||||
</pre>
|
||||
|
@ -1298,7 +1350,13 @@ documentation.
|
|||
</pre>
|
||||
If this bit is set, letters in the pattern match both upper and lower case
|
||||
letters in the subject. It is equivalent to Perl's /i option, and it can be
|
||||
changed within a pattern by a (?i) option setting.
|
||||
changed within a pattern by a (?i) option setting. If PCRE2_UTF is set, Unicode
|
||||
properties are used for all characters with more than one other case, and for
|
||||
all characters whose code points are greater than U+007f. For lower valued
|
||||
characters with only one other case, a lookup table is used for speed. When
|
||||
PCRE2_UTF is not set, a lookup table is used for all code points less than 256,
|
||||
and higher code points (available only in 16-bit or 32-bit mode) are treated as
|
||||
not having another case.
|
||||
<pre>
|
||||
PCRE2_DOLLAR_ENDONLY
|
||||
</pre>
|
||||
|
@ -1380,18 +1438,18 @@ built.
|
|||
<pre>
|
||||
PCRE2_EXTENDED_MORE
|
||||
</pre>
|
||||
This option has the effect of PCRE2_EXTENDED, but, in addition, space and
|
||||
horizontal tab characters are also ignored inside a character class.
|
||||
This option has the effect of PCRE2_EXTENDED, but, in addition, unescaped space
|
||||
and horizontal tab characters are ignored inside a character class.
|
||||
PCRE2_EXTENDED_MORE is equivalent to Perl's 5.26 /xx option, and it can be
|
||||
changed within a pattern by a (?xx) option setting.
|
||||
<pre>
|
||||
PCRE2_FIRSTLINE
|
||||
</pre>
|
||||
If this option is set, an unanchored pattern is required to match before or at
|
||||
the first newline in the subject string, though the matched text may continue
|
||||
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
|
||||
general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a
|
||||
match must occur in the first line and also within the offset limit. In other
|
||||
If this option is set, the start of an unanchored pattern match must be before
|
||||
or at the first newline in the subject string, though the matched text may
|
||||
continue over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a
|
||||
more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit,
|
||||
a match must occur in the first line and also within the offset limit. In other
|
||||
words, whichever limit comes first is used.
|
||||
<pre>
|
||||
PCRE2_MATCH_UNSET_BACKREF
|
||||
|
@ -1457,8 +1515,8 @@ PCRE2_NEVER_UTF causes an error.
|
|||
If this option is set, it disables the use of numbered capturing parentheses in
|
||||
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
||||
were followed by ?: but named parentheses can still be used for capturing (and
|
||||
they acquire numbers in the usual way). There is no equivalent of this option
|
||||
in Perl. Note that, if this option is set, references to capturing groups (back
|
||||
they acquire numbers in the usual way). This is the same as Perl's /n option.
|
||||
Note that, when this option is set, references to capturing groups (back
|
||||
references or recursion/subroutine calls) may only refer to named groups,
|
||||
though the reference can be by name or by number.
|
||||
<pre>
|
||||
|
@ -1494,8 +1552,8 @@ compiler.
|
|||
<P>
|
||||
There are a number of optimizations that may occur at the start of a match, in
|
||||
order to speed up the process. For example, if it is known that an unanchored
|
||||
match must start with a specific character, the matching code searches the
|
||||
subject for that character, and fails immediately if it cannot find it, without
|
||||
match must start with a specific code unit value, the matching code searches
|
||||
the subject for that value, and fails immediately if it cannot find it, without
|
||||
actually running the main matching function. This means that a special item
|
||||
such as (*COMMIT) at the start of a pattern is not considered until after a
|
||||
suitable starting point for the match has been found. Also, when callouts or
|
||||
|
@ -1524,9 +1582,11 @@ current starting position, which in this case, it does. However, if the same
|
|||
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
|
||||
subject string does not happen. The first match attempt is run starting from
|
||||
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
|
||||
the overall result is "no match". There are also other start-up optimizations.
|
||||
For example, a minimum length for the subject may be recorded. Consider the
|
||||
pattern
|
||||
the overall result is "no match".
|
||||
</P>
|
||||
<P>
|
||||
There are also other start-up optimizations. For example, a minimum length for
|
||||
the subject may be recorded. Consider the pattern
|
||||
<pre>
|
||||
(*MARK:A)(X|Y)
|
||||
</pre>
|
||||
|
@ -1548,15 +1608,29 @@ and
|
|||
in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
document. If an invalid UTF sequence is found, <b>pcre2_compile()</b> returns a
|
||||
negative error code.
|
||||
negative error code.
|
||||
</P>
|
||||
<P>
|
||||
If you know that your pattern is valid, and you want to skip this check for
|
||||
performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set,
|
||||
the effect of passing an invalid UTF string as a pattern is undefined. It may
|
||||
cause your program to crash or loop. Note that this option can also be passed
|
||||
to <b>pcre2_match()</b> and <b>pcre_dfa_match()</b>, to suppress validity
|
||||
checking of the subject string.
|
||||
If you know that your pattern is a valid UTF string, and you want to skip this
|
||||
check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When
|
||||
it is set, the effect of passing an invalid UTF string as a pattern is
|
||||
undefined. It may cause your program to crash or loop.
|
||||
</P>
|
||||
<P>
|
||||
Note that this option can also be passed to <b>pcre2_match()</b> and
|
||||
<b>pcre_dfa_match()</b>, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
</P>
|
||||
<P>
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
error that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. In particular, the so-called "surrogate" code
|
||||
points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences
|
||||
such as \x{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra
|
||||
option, as described in the section entitled "Extra compile options"
|
||||
<a href="#extracompileoptions">below.</a>
|
||||
However, this is possible only in UTF-8 and UTF-32 modes, because these values
|
||||
are not representable in UTF-16.
|
||||
<pre>
|
||||
PCRE2_UCP
|
||||
</pre>
|
||||
|
@ -1594,10 +1668,42 @@ This option causes PCRE2 to regard both the pattern and the subject strings
|
|||
that are subsequently processed as strings of UTF characters instead of
|
||||
single-code-unit strings. It is available when PCRE2 is built to include
|
||||
Unicode support (which is the default). If Unicode support is not available,
|
||||
the use of this option provokes an error. Details of how this option changes
|
||||
the behaviour of PCRE2 are given in the
|
||||
the use of this option provokes an error. Details of how PCRE2_UTF changes the
|
||||
behaviour of PCRE2 are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
<a name="extracompileoptions"></a></P>
|
||||
<br><b>
|
||||
Extra compile options
|
||||
</b><br>
|
||||
<P>
|
||||
Unlike the main compile-time options, the extra options are not saved with the
|
||||
compiled pattern. The option bits that can be set in a compile context by
|
||||
calling the <b>pcre2_set_compile_extra_options()</b> function are as follows:
|
||||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
</pre>
|
||||
This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is
|
||||
forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate"
|
||||
code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode
|
||||
code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot
|
||||
therefore be represented in UTF-16. They can be represented in UTF-8 and
|
||||
UTF-32, but are defined as invalid code points, and cause errors if encountered
|
||||
in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2.
|
||||
</P>
|
||||
<P>
|
||||
These values also cause errors if encountered in escape sequences such as
|
||||
\x{d912} within a pattern. However, it seems that some applications, when
|
||||
using PCRE2 to check for unwanted characters in UTF-8 strings, explicitly test
|
||||
for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does
|
||||
not disable the error that occurs, because it applies only to the testing of
|
||||
input strings for UTF validity.
|
||||
</P>
|
||||
<P>
|
||||
If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code
|
||||
point values in UTF-8 and UTF-32 patterns no longer provoke errors and are
|
||||
incorporated in the compiled pattern. However, they can only match subject
|
||||
characters if the matching function is called with PCRE2_NO_UTF_CHECK set.
|
||||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">COMPILATION ERROR CODES</a><br>
|
||||
<P>
|
||||
|
@ -1806,7 +1912,9 @@ The third argument should point to an <b>uint32_t</b> variable.
|
|||
If the pattern set a backtracking depth limit by including an item of the form
|
||||
(*LIMIT_DEPTH=nnnn) at the start, the value is returned. The third argument
|
||||
should point to an unsigned 32-bit integer. If no such value has been set, the
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. Note
|
||||
that this limit will only be used during matching if it is less than the limit
|
||||
set or defaulted by the caller of the match function.
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTBITMAP
|
||||
</pre>
|
||||
|
@ -1824,15 +1932,15 @@ returned. Otherwise NULL is returned. The third argument should point to an
|
|||
Return information about the first code unit of any matched string, for a
|
||||
non-anchored pattern. The third argument should point to an <b>uint32_t</b>
|
||||
variable. If there is a fixed first value, for example, the letter "c" from a
|
||||
pattern such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
||||
it is known that a match can occur only at the start of the subject or
|
||||
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
||||
patterns, 0 is returned.
|
||||
pattern such as (cat|cow|coyote), 1 is returned, and the value can be retrieved
|
||||
using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but it is
|
||||
known that a match can occur only at the start of the subject or following a
|
||||
newline in the subject, 2 is returned. Otherwise, and for anchored patterns, 0
|
||||
is returned.
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTCODEUNIT
|
||||
</pre>
|
||||
Return the value of the first code unit of any matched string in the situation
|
||||
Return the value of the first code unit of any matched string for a pattern
|
||||
where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0. The third
|
||||
argument should point to an <b>uint32_t</b> variable. In the 8-bit library, the
|
||||
value is always less than 256. In the 16-bit library the value can be up to
|
||||
|
@ -1864,7 +1972,9 @@ the equivalent hexadecimal or octal escape sequences.
|
|||
If the pattern set a heap memory limit by including an item of the form
|
||||
(*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argument
|
||||
should point to an unsigned 32-bit integer. If no such value has been set, the
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. Note
|
||||
that this limit will only be used during matching if it is less than the limit
|
||||
set or defaulted by the caller of the match function.
|
||||
<pre>
|
||||
PCRE2_INFO_JCHANGED
|
||||
</pre>
|
||||
|
@ -1891,10 +2001,10 @@ PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0.
|
|||
<pre>
|
||||
PCRE2_INFO_LASTCODEUNIT
|
||||
</pre>
|
||||
Return the value of the rightmost literal data unit that must exist in any
|
||||
matched string, other than at its start, if such a value has been recorded. The
|
||||
third argument should point to an <b>uint32_t</b> variable. If there is no such
|
||||
value, 0 is returned.
|
||||
Return the value of the rightmost literal code unit that must exist in any
|
||||
matched string, other than at its start, for a pattern where
|
||||
PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argument
|
||||
should point to an <b>uint32_t</b> variable.
|
||||
<pre>
|
||||
PCRE2_INFO_MATCHEMPTY
|
||||
</pre>
|
||||
|
@ -1909,7 +2019,9 @@ in such cases.
|
|||
If the pattern set a match limit by including an item of the form
|
||||
(*LIMIT_MATCH=nnnn) at the start, the value is returned. The third argument
|
||||
should point to an unsigned 32-bit integer. If no such value has been set, the
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET.
|
||||
call to <b>pcre2_pattern_info()</b> returns the error PCRE2_ERROR_UNSET. Note
|
||||
that this limit will only be used during matching if it is less than the limit
|
||||
set or defaulted by the caller of the match function.
|
||||
<pre>
|
||||
PCRE2_INFO_MAXLOOKBEHIND
|
||||
</pre>
|
||||
|
@ -1921,7 +2033,8 @@ require a one-character lookbehind. \A also registers a one-character
|
|||
lookbehind, though it does not actually inspect the previous character. This is
|
||||
to ensure that at least one character from the old segment is retained when a
|
||||
new segment is processed. Otherwise, if there are no lookbehinds in the
|
||||
pattern, \A might match incorrectly at the start of a new segment.
|
||||
pattern, \A might match incorrectly at the start of a second or subsequent
|
||||
segment.
|
||||
<pre>
|
||||
PCRE2_INFO_MINLENGTH
|
||||
</pre>
|
||||
|
@ -2216,7 +2329,7 @@ character is CR followed by LF, advance the starting offset by two characters
|
|||
instead of one.
|
||||
</P>
|
||||
<P>
|
||||
If a non-zero starting offset is passed when the pattern is anchored, an single
|
||||
If a non-zero starting offset is passed when the pattern is anchored, a single
|
||||
attempt to match at the given offset is made. This can only succeed if the
|
||||
pattern does not require the match to be at the start of the subject. In other
|
||||
words, the anchoring must be the result of setting the PCRE2_ANCHORED option or
|
||||
|
@ -2611,6 +2724,10 @@ documentation for details.
|
|||
PCRE2_ERROR_DEPTHLIMIT
|
||||
</pre>
|
||||
The nested backtracking depth limit was reached.
|
||||
<pre>
|
||||
PCRE2_ERROR_HEAPLIMIT
|
||||
</pre>
|
||||
The heap limit was reached.
|
||||
<pre>
|
||||
PCRE2_ERROR_INTERNAL
|
||||
</pre>
|
||||
|
@ -3290,7 +3407,7 @@ NOTE: PCRE2's "auto-possessification" optimization usually applies to character
|
|||
repeats at the end of a pattern (as well as internally). For example, the
|
||||
pattern "a\d+" is compiled as if it were "a\d++". For DFA matching, this
|
||||
means that only one possible match is found. If you really do want multiple
|
||||
matches in such cases, either use an ungreedy repeat auch as "a\d+?" or set
|
||||
matches in such cases, either use an ungreedy repeat such as "a\d+?" or set
|
||||
the PCRE2_NO_AUTO_POSSESS option when compiling.
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -3351,7 +3468,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 April 2017
|
||||
Last updated: 17 May 2017
|
||||
<br>
|
||||
Copyright © 1997-2017 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -1545,12 +1545,13 @@ alternative in the subpattern.
|
|||
<br><a name="SEC13" href="#TOC1">INTERNAL OPTION SETTING</a><br>
|
||||
<P>
|
||||
The settings of the PCRE2_CASELESS, PCRE2_MULTILINE, PCRE2_DOTALL,
|
||||
PCRE2_EXTENDED, and PCRE2_EXTENDED_MORE options (which are Perl-compatible) can
|
||||
be changed from within the pattern by a sequence of Perl option letters
|
||||
enclosed between "(?" and ")". The option letters are
|
||||
PCRE2_EXTENDED, PCRE2_EXTENDED_MORE, and PCRE2_NO_AUTO_CAPTURE options (which
|
||||
are Perl-compatible) can be changed from within the pattern by a sequence of
|
||||
Perl option letters enclosed between "(?" and ")". The option letters are
|
||||
<pre>
|
||||
i for PCRE2_CASELESS
|
||||
m for PCRE2_MULTILINE
|
||||
n for PCRE2_NO_AUTO_CAPTURE
|
||||
s for PCRE2_DOTALL
|
||||
x for PCRE2_EXTENDED
|
||||
xx for PCRE2_EXTENDED_MORE
|
||||
|
|
|
@ -430,6 +430,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
(?i) caseless
|
||||
(?J) allow duplicate names
|
||||
(?m) multiline
|
||||
(?n) no auto capture
|
||||
(?s) single line (dotall)
|
||||
(?U) default ungreedy (lazy)
|
||||
(?x) extended: ignore white space except in classes
|
||||
|
|
|
@ -559,14 +559,19 @@ by a previous <b>#pattern</b> command.
|
|||
Setting compilation options
|
||||
</b><br>
|
||||
<P>
|
||||
The following modifiers set options for <b>pcre2_compile()</b>. The most common
|
||||
ones have single-letter abbreviations, with special handling for /x (to make
|
||||
it like Perl). If a second x is present, PCRE2_EXTENDED is converted into
|
||||
PCRE2_EXTENDED_MORE. A third appearance adds PCRE2_EXTENDED as well. See
|
||||
The following modifiers set options for <b>pcre2_compile()</b>. Most of them set
|
||||
bits in the options argument of that function, but those whose names start with
|
||||
PCRE2_EXTRA are additional options that are set in the compile context. For the
|
||||
main options, there are some single-letter abbreviations that are the same as
|
||||
Perl options. There is special handling for /x: if a second x is present,
|
||||
PCRE2_EXTENDED is converted into PCRE2_EXTENDED_MORE as in Perl. A third
|
||||
appearance adds PCRE2_EXTENDED as well, though this makes no difference to the
|
||||
way <b>pcre2_compile()</b> behaves. See
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
for a description of the effects of these options.
|
||||
<pre>
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
|
@ -585,7 +590,7 @@ for a description of the effects of these options.
|
|||
never_backslash_c set PCRE2_NEVER_BACKSLASH_C
|
||||
never_ucp set PCRE2_NEVER_UCP
|
||||
never_utf set PCRE2_NEVER_UTF
|
||||
no_auto_capture set PCRE2_NO_AUTO_CAPTURE
|
||||
/n no_auto_capture set PCRE2_NO_AUTO_CAPTURE
|
||||
no_auto_possess set PCRE2_NO_AUTO_POSSESS
|
||||
no_dotstar_anchor set PCRE2_NO_DOTSTAR_ANCHOR
|
||||
no_start_optimize set PCRE2_NO_START_OPTIMIZE
|
||||
|
@ -607,7 +612,8 @@ Setting compilation controls
|
|||
</b><br>
|
||||
<P>
|
||||
The following modifiers affect the compilation process or request information
|
||||
about the pattern:
|
||||
about the pattern. There are single-letter abbreviations for some that are
|
||||
heavily used in the test files.
|
||||
<pre>
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
/B bincode show binary code without lengths
|
||||
|
@ -1810,7 +1816,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 April 2017
|
||||
Last updated: 17 May 2017
|
||||
<br>
|
||||
Copyright © 1997-2017 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -47,7 +47,7 @@ and
|
|||
documentation. Only the short names for properties are supported. For example,
|
||||
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
|
||||
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
|
||||
compatibility with Perl 5.6. PCRE does not support this.
|
||||
compatibility with Perl 5.6. PCRE2 does not support this.
|
||||
</P>
|
||||
<br><b>
|
||||
WIDE CHARACTERS AND UTF MODES
|
||||
|
@ -109,10 +109,15 @@ However, the special horizontal and vertical white space matching escapes (\h,
|
|||
\H, \v, and \V) do match all the appropriate Unicode characters, whether or
|
||||
not PCRE2_UCP is set.
|
||||
</P>
|
||||
<br><b>
|
||||
CASE-EQUIVALENCE IN UTF MODES
|
||||
</b><br>
|
||||
<P>
|
||||
Case-insensitive matching in UTF mode makes use of Unicode properties. A few
|
||||
Unicode characters such as Greek sigma have more than two codepoints that are
|
||||
case-equivalent, and these are treated as such.
|
||||
Case-insensitive matching in a UTF mode makes use of Unicode properties except
|
||||
for characters whose code points are less than 128 and that have at most two
|
||||
case-equivalent values. For these, a direct table lookup is used for speed. A
|
||||
few Unicode characters such as Greek sigma have more than two codepoints that
|
||||
are case-equivalent, and these are treated as such.
|
||||
</P>
|
||||
<br><b>
|
||||
VALIDITY OF UTF STRINGS
|
||||
|
@ -173,6 +178,15 @@ or <b>pcre2_dfa_match()</b>.
|
|||
<P>
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
|
||||
is undefined and your program may crash or loop indefinitely.
|
||||
</P>
|
||||
<P>
|
||||
Note that setting PCRE2_NO_UTF_CHECK at compile time does not disable the error
|
||||
that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. If you want to allow escape sequences such as
|
||||
\x{d800} (a surrogate code point) you can set the
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible
|
||||
only in UTF-8 and UTF-32 modes, because these values are not representable in
|
||||
UTF-16.
|
||||
<a name="utf8strings"></a></P>
|
||||
<br><b>
|
||||
Errors in UTF-8 strings
|
||||
|
@ -280,9 +294,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 03 July 2016
|
||||
Last updated: 17 May 2017
|
||||
<br>
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
Copyright © 1997-2017 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -207,6 +207,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_set_character_tables.html">pcre2_set_character_tables</a></td>
|
||||
<td> Set character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_extra_options.html">pcre2_set_compile_extra_options</a></td>
|
||||
<td> Set compile time extra options</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_set_compile_recursion_guard.html">pcre2_set_compile_recursion_guard</a></td>
|
||||
<td> Set up a compile recursion guard function</td></tr>
|
||||
|
||||
|
|
3689
doc/pcre2.txt
3689
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "04 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2_COMPILE 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -35,6 +35,7 @@ system stack size checking, or to change one or more of these parameters:
|
|||
The newline character sequence;
|
||||
The compile time nested parentheses limit;
|
||||
The maximum pattern length (in code units) that is allowed.
|
||||
The additional options bits
|
||||
.sp
|
||||
The option bits are:
|
||||
.sp
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
.TH PCRE2_SET_MAX_PATTERN_LENGTH 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.nf
|
||||
.B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " PCRE2_SIZE \fIextra_options\fP);"
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function sets additional option bits for \fBpcre2_compile()\fP that are
|
||||
housed in a compile context. It completely replaces all the bits. The extra
|
||||
options are:
|
||||
.sp
|
||||
.\" JOIN
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff}
|
||||
in UTF-8 and UTF-32 modes
|
||||
.sp
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
page and a description of the POSIX API in the
|
||||
.\" HREF
|
||||
\fBpcre2posix\fP
|
||||
.\"
|
||||
page.
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "20 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2API 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -90,6 +90,9 @@ document for an overview of all the PCRE2 documentation.
|
|||
.B int pcre2_set_character_tables(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " const unsigned char *\fItables\fP);"
|
||||
.sp
|
||||
.B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " uint32_t \fIextra_options\fP);"
|
||||
.sp
|
||||
.B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||
.sp
|
||||
|
@ -643,6 +646,7 @@ following compile-time parameters:
|
|||
The newline character sequence
|
||||
The compile time nested parentheses limit
|
||||
The maximum length of the pattern string
|
||||
The extra options bits (none set by default)
|
||||
.sp
|
||||
A compile context is also required if you are using custom memory management.
|
||||
If none of these apply, just pass NULL as the context argument of
|
||||
|
@ -685,6 +689,23 @@ argument is a general context. This function builds a set of character tables
|
|||
in the current locale.
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_compile_extra_options(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " uint32_t \fIextra_options\fP);"
|
||||
.fi
|
||||
.sp
|
||||
As PCRE2 has developed, almost all the 32 option bits that are available in
|
||||
the \fIoptions\fP argument of \fBpcre2_compile()\fP have been used up. To avoid
|
||||
running out, the compile context contains a set of extra option bits which are
|
||||
used for some newer, assumed rarer, options. This function sets those bits. It
|
||||
always sets all the bits (either on or off). It does not modify any existing
|
||||
setting. The available options are defined in the section entitled "Extra
|
||||
compile options"
|
||||
.\" HTML <a href="#extracompileoptions">
|
||||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
.sp
|
||||
.nf
|
||||
.B int pcre2_set_max_pattern_length(pcre2_compile_context *\fIccontext\fP,
|
||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||
.fi
|
||||
|
@ -1533,14 +1554,29 @@ in the
|
|||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
document. If an invalid UTF sequence is found, \fBpcre2_compile()\fP returns a
|
||||
negative error code.
|
||||
negative error code.
|
||||
.P
|
||||
If you know that your pattern is valid, and you want to skip this check for
|
||||
performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it is set,
|
||||
the effect of passing an invalid UTF string as a pattern is undefined. It may
|
||||
cause your program to crash or loop. Note that this option can also be passed
|
||||
to \fBpcre2_match()\fP and \fBpcre_dfa_match()\fP, to suppress validity
|
||||
checking of the subject string.
|
||||
If you know that your pattern is a valid UTF string, and you want to skip this
|
||||
check for performance reasons, you can set the PCRE2_NO_UTF_CHECK option. When
|
||||
it is set, the effect of passing an invalid UTF string as a pattern is
|
||||
undefined. It may cause your program to crash or loop.
|
||||
.P
|
||||
Note that this option can also be passed to \fBpcre2_match()\fP and
|
||||
\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject
|
||||
string.
|
||||
.P
|
||||
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
|
||||
error that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. In particular, the so-called "surrogate" code
|
||||
points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences
|
||||
such as \ex{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra
|
||||
option, as described in the section entitled "Extra compile options"
|
||||
.\" HTML <a href="#extracompileoptions">
|
||||
.\" </a>
|
||||
below.
|
||||
.\"
|
||||
However, this is possible only in UTF-8 and UTF-32 modes, because these values
|
||||
are not representable in UTF-16.
|
||||
.sp
|
||||
PCRE2_UCP
|
||||
.sp
|
||||
|
@ -1594,6 +1630,37 @@ behaviour of PCRE2 are given in the
|
|||
page.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="extracompileoptions"></a>
|
||||
.SS "Extra compile options"
|
||||
.rs
|
||||
.sp
|
||||
Unlike the main compile-time options, the extra options are not saved with the
|
||||
compiled pattern. The option bits that can be set in a compile context by
|
||||
calling the \fBpcre2_set_compile_extra_options()\fP function are as follows:
|
||||
.sp
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
.sp
|
||||
This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is
|
||||
forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate"
|
||||
code points in the range 0xd800 to 0xdfff are used in pairs in UTF-16 to encode
|
||||
code points with values in the range 0x10000 to 0x10ffff. The surrogates cannot
|
||||
therefore be represented in UTF-16. They can be represented in UTF-8 and
|
||||
UTF-32, but are defined as invalid code points, and cause errors if encountered
|
||||
in a UTF-8 or UTF-32 string that is being checked for validity by PCRE2.
|
||||
.P
|
||||
These values also cause errors if encountered in escape sequences such as
|
||||
\ex{d912} within a pattern. However, it seems that some applications, when
|
||||
using PCRE2 to check for unwanted characters in UTF-8 strings, explicitly test
|
||||
for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does
|
||||
not disable the error that occurs, because it applies only to the testing of
|
||||
input strings for UTF validity.
|
||||
.P
|
||||
If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code
|
||||
point values in UTF-8 and UTF-32 patterns no longer provoke errors and are
|
||||
incorporated in the compiled pattern. However, they can only match subject
|
||||
characters if the matching function is called with PCRE2_NO_UTF_CHECK set.
|
||||
.
|
||||
.
|
||||
.SH "COMPILATION ERROR CODES"
|
||||
.rs
|
||||
.sp
|
||||
|
@ -3421,6 +3488,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "18 April 2017" "PCRE 10.30"
|
||||
.TH PCRE2TEST 1 "17 May 2017" "PCRE 10.30"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -519,17 +519,21 @@ by a previous \fB#pattern\fP command.
|
|||
.SS "Setting compilation options"
|
||||
.rs
|
||||
.sp
|
||||
The following modifiers set options for \fBpcre2_compile()\fP. There are some
|
||||
single-letter abbreviations that are the same as Perl options. There is special
|
||||
handling for /x: if a second x is present, PCRE2_EXTENDED is converted into
|
||||
PCRE2_EXTENDED_MORE as in Perl. A third appearance adds PCRE2_EXTENDED as well,
|
||||
though this makes no difference to the way \fBpcre2_compile()\fP behaves. See
|
||||
The following modifiers set options for \fBpcre2_compile()\fP. Most of them set
|
||||
bits in the options argument of that function, but those whose names start with
|
||||
PCRE2_EXTRA are additional options that are set in the compile context. For the
|
||||
main options, there are some single-letter abbreviations that are the same as
|
||||
Perl options. There is special handling for /x: if a second x is present,
|
||||
PCRE2_EXTENDED is converted into PCRE2_EXTENDED_MORE as in Perl. A third
|
||||
appearance adds PCRE2_EXTENDED as well, though this makes no difference to the
|
||||
way \fBpcre2_compile()\fP behaves. See
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
for a description of the effects of these options.
|
||||
.sp
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
alt_verbnames set PCRE2_ALT_VERBNAMES
|
||||
|
@ -1788,6 +1792,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 18 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
.fi
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2UNICODE 3 "20 April 2017" "PCRE2 10.30"
|
||||
.TH PCRE2UNICODE 3 "17 May 2017" "PCRE2 10.30"
|
||||
.SH NAME
|
||||
PCRE - Perl-compatible regular expressions (revised API)
|
||||
.SH "UNICODE AND UTF SUPPORT"
|
||||
|
@ -164,6 +164,14 @@ or \fBpcre2_dfa_match()\fP.
|
|||
.P
|
||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
|
||||
is undefined and your program may crash or loop indefinitely.
|
||||
.P
|
||||
Note that setting PCRE2_NO_UTF_CHECK at compile time does not disable the error
|
||||
that is given if an escape sequence for an invalid Unicode code point is
|
||||
encountered in the pattern. If you want to allow escape sequences such as
|
||||
\ex{d800} (a surrogate code point) you can set the
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option. However, this is possible
|
||||
only in UTF-8 and UTF-32 modes, because these values are not representable in
|
||||
UTF-16.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="utf8strings"></a>
|
||||
|
@ -272,6 +280,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 April 2017
|
||||
Last updated: 17 May 2017
|
||||
Copyright (c) 1997-2017 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -139,6 +139,10 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */
|
||||
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
|
||||
|
||||
/* An additional compile options word is available in the compile context. */
|
||||
|
||||
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */
|
||||
|
@ -448,6 +452,8 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
|||
pcre2_set_bsr(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_character_tables(pcre2_compile_context *, const unsigned char *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
|
@ -721,6 +727,7 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_)
|
||||
#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_)
|
||||
#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_)
|
||||
#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_)
|
||||
#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_)
|
||||
#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_)
|
||||
#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_)
|
||||
|
|
|
@ -139,6 +139,10 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_USE_OFFSET_LIMIT 0x00800000u /* J M D */
|
||||
#define PCRE2_EXTENDED_MORE 0x01000000u /* C */
|
||||
|
||||
/* An additional compile options word is available in the compile context. */
|
||||
|
||||
#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
#define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */
|
||||
|
@ -448,6 +452,8 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
|||
pcre2_set_bsr(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_character_tables(pcre2_compile_context *, const unsigned char *); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
|
@ -721,6 +727,7 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_set_bsr PCRE2_SUFFIX(pcre2_set_bsr_)
|
||||
#define pcre2_set_callout PCRE2_SUFFIX(pcre2_set_callout_)
|
||||
#define pcre2_set_character_tables PCRE2_SUFFIX(pcre2_set_character_tables_)
|
||||
#define pcre2_set_compile_extra_options PCRE2_SUFFIX(pcre2_set_compile_extra_options_)
|
||||
#define pcre2_set_compile_recursion_guard PCRE2_SUFFIX(pcre2_set_compile_recursion_guard_)
|
||||
#define pcre2_set_depth_limit PCRE2_SUFFIX(pcre2_set_depth_limit_)
|
||||
#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_)
|
||||
|
|
|
@ -717,7 +717,8 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
|||
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
||||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90 };
|
||||
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88, ERR89, ERR90,
|
||||
ERR91};
|
||||
|
||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||
|
@ -728,7 +729,7 @@ enum { PSO_OPT, /* Value is an option bit */
|
|||
PSO_FLG, /* Value is a flag bit */
|
||||
PSO_NL, /* Value is a newline type */
|
||||
PSO_BSR, /* Value is a \R type */
|
||||
PSO_LIMH, /* Read integer value for heap limit */
|
||||
PSO_LIMH, /* Read integer value for heap limit */
|
||||
PSO_LIMM, /* Read integer value for match limit */
|
||||
PSO_LIMD }; /* Read integer value for depth limit */
|
||||
|
||||
|
@ -1474,7 +1475,10 @@ else
|
|||
if (utf)
|
||||
{
|
||||
if (c > 0x10ffffU) *errorcodeptr = ERR77;
|
||||
else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
|
||||
else
|
||||
if (c >= 0xd800 && c <= 0xdfff &&
|
||||
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
|
||||
*errorcodeptr = ERR73;
|
||||
}
|
||||
else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
|
||||
}
|
||||
|
@ -1663,7 +1667,8 @@ else
|
|||
}
|
||||
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
|
||||
{
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff)
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff &&
|
||||
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
|
||||
{
|
||||
ptr--;
|
||||
*errorcodeptr = ERR73;
|
||||
|
@ -1732,7 +1737,8 @@ else
|
|||
}
|
||||
else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
|
||||
{
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff)
|
||||
if (utf && c >= 0xd800 && c <= 0xdfff &&
|
||||
(cb->cx->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
|
||||
{
|
||||
ptr--;
|
||||
*errorcodeptr = ERR73;
|
||||
|
@ -2227,7 +2233,7 @@ typedef struct nest_save {
|
|||
uint16_t reset_group;
|
||||
uint16_t max_group;
|
||||
uint16_t flags;
|
||||
uint32_t options;
|
||||
uint32_t options;
|
||||
} nest_save;
|
||||
|
||||
#define NSF_RESET 0x0001u
|
||||
|
@ -2297,10 +2303,10 @@ creating a nest_save that spans the end of the workspace. */
|
|||
|
||||
end_nests = (nest_save *)((char *)end_nests -
|
||||
((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
|
||||
|
||||
|
||||
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
|
||||
|
||||
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
|
||||
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
|
||||
|
||||
/* Now scan the pattern */
|
||||
|
||||
|
@ -2969,7 +2975,7 @@ while (ptr < ptrend)
|
|||
for (;;)
|
||||
{
|
||||
BOOL char_is_literal = TRUE;
|
||||
|
||||
|
||||
/* Inside \Q...\E everything is literal except \E */
|
||||
|
||||
if (inescq)
|
||||
|
@ -2982,11 +2988,11 @@ while (ptr < ptrend)
|
|||
}
|
||||
goto CLASS_LITERAL;
|
||||
}
|
||||
|
||||
|
||||
/* Skip over space and tab (only) in extended-more mode. */
|
||||
|
||||
if ((options & PCRE2_EXTENDED_MORE) != 0 &&
|
||||
(c == CHAR_SPACE || c == CHAR_HT))
|
||||
|
||||
if ((options & PCRE2_EXTENDED_MORE) != 0 &&
|
||||
(c == CHAR_SPACE || c == CHAR_HT))
|
||||
goto CLASS_CONTINUE;
|
||||
|
||||
/* Handle POSIX class names. Perl allows a negation extension of the
|
||||
|
@ -3448,12 +3454,12 @@ while (ptr < ptrend)
|
|||
case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
|
||||
case CHAR_s: *optset |= PCRE2_DOTALL; break;
|
||||
case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
|
||||
|
||||
|
||||
/* If x appears twice it sets the extended extended option. */
|
||||
|
||||
case CHAR_x:
|
||||
|
||||
case CHAR_x:
|
||||
*optset |= ((*optset & PCRE2_EXTENDED) != 0)?
|
||||
PCRE2_EXTENDED_MORE : PCRE2_EXTENDED;
|
||||
PCRE2_EXTENDED_MORE : PCRE2_EXTENDED;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -3463,10 +3469,10 @@ while (ptr < ptrend)
|
|||
}
|
||||
}
|
||||
options = (options | set) & (~unset);
|
||||
|
||||
|
||||
/* Unsetting extended should also get rid of extended-more. */
|
||||
|
||||
if ((options & PCRE2_EXTENDED) == 0) options &= ~PCRE2_EXTENDED_MORE;
|
||||
|
||||
if ((options & PCRE2_EXTENDED) == 0) options &= ~PCRE2_EXTENDED_MORE;
|
||||
|
||||
/* If the options ended with ')' this is not the start of a nested
|
||||
group with option changes, so the options change at this level.
|
||||
|
@ -4190,18 +4196,18 @@ for (;;)
|
|||
case OP_CALLOUT_STR:
|
||||
code += GET(code, 1 + 2*LINK_SIZE);
|
||||
break;
|
||||
|
||||
|
||||
case OP_SKIPZERO:
|
||||
code += 2 + GET(code, 2) + LINK_SIZE;
|
||||
break;
|
||||
|
||||
break;
|
||||
|
||||
case OP_COND:
|
||||
case OP_SCOND:
|
||||
if (code[1+LINK_SIZE] != OP_FALSE || /* Not DEFINE */
|
||||
code[GET(code, 1)] != OP_KET) /* More than one branch */
|
||||
return code;
|
||||
code += GET(code, 1) + 1 + LINK_SIZE;
|
||||
break;
|
||||
break;
|
||||
|
||||
default:
|
||||
return code;
|
||||
|
@ -8150,7 +8156,7 @@ uint32_t nestlevel = 0;
|
|||
for (;; pptr++)
|
||||
{
|
||||
uint32_t meta = META_CODE(*pptr);
|
||||
|
||||
|
||||
switch(meta)
|
||||
{
|
||||
default: /* Just skip over most items */
|
||||
|
@ -8265,8 +8271,8 @@ int branchlength;
|
|||
int grouplength = -1;
|
||||
|
||||
/* The cache can be used only if there is no possibility of there being two
|
||||
groups with the same number. We do not need to set the end pointer for a group
|
||||
that is being processed as a back reference or recursion, but we must do so for
|
||||
groups with the same number. We do not need to set the end pointer for a group
|
||||
that is being processed as a back reference or recursion, but we must do so for
|
||||
an inline group. */
|
||||
|
||||
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
|
||||
|
@ -8438,7 +8444,7 @@ for (;; pptr++)
|
|||
}
|
||||
break;
|
||||
|
||||
/* Lookaheads can be ignored, but we must start the skip inside the group
|
||||
/* Lookaheads can be ignored, but we must start the skip inside the group
|
||||
so that it isn't treated as a group within the branch. */
|
||||
|
||||
case META_LOOKAHEAD:
|
||||
|
@ -8464,7 +8470,7 @@ for (;; pptr++)
|
|||
case META_BACKREF_BYNAME:
|
||||
if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
|
||||
goto ISNOTFIXED;
|
||||
/* Fall through */
|
||||
/* Fall through */
|
||||
|
||||
case META_RECURSE_BYNAME:
|
||||
{
|
||||
|
@ -8542,7 +8548,7 @@ for (;; pptr++)
|
|||
else if (*gptr == (META_CAPTURE | group)) break;
|
||||
}
|
||||
|
||||
/* We must start the search for the end of the group at the first meta code
|
||||
/* We must start the search for the end of the group at the first meta code
|
||||
inside the group. Otherwise it will be treated as an enclosed group. */
|
||||
|
||||
gptrend = parsed_skip(gptr + 1, PSKIP_KET);
|
||||
|
@ -8552,12 +8558,12 @@ for (;; pptr++)
|
|||
if (r != NULL) goto ISNOTFIXED; /* Mutual recursion */
|
||||
this_recurse.prev = recurses;
|
||||
this_recurse.groupptr = gptr;
|
||||
|
||||
|
||||
/* We do not need to know the position of the end of the group, that is,
|
||||
gptr is not used after the call to get_grouplength(). Setting the second
|
||||
argument FALSE stops it scanning for the end when the length can be found
|
||||
in the cache. */
|
||||
|
||||
gptr is not used after the call to get_grouplength(). Setting the second
|
||||
argument FALSE stops it scanning for the end when the length can be found
|
||||
in the cache. */
|
||||
|
||||
gptr++;
|
||||
grouplength = get_grouplength(&gptr, FALSE, errcodeptr, lcptr, group,
|
||||
&this_recurse, cb);
|
||||
|
@ -8596,7 +8602,7 @@ for (;; pptr++)
|
|||
case META_NOCAPTURE:
|
||||
pptr++;
|
||||
CHECK_GROUP:
|
||||
grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
|
||||
grouplength = get_grouplength(&pptr, TRUE, errcodeptr, lcptr, group,
|
||||
recurses, cb);
|
||||
if (grouplength < 0) return -1;
|
||||
itemlength = grouplength;
|
||||
|
@ -9053,7 +9059,7 @@ while (patlen - skipatstart >= 2 &&
|
|||
|
||||
case PSO_LIMM:
|
||||
case PSO_LIMD:
|
||||
case PSO_LIMH:
|
||||
case PSO_LIMH:
|
||||
c = 0;
|
||||
pp = skipatstart;
|
||||
if (!IS_DIGIT(ptr[pp]))
|
||||
|
@ -9100,7 +9106,9 @@ if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
|
|||
#endif
|
||||
|
||||
/* Check UTF. We have the original options in 'options', with that value as
|
||||
modified by (*UTF) etc in cb->external_options. */
|
||||
modified by (*UTF) etc in cb->external_options. The extra option
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
|
||||
surrogate code points cannot be represented in UTF-16. */
|
||||
|
||||
utf = (cb.external_options & PCRE2_UTF) != 0;
|
||||
if (utf)
|
||||
|
@ -9113,6 +9121,14 @@ if (utf)
|
|||
if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
|
||||
(errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
|
||||
goto HAD_ERROR; /* Offset was set by valid_utf() */
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
|
||||
{
|
||||
errorcode = ERR91;
|
||||
goto HAD_EARLY_ERROR;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Check UCP lockout. */
|
||||
|
@ -9299,7 +9315,7 @@ possible because nowadays we limit the maximum value of cb.names_found and
|
|||
cb.name_entry_size. */
|
||||
|
||||
re_blocksize = sizeof(pcre2_real_code) +
|
||||
CU2BYTES(length +
|
||||
CU2BYTES(length +
|
||||
(PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
|
||||
re = (pcre2_real_code *)
|
||||
ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
|
||||
|
@ -9308,11 +9324,11 @@ if (re == NULL)
|
|||
errorcode = ERR21;
|
||||
goto HAD_CB_ERROR;
|
||||
}
|
||||
|
||||
/* The compiler may put padding at the end of the pcre2_real_code structure in
|
||||
order to round it up to a multiple of 4 or 8 bytes. This means that when a
|
||||
compiled pattern is copied (for example, when serialized) undefined bytes are
|
||||
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
|
||||
|
||||
/* The compiler may put padding at the end of the pcre2_real_code structure in
|
||||
order to round it up to a multiple of 4 or 8 bytes. This means that when a
|
||||
compiled pattern is copied (for example, when serialized) undefined bytes are
|
||||
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
|
||||
write to the last 8 bytes of the structure before setting the fields. */
|
||||
|
||||
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
|
||||
|
|
|
@ -138,7 +138,8 @@ const pcre2_compile_context PRIV(default_compile_context) = {
|
|||
PCRE2_UNSET, /* Max pattern length */
|
||||
BSR_DEFAULT, /* Backslash R default */
|
||||
NEWLINE_DEFAULT, /* Newline convention */
|
||||
PARENS_NEST_LIMIT }; /* As it says */
|
||||
PARENS_NEST_LIMIT, /* As it says */
|
||||
0 }; /* Extra options */
|
||||
|
||||
/* The create function copies the default into the new memory, but must
|
||||
override the default memory handling functions if a gcontext was provided. */
|
||||
|
@ -168,7 +169,7 @@ const pcre2_match_context PRIV(default_match_context) = {
|
|||
NULL,
|
||||
NULL,
|
||||
PCRE2_UNSET, /* Offset limit */
|
||||
HEAP_LIMIT,
|
||||
HEAP_LIMIT,
|
||||
MATCH_LIMIT,
|
||||
MATCH_LIMIT_DEPTH };
|
||||
|
||||
|
@ -197,7 +198,7 @@ const pcre2_convert_context PRIV(default_convert_context) = {
|
|||
CHAR_BACKSLASH /* Default path separator */
|
||||
#else /* is OS dependent */
|
||||
CHAR_SLASH /* Not Windows */
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
/* The create function copies the default into the new memory, but must
|
||||
|
@ -371,6 +372,13 @@ ccontext->parens_nest_limit = limit;
|
|||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *ccontext, uint32_t options)
|
||||
{
|
||||
ccontext->extra_options = options;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
||||
int (*guard)(uint32_t, void *), void *user_data)
|
||||
|
@ -420,7 +428,7 @@ mcontext->offset_limit = limit;
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* This function became obsolete at release 10.30. It is kept as a no-op for
|
||||
/* This function became obsolete at release 10.30. It is kept as a no-op for
|
||||
backwards compatibility. */
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
|
@ -448,3 +456,4 @@ return 0;
|
|||
|
||||
|
||||
/* End of pcre2_context.c */
|
||||
|
||||
|
|
|
@ -176,6 +176,7 @@ static const unsigned char compile_error_texts[] =
|
|||
"internal error: unknown code in parsed pattern\0"
|
||||
/* 90 */
|
||||
"internal error: bad code value in parsed_skip()\0"
|
||||
"PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
|
|
|
@ -572,6 +572,7 @@ typedef struct pcre2_real_compile_context {
|
|||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
uint32_t extra_options;
|
||||
} pcre2_real_compile_context;
|
||||
|
||||
/* The real match context structure. */
|
||||
|
|
|
@ -194,6 +194,7 @@ void vms_setsymbol( char *, char *, int );
|
|||
#define LOCALESIZE 32 /* Size of locale name */
|
||||
#define LOOPREPEAT 500000 /* Default loop count for timing */
|
||||
#define MALLOCLISTSIZE 20 /* For remembering mallocs */
|
||||
#define PARENS_NEST_DEFAULT 220 /* Default parentheses nest limit */
|
||||
#define PATSTACKSIZE 20 /* Pattern stack for save/restore testing */
|
||||
#define REPLACE_MODSIZE 100 /* Field for reading 8-bit replacement */
|
||||
#define VERSION_SIZE 64 /* Size of buffer for the version strings */
|
||||
|
@ -577,6 +578,7 @@ static modstruct modlist[] = {
|
|||
{ "allaftertext", MOD_PNDP, MOD_CTL, CTL_ALLAFTERTEXT, PO(control) },
|
||||
{ "allcaptures", MOD_PND, MOD_CTL, CTL_ALLCAPTURES, PO(control) },
|
||||
{ "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) },
|
||||
{ "allow_surrogate_escapes", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES, CO(extra_options) },
|
||||
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
|
||||
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
|
||||
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
|
||||
|
@ -685,6 +687,8 @@ static modstruct modlist[] = {
|
|||
#define POSIX_SUPPORTED_COMPILE_OPTIONS ( \
|
||||
PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_MULTILINE|PCRE2_UCP|PCRE2_UTF| \
|
||||
PCRE2_UNGREEDY)
|
||||
|
||||
#define POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS (0)
|
||||
|
||||
#define POSIX_SUPPORTED_COMPILE_CONTROLS ( \
|
||||
CTL_AFTERTEXT|CTL_ALLAFTERTEXT|CTL_EXPAND|CTL_POSIX|CTL_POSIX_NOSUB)
|
||||
|
@ -4025,6 +4029,32 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%
|
|||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Show compile extra options *
|
||||
*************************************************/
|
||||
|
||||
/* Called for unsupported POSIX options.
|
||||
|
||||
Arguments:
|
||||
options an options word
|
||||
before text to print before
|
||||
after text to print after
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
show_compile_extra_options(uint32_t options, const char *before,
|
||||
const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
else fprintf(outfile, "%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
|
||||
after);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef SUPPORT_PCRE2_8
|
||||
/*************************************************
|
||||
|
@ -5161,6 +5191,16 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
|
|||
pat_patctl.options & ~POSIX_SUPPORTED_COMPILE_OPTIONS, msg, "");
|
||||
msg = "";
|
||||
}
|
||||
|
||||
if ((FLD(pat_context, extra_options) &
|
||||
~POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS) != 0)
|
||||
{
|
||||
show_compile_extra_options(
|
||||
FLD(pat_context, extra_options) & ~POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS,
|
||||
msg, "");
|
||||
msg = "";
|
||||
}
|
||||
|
||||
if ((pat_patctl.control & ~POSIX_SUPPORTED_COMPILE_CONTROLS) != 0 ||
|
||||
(pat_patctl.control2 & ~POSIX_SUPPORTED_COMPILE_CONTROLS2) != 0)
|
||||
{
|
||||
|
@ -5170,7 +5210,11 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
|
|||
}
|
||||
|
||||
if (local_newline_default != 0) prmsg(&msg, "#newline_default");
|
||||
|
||||
if (FLD(pat_context, max_pattern_length) != PCRE2_UNSET)
|
||||
prmsg(&msg, "max_pattern_length");
|
||||
if (FLD(pat_context, parens_nest_limit) != PARENS_NEST_DEFAULT)
|
||||
prmsg(&msg, "parens_nest_limit");
|
||||
|
||||
if (msg[0] == 0) fprintf(outfile, "\n");
|
||||
|
||||
/* Translate PCRE2 options to POSIX options and then compile. */
|
||||
|
@ -8123,6 +8167,7 @@ max_oveccount = DEFAULT_OVECCOUNT;
|
|||
G(match_data,BITS) = G(pcre2_match_data_create_,BITS)(max_oveccount, G(general_context,BITS))
|
||||
|
||||
#define CONTEXTTESTS \
|
||||
(void)G(pcre2_set_compile_extra_options_,BITS)(G(pat_context,BITS), 0); \
|
||||
(void)G(pcre2_set_max_pattern_length_,BITS)(G(pat_context,BITS), 0); \
|
||||
(void)G(pcre2_set_offset_limit_,BITS)(G(dat_context,BITS), 0); \
|
||||
(void)G(pcre2_set_recursion_memory_management_,BITS)(G(dat_context,BITS), my_malloc, my_free, NULL)
|
||||
|
@ -8163,7 +8208,7 @@ if (test_mode == PCRE32_MODE)
|
|||
/* Set a default parentheses nest limit that is large enough to run the
|
||||
standard tests (this also exercises the function). */
|
||||
|
||||
PCRE2_SET_PARENS_NEST_LIMIT(default_pat_context, 220);
|
||||
PCRE2_SET_PARENS_NEST_LIMIT(default_pat_context, PARENS_NEST_DEFAULT);
|
||||
|
||||
/* Handle command line modifier settings, sending any error messages to
|
||||
stderr. We need to know the mode before modifying the context, and it is tidier
|
||||
|
|
|
@ -458,4 +458,13 @@
|
|||
|
||||
/[\s[:^ascii:]]/B,ucp
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 8-bit mode,
|
||||
# but subjects containing them must not be UTF-checked.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -363,4 +363,14 @@
|
|||
/\pP/ucp
|
||||
\x{7fffffff}
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 32-bit mode,
|
||||
# but subjects containing them must not be UTF-checked. These patterns give
|
||||
# errors in 16-bit mode.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#forbid_utf
|
||||
#pattern posix
|
||||
|
||||
# Test invalid options
|
||||
# Test some invalid options
|
||||
|
||||
/abc/auto_callout
|
||||
|
||||
|
@ -14,6 +14,10 @@
|
|||
|
||||
/abc/
|
||||
abc\=partial_hard
|
||||
|
||||
/a(())bc/parens_nest_limit=1
|
||||
|
||||
/abc/allow_surrogate_escapes,max_pattern_length=2
|
||||
|
||||
# Real tests
|
||||
|
||||
|
|
|
@ -1575,4 +1575,15 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP),
|
|||
End
|
||||
------------------------------------------------------------------
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 8-bit mode,
|
||||
# but subjects containing them must not be UTF-checked.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
0: \x{d800}
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
0: \x{dfff}\x{df01}
|
||||
|
||||
# End of testinput10
|
||||
|
|
|
@ -1421,4 +1421,16 @@ No match
|
|||
** Truncation will probably give the wrong result.
|
||||
No match
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 32-bit mode,
|
||||
# but subjects containing them must not be UTF-checked. These patterns give
|
||||
# errors in 16-bit mode.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
|
||||
\x{d800}\=no_utf_check
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -1413,4 +1413,16 @@ No match
|
|||
\x{7fffffff}
|
||||
No match
|
||||
|
||||
# A special extra option allows excaped surrogate code points in 32-bit mode,
|
||||
# but subjects containing them must not be UTF-checked. These patterns give
|
||||
# errors in 16-bit mode.
|
||||
|
||||
/\x{d800}/utf,allow_surrogate_escapes
|
||||
\x{d800}\=no_utf_check
|
||||
0: \x{d800}
|
||||
|
||||
/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
|
||||
\x{dfff}\x{df01}\=no_utf_check
|
||||
0: \x{dfff}\x{df01}
|
||||
|
||||
# End of testinput12
|
||||
|
|
|
@ -15970,7 +15970,6 @@ Error -2: partial match
|
|||
Error -1: no match
|
||||
Error 0: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error 100: no error
|
||||
Error 188: pattern string is longer than the limit set by the application
|
||||
Error 189: internal error: unknown code in parsed pattern
|
||||
Error 190: internal error: bad code value in parsed_skip()
|
||||
Error 191: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error 101: \ at end of pattern
|
||||
Error 191: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode
|
||||
Error 192: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
|
|
Loading…
Reference in New Issue