Partial documentation and partial code tweaks.
This commit is contained in:
parent
a6302442f2
commit
26cd0bccb3
|
@ -34,6 +34,7 @@ dist_html_DATA = \
|
||||||
doc/html/pcre2jit.html \
|
doc/html/pcre2jit.html \
|
||||||
doc/html/pcre2limits.html \
|
doc/html/pcre2limits.html \
|
||||||
doc/html/pcre2matching.html \
|
doc/html/pcre2matching.html \
|
||||||
|
doc/html/pcre2partial.html \
|
||||||
doc/html/pcre2test.html \
|
doc/html/pcre2test.html \
|
||||||
doc/html/pcre2unicode.html
|
doc/html/pcre2unicode.html
|
||||||
|
|
||||||
|
@ -64,7 +65,6 @@ dist_html_DATA = \
|
||||||
# doc/html/pcre2_utf16_to_host_byte_order.html \
|
# doc/html/pcre2_utf16_to_host_byte_order.html \
|
||||||
# doc/html/pcre2_utf32_to_host_byte_order.html \
|
# doc/html/pcre2_utf32_to_host_byte_order.html \
|
||||||
# doc/html/pcre2_version.html \
|
# doc/html/pcre2_version.html \
|
||||||
# doc/html/pcre2partial.html \
|
|
||||||
# doc/html/pcre2pattern.html \
|
# doc/html/pcre2pattern.html \
|
||||||
# doc/html/pcre2perform.html \
|
# doc/html/pcre2perform.html \
|
||||||
# doc/html/pcre2posix.html \
|
# doc/html/pcre2posix.html \
|
||||||
|
@ -86,6 +86,7 @@ dist_man_MANS = \
|
||||||
doc/pcre2jit.3 \
|
doc/pcre2jit.3 \
|
||||||
doc/pcre2limits.3 \
|
doc/pcre2limits.3 \
|
||||||
doc/pcre2matching.3 \
|
doc/pcre2matching.3 \
|
||||||
|
doc/pcre2partial.3 \
|
||||||
doc/pcre2test.1 \
|
doc/pcre2test.1 \
|
||||||
doc/pcre2unicode.3
|
doc/pcre2unicode.3
|
||||||
|
|
||||||
|
@ -118,7 +119,6 @@ dist_man_MANS = \
|
||||||
# doc/pcre2_utf16_to_host_byte_order.3 \
|
# doc/pcre2_utf16_to_host_byte_order.3 \
|
||||||
# doc/pcre2_utf32_to_host_byte_order.3 \
|
# doc/pcre2_utf32_to_host_byte_order.3 \
|
||||||
# doc/pcre2_version.3 \
|
# doc/pcre2_version.3 \
|
||||||
# doc/pcre2partial.3 \
|
|
||||||
# doc/pcre2pattern.3 \
|
# doc/pcre2pattern.3 \
|
||||||
# doc/pcre2perform.3 \
|
# doc/pcre2perform.3 \
|
||||||
# doc/pcre2posix.3 \
|
# doc/pcre2posix.3 \
|
||||||
|
|
|
@ -90,9 +90,6 @@ document for an overview of all the PCRE2 documentation.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC2" href="#TOC1">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a><br>
|
<br><a name="SEC2" href="#TOC1">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
@ -102,9 +99,6 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b>
|
<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
|
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC3" href="#TOC1">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a><br>
|
<br><a name="SEC3" href="#TOC1">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a><br>
|
||||||
|
@ -133,7 +127,7 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b>
|
<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
@ -141,7 +135,7 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b> const unsigned char *<i>tables</i>);</b>
|
<b> const unsigned char *<i>tables</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
@ -165,10 +159,6 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b>
|
<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
|
|
||||||
<b> uint32_t <i>value</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
||||||
<b> void *<i>callout_data</i>);</b>
|
<b> void *<i>callout_data</i>);</b>
|
||||||
|
@ -178,10 +168,6 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
|
|
||||||
<b> uint32_t <i>value</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -596,7 +582,7 @@ A compile context is created, copied, and freed by the following functions:
|
||||||
A compile context is created with default values for its parameters. These can
|
A compile context is created with default values for its parameters. These can
|
||||||
be changed by calling the following functions, which return 0 on success, or
|
be changed by calling the following functions, which return 0 on success, or
|
||||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||||
<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
@ -605,8 +591,7 @@ or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
|
||||||
ending sequence. The value of this parameter does not affect what is compiled;
|
ending sequence. The value of this parameter does not affect what is compiled;
|
||||||
it is just saved with the compiled pattern. The value is used by the JIT
|
it is just saved with the compiled pattern. The value is used by the JIT
|
||||||
compiler and by the two interpreted matching functions, <i>pcre2_match()</i> and
|
compiler and by the two interpreted matching functions, <i>pcre2_match()</i> and
|
||||||
<i>pcre2_dfa_match()</i>. You can change the value when calling these functions,
|
<i>pcre2_dfa_match()</i>.
|
||||||
but doing so disables the use of JIT.
|
|
||||||
<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> const unsigned char *<i>tables</i>);</b>
|
<b> const unsigned char *<i>tables</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -614,7 +599,7 @@ but doing so disables the use of JIT.
|
||||||
The value must be the result of a call to <i>pcre2_maketables()</i>, whose only
|
The value must be the result of a call to <i>pcre2_maketables()</i>, whose only
|
||||||
argument is a general context. This function builds a set of character tables
|
argument is a general context. This function builds a set of character tables
|
||||||
in the current locale.
|
in the current locale.
|
||||||
<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
@ -629,8 +614,7 @@ When a pattern is compiled with the PCRE2_EXTENDED option, the value of this
|
||||||
parameter affects the recognition of white space and the end of internal
|
parameter affects the recognition of white space and the end of internal
|
||||||
comments starting with #. The value is saved with the compiled pattern for
|
comments starting with #. The value is saved with the compiled pattern for
|
||||||
subsequent use by the JIT compiler and by the two interpreted matching
|
subsequent use by the JIT compiler and by the two interpreted matching
|
||||||
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>. You can change the
|
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
||||||
value when calling these functions, but doing so disables the use of JIT.
|
|
||||||
<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -685,14 +669,6 @@ A match context is created, copied, and freed by the following functions:
|
||||||
A match context is created with default values for its parameters. These can
|
A match context is created with default values for its parameters. These can
|
||||||
be changed by calling the following functions, which return 0 on success, or
|
be changed by calling the following functions, which return 0 on success, or
|
||||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||||
<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
|
|
||||||
<b> uint32_t <i>value</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF,
|
|
||||||
or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
|
|
||||||
ending sequence. If you want to make use of JIT matching, you should not use
|
|
||||||
this function, but instead set the value in a compile context.
|
|
||||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
||||||
<b> void *<i>callout_data</i>);</b>
|
<b> void *<i>callout_data</i>);</b>
|
||||||
|
@ -769,17 +745,6 @@ pattern of the form
|
||||||
where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
||||||
less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
||||||
limit is set, less than the default.
|
limit is set, less than the default.
|
||||||
<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
|
|
||||||
<b> uint32_t <i>value</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
This specifies which characters or character sequences are to be recognized as
|
|
||||||
newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only),
|
|
||||||
PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character
|
|
||||||
sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or
|
|
||||||
PCRE2_NEWLINE_ANY (any Unicode newline sequence). If you want to make use of
|
|
||||||
JIT matching, you should not use this function, but instead set the value in a
|
|
||||||
compile context.
|
|
||||||
<b>int pcre2_set_recursion_memory_management(</b>
|
<b>int pcre2_set_recursion_memory_management(</b>
|
||||||
<b> pcre2_match_context *<i>mcontext</i>,</b>
|
<b> pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
|
<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
|
||||||
|
@ -956,9 +921,8 @@ documentation).
|
||||||
<P>
|
<P>
|
||||||
For those options that can be different in different parts of the pattern, the
|
For those options that can be different in different parts of the pattern, the
|
||||||
contents of the <i>options</i> argument specifies their settings at the start of
|
contents of the <i>options</i> argument specifies their settings at the start of
|
||||||
compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and
|
compilation. The PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK options can be set at
|
||||||
PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as well as
|
the time of matching as well as at compile time.
|
||||||
at compile time.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Other, less frequently required compile-time parameters (for example, the
|
Other, less frequently required compile-time parameters (for example, the
|
||||||
|
@ -1176,14 +1140,55 @@ purposes.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_NO_START_OPTIMIZE
|
PCRE2_NO_START_OPTIMIZE
|
||||||
</pre>
|
</pre>
|
||||||
This is an option that acts at matching time; that is, it is really an option
|
This is an option whose main effect is at matching time. It does not change
|
||||||
for <b>pcre2_match()</b> or <b>pcre_dfa_match()</b>. If it is set at compile
|
what <b>pcre2_compile()</b> generates, but it does affect the output of the JIT
|
||||||
time, it is remembered with the compiled pattern and assumed at matching time.
|
compiler.
|
||||||
This is necessary if you want to use JIT execution, because the JIT compiler
|
</P>
|
||||||
needs to know whether or not this option is set. For details, see the
|
<P>
|
||||||
discussion of PCRE2_NO_START_OPTIMIZE in the section on <b>pcre2_match()</b>
|
There are a number of optimizations that may occur at the start of a match, in
|
||||||
options
|
order to speed up the process. For example, if it is known that an unanchored
|
||||||
<a href="#matchoptions">below.</a>
|
match must start with a specific character, the matching code searches the
|
||||||
|
subject for that character, and fails immediately if it cannot find it, without
|
||||||
|
actually running the main matching function. This means that a special item
|
||||||
|
such as (*COMMIT) at the start of a pattern is not considered until after a
|
||||||
|
suitable starting point for the match has been found. Also, when callouts or
|
||||||
|
(*MARK) items are in use, these "start-up" optimizations can cause them to be
|
||||||
|
skipped if the pattern is never actually used. The start-up optimizations are
|
||||||
|
in effect a pre-scan of the subject that takes place before the pattern is run.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
|
||||||
|
possibly causing performance to suffer, but ensuring that in cases where the
|
||||||
|
result is "no match", the callouts do occur, and that items such as (*COMMIT)
|
||||||
|
and (*MARK) are considered at every possible starting position in the subject
|
||||||
|
string.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation.
|
||||||
|
Consider the pattern
|
||||||
|
<pre>
|
||||||
|
(*COMMIT)ABC
|
||||||
|
</pre>
|
||||||
|
When this is compiled, PCRE2 records the fact that a match must start with the
|
||||||
|
character "A". Suppose the subject string is "DEFABC". The start-up
|
||||||
|
optimization scans along the subject, finds "A" and runs the first match
|
||||||
|
attempt from there. The (*COMMIT) item means that the pattern must match the
|
||||||
|
current starting position, which in this case, it does. However, if the same
|
||||||
|
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
|
||||||
|
subject string does not happen. The first match attempt is run starting from
|
||||||
|
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
|
||||||
|
the overall result is "no match". There are also other start-up optimizations.
|
||||||
|
For example, a minimum length for the subject may be recorded. Consider the
|
||||||
|
pattern
|
||||||
|
<pre>
|
||||||
|
(*MARK:A)(X|Y)
|
||||||
|
</pre>
|
||||||
|
The minimum length for a match is one character. If the subject is "ABC", there
|
||||||
|
will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
|
||||||
|
string at the end of the subject does not take place, because PCRE2 knows that
|
||||||
|
the subject is now too short, and so the (*MARK) is never encountered. In this
|
||||||
|
case, the optimization does not affect the overall match result, which is still
|
||||||
|
"no match", but it does affect the auxiliary information that is returned.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_NO_UTF_CHECK
|
PCRE2_NO_UTF_CHECK
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -1653,8 +1658,10 @@ match data block by calling one of the creation functions above. For
|
||||||
<b>pcre2_match_data_create()</b>, the first argument is the number of pairs of
|
<b>pcre2_match_data_create()</b>, the first argument is the number of pairs of
|
||||||
offsets in the <i>ovector</i>. One pair of offsets is required to identify the
|
offsets in the <i>ovector</i>. One pair of offsets is required to identify the
|
||||||
string that matched the whole pattern, with another pair for each captured
|
string that matched the whole pattern, with another pair for each captured
|
||||||
substring. For example, a value of 4 creates enough space to record the
|
substring. For example, a value of 4 creates enough space to record the matched
|
||||||
matched portion of the subject plus three captured substrings.
|
portion of the subject plus three captured substrings. A minimum of at least 1
|
||||||
|
pair is imposed by <b>pcre2_match_data_create()</b>, so it is always possible to
|
||||||
|
return the overall matched string.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
|
For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
|
||||||
|
@ -1779,10 +1786,9 @@ Option bits for <b>pcre2_match()</b>
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
|
The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
|
||||||
zero. The only bits that may be set are PCRE2_ANCHORED,
|
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||||
PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
|
||||||
PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and
|
PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below.
|
||||||
PCRE2_PARTIAL_SOFT. Their action is described below.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the pattern was successfully processed by the just-in-time (JIT) compiler,
|
If the pattern was successfully processed by the just-in-time (JIT) compiler,
|
||||||
|
@ -1833,56 +1839,6 @@ valid, so PCRE2 searches further into the string for occurrences of "a" or "b".
|
||||||
This is like PCRE2_NOTEMPTY, except that an empty string match that is not at
|
This is like PCRE2_NOTEMPTY, except that an empty string match that is not at
|
||||||
the start of the subject is permitted. If the pattern is anchored, such a match
|
the start of the subject is permitted. If the pattern is anchored, such a match
|
||||||
can occur only if the pattern contains \K.
|
can occur only if the pattern contains \K.
|
||||||
<pre>
|
|
||||||
PCRE2_NO_START_OPTIMIZE
|
|
||||||
</pre>
|
|
||||||
There are a number of optimizations that <b>pcre2_match()</b> uses at the start
|
|
||||||
of a match, in order to speed up the process. For example, if it is known that
|
|
||||||
an unanchored match must start with a specific character, it searches the
|
|
||||||
subject for that character, and fails immediately if it cannot find it, without
|
|
||||||
actually running the main matching function. This means that a special item
|
|
||||||
such as (*COMMIT) at the start of a pattern is not considered until after a
|
|
||||||
suitable starting point for the match has been found. Also, when callouts or
|
|
||||||
(*MARK) items are in use, these "start-up" optimizations can cause them to be
|
|
||||||
skipped if the pattern is never actually used. The start-up optimizations are
|
|
||||||
in effect a pre-scan of the subject that takes place before the pattern is run.
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
|
|
||||||
possibly causing performance to suffer, but ensuring that in cases where the
|
|
||||||
result is "no match", the callouts do occur, and that items such as (*COMMIT)
|
|
||||||
and (*MARK) are considered at every possible starting position in the subject
|
|
||||||
string. If PCRE2_NO_START_OPTIMIZE is set at compile time, it cannot be unset
|
|
||||||
at matching time. The use of PCRE2_NO_START_OPTIMIZE at matching time (that is,
|
|
||||||
passing it to <b>pcre2_match()</b>) disables JIT execution; in this situation,
|
|
||||||
matching is always done using interpretively.
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching operation.
|
|
||||||
Consider the pattern
|
|
||||||
<pre>
|
|
||||||
(*COMMIT)ABC
|
|
||||||
</pre>
|
|
||||||
When this is compiled, PCRE2 records the fact that a match must start with the
|
|
||||||
character "A". Suppose the subject string is "DEFABC". The start-up
|
|
||||||
optimization scans along the subject, finds "A" and runs the first match
|
|
||||||
attempt from there. The (*COMMIT) item means that the pattern must match the
|
|
||||||
current starting position, which in this case, it does. However, if the same
|
|
||||||
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
|
|
||||||
subject string does not happen. The first match attempt is run starting from
|
|
||||||
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
|
|
||||||
the overall result is "no match". There are also other start-up optimizations.
|
|
||||||
For example, a minimum length for the subject may be recorded. Consider the
|
|
||||||
pattern
|
|
||||||
<pre>
|
|
||||||
(*MARK:A)(X|Y)
|
|
||||||
</pre>
|
|
||||||
The minimum length for a match is one character. If the subject is "ABC", there
|
|
||||||
will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
|
|
||||||
string at the end of the subject does not take place, because PCRE2 knows that
|
|
||||||
the subject is now too short, and so the (*MARK) is never encountered. In this
|
|
||||||
case, the optimization does not affect the overall match result, which is still
|
|
||||||
"no match", but it does affect the auxiliary information that is returned.
|
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_NO_UTF_CHECK
|
PCRE2_NO_UTF_CHECK
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -2035,13 +1991,13 @@ returned.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the ovector is too small to hold all the captured substring offsets, as much
|
If the ovector is too small to hold all the captured substring offsets, as much
|
||||||
as possible is filled in, and the function returns a value of zero. If neither
|
as possible is filled in, and the function returns a value of zero. If captured
|
||||||
the actual string matched nor any captured substrings are of interest,
|
substrings are not of interest, <b>pcre2_match()</b> may be called with a match
|
||||||
<b>pcre2_match()</b> may be called with a match data block whose ovector is of
|
data block whose ovector is of minimum length (that is, one pair). However, if
|
||||||
zero length. However, if the pattern contains back references and the
|
the pattern contains back references and the <i>ovector</i> is not big enough to
|
||||||
<i>ovector</i> is not big enough to remember the related substrings, PCRE2 has
|
remember the related substrings, PCRE2 has to get additional memory for use
|
||||||
to get additional memory for use during matching. Thus it is usually advisable
|
during matching. Thus it is usually advisable to set up a match data block
|
||||||
to set up a match data block containing an ovector of reasonable size.
|
containing an ovector of reasonable size.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
It is possible for capturing subpattern number <i>n+1</i> to match some part of
|
It is possible for capturing subpattern number <i>n+1</i> to match some part of
|
||||||
|
@ -2074,12 +2030,6 @@ Other information about the match
|
||||||
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
|
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -2093,39 +2043,10 @@ Otherwise NULL is returned. A (*MARK) name may be available after a failed
|
||||||
match or a partial match, as well as after a successful one.
|
match or a partial match, as well as after a successful one.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The other three functions yield values that give information about the part of
|
The offset of the character at which the successful match started is
|
||||||
the subject string that was inspected during a successful match or a partial
|
returned by <b>pcre2_get_startchar()</b>. This can be different to the value of
|
||||||
match. Their results are undefined after a failed match. They return the
|
<i>ovector[0]</i> if the pattern contains the \K escape sequence. Note,
|
||||||
following values, respectively:
|
however, the \K has no effect for a partial match.
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
(1) The offset of the leftmost character that was inspected during the match.
|
|
||||||
This can be earlier than the point at which the match started if the pattern
|
|
||||||
contains lookbehind assertions or \b or \B at the start.
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
(2) The offset of the character that follows the rightmost character that was
|
|
||||||
inspected during the match. This can be after the end of the match if the
|
|
||||||
pattern contains lookahead assertions.
|
|
||||||
<br>
|
|
||||||
<br>
|
|
||||||
(3) The offset of the character at which the successful or partial match
|
|
||||||
started. This can be different to the value of <i>ovector[0]</i> if the pattern
|
|
||||||
contains the \K escape sequence.
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
For example, if the pattern (?<=abc)xx\Kyy(?=def) is matched against the
|
|
||||||
string "123abcxxyydef123", the resulting offsets are:
|
|
||||||
<pre>
|
|
||||||
ovector[0] 8
|
|
||||||
ovector[1] 10
|
|
||||||
leftchar 3
|
|
||||||
rightchar 13
|
|
||||||
startchar 6
|
|
||||||
</pre>
|
|
||||||
The <b>allusedtext</b> modifier in <b>pcre2test</b> can be used to display a
|
|
||||||
longer string that shows the leftmost and rightmost characters in a match
|
|
||||||
instead of just the matched string.
|
|
||||||
<a name="errorlist"></a></P>
|
<a name="errorlist"></a></P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Error return values from <b>pcre2_match()</b>
|
Error return values from <b>pcre2_match()</b>
|
||||||
|
@ -2513,10 +2434,9 @@ Option bits for <b>pcre_dfa_match()</b>
|
||||||
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
||||||
be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||||
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
|
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
|
||||||
PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
|
PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and
|
||||||
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are
|
PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for
|
||||||
exactly the same as for <b>pcre2_match()</b>, so their description is not
|
<b>pcre2_match()</b>, so their description is not repeated here.
|
||||||
repeated here.
|
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_PARTIAL_HARD
|
PCRE2_PARTIAL_HARD
|
||||||
PCRE2_PARTIAL_SOFT
|
PCRE2_PARTIAL_SOFT
|
||||||
|
@ -2650,7 +2570,7 @@ Cambridge CB2 3QH, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 16 September 2014
|
Last updated: 14 October 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -131,7 +131,7 @@ long enough, or, for unanchored patterns, if it has been scanned far enough.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
|
You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
|
||||||
option to the matching function, or by starting the pattern with
|
option to <b>pcre2_compile()</b>, or by starting the pattern with
|
||||||
(*NO_START_OPT). This slows down the matching process, but does ensure that
|
(*NO_START_OPT). This slows down the matching process, but does ensure that
|
||||||
callouts such as the example above are obeyed.
|
callouts such as the example above are obeyed.
|
||||||
</P>
|
</P>
|
||||||
|
|
|
@ -128,9 +128,8 @@ or the JIT compiler was not able to handle the pattern.
|
||||||
<P>
|
<P>
|
||||||
The <b>pcre2_match()</b> options that are supported for JIT matching are
|
The <b>pcre2_match()</b> options that are supported for JIT matching are
|
||||||
PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
||||||
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The options
|
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The
|
||||||
that are not supported at match time are PCRE2_ANCHORED and
|
PCRE2_ANCHORED option is not supported at match time.
|
||||||
PCRE2_NO_START_OPTIMIZE, though they are supported if given at compile time.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The only unsupported pattern items are \C (match a single data unit) when
|
The only unsupported pattern items are \C (match a single data unit) when
|
||||||
|
|
|
@ -0,0 +1,464 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>pcre2partial specification</title>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||||
|
<h1>pcre2partial man page</h1>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
|
please consult the man page, in case the conversion went wrong.
|
||||||
|
<br>
|
||||||
|
<ul>
|
||||||
|
<li><a name="TOC1" href="#SEC1">PARTIAL MATCHING IN PCRE2</a>
|
||||||
|
<li><a name="TOC2" href="#SEC2">PARTIAL MATCHING USING pcre2_match()</a>
|
||||||
|
<li><a name="TOC3" href="#SEC3">PARTIAL MATCHING USING pcre2_dfa_match()</a>
|
||||||
|
<li><a name="TOC4" href="#SEC4">PARTIAL MATCHING AND WORD BOUNDARIES</a>
|
||||||
|
<li><a name="TOC5" href="#SEC5">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a>
|
||||||
|
<li><a name="TOC6" href="#SEC6">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a>
|
||||||
|
<li><a name="TOC7" href="#SEC7">MULTI-SEGMENT MATCHING WITH pcre2_match()</a>
|
||||||
|
<li><a name="TOC8" href="#SEC8">ISSUES WITH MULTI-SEGMENT MATCHING</a>
|
||||||
|
<li><a name="TOC9" href="#SEC9">AUTHOR</a>
|
||||||
|
<li><a name="TOC10" href="#SEC10">REVISION</a>
|
||||||
|
</ul>
|
||||||
|
<br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE2</a><br>
|
||||||
|
<P>
|
||||||
|
In normal use of PCRE2, if the subject string that is passed to a matching
|
||||||
|
function matches as far as it goes, but is too short to match the entire
|
||||||
|
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
|
||||||
|
might be helpful to distinguish this case from other cases in which there is no
|
||||||
|
match.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Consider, for example, an application where a human is required to type in data
|
||||||
|
for a field with specific formatting requirements. An example might be a date
|
||||||
|
in the form <i>ddmmmyy</i>, defined by this pattern:
|
||||||
|
<pre>
|
||||||
|
^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
|
||||||
|
</pre>
|
||||||
|
If the application sees the user's keystrokes one by one, and can check that
|
||||||
|
what has been typed so far is potentially valid, it is able to raise an error
|
||||||
|
as soon as a mistake is made, by beeping and not reflecting the character that
|
||||||
|
has been typed, for example. This immediate feedback is likely to be a better
|
||||||
|
user interface than a check that is delayed until the entire string has been
|
||||||
|
entered. Partial matching can also be useful when the subject string is very
|
||||||
|
long and is not all available at once.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
|
||||||
|
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
|
||||||
|
The difference between the two options is whether or not a partial match is
|
||||||
|
preferred to an alternative complete match, though the details differ between
|
||||||
|
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
|
||||||
|
takes precedence.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If you want to use partial matching with just-in-time optimized code, you must
|
||||||
|
call <b>pcre2_jit_compile()</b> with one or both of these options:
|
||||||
|
<pre>
|
||||||
|
PCRE2_JIT_PARTIAL_SOFT
|
||||||
|
PCRE2_JIT_PARTIAL_HARD
|
||||||
|
</pre>
|
||||||
|
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
|
||||||
|
matches on the same pattern. If the appropriate JIT mode has not been compiled,
|
||||||
|
interpretive matching code is used.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Setting a partial matching option disables two of PCRE2's standard
|
||||||
|
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
|
||||||
|
abandons matching immediately if it is not present in the subject string. This
|
||||||
|
optimization cannot be used for a subject string that might match only
|
||||||
|
partially. PCRE2 also knows the minimum length of a matching string, and does
|
||||||
|
not bother to run the matching function on shorter strings. This optimization
|
||||||
|
is also disabled for partial matching.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre2_match()</a><br>
|
||||||
|
<P>
|
||||||
|
A partial match occurs during a call to <b>pcre2_match()</b> when the end of the
|
||||||
|
subject string is reached successfully, but matching cannot continue because
|
||||||
|
more characters are needed. However, at least one character in the subject must
|
||||||
|
have been inspected. This character need not form part of the final matched
|
||||||
|
string; lookbehind assertions and the \K escape sequence provide ways of
|
||||||
|
inspecting characters before the start of a matched string. The requirement for
|
||||||
|
inspecting at least one character exists because an empty string can always be
|
||||||
|
matched; without such a restriction there would always be a partial match of an
|
||||||
|
empty string at the end of the subject.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
When a partial match is returned, the first two elements in the ovector point
|
||||||
|
to the portion of the subject that was matched. The appearance of \K in the
|
||||||
|
pattern has no effect for a partial match. Consider this pattern:
|
||||||
|
<pre>
|
||||||
|
/abc\K123/
|
||||||
|
</pre>
|
||||||
|
If it is matched against "456abc123xyz" the result is a complete match, and the
|
||||||
|
ovector defines the matched string as "123", because \K resets the "start of
|
||||||
|
match" point. However, if a partial match is requested and the subject string
|
||||||
|
is "456abc12", a partial match is found for the string "abc12", because all
|
||||||
|
these characters are needed for a subsequent re-match with additional
|
||||||
|
characters.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
What happens when a partial match is identified depends on which of the two
|
||||||
|
partial matching options are set.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
PCRE2_PARTIAL_SOFT WITH pcre2_match()
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
If PCRE2_PARTIAL_SOFT is set when <b>pcre2_match()</b> identifies a partial
|
||||||
|
match, the partial match is remembered, but matching continues as normal, and
|
||||||
|
other alternatives in the pattern are tried. If no complete match can be found,
|
||||||
|
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
This option is "soft" because it prefers a complete match over a partial match.
|
||||||
|
All the various matching items in a pattern behave as if the subject string is
|
||||||
|
potentially complete. For example, \z, \Z, and $ match at the end of the
|
||||||
|
subject, as normal, and for \b and \B the end of the subject is treated as a
|
||||||
|
non-alphanumeric.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If there is more than one partial match, the first one that was found provides
|
||||||
|
the data that is returned. Consider this pattern:
|
||||||
|
<pre>
|
||||||
|
/123\w+X|dogY/
|
||||||
|
</pre>
|
||||||
|
If this is matched against the subject string "abc123dog", both
|
||||||
|
alternatives fail to match, but the end of the subject is reached during
|
||||||
|
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
|
||||||
|
identifying "123dog" as the first partial match that was found. (In this
|
||||||
|
example, there are two partial matches, because "dog" on its own partially
|
||||||
|
matches the second alternative.)
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
PCRE2_PARTIAL_HARD WITH pcre2_match()
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
If PCRE2_PARTIAL_HARD is set for <b>pcre2_match()</b>, PCRE2_ERROR_PARTIAL is
|
||||||
|
returned as soon as a partial match is found, without continuing to search for
|
||||||
|
possible complete matches. This option is "hard" because it prefers an earlier
|
||||||
|
partial match over a later complete match. For this reason, the assumption is
|
||||||
|
made that the end of the supplied subject string may not be the true end of the
|
||||||
|
available data, and so, if \z, \Z, \b, \B, or $ are encountered at the end
|
||||||
|
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
|
||||||
|
character in the subject has been inspected.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
Comparing hard and soft partial matching
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
The difference between the two partial matching options can be illustrated by a
|
||||||
|
pattern such as:
|
||||||
|
<pre>
|
||||||
|
/dog(sbody)?/
|
||||||
|
</pre>
|
||||||
|
This matches either "dog" or "dogsbody", greedily (that is, it prefers the
|
||||||
|
longer string if possible). If it is matched against the string "dog" with
|
||||||
|
PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if
|
||||||
|
PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other
|
||||||
|
hand, if the pattern is made ungreedy the result is different:
|
||||||
|
<pre>
|
||||||
|
/dog(sbody)??/
|
||||||
|
</pre>
|
||||||
|
In this case the result is always a complete match because that is found first,
|
||||||
|
and matching never continues after finding a complete match. It might be easier
|
||||||
|
to follow this explanation by thinking of the two patterns like this:
|
||||||
|
<pre>
|
||||||
|
/dog(sbody)?/ is the same as /dogsbody|dog/
|
||||||
|
/dog(sbody)??/ is the same as /dog|dogsbody/
|
||||||
|
</pre>
|
||||||
|
The second pattern will never match "dogsbody", because it will always find the
|
||||||
|
shorter match first.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC3" href="#TOC1">PARTIAL MATCHING USING pcre2_dfa_match()</a><br>
|
||||||
|
<P>
|
||||||
|
The DFA functions move along the subject string character by character, without
|
||||||
|
backtracking, searching for all possible matches simultaneously. If the end of
|
||||||
|
the subject is reached before the end of the pattern, there is the possibility
|
||||||
|
of a partial match, again provided that at least one character has been
|
||||||
|
inspected.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
|
||||||
|
have been no complete matches. Otherwise, the complete matches are returned.
|
||||||
|
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
|
||||||
|
any complete matches. The portion of the string that was matched when the
|
||||||
|
longest partial match was found is set as the first matching string.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Because the DFA functions always search for all possible matches, and there is
|
||||||
|
no difference between greedy and ungreedy repetition, their behaviour is
|
||||||
|
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
|
||||||
|
the string "dog" matched against the ungreedy pattern shown above:
|
||||||
|
<pre>
|
||||||
|
/dog(sbody)??/
|
||||||
|
</pre>
|
||||||
|
Whereas the standard functions stop as soon as they find the complete match for
|
||||||
|
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
||||||
|
return that when PCRE2_PARTIAL_HARD is set.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
|
||||||
|
<P>
|
||||||
|
If a pattern ends with one of sequences \b or \B, which test for word
|
||||||
|
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
|
||||||
|
results. Consider this pattern:
|
||||||
|
<pre>
|
||||||
|
/\bcat\b/
|
||||||
|
</pre>
|
||||||
|
This matches "cat", provided there is a word boundary at either end. If the
|
||||||
|
subject string is "the cat", the comparison of the final "t" with a following
|
||||||
|
character cannot take place, so a partial match is found. However, normal
|
||||||
|
matching carries on, and \b matches at the end of the subject when the last
|
||||||
|
character is a letter, so a complete match is found. The result, therefore, is
|
||||||
|
<i>not</i> PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
|
||||||
|
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC5" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a><br>
|
||||||
|
<P>
|
||||||
|
If the <b>partial_soft</b> (or <b>ps</b>) modifier is present on a
|
||||||
|
<b>pcre2test</b> data line, the PCRE2_PARTIAL_SOFT option is used for the match.
|
||||||
|
Here is a run of <b>pcre2test</b> that uses the date example quoted above:
|
||||||
|
<pre>
|
||||||
|
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||||
|
data> 25jun04\=ps
|
||||||
|
0: 25jun04
|
||||||
|
1: jun
|
||||||
|
data> 25dec3\=ps
|
||||||
|
Partial match: 23dec3
|
||||||
|
data> 3ju\=ps
|
||||||
|
Partial match: 3ju
|
||||||
|
data> 3juj\=ps
|
||||||
|
No match
|
||||||
|
data> j\=ps
|
||||||
|
No match
|
||||||
|
</pre>
|
||||||
|
The first data string is matched completely, so <b>pcre2test</b> shows the
|
||||||
|
matched substrings. The remaining four strings do not match the complete
|
||||||
|
pattern, but the first two are partial matches. Similar output is obtained
|
||||||
|
if DFA matching is used.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If the <b>partial_hard</b> (or <b>ph</b>) modifier is present on a
|
||||||
|
<b>pcre2test</b> data line, the PCRE2_PARTIAL_HARD option is set for the match.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC6" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a><br>
|
||||||
|
<P>
|
||||||
|
When a partial match has been found using a DFA matching function, it is
|
||||||
|
possible to continue the match by providing additional subject data and calling
|
||||||
|
the function again with the same compiled regular expression, this time setting
|
||||||
|
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
|
||||||
|
because this is where details of the previous partial match are stored. Here is
|
||||||
|
an example using <b>pcre2test</b>:
|
||||||
|
<pre>
|
||||||
|
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||||
|
data> 23ja\=dfa,ps
|
||||||
|
Partial match: 23ja
|
||||||
|
data> n05\=dfa,dfa_restart
|
||||||
|
0: n05
|
||||||
|
</pre>
|
||||||
|
The first call has "23ja" as the subject, and requests partial matching; the
|
||||||
|
second call has "n05" as the subject for the continued (restarted) match.
|
||||||
|
Notice that when the match is complete, only the last part is shown; PCRE2 does
|
||||||
|
not retain the previously partially-matched string. It is up to the calling
|
||||||
|
program to do that if it needs to.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
That means that, for an unanchored pattern, if a continued match fails, it is
|
||||||
|
not possible to try again at a new starting point. All this facility is capable
|
||||||
|
of doing is continuing with the previous match attempt. In the previous
|
||||||
|
example, if the second set of data is "ug23" the result is no match, even
|
||||||
|
though there would be a match for "aug23" if the entire string were given at
|
||||||
|
once. Depending on the application, this may or may not be what you want.
|
||||||
|
The only way to allow for starting again at the next character is to retain the
|
||||||
|
matched part of the subject and try a new complete match.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
|
||||||
|
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
|
||||||
|
facility can be used to pass very long subject strings to the DFA matching
|
||||||
|
functions.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC7" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_match()</a><br>
|
||||||
|
<P>
|
||||||
|
Unlike the DFA function, it is not possible to restart the previous match with
|
||||||
|
a new segment of data when using <b>pcre2_match()</b>. Instead, new data must be
|
||||||
|
added to the previous subject string, and the entire match re-run, starting
|
||||||
|
from the point where the partial match occurred. Earlier data can be discarded.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
|
||||||
|
treat the end of a segment as the end of the subject when matching \z, \Z,
|
||||||
|
\b, \B, and $. Consider an unanchored pattern that matches dates:
|
||||||
|
<pre>
|
||||||
|
re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
|
||||||
|
data> The date is 23ja\=ph
|
||||||
|
Partial match: 23ja
|
||||||
|
</pre>
|
||||||
|
At this stage, an application could discard the text preceding "23ja", add on
|
||||||
|
text from the next segment, and call the matching function again. Unlike the
|
||||||
|
DFA matching function, the entire matching string must always be available,
|
||||||
|
and the complete matching process occurs for each call, so more memory and more
|
||||||
|
processing time is needed.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC8" href="#TOC1">ISSUES WITH MULTI-SEGMENT MATCHING</a><br>
|
||||||
|
<P>
|
||||||
|
Certain types of pattern may give problems with multi-segment matching,
|
||||||
|
whichever matching function is used.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
1. If the pattern contains a test for the beginning of a line, you need to pass
|
||||||
|
the PCRE2_NOTBOL option when the subject string for any call does start at the
|
||||||
|
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
|
||||||
|
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
|
||||||
|
includes the effect of PCRE2_NOTEOL.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
2. If a pattern contains a lookbehind assertion, characters that precede the
|
||||||
|
start of the partial match may have been inspected during the matching process.
|
||||||
|
When using <b>pcre2_match()</b>, sufficient characters must be retained for the
|
||||||
|
next match attempt. You can ensure that enough characters are retained by doing
|
||||||
|
the following:
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Before doing any matching, find the length of the longest lookbehind in the
|
||||||
|
pattern by calling <b>pcre2_pattern_info()</b> with the PCRE2_INFO_MAXLOOKBEHIND
|
||||||
|
option. Note that the resulting count is in characters, not code units. After a
|
||||||
|
partial match, moving back from the ovector[0] offset in the subject by the
|
||||||
|
number of characters given for the maximum lookbehind gets you to the earliest
|
||||||
|
character that must be retained. In a non-UTF or a 32-bit situation, moving
|
||||||
|
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
|
||||||
|
while moving back through the code units.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Characters before the point you have now reached can be discarded, and after
|
||||||
|
the next segment has been added to what is retained, you should run the next
|
||||||
|
match with the <b>startoffset</b> argument set so that the match begins at the
|
||||||
|
same point as before.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
For example, if the pattern "(?<=123)abc" is partially matched against the
|
||||||
|
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
|
||||||
|
lookbehind count is 3, so all characters before offset 2 can be discarded. The
|
||||||
|
value of <b>startoffset</b> for the next match should be 3. When <b>pcre2test</b>
|
||||||
|
displays a partial match, it indicates the lookbehind characters with '<'
|
||||||
|
characters:
|
||||||
|
<pre>
|
||||||
|
re> "(?<=123)abc"
|
||||||
|
data> xx123ab\=ph
|
||||||
|
Partial match: 123ab
|
||||||
|
<<<
|
||||||
|
</PRE>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
3. Because a partial match must always contain at least one character, what
|
||||||
|
might be considered a partial match of an empty string actually gives a "no
|
||||||
|
match" result. For example:
|
||||||
|
<pre>
|
||||||
|
re> /c(?<=abc)x/
|
||||||
|
data> ab\=ps
|
||||||
|
No match
|
||||||
|
</pre>
|
||||||
|
If the next segment begins "cx", a match should be found, but this will only
|
||||||
|
happen if characters from the previous segment are retained. For this reason, a
|
||||||
|
"no match" result should be interpreted as "partial match of an empty string"
|
||||||
|
when the pattern contains lookbehinds.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
4. Matching a subject string that is split into multiple segments may not
|
||||||
|
always produce exactly the same result as matching over one single long string,
|
||||||
|
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
|
||||||
|
Word Boundaries" above describes an issue that arises if the pattern ends with
|
||||||
|
\b or \B. Another kind of difference may occur when there are multiple
|
||||||
|
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
|
||||||
|
is given only when there are no completed matches. This means that as soon as
|
||||||
|
the shortest match has been found, continuation to a new subject segment is no
|
||||||
|
longer possible. Consider this <b>pcre2test</b> example:
|
||||||
|
<pre>
|
||||||
|
re> /dog(sbody)?/
|
||||||
|
data> dogsb\=ps
|
||||||
|
0: dog
|
||||||
|
data> do\=ps,dfa
|
||||||
|
Partial match: do
|
||||||
|
data> gsb\=ps,dfa,dfa_restart
|
||||||
|
0: g
|
||||||
|
data> dogsbody\=dfa
|
||||||
|
0: dogsbody
|
||||||
|
1: dog
|
||||||
|
</pre>
|
||||||
|
The first data line passes the string "dogsb" to a standard matching function,
|
||||||
|
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
|
||||||
|
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
|
||||||
|
string "dog" is a complete match. Similarly, when the subject is presented to
|
||||||
|
a DFA matching function in several parts ("do" and "gsb" being the first two)
|
||||||
|
the match stops when "dog" has been found, and it is not possible to continue.
|
||||||
|
On the other hand, if "dogsbody" is presented as a single string, a DFA
|
||||||
|
matching function finds both matches.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
|
||||||
|
multi-segment data. The example above then behaves differently:
|
||||||
|
<pre>
|
||||||
|
re> /dog(sbody)?/
|
||||||
|
data> dogsb\=ph
|
||||||
|
Partial match: dogsb
|
||||||
|
data> do\=ps,dfa
|
||||||
|
Partial match: do
|
||||||
|
data> gsb\=ph,dfa,dfa_restart
|
||||||
|
Partial match: gsb
|
||||||
|
</pre>
|
||||||
|
5. Patterns that contain alternatives at the top level which do not all start
|
||||||
|
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
|
||||||
|
used. For example, consider this pattern:
|
||||||
|
<pre>
|
||||||
|
1234|3789
|
||||||
|
</pre>
|
||||||
|
If the first part of the subject is "ABC123", a partial match of the first
|
||||||
|
alternative is found at offset 3. There is no partial match for the second
|
||||||
|
alternative, because such a match does not start at the same point in the
|
||||||
|
subject string. Attempting to continue with the string "7890" does not yield a
|
||||||
|
match because only those alternatives that match at one point in the subject
|
||||||
|
are remembered. The problem arises because the start of the second alternative
|
||||||
|
matches within the first alternative. There is no problem with anchored
|
||||||
|
patterns or patterns such as:
|
||||||
|
<pre>
|
||||||
|
1234|ABCD
|
||||||
|
</pre>
|
||||||
|
where no string can be a partial match for both alternatives. This is not a
|
||||||
|
problem if a standard matching function is used, because the entire match has
|
||||||
|
to be rerun each time:
|
||||||
|
<pre>
|
||||||
|
re> /1234|3789/
|
||||||
|
data> ABC123\=ph
|
||||||
|
Partial match: 123
|
||||||
|
data> 1237890
|
||||||
|
0: 3789
|
||||||
|
</pre>
|
||||||
|
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
|
||||||
|
the entire match can also be used with the DFA matching function. Another
|
||||||
|
possibility is to work with two buffers. If a partial match at offset <i>n</i>
|
||||||
|
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
|
||||||
|
the second buffer, you can then try a new match starting at offset <i>n+1</i> in
|
||||||
|
the first buffer.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC9" href="#TOC1">AUTHOR</a><br>
|
||||||
|
<P>
|
||||||
|
Philip Hazel
|
||||||
|
<br>
|
||||||
|
University Computing Service
|
||||||
|
<br>
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
<br>
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC10" href="#TOC1">REVISION</a><br>
|
||||||
|
<P>
|
||||||
|
Last updated: 14 October 2014
|
||||||
|
<br>
|
||||||
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
|
<br>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
|
@ -476,6 +476,7 @@ about the pattern:
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
hex pattern is coded in hexadecimal
|
hex pattern is coded in hexadecimal
|
||||||
jit[=<number>] use JIT
|
jit[=<number>] use JIT
|
||||||
|
jitverify verify JIT use
|
||||||
locale=<name> use this locale
|
locale=<name> use this locale
|
||||||
memory show memory used
|
memory show memory used
|
||||||
newline=<type> set newline type
|
newline=<type> set newline type
|
||||||
|
@ -503,10 +504,6 @@ The <b>newline</b> modifier specifies which characters are to be interpreted as
|
||||||
newlines, both in the pattern and (by default) in subject lines. The type must
|
newlines, both in the pattern and (by default) in subject lines. The type must
|
||||||
be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
|
||||||
Both the \R and newline settings can be changed at match time, but if this is
|
|
||||||
done, JIT matching is disabled.
|
|
||||||
</P>
|
|
||||||
<br><b>
|
<br><b>
|
||||||
Information about a pattern
|
Information about a pattern
|
||||||
</b><br>
|
</b><br>
|
||||||
|
@ -556,29 +553,32 @@ length of the pattern is passed. This is implied if <b>hex</b> is set.
|
||||||
JIT compilation
|
JIT compilation
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>/jit</b> modifier may optionally be followed by a number in the range 0
|
The <b>/jit</b> modifier may optionally be followed by and equals sign and a
|
||||||
to 7:
|
number in the range 0 to 7:
|
||||||
<pre>
|
<pre>
|
||||||
0 disable JIT
|
0 disable JIT
|
||||||
1 normal match only
|
1 use JIT for normal match only
|
||||||
2 soft partial match only
|
2 use JIT for soft partial match only
|
||||||
3 normal match and soft partial match
|
3 use JIT for normal match and soft partial match
|
||||||
4 hard partial match only
|
4 use JIT for hard partial match only
|
||||||
6 soft and hard partial match
|
6 use JIT for soft and hard partial match
|
||||||
7 all three modes
|
7 all three modes
|
||||||
</pre>
|
</pre>
|
||||||
If no number is given, 7 is assumed. If JIT compilation is successful, the
|
If no number is given, 7 is assumed. If JIT compilation is successful, the
|
||||||
compiled JIT code will automatically be used when <b>pcre2_match()</b> is run,
|
compiled JIT code will automatically be used when <b>pcre2_match()</b> is run
|
||||||
except when incompatible run-time options are specified. For more details, see
|
for the appropriate type of match, except when incompatible run-time options
|
||||||
the
|
are specified. For more details, see the
|
||||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||||
documentation. See also the <b>jitstack</b> modifier below for a way of
|
documentation. See also the <b>jitstack</b> modifier below for a way of
|
||||||
setting the size of the JIT stack.
|
setting the size of the JIT stack.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the <b>jitverify</b> modifier is specified, the text "(JIT)" is added to the
|
If the <b>jitverify</b> modifier is specified, information about the compiled
|
||||||
first output line after a match or non match when JIT-compiled code was
|
pattern shows whether JIT compilation was or was not successful. If
|
||||||
actually used. This modifier can also be set on a subject line.
|
<b>jitverify</b> is specified without <b>jit</b>, jit=7 is assumed. If JIT
|
||||||
|
compilation is successful when <b>jitverify</b> is set, the text "(JIT)" is
|
||||||
|
added to the first output line after a match or non match when JIT-compiled
|
||||||
|
code was actually used.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Setting a locale
|
Setting a locale
|
||||||
|
@ -680,7 +680,6 @@ not affect the compilation process.
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitverify verify JIT usage
|
|
||||||
mark show mark values
|
mark show mark values
|
||||||
</pre>
|
</pre>
|
||||||
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
||||||
|
@ -703,7 +702,6 @@ for a description of their effects.
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
dfa_restart set PCRE2_DFA_RESTART
|
dfa_restart set PCRE2_DFA_RESTART
|
||||||
dfa_shortest set PCRE2_DFA_SHORTEST
|
dfa_shortest set PCRE2_DFA_SHORTEST
|
||||||
no_start_optimize set PCRE2_NO_START_OPTIMIZE
|
|
||||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||||
notbol set PCRE2_NOTBOL
|
notbol set PCRE2_NOTBOL
|
||||||
notempty set PCRE2_NOTEMPTY
|
notempty set PCRE2_NOTEMPTY
|
||||||
|
@ -734,9 +732,8 @@ pattern.
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text (non-JIT only)
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
bsr=[anycrlf|unicode] specify \R handling
|
|
||||||
callout_capture show captures at callout time
|
callout_capture show captures at callout time
|
||||||
callout_data=<n> set a value to pass via callouts
|
callout_data=<n> set a value to pass via callouts
|
||||||
callout_fail=<n>[:<m>] control callout failure
|
callout_fail=<n>[:<m>] control callout failure
|
||||||
|
@ -748,11 +745,9 @@ pattern.
|
||||||
getall extract all captured substrings
|
getall extract all captured substrings
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitstack=<n> set size of JIT stack
|
jitstack=<n> set size of JIT stack
|
||||||
jitverify verify JIT usage
|
|
||||||
mark show mark values
|
mark show mark values
|
||||||
match_limit=>n> set a match limit
|
match_limit=>n> set a match limit
|
||||||
memory show memory usage
|
memory show memory usage
|
||||||
newline=<type> set newline type
|
|
||||||
offset=<n> set starting offset
|
offset=<n> set starting offset
|
||||||
ovector=<n> set size of output vector
|
ovector=<n> set size of output vector
|
||||||
recursion_limit=<n> set a recursion limit
|
recursion_limit=<n> set a recursion limit
|
||||||
|
@ -761,14 +756,6 @@ The effects of these modifiers are described in the following sections.
|
||||||
FIXME: Give more examples.
|
FIXME: Give more examples.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Newline and \R handling
|
|
||||||
</b><br>
|
|
||||||
<P>
|
|
||||||
These modifiers set the newline and \R processing conventions for the subject
|
|
||||||
line, overriding any values that were set at compile time (as described above).
|
|
||||||
JIT matching is disabled if these settings are changed at match time.
|
|
||||||
</P>
|
|
||||||
<br><b>
|
|
||||||
Showing more text
|
Showing more text
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -782,10 +769,12 @@ plus character following the capture number.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
||||||
during a successful pattern match be shown. This affects the output if there
|
during a successful pattern match by the interpreter should be shown. This
|
||||||
is a lookbehind at the start of a match, or a lookahead at the end, or if \K
|
feature is not supported for JIT matching, and if requested with JIT it is
|
||||||
is used in the pattern. Characters that precede or follow the start and end of
|
ignored (with a warning message). Setting this modifier affects the output if
|
||||||
the actual match are indicated in the output by '<' or '>' characters
|
there is a lookbehind at the start of a match, or a lookahead at the end, or if
|
||||||
|
\K is used in the pattern. Characters that precede or follow the start and end
|
||||||
|
of the actual match are indicated in the output by '<' or '>' characters
|
||||||
underneath them. Here is an example:
|
underneath them. Here is an example:
|
||||||
<pre>
|
<pre>
|
||||||
/(?<=pqr)abc(?=xyz)/
|
/(?<=pqr)abc(?=xyz)/
|
||||||
|
@ -903,6 +892,11 @@ until it finds the minimum values for each parameter that allow
|
||||||
<b>pcre2_match()</b> to complete without error.
|
<b>pcre2_match()</b> to complete without error.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
If JIT is being used, only the match limit is relevant. If DFA matching is
|
||||||
|
being used, neither limit is relevant, and this modifier is ignored (with a
|
||||||
|
warning message).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
The <i>match_limit</i> number is a measure of the amount of backtracking
|
The <i>match_limit</i> number is a measure of the amount of backtracking
|
||||||
that takes place, and learning the minimum value can be instructive. For most
|
that takes place, and learning the minimum value can be instructive. For most
|
||||||
simple matches, the number is quite small, but for patterns with very large
|
simple matches, the number is quite small, but for patterns with very large
|
||||||
|
@ -944,6 +938,13 @@ appears, though of course it can also be used to set a default in a
|
||||||
<b>#subject</b> command. It specifies the number of pairs of offsets that are
|
<b>#subject</b> command. It specifies the number of pairs of offsets that are
|
||||||
available for storing matching information. The default is 15.
|
available for storing matching information. The default is 15.
|
||||||
</P>
|
</P>
|
||||||
|
<P>
|
||||||
|
At least one pair of offsets is always created by
|
||||||
|
<b>pcre2_match_data_create()</b>, for matching with PCRE2's native API, so a
|
||||||
|
value of 0 is the same as 1. However a value of 0 is useful when testing the
|
||||||
|
POSIX API because it causes <b>regexec()</b> to be called with a NULL capture
|
||||||
|
vector.
|
||||||
|
</P>
|
||||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||||
<P>
|
<P>
|
||||||
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
|
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
|
||||||
|
@ -1190,7 +1191,7 @@ Cambridge CB2 3QH, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC20" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC20" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 19 August 2014
|
Last updated: 11 October 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
231
doc/pcre2.txt
231
doc/pcre2.txt
|
@ -52,16 +52,12 @@ PCRE2 NATIVE API BASIC FUNCTIONS
|
||||||
|
|
||||||
PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS
|
PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS
|
||||||
|
|
||||||
PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *match_data);
|
|
||||||
|
|
||||||
PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
|
PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
|
||||||
|
|
||||||
uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
|
uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
|
||||||
|
|
||||||
PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
|
PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
|
||||||
|
|
||||||
PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *match_data);
|
|
||||||
|
|
||||||
PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
|
PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,13 +83,13 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS
|
||||||
|
|
||||||
void pcre2_compile_context_free(pcre2_compile_context *ccontext);
|
void pcre2_compile_context_free(pcre2_compile_context *ccontext);
|
||||||
|
|
||||||
int pcre2_set_bsr_compile(pcre2_compile_context *ccontext,
|
int pcre2_set_bsr(pcre2_compile_context *ccontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
|
||||||
int pcre2_set_character_tables(pcre2_compile_context *ccontext,
|
int pcre2_set_character_tables(pcre2_compile_context *ccontext,
|
||||||
const unsigned char *tables);
|
const unsigned char *tables);
|
||||||
|
|
||||||
int pcre2_set_newline_compile(pcre2_compile_context *ccontext,
|
int pcre2_set_newline(pcre2_compile_context *ccontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
|
||||||
int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
|
int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
|
||||||
|
@ -113,9 +109,6 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS
|
||||||
|
|
||||||
void pcre2_match_context_free(pcre2_match_context *mcontext);
|
void pcre2_match_context_free(pcre2_match_context *mcontext);
|
||||||
|
|
||||||
int pcre2_set_bsr_match(pcre2_match_context *mcontext,
|
|
||||||
uint32_t value);
|
|
||||||
|
|
||||||
int pcre2_set_callout(pcre2_match_context *mcontext,
|
int pcre2_set_callout(pcre2_match_context *mcontext,
|
||||||
int (*callout_function)(pcre2_callout_block *),
|
int (*callout_function)(pcre2_callout_block *),
|
||||||
void *callout_data);
|
void *callout_data);
|
||||||
|
@ -123,9 +116,6 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS
|
||||||
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
int pcre2_set_match_limit(pcre2_match_context *mcontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
|
||||||
int pcre2_set_newline_match(pcre2_match_context *mcontext,
|
|
||||||
uint32_t value);
|
|
||||||
|
|
||||||
int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
|
int pcre2_set_recursion_limit(pcre2_match_context *mcontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
|
||||||
|
@ -501,7 +491,7 @@ PCRE2 CONTEXTS
|
||||||
These can be changed by calling the following functions, which return 0
|
These can be changed by calling the following functions, which return 0
|
||||||
on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
|
on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||||
|
|
||||||
int pcre2_set_bsr_compile(pcre2_compile_context *ccontext,
|
int pcre2_set_bsr(pcre2_compile_context *ccontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
|
||||||
The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only
|
The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only
|
||||||
|
@ -509,9 +499,7 @@ PCRE2 CONTEXTS
|
||||||
Unicode line ending sequence. The value of this parameter does not
|
Unicode line ending sequence. The value of this parameter does not
|
||||||
affect what is compiled; it is just saved with the compiled pattern.
|
affect what is compiled; it is just saved with the compiled pattern.
|
||||||
The value is used by the JIT compiler and by the two interpreted match-
|
The value is used by the JIT compiler and by the two interpreted match-
|
||||||
ing functions, pcre2_match() and pcre2_dfa_match(). You can change the
|
ing functions, pcre2_match() and pcre2_dfa_match().
|
||||||
value when calling these functions, but doing so disables the use of
|
|
||||||
JIT.
|
|
||||||
|
|
||||||
int pcre2_set_character_tables(pcre2_compile_context *ccontext,
|
int pcre2_set_character_tables(pcre2_compile_context *ccontext,
|
||||||
const unsigned char *tables);
|
const unsigned char *tables);
|
||||||
|
@ -520,7 +508,7 @@ PCRE2 CONTEXTS
|
||||||
only argument is a general context. This function builds a set of char-
|
only argument is a general context. This function builds a set of char-
|
||||||
acter tables in the current locale.
|
acter tables in the current locale.
|
||||||
|
|
||||||
int pcre2_set_newline_compile(pcre2_compile_context *ccontext,
|
int pcre2_set_newline(pcre2_compile_context *ccontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
|
||||||
This specifies which characters or character sequences are to be recog-
|
This specifies which characters or character sequences are to be recog-
|
||||||
|
@ -533,9 +521,7 @@ PCRE2 CONTEXTS
|
||||||
this parameter affects the recognition of white space and the end of
|
this parameter affects the recognition of white space and the end of
|
||||||
internal comments starting with #. The value is saved with the compiled
|
internal comments starting with #. The value is saved with the compiled
|
||||||
pattern for subsequent use by the JIT compiler and by the two inter-
|
pattern for subsequent use by the JIT compiler and by the two inter-
|
||||||
preted matching functions, pcre2_match() and pcre2_dfa_match(). You can
|
preted matching functions, pcre2_match() and pcre2_dfa_match().
|
||||||
change the value when calling these functions, but doing so disables
|
|
||||||
the use of JIT.
|
|
||||||
|
|
||||||
int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
|
int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
|
||||||
uint32_t value);
|
uint32_t value);
|
||||||
|
@ -588,15 +574,6 @@ PCRE2 CONTEXTS
|
||||||
These can be changed by calling the following functions, which return 0
|
These can be changed by calling the following functions, which return 0
|
||||||
on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
|
on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||||
|
|
||||||
int pcre2_set_bsr_match(pcre2_match_context *mcontext,
|
|
||||||
uint32_t value);
|
|
||||||
|
|
||||||
The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only
|
|
||||||
CR, LF, or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any
|
|
||||||
Unicode line ending sequence. If you want to make use of JIT matching,
|
|
||||||
you should not use this function, but instead set the value in a com-
|
|
||||||
pile context.
|
|
||||||
|
|
||||||
int pcre2_set_callout(pcre2_match_context *mcontext,
|
int pcre2_set_callout(pcre2_match_context *mcontext,
|
||||||
int (*callout_function)(pcre2_callout_block *),
|
int (*callout_function)(pcre2_callout_block *),
|
||||||
void *callout_data);
|
void *callout_data);
|
||||||
|
@ -668,17 +645,6 @@ PCRE2 CONTEXTS
|
||||||
unless ddd is less than the limit set by the caller of pcre2_match()
|
unless ddd is less than the limit set by the caller of pcre2_match()
|
||||||
or, if no such limit is set, less than the default.
|
or, if no such limit is set, less than the default.
|
||||||
|
|
||||||
int pcre2_set_newline_match(pcre2_match_context *mcontext,
|
|
||||||
uint32_t value);
|
|
||||||
|
|
||||||
This specifies which characters or character sequences are to be recog-
|
|
||||||
nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage
|
|
||||||
return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
|
|
||||||
two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any
|
|
||||||
of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence). If
|
|
||||||
you want to make use of JIT matching, you should not use this function,
|
|
||||||
but instead set the value in a compile context.
|
|
||||||
|
|
||||||
int pcre2_set_recursion_memory_management(
|
int pcre2_set_recursion_memory_management(
|
||||||
pcre2_match_context *mcontext,
|
pcre2_match_context *mcontext,
|
||||||
void *(*private_malloc)(PCRE2_SIZE, void *),
|
void *(*private_malloc)(PCRE2_SIZE, void *),
|
||||||
|
@ -852,9 +818,8 @@ COMPILING A PATTERN
|
||||||
|
|
||||||
For those options that can be different in different parts of the pat-
|
For those options that can be different in different parts of the pat-
|
||||||
tern, the contents of the options argument specifies their settings at
|
tern, the contents of the options argument specifies their settings at
|
||||||
the start of compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and
|
the start of compilation. The PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK
|
||||||
PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as
|
options can be set at the time of matching as well as at compile time.
|
||||||
well as at compile time.
|
|
||||||
|
|
||||||
Other, less frequently required compile-time parameters (for example,
|
Other, less frequently required compile-time parameters (for example,
|
||||||
the newline setting) can be provided in a compile context (as described
|
the newline setting) can be provided in a compile context (as described
|
||||||
|
@ -1065,13 +1030,56 @@ COMPILING A PATTERN
|
||||||
|
|
||||||
PCRE2_NO_START_OPTIMIZE
|
PCRE2_NO_START_OPTIMIZE
|
||||||
|
|
||||||
This is an option that acts at matching time; that is, it is really an
|
This is an option whose main effect is at matching time. It does not
|
||||||
option for pcre2_match() or pcre_dfa_match(). If it is set at compile
|
change what pcre2_compile() generates, but it does affect the output of
|
||||||
time, it is remembered with the compiled pattern and assumed at match-
|
the JIT compiler.
|
||||||
ing time. This is necessary if you want to use JIT execution, because
|
|
||||||
the JIT compiler needs to know whether or not this option is set. For
|
There are a number of optimizations that may occur at the start of a
|
||||||
details, see the discussion of PCRE2_NO_START_OPTIMIZE in the section
|
match, in order to speed up the process. For example, if it is known
|
||||||
on pcre2_match() options below.
|
that an unanchored match must start with a specific character, the
|
||||||
|
matching code searches the subject for that character, and fails imme-
|
||||||
|
diately if it cannot find it, without actually running the main match-
|
||||||
|
ing function. This means that a special item such as (*COMMIT) at the
|
||||||
|
start of a pattern is not considered until after a suitable starting
|
||||||
|
point for the match has been found. Also, when callouts or (*MARK)
|
||||||
|
items are in use, these "start-up" optimizations can cause them to be
|
||||||
|
skipped if the pattern is never actually used. The start-up optimiza-
|
||||||
|
tions are in effect a pre-scan of the subject that takes place before
|
||||||
|
the pattern is run.
|
||||||
|
|
||||||
|
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
|
||||||
|
possibly causing performance to suffer, but ensuring that in cases
|
||||||
|
where the result is "no match", the callouts do occur, and that items
|
||||||
|
such as (*COMMIT) and (*MARK) are considered at every possible starting
|
||||||
|
position in the subject string.
|
||||||
|
|
||||||
|
Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching
|
||||||
|
operation. Consider the pattern
|
||||||
|
|
||||||
|
(*COMMIT)ABC
|
||||||
|
|
||||||
|
When this is compiled, PCRE2 records the fact that a match must start
|
||||||
|
with the character "A". Suppose the subject string is "DEFABC". The
|
||||||
|
start-up optimization scans along the subject, finds "A" and runs the
|
||||||
|
first match attempt from there. The (*COMMIT) item means that the pat-
|
||||||
|
tern must match the current starting position, which in this case, it
|
||||||
|
does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE
|
||||||
|
set, the initial scan along the subject string does not happen. The
|
||||||
|
first match attempt is run starting from "D" and when this fails,
|
||||||
|
(*COMMIT) prevents any further matches being tried, so the overall
|
||||||
|
result is "no match". There are also other start-up optimizations. For
|
||||||
|
example, a minimum length for the subject may be recorded. Consider the
|
||||||
|
pattern
|
||||||
|
|
||||||
|
(*MARK:A)(X|Y)
|
||||||
|
|
||||||
|
The minimum length for a match is one character. If the subject is
|
||||||
|
"ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
|
||||||
|
to match an empty string at the end of the subject does not take place,
|
||||||
|
because PCRE2 knows that the subject is now too short, and so the
|
||||||
|
(*MARK) is never encountered. In this case, the optimization does not
|
||||||
|
affect the overall match result, which is still "no match", but it does
|
||||||
|
affect the auxiliary information that is returned.
|
||||||
|
|
||||||
PCRE2_NO_UTF_CHECK
|
PCRE2_NO_UTF_CHECK
|
||||||
|
|
||||||
|
@ -1524,7 +1532,9 @@ THE MATCH DATA BLOCK
|
||||||
string that matched the whole pattern, with another pair for each cap-
|
string that matched the whole pattern, with another pair for each cap-
|
||||||
tured substring. For example, a value of 4 creates enough space to
|
tured substring. For example, a value of 4 creates enough space to
|
||||||
record the matched portion of the subject plus three captured sub-
|
record the matched portion of the subject plus three captured sub-
|
||||||
strings.
|
strings. A minimum of at least 1 pair is imposed by
|
||||||
|
pcre2_match_data_create(), so it is always possible to return the over-
|
||||||
|
all matched string.
|
||||||
|
|
||||||
For pcre2_match_data_create_from_pattern(), the first argument is a
|
For pcre2_match_data_create_from_pattern(), the first argument is a
|
||||||
pointer to a compiled pattern. In this case the ovector is created to
|
pointer to a compiled pattern. In this case the ovector is created to
|
||||||
|
@ -1636,8 +1646,8 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
|
||||||
The unused bits of the options argument for pcre2_match() must be zero.
|
The unused bits of the options argument for pcre2_match() must be zero.
|
||||||
The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||||
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
||||||
PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and
|
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their
|
||||||
PCRE2_PARTIAL_SOFT. Their action is described below.
|
action is described below.
|
||||||
|
|
||||||
If the pattern was successfully processed by the just-in-time (JIT)
|
If the pattern was successfully processed by the just-in-time (JIT)
|
||||||
compiler, the only supported options for matching using the JIT code
|
compiler, the only supported options for matching using the JIT code
|
||||||
|
@ -1691,58 +1701,6 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION
|
||||||
not at the start of the subject is permitted. If the pattern is
|
not at the start of the subject is permitted. If the pattern is
|
||||||
anchored, such a match can occur only if the pattern contains \K.
|
anchored, such a match can occur only if the pattern contains \K.
|
||||||
|
|
||||||
PCRE2_NO_START_OPTIMIZE
|
|
||||||
|
|
||||||
There are a number of optimizations that pcre2_match() uses at the
|
|
||||||
start of a match, in order to speed up the process. For example, if it
|
|
||||||
is known that an unanchored match must start with a specific character,
|
|
||||||
it searches the subject for that character, and fails immediately if it
|
|
||||||
cannot find it, without actually running the main matching function.
|
|
||||||
This means that a special item such as (*COMMIT) at the start of a pat-
|
|
||||||
tern is not considered until after a suitable starting point for the
|
|
||||||
match has been found. Also, when callouts or (*MARK) items are in use,
|
|
||||||
these "start-up" optimizations can cause them to be skipped if the pat-
|
|
||||||
tern is never actually used. The start-up optimizations are in effect a
|
|
||||||
pre-scan of the subject that takes place before the pattern is run.
|
|
||||||
|
|
||||||
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
|
|
||||||
possibly causing performance to suffer, but ensuring that in cases
|
|
||||||
where the result is "no match", the callouts do occur, and that items
|
|
||||||
such as (*COMMIT) and (*MARK) are considered at every possible starting
|
|
||||||
position in the subject string. If PCRE2_NO_START_OPTIMIZE is set at
|
|
||||||
compile time, it cannot be unset at matching time. The use of
|
|
||||||
PCRE2_NO_START_OPTIMIZE at matching time (that is, passing it to
|
|
||||||
pcre2_match()) disables JIT execution; in this situation, matching is
|
|
||||||
always done using interpretively.
|
|
||||||
|
|
||||||
Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching
|
|
||||||
operation. Consider the pattern
|
|
||||||
|
|
||||||
(*COMMIT)ABC
|
|
||||||
|
|
||||||
When this is compiled, PCRE2 records the fact that a match must start
|
|
||||||
with the character "A". Suppose the subject string is "DEFABC". The
|
|
||||||
start-up optimization scans along the subject, finds "A" and runs the
|
|
||||||
first match attempt from there. The (*COMMIT) item means that the pat-
|
|
||||||
tern must match the current starting position, which in this case, it
|
|
||||||
does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE
|
|
||||||
set, the initial scan along the subject string does not happen. The
|
|
||||||
first match attempt is run starting from "D" and when this fails,
|
|
||||||
(*COMMIT) prevents any further matches being tried, so the overall
|
|
||||||
result is "no match". There are also other start-up optimizations. For
|
|
||||||
example, a minimum length for the subject may be recorded. Consider the
|
|
||||||
pattern
|
|
||||||
|
|
||||||
(*MARK:A)(X|Y)
|
|
||||||
|
|
||||||
The minimum length for a match is one character. If the subject is
|
|
||||||
"ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
|
|
||||||
to match an empty string at the end of the subject does not take place,
|
|
||||||
because PCRE2 knows that the subject is now too short, and so the
|
|
||||||
(*MARK) is never encountered. In this case, the optimization does not
|
|
||||||
affect the overall match result, which is still "no match", but it does
|
|
||||||
affect the auxiliary information that is returned.
|
|
||||||
|
|
||||||
PCRE2_NO_UTF_CHECK
|
PCRE2_NO_UTF_CHECK
|
||||||
|
|
||||||
When PCRE2_UTF is set at compile time, the validity of the subject as a
|
When PCRE2_UTF is set at compile time, the validity of the subject as a
|
||||||
|
@ -1871,13 +1829,13 @@ HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS
|
||||||
|
|
||||||
If the ovector is too small to hold all the captured substring offsets,
|
If the ovector is too small to hold all the captured substring offsets,
|
||||||
as much as possible is filled in, and the function returns a value of
|
as much as possible is filled in, and the function returns a value of
|
||||||
zero. If neither the actual string matched nor any captured substrings
|
zero. If captured substrings are not of interest, pcre2_match() may be
|
||||||
are of interest, pcre2_match() may be called with a match data block
|
called with a match data block whose ovector is of minimum length (that
|
||||||
whose ovector is of zero length. However, if the pattern contains back
|
is, one pair). However, if the pattern contains back references and the
|
||||||
references and the ovector is not big enough to remember the related
|
ovector is not big enough to remember the related substrings, PCRE2 has
|
||||||
substrings, PCRE2 has to get additional memory for use during matching.
|
to get additional memory for use during matching. Thus it is usually
|
||||||
Thus it is usually advisable to set up a match data block containing an
|
advisable to set up a match data block containing an ovector of reason-
|
||||||
ovector of reasonable size.
|
able size.
|
||||||
|
|
||||||
It is possible for capturing subpattern number n+1 to match some part
|
It is possible for capturing subpattern number n+1 to match some part
|
||||||
of the subject when subpattern n has not been used at all. For example,
|
of the subject when subpattern n has not been used at all. For example,
|
||||||
|
@ -1904,10 +1862,6 @@ HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS
|
||||||
|
|
||||||
PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
|
PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
|
||||||
|
|
||||||
PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *match_data);
|
|
||||||
|
|
||||||
PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *match_data);
|
|
||||||
|
|
||||||
PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
|
PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *match_data);
|
||||||
|
|
||||||
In addition to the offsets in the ovector, other information about a
|
In addition to the offsets in the ovector, other information about a
|
||||||
|
@ -1920,35 +1874,10 @@ HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS
|
||||||
after a failed match or a partial match, as well as after a successful
|
after a failed match or a partial match, as well as after a successful
|
||||||
one.
|
one.
|
||||||
|
|
||||||
The other three functions yield values that give information about the
|
The offset of the character at which the successful match started is
|
||||||
part of the subject string that was inspected during a successful match
|
returned by pcre2_get_startchar(). This can be different to the value
|
||||||
or a partial match. Their results are undefined after a failed match.
|
of ovector[0] if the pattern contains the \K escape sequence. Note,
|
||||||
They return the following values, respectively:
|
however, the \K has no effect for a partial match.
|
||||||
|
|
||||||
(1) The offset of the leftmost character that was inspected during the
|
|
||||||
match. This can be earlier than the point at which the match started
|
|
||||||
if the pattern contains lookbehind assertions or \b or \B at the start.
|
|
||||||
|
|
||||||
(2) The offset of the character that follows the rightmost character
|
|
||||||
that was inspected during the match. This can be after the end of the
|
|
||||||
match if the pattern contains lookahead assertions.
|
|
||||||
|
|
||||||
(3) The offset of the character at which the successful or partial
|
|
||||||
match started. This can be different to the value of ovector[0] if the
|
|
||||||
pattern contains the \K escape sequence.
|
|
||||||
|
|
||||||
For example, if the pattern (?<=abc)xx\Kyy(?=def) is matched against
|
|
||||||
the string "123abcxxyydef123", the resulting offsets are:
|
|
||||||
|
|
||||||
ovector[0] 8
|
|
||||||
ovector[1] 10
|
|
||||||
leftchar 3
|
|
||||||
rightchar 13
|
|
||||||
startchar 6
|
|
||||||
|
|
||||||
The allusedtext modifier in pcre2test can be used to display a longer
|
|
||||||
string that shows the leftmost and rightmost characters in a match
|
|
||||||
instead of just the matched string.
|
|
||||||
|
|
||||||
Error return values from pcre2_match()
|
Error return values from pcre2_match()
|
||||||
|
|
||||||
|
@ -2303,10 +2232,10 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
|
||||||
The unused bits of the options argument for pcre2_dfa_match() must be
|
The unused bits of the options argument for pcre2_dfa_match() must be
|
||||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||||
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
||||||
PCRE2_NO_UTF_CHECK, PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD,
|
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
|
||||||
PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but
|
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of
|
||||||
the last four of these are exactly the same as for pcre2_match(), so
|
these are exactly the same as for pcre2_match(), so their description
|
||||||
their description is not repeated here.
|
is not repeated here.
|
||||||
|
|
||||||
PCRE2_PARTIAL_HARD
|
PCRE2_PARTIAL_HARD
|
||||||
PCRE2_PARTIAL_SOFT
|
PCRE2_PARTIAL_SOFT
|
||||||
|
@ -2434,7 +2363,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 16 September 2014
|
Last updated: 14 October 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -2551,7 +2480,7 @@ MISSING CALLOUTS
|
||||||
has been scanned far enough.
|
has been scanned far enough.
|
||||||
|
|
||||||
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
You can disable these optimizations by passing the PCRE2_NO_START_OPTI-
|
||||||
MIZE option to the matching function, or by starting the pattern with
|
MIZE option to pcre2_compile(), or by starting the pattern with
|
||||||
(*NO_START_OPT). This slows down the matching process, but does ensure
|
(*NO_START_OPT). This slows down the matching process, but does ensure
|
||||||
that callouts such as the example above are obeyed.
|
that callouts such as the example above are obeyed.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2API 3 "10 October 2014" "PCRE2 10.00"
|
.TH PCRE2API 3 "14 October 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.sp
|
.sp
|
||||||
|
@ -2061,15 +2061,10 @@ pointer to the zero-terminated name, which is within the compiled pattern.
|
||||||
Otherwise NULL is returned. A (*MARK) name may be available after a failed
|
Otherwise NULL is returned. A (*MARK) name may be available after a failed
|
||||||
match or a partial match, as well as after a successful one.
|
match or a partial match, as well as after a successful one.
|
||||||
.P
|
.P
|
||||||
The offset of the character at which the successful or partial match started is
|
The offset of the character at which the successful match started is
|
||||||
returned by \fBpcre2_get_startchar()\fP. This can be different to the value of
|
returned by \fBpcre2_get_startchar()\fP. This can be different to the value of
|
||||||
\fIovector[0]\fP if the pattern contains the \eK escape sequence. This
|
\fIovector[0]\fP if the pattern contains the \eK escape sequence. Note,
|
||||||
information is needed when doing partial matching over multiple data segments
|
however, the \eK has no effect for a partial match.
|
||||||
(see the
|
|
||||||
.\" HREF
|
|
||||||
\fBpcre2partial\fP
|
|
||||||
.\"
|
|
||||||
documentation).
|
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.\" HTML <a name="errorlist"></a>
|
.\" HTML <a name="errorlist"></a>
|
||||||
|
@ -2626,6 +2621,6 @@ Cambridge CB2 3QH, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 10 October 2014
|
Last updated: 14 October 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -0,0 +1,433 @@
|
||||||
|
.TH PCRE2PARTIAL 3 "14 October 2014" "PCRE2 10.00"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions
|
||||||
|
.SH "PARTIAL MATCHING IN PCRE2"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
In normal use of PCRE2, if the subject string that is passed to a matching
|
||||||
|
function matches as far as it goes, but is too short to match the entire
|
||||||
|
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
|
||||||
|
might be helpful to distinguish this case from other cases in which there is no
|
||||||
|
match.
|
||||||
|
.P
|
||||||
|
Consider, for example, an application where a human is required to type in data
|
||||||
|
for a field with specific formatting requirements. An example might be a date
|
||||||
|
in the form \fIddmmmyy\fP, defined by this pattern:
|
||||||
|
.sp
|
||||||
|
^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$
|
||||||
|
.sp
|
||||||
|
If the application sees the user's keystrokes one by one, and can check that
|
||||||
|
what has been typed so far is potentially valid, it is able to raise an error
|
||||||
|
as soon as a mistake is made, by beeping and not reflecting the character that
|
||||||
|
has been typed, for example. This immediate feedback is likely to be a better
|
||||||
|
user interface than a check that is delayed until the entire string has been
|
||||||
|
entered. Partial matching can also be useful when the subject string is very
|
||||||
|
long and is not all available at once.
|
||||||
|
.P
|
||||||
|
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
|
||||||
|
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
|
||||||
|
The difference between the two options is whether or not a partial match is
|
||||||
|
preferred to an alternative complete match, though the details differ between
|
||||||
|
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
|
||||||
|
takes precedence.
|
||||||
|
.P
|
||||||
|
If you want to use partial matching with just-in-time optimized code, you must
|
||||||
|
call \fBpcre2_jit_compile()\fP with one or both of these options:
|
||||||
|
.sp
|
||||||
|
PCRE2_JIT_PARTIAL_SOFT
|
||||||
|
PCRE2_JIT_PARTIAL_HARD
|
||||||
|
.sp
|
||||||
|
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
|
||||||
|
matches on the same pattern. If the appropriate JIT mode has not been compiled,
|
||||||
|
interpretive matching code is used.
|
||||||
|
.P
|
||||||
|
Setting a partial matching option disables two of PCRE2's standard
|
||||||
|
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
|
||||||
|
abandons matching immediately if it is not present in the subject string. This
|
||||||
|
optimization cannot be used for a subject string that might match only
|
||||||
|
partially. PCRE2 also knows the minimum length of a matching string, and does
|
||||||
|
not bother to run the matching function on shorter strings. This optimization
|
||||||
|
is also disabled for partial matching.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "PARTIAL MATCHING USING pcre2_match()"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
A partial match occurs during a call to \fBpcre2_match()\fP when the end of the
|
||||||
|
subject string is reached successfully, but matching cannot continue because
|
||||||
|
more characters are needed. However, at least one character in the subject must
|
||||||
|
have been inspected. This character need not form part of the final matched
|
||||||
|
string; lookbehind assertions and the \eK escape sequence provide ways of
|
||||||
|
inspecting characters before the start of a matched string. The requirement for
|
||||||
|
inspecting at least one character exists because an empty string can always be
|
||||||
|
matched; without such a restriction there would always be a partial match of an
|
||||||
|
empty string at the end of the subject.
|
||||||
|
.P
|
||||||
|
When a partial match is returned, the first two elements in the ovector point
|
||||||
|
to the portion of the subject that was matched. The appearance of \eK in the
|
||||||
|
pattern has no effect for a partial match. Consider this pattern:
|
||||||
|
.sp
|
||||||
|
/abc\eK123/
|
||||||
|
.sp
|
||||||
|
If it is matched against "456abc123xyz" the result is a complete match, and the
|
||||||
|
ovector defines the matched string as "123", because \eK resets the "start of
|
||||||
|
match" point. However, if a partial match is requested and the subject string
|
||||||
|
is "456abc12", a partial match is found for the string "abc12", because all
|
||||||
|
these characters are needed for a subsequent re-match with additional
|
||||||
|
characters.
|
||||||
|
.P
|
||||||
|
What happens when a partial match is identified depends on which of the two
|
||||||
|
partial matching options are set.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS "PCRE2_PARTIAL_SOFT WITH pcre2_match()"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
If PCRE2_PARTIAL_SOFT is set when \fBpcre2_match()\fP identifies a partial
|
||||||
|
match, the partial match is remembered, but matching continues as normal, and
|
||||||
|
other alternatives in the pattern are tried. If no complete match can be found,
|
||||||
|
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
|
||||||
|
.P
|
||||||
|
This option is "soft" because it prefers a complete match over a partial match.
|
||||||
|
All the various matching items in a pattern behave as if the subject string is
|
||||||
|
potentially complete. For example, \ez, \eZ, and $ match at the end of the
|
||||||
|
subject, as normal, and for \eb and \eB the end of the subject is treated as a
|
||||||
|
non-alphanumeric.
|
||||||
|
.P
|
||||||
|
If there is more than one partial match, the first one that was found provides
|
||||||
|
the data that is returned. Consider this pattern:
|
||||||
|
.sp
|
||||||
|
/123\ew+X|dogY/
|
||||||
|
.sp
|
||||||
|
If this is matched against the subject string "abc123dog", both
|
||||||
|
alternatives fail to match, but the end of the subject is reached during
|
||||||
|
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
|
||||||
|
identifying "123dog" as the first partial match that was found. (In this
|
||||||
|
example, there are two partial matches, because "dog" on its own partially
|
||||||
|
matches the second alternative.)
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS "PCRE2_PARTIAL_HARD WITH pcre2_match()"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
If PCRE2_PARTIAL_HARD is set for \fBpcre2_match()\fP, PCRE2_ERROR_PARTIAL is
|
||||||
|
returned as soon as a partial match is found, without continuing to search for
|
||||||
|
possible complete matches. This option is "hard" because it prefers an earlier
|
||||||
|
partial match over a later complete match. For this reason, the assumption is
|
||||||
|
made that the end of the supplied subject string may not be the true end of the
|
||||||
|
available data, and so, if \ez, \eZ, \eb, \eB, or $ are encountered at the end
|
||||||
|
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
|
||||||
|
character in the subject has been inspected.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SS "Comparing hard and soft partial matching"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The difference between the two partial matching options can be illustrated by a
|
||||||
|
pattern such as:
|
||||||
|
.sp
|
||||||
|
/dog(sbody)?/
|
||||||
|
.sp
|
||||||
|
This matches either "dog" or "dogsbody", greedily (that is, it prefers the
|
||||||
|
longer string if possible). If it is matched against the string "dog" with
|
||||||
|
PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if
|
||||||
|
PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other
|
||||||
|
hand, if the pattern is made ungreedy the result is different:
|
||||||
|
.sp
|
||||||
|
/dog(sbody)??/
|
||||||
|
.sp
|
||||||
|
In this case the result is always a complete match because that is found first,
|
||||||
|
and matching never continues after finding a complete match. It might be easier
|
||||||
|
to follow this explanation by thinking of the two patterns like this:
|
||||||
|
.sp
|
||||||
|
/dog(sbody)?/ is the same as /dogsbody|dog/
|
||||||
|
/dog(sbody)??/ is the same as /dog|dogsbody/
|
||||||
|
.sp
|
||||||
|
The second pattern will never match "dogsbody", because it will always find the
|
||||||
|
shorter match first.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "PARTIAL MATCHING USING pcre2_dfa_match()"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The DFA functions move along the subject string character by character, without
|
||||||
|
backtracking, searching for all possible matches simultaneously. If the end of
|
||||||
|
the subject is reached before the end of the pattern, there is the possibility
|
||||||
|
of a partial match, again provided that at least one character has been
|
||||||
|
inspected.
|
||||||
|
.P
|
||||||
|
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
|
||||||
|
have been no complete matches. Otherwise, the complete matches are returned.
|
||||||
|
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
|
||||||
|
any complete matches. The portion of the string that was matched when the
|
||||||
|
longest partial match was found is set as the first matching string.
|
||||||
|
.P
|
||||||
|
Because the DFA functions always search for all possible matches, and there is
|
||||||
|
no difference between greedy and ungreedy repetition, their behaviour is
|
||||||
|
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
|
||||||
|
the string "dog" matched against the ungreedy pattern shown above:
|
||||||
|
.sp
|
||||||
|
/dog(sbody)??/
|
||||||
|
.sp
|
||||||
|
Whereas the standard functions stop as soon as they find the complete match for
|
||||||
|
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
||||||
|
return that when PCRE2_PARTIAL_HARD is set.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
If a pattern ends with one of sequences \eb or \eB, which test for word
|
||||||
|
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
|
||||||
|
results. Consider this pattern:
|
||||||
|
.sp
|
||||||
|
/\ebcat\eb/
|
||||||
|
.sp
|
||||||
|
This matches "cat", provided there is a word boundary at either end. If the
|
||||||
|
subject string is "the cat", the comparison of the final "t" with a following
|
||||||
|
character cannot take place, so a partial match is found. However, normal
|
||||||
|
matching carries on, and \eb matches at the end of the subject when the last
|
||||||
|
character is a letter, so a complete match is found. The result, therefore, is
|
||||||
|
\fInot\fP PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
|
||||||
|
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
If the \fBpartial_soft\fP (or \fBps\fP) modifier is present on a
|
||||||
|
\fBpcre2test\fP data line, the PCRE2_PARTIAL_SOFT option is used for the match.
|
||||||
|
Here is a run of \fBpcre2test\fP that uses the date example quoted above:
|
||||||
|
.sp
|
||||||
|
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||||
|
data> 25jun04\e=ps
|
||||||
|
0: 25jun04
|
||||||
|
1: jun
|
||||||
|
data> 25dec3\e=ps
|
||||||
|
Partial match: 23dec3
|
||||||
|
data> 3ju\e=ps
|
||||||
|
Partial match: 3ju
|
||||||
|
data> 3juj\e=ps
|
||||||
|
No match
|
||||||
|
data> j\e=ps
|
||||||
|
No match
|
||||||
|
.sp
|
||||||
|
The first data string is matched completely, so \fBpcre2test\fP shows the
|
||||||
|
matched substrings. The remaining four strings do not match the complete
|
||||||
|
pattern, but the first two are partial matches. Similar output is obtained
|
||||||
|
if DFA matching is used.
|
||||||
|
.P
|
||||||
|
If the \fBpartial_hard\fP (or \fBph\fP) modifier is present on a
|
||||||
|
\fBpcre2test\fP data line, the PCRE2_PARTIAL_HARD option is set for the match.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
When a partial match has been found using a DFA matching function, it is
|
||||||
|
possible to continue the match by providing additional subject data and calling
|
||||||
|
the function again with the same compiled regular expression, this time setting
|
||||||
|
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
|
||||||
|
because this is where details of the previous partial match are stored. Here is
|
||||||
|
an example using \fBpcre2test\fP:
|
||||||
|
.sp
|
||||||
|
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||||
|
data> 23ja\e=dfa,ps
|
||||||
|
Partial match: 23ja
|
||||||
|
data> n05\e=dfa,dfa_restart
|
||||||
|
0: n05
|
||||||
|
.sp
|
||||||
|
The first call has "23ja" as the subject, and requests partial matching; the
|
||||||
|
second call has "n05" as the subject for the continued (restarted) match.
|
||||||
|
Notice that when the match is complete, only the last part is shown; PCRE2 does
|
||||||
|
not retain the previously partially-matched string. It is up to the calling
|
||||||
|
program to do that if it needs to.
|
||||||
|
.P
|
||||||
|
That means that, for an unanchored pattern, if a continued match fails, it is
|
||||||
|
not possible to try again at a new starting point. All this facility is capable
|
||||||
|
of doing is continuing with the previous match attempt. In the previous
|
||||||
|
example, if the second set of data is "ug23" the result is no match, even
|
||||||
|
though there would be a match for "aug23" if the entire string were given at
|
||||||
|
once. Depending on the application, this may or may not be what you want.
|
||||||
|
The only way to allow for starting again at the next character is to retain the
|
||||||
|
matched part of the subject and try a new complete match.
|
||||||
|
.P
|
||||||
|
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
|
||||||
|
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
|
||||||
|
facility can be used to pass very long subject strings to the DFA matching
|
||||||
|
functions.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "MULTI-SEGMENT MATCHING WITH pcre2_match()"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Unlike the DFA function, it is not possible to restart the previous match with
|
||||||
|
a new segment of data when using \fBpcre2_match()\fP. Instead, new data must be
|
||||||
|
added to the previous subject string, and the entire match re-run, starting
|
||||||
|
from the point where the partial match occurred. Earlier data can be discarded.
|
||||||
|
.P
|
||||||
|
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
|
||||||
|
treat the end of a segment as the end of the subject when matching \ez, \eZ,
|
||||||
|
\eb, \eB, and $. Consider an unanchored pattern that matches dates:
|
||||||
|
.sp
|
||||||
|
re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/
|
||||||
|
data> The date is 23ja\e=ph
|
||||||
|
Partial match: 23ja
|
||||||
|
.sp
|
||||||
|
At this stage, an application could discard the text preceding "23ja", add on
|
||||||
|
text from the next segment, and call the matching function again. Unlike the
|
||||||
|
DFA matching function, the entire matching string must always be available,
|
||||||
|
and the complete matching process occurs for each call, so more memory and more
|
||||||
|
processing time is needed.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "ISSUES WITH MULTI-SEGMENT MATCHING"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Certain types of pattern may give problems with multi-segment matching,
|
||||||
|
whichever matching function is used.
|
||||||
|
.P
|
||||||
|
1. If the pattern contains a test for the beginning of a line, you need to pass
|
||||||
|
the PCRE2_NOTBOL option when the subject string for any call does start at the
|
||||||
|
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
|
||||||
|
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
|
||||||
|
includes the effect of PCRE2_NOTEOL.
|
||||||
|
.P
|
||||||
|
2. If a pattern contains a lookbehind assertion, characters that precede the
|
||||||
|
start of the partial match may have been inspected during the matching process.
|
||||||
|
When using \fBpcre2_match()\fP, sufficient characters must be retained for the
|
||||||
|
next match attempt. You can ensure that enough characters are retained by doing
|
||||||
|
the following:
|
||||||
|
.P
|
||||||
|
Before doing any matching, find the length of the longest lookbehind in the
|
||||||
|
pattern by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_MAXLOOKBEHIND
|
||||||
|
option. Note that the resulting count is in characters, not code units. After a
|
||||||
|
partial match, moving back from the ovector[0] offset in the subject by the
|
||||||
|
number of characters given for the maximum lookbehind gets you to the earliest
|
||||||
|
character that must be retained. In a non-UTF or a 32-bit situation, moving
|
||||||
|
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
|
||||||
|
while moving back through the code units.
|
||||||
|
.P
|
||||||
|
Characters before the point you have now reached can be discarded, and after
|
||||||
|
the next segment has been added to what is retained, you should run the next
|
||||||
|
match with the \fBstartoffset\fP argument set so that the match begins at the
|
||||||
|
same point as before.
|
||||||
|
.P
|
||||||
|
For example, if the pattern "(?<=123)abc" is partially matched against the
|
||||||
|
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
|
||||||
|
lookbehind count is 3, so all characters before offset 2 can be discarded. The
|
||||||
|
value of \fBstartoffset\fP for the next match should be 3. When \fBpcre2test\fP
|
||||||
|
displays a partial match, it indicates the lookbehind characters with '<'
|
||||||
|
characters:
|
||||||
|
.sp
|
||||||
|
re> "(?<=123)abc"
|
||||||
|
data> xx123ab\e=ph
|
||||||
|
Partial match: 123ab
|
||||||
|
<<<
|
||||||
|
.P
|
||||||
|
3. Because a partial match must always contain at least one character, what
|
||||||
|
might be considered a partial match of an empty string actually gives a "no
|
||||||
|
match" result. For example:
|
||||||
|
.sp
|
||||||
|
re> /c(?<=abc)x/
|
||||||
|
data> ab\e=ps
|
||||||
|
No match
|
||||||
|
.sp
|
||||||
|
If the next segment begins "cx", a match should be found, but this will only
|
||||||
|
happen if characters from the previous segment are retained. For this reason, a
|
||||||
|
"no match" result should be interpreted as "partial match of an empty string"
|
||||||
|
when the pattern contains lookbehinds.
|
||||||
|
.P
|
||||||
|
4. Matching a subject string that is split into multiple segments may not
|
||||||
|
always produce exactly the same result as matching over one single long string,
|
||||||
|
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
|
||||||
|
Word Boundaries" above describes an issue that arises if the pattern ends with
|
||||||
|
\eb or \eB. Another kind of difference may occur when there are multiple
|
||||||
|
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
|
||||||
|
is given only when there are no completed matches. This means that as soon as
|
||||||
|
the shortest match has been found, continuation to a new subject segment is no
|
||||||
|
longer possible. Consider this \fBpcre2test\fP example:
|
||||||
|
.sp
|
||||||
|
re> /dog(sbody)?/
|
||||||
|
data> dogsb\e=ps
|
||||||
|
0: dog
|
||||||
|
data> do\e=ps,dfa
|
||||||
|
Partial match: do
|
||||||
|
data> gsb\e=ps,dfa,dfa_restart
|
||||||
|
0: g
|
||||||
|
data> dogsbody\e=dfa
|
||||||
|
0: dogsbody
|
||||||
|
1: dog
|
||||||
|
.sp
|
||||||
|
The first data line passes the string "dogsb" to a standard matching function,
|
||||||
|
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
|
||||||
|
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
|
||||||
|
string "dog" is a complete match. Similarly, when the subject is presented to
|
||||||
|
a DFA matching function in several parts ("do" and "gsb" being the first two)
|
||||||
|
the match stops when "dog" has been found, and it is not possible to continue.
|
||||||
|
On the other hand, if "dogsbody" is presented as a single string, a DFA
|
||||||
|
matching function finds both matches.
|
||||||
|
.P
|
||||||
|
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
|
||||||
|
multi-segment data. The example above then behaves differently:
|
||||||
|
.sp
|
||||||
|
re> /dog(sbody)?/
|
||||||
|
data> dogsb\e=ph
|
||||||
|
Partial match: dogsb
|
||||||
|
data> do\e=ps,dfa
|
||||||
|
Partial match: do
|
||||||
|
data> gsb\e=ph,dfa,dfa_restart
|
||||||
|
Partial match: gsb
|
||||||
|
.sp
|
||||||
|
5. Patterns that contain alternatives at the top level which do not all start
|
||||||
|
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
|
||||||
|
used. For example, consider this pattern:
|
||||||
|
.sp
|
||||||
|
1234|3789
|
||||||
|
.sp
|
||||||
|
If the first part of the subject is "ABC123", a partial match of the first
|
||||||
|
alternative is found at offset 3. There is no partial match for the second
|
||||||
|
alternative, because such a match does not start at the same point in the
|
||||||
|
subject string. Attempting to continue with the string "7890" does not yield a
|
||||||
|
match because only those alternatives that match at one point in the subject
|
||||||
|
are remembered. The problem arises because the start of the second alternative
|
||||||
|
matches within the first alternative. There is no problem with anchored
|
||||||
|
patterns or patterns such as:
|
||||||
|
.sp
|
||||||
|
1234|ABCD
|
||||||
|
.sp
|
||||||
|
where no string can be a partial match for both alternatives. This is not a
|
||||||
|
problem if a standard matching function is used, because the entire match has
|
||||||
|
to be rerun each time:
|
||||||
|
.sp
|
||||||
|
re> /1234|3789/
|
||||||
|
data> ABC123\e=ph
|
||||||
|
Partial match: 123
|
||||||
|
data> 1237890
|
||||||
|
0: 3789
|
||||||
|
.sp
|
||||||
|
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
|
||||||
|
the entire match can also be used with the DFA matching function. Another
|
||||||
|
possibility is to work with two buffers. If a partial match at offset \fIn\fP
|
||||||
|
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
|
||||||
|
the second buffer, you can then try a new match starting at offset \fIn+1\fP in
|
||||||
|
the first buffer.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH AUTHOR
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Philip Hazel
|
||||||
|
University Computing Service
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH REVISION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Last updated: 14 October 2014
|
||||||
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
|
.fi
|
|
@ -424,6 +424,7 @@ PATTERN MODIFIERS
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
hex pattern is coded in hexadecimal
|
hex pattern is coded in hexadecimal
|
||||||
jit[=<number>] use JIT
|
jit[=<number>] use JIT
|
||||||
|
jitverify verify JIT use
|
||||||
locale=<name> use this locale
|
locale=<name> use this locale
|
||||||
memory show memory used
|
memory show memory used
|
||||||
newline=<type> set newline type
|
newline=<type> set newline type
|
||||||
|
@ -448,9 +449,6 @@ PATTERN MODIFIERS
|
||||||
as newlines, both in the pattern and (by default) in subject lines. The
|
as newlines, both in the pattern and (by default) in subject lines. The
|
||||||
type must be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
type must be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
||||||
|
|
||||||
Both the \R and newline settings can be changed at match time, but if
|
|
||||||
this is done, JIT matching is disabled.
|
|
||||||
|
|
||||||
Information about a pattern
|
Information about a pattern
|
||||||
|
|
||||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||||
|
@ -490,26 +488,30 @@ PATTERN MODIFIERS
|
||||||
|
|
||||||
JIT compilation
|
JIT compilation
|
||||||
|
|
||||||
The /jit modifier may optionally be followed by a number in the range 0
|
The /jit modifier may optionally be followed by and equals sign and a
|
||||||
to 7:
|
number in the range 0 to 7:
|
||||||
|
|
||||||
0 disable JIT
|
0 disable JIT
|
||||||
1 normal match only
|
1 use JIT for normal match only
|
||||||
2 soft partial match only
|
2 use JIT for soft partial match only
|
||||||
3 normal match and soft partial match
|
3 use JIT for normal match and soft partial match
|
||||||
4 hard partial match only
|
4 use JIT for hard partial match only
|
||||||
6 soft and hard partial match
|
6 use JIT for soft and hard partial match
|
||||||
7 all three modes
|
7 all three modes
|
||||||
|
|
||||||
If no number is given, 7 is assumed. If JIT compilation is successful,
|
If no number is given, 7 is assumed. If JIT compilation is successful,
|
||||||
the compiled JIT code will automatically be used when pcre2_match() is
|
the compiled JIT code will automatically be used when pcre2_match() is
|
||||||
run, except when incompatible run-time options are specified. For more
|
run for the appropriate type of match, except when incompatible run-
|
||||||
details, see the pcre2jit documentation. See also the jitstack modifier
|
time options are specified. For more details, see the pcre2jit documen-
|
||||||
below for a way of setting the size of the JIT stack.
|
tation. See also the jitstack modifier below for a way of setting the
|
||||||
|
size of the JIT stack.
|
||||||
|
|
||||||
If the jitverify modifier is specified, the text "(JIT)" is added to
|
If the jitverify modifier is specified, information about the compiled
|
||||||
|
pattern shows whether JIT compilation was or was not successful. If
|
||||||
|
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||||
|
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||||
the first output line after a match or non match when JIT-compiled code
|
the first output line after a match or non match when JIT-compiled code
|
||||||
was actually used. This modifier can also be set on a subject line.
|
was actually used.
|
||||||
|
|
||||||
Setting a locale
|
Setting a locale
|
||||||
|
|
||||||
|
@ -597,7 +599,6 @@ PATTERN MODIFIERS
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitverify verify JIT usage
|
|
||||||
mark show mark values
|
mark show mark values
|
||||||
|
|
||||||
These modifiers may not appear in a #pattern command. If you want them
|
These modifiers may not appear in a #pattern command. If you want them
|
||||||
|
@ -617,7 +618,6 @@ SUBJECT MODIFIERS
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
dfa_restart set PCRE2_DFA_RESTART
|
dfa_restart set PCRE2_DFA_RESTART
|
||||||
dfa_shortest set PCRE2_DFA_SHORTEST
|
dfa_shortest set PCRE2_DFA_SHORTEST
|
||||||
no_start_optimize set PCRE2_NO_START_OPTIMIZE
|
|
||||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||||
notbol set PCRE2_NOTBOL
|
notbol set PCRE2_NOTBOL
|
||||||
notempty set PCRE2_NOTEMPTY
|
notempty set PCRE2_NOTEMPTY
|
||||||
|
@ -645,9 +645,8 @@ SUBJECT MODIFIERS
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text (non-JIT only)
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
bsr=[anycrlf|unicode] specify \R handling
|
|
||||||
callout_capture show captures at callout time
|
callout_capture show captures at callout time
|
||||||
callout_data=<n> set a value to pass via callouts
|
callout_data=<n> set a value to pass via callouts
|
||||||
callout_fail=<n>[:<m>] control callout failure
|
callout_fail=<n>[:<m>] control callout failure
|
||||||
|
@ -659,11 +658,9 @@ SUBJECT MODIFIERS
|
||||||
getall extract all captured substrings
|
getall extract all captured substrings
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitstack=<n> set size of JIT stack
|
jitstack=<n> set size of JIT stack
|
||||||
jitverify verify JIT usage
|
|
||||||
mark show mark values
|
mark show mark values
|
||||||
match_limit=>n> set a match limit
|
match_limit=>n> set a match limit
|
||||||
memory show memory usage
|
memory show memory usage
|
||||||
newline=<type> set newline type
|
|
||||||
offset=<n> set starting offset
|
offset=<n> set starting offset
|
||||||
ovector=<n> set size of output vector
|
ovector=<n> set size of output vector
|
||||||
recursion_limit=<n> set a recursion limit
|
recursion_limit=<n> set a recursion limit
|
||||||
|
@ -671,13 +668,6 @@ SUBJECT MODIFIERS
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
FIXME: Give more examples.
|
FIXME: Give more examples.
|
||||||
|
|
||||||
Newline and \R handling
|
|
||||||
|
|
||||||
These modifiers set the newline and \R processing conventions for the
|
|
||||||
subject line, overriding any values that were set at compile time (as
|
|
||||||
described above). JIT matching is disabled if these settings are
|
|
||||||
changed at match time.
|
|
||||||
|
|
||||||
Showing more text
|
Showing more text
|
||||||
|
|
||||||
The aftertext modifier requests that as well as outputting the sub-
|
The aftertext modifier requests that as well as outputting the sub-
|
||||||
|
@ -690,11 +680,14 @@ SUBJECT MODIFIERS
|
||||||
ture number.
|
ture number.
|
||||||
|
|
||||||
The allusedtext modifier requests that all the text that was consulted
|
The allusedtext modifier requests that all the text that was consulted
|
||||||
during a successful pattern match be shown. This affects the output if
|
during a successful pattern match by the interpreter should be shown.
|
||||||
there is a lookbehind at the start of a match, or a lookahead at the
|
This feature is not supported for JIT matching, and if requested with
|
||||||
end, or if \K is used in the pattern. Characters that precede or follow
|
JIT it is ignored (with a warning message). Setting this modifier
|
||||||
the start and end of the actual match are indicated in the output by
|
affects the output if there is a lookbehind at the start of a match, or
|
||||||
'<' or '>' characters underneath them. Here is an example:
|
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||||
|
that precede or follow the start and end of the actual match are indi-
|
||||||
|
cated in the output by '<' or '>' characters underneath them. Here is
|
||||||
|
an example:
|
||||||
|
|
||||||
/(?<=pqr)abc(?=xyz)/
|
/(?<=pqr)abc(?=xyz)/
|
||||||
123pqrabcxyz456\=allusedtext
|
123pqrabcxyz456\=allusedtext
|
||||||
|
@ -792,6 +785,10 @@ SUBJECT MODIFIERS
|
||||||
the minimum values for each parameter that allow pcre2_match() to com-
|
the minimum values for each parameter that allow pcre2_match() to com-
|
||||||
plete without error.
|
plete without error.
|
||||||
|
|
||||||
|
If JIT is being used, only the match limit is relevant. If DFA matching
|
||||||
|
is being used, neither limit is relevant, and this modifier is ignored
|
||||||
|
(with a warning message).
|
||||||
|
|
||||||
The match_limit number is a measure of the amount of backtracking that
|
The match_limit number is a measure of the amount of backtracking that
|
||||||
takes place, and learning the minimum value can be instructive. For
|
takes place, and learning the minimum value can be instructive. For
|
||||||
most simple matches, the number is quite small, but for patterns with
|
most simple matches, the number is quite small, but for patterns with
|
||||||
|
@ -827,6 +824,11 @@ SUBJECT MODIFIERS
|
||||||
#subject command. It specifies the number of pairs of offsets that are
|
#subject command. It specifies the number of pairs of offsets that are
|
||||||
available for storing matching information. The default is 15.
|
available for storing matching information. The default is 15.
|
||||||
|
|
||||||
|
At least one pair of offsets is always created by pcre2_match_data_cre-
|
||||||
|
ate(), for matching with PCRE2's native API, so a value of 0 is the
|
||||||
|
same as 1. However a value of 0 is useful when testing the POSIX API
|
||||||
|
because it causes regexec() to be called with a NULL capture vector.
|
||||||
|
|
||||||
|
|
||||||
THE ALTERNATIVE MATCHING FUNCTION
|
THE ALTERNATIVE MATCHING FUNCTION
|
||||||
|
|
||||||
|
@ -1069,5 +1071,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 19 August 2014
|
Last updated: 11 October 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
|
|
|
@ -612,6 +612,7 @@ clock_t total_match_time = 0;
|
||||||
|
|
||||||
static uint32_t dfa_matched;
|
static uint32_t dfa_matched;
|
||||||
static uint32_t forbid_utf = 0;
|
static uint32_t forbid_utf = 0;
|
||||||
|
static uint32_t maxlookbehind;
|
||||||
static uint32_t max_oveccount;
|
static uint32_t max_oveccount;
|
||||||
static uint32_t callout_count;
|
static uint32_t callout_count;
|
||||||
|
|
||||||
|
@ -2293,6 +2294,55 @@ return 0;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Move back by so many characters *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* Given a code unit offset in a subject string, move backwards by a number of
|
||||||
|
characters, and return the resulting offset.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
subject pointer to the string
|
||||||
|
offset start offset
|
||||||
|
count count to move back by
|
||||||
|
utf TRUE if in UTF mode
|
||||||
|
|
||||||
|
Returns: a possibly changed offset
|
||||||
|
*/
|
||||||
|
|
||||||
|
static PCRE2_SIZE
|
||||||
|
backchars(uint8_t *subject, PCRE2_SIZE offset, uint32_t count, BOOL utf)
|
||||||
|
{
|
||||||
|
long int yield;
|
||||||
|
|
||||||
|
if (!utf || test_mode == PCRE32_MODE) yield = offset - count;
|
||||||
|
|
||||||
|
else if (test_mode == PCRE8_MODE)
|
||||||
|
{
|
||||||
|
PCRE2_SPTR8 pp = (PCRE2_SPTR8)subject + offset;
|
||||||
|
for (; count > 0; count--)
|
||||||
|
{
|
||||||
|
pp--;
|
||||||
|
while ((*pp & 0xc0) == 0x80) pp--;
|
||||||
|
}
|
||||||
|
yield = pp - (PCRE2_SPTR8)subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
else /* 16-bit mode */
|
||||||
|
{
|
||||||
|
PCRE2_SPTR16 pp = (PCRE2_SPTR16)subject + offset;
|
||||||
|
for (; count > 0; count--)
|
||||||
|
{
|
||||||
|
pp--;
|
||||||
|
if ((*pp & 0xfc00) == 0xdc00) pp--;
|
||||||
|
}
|
||||||
|
yield = pp - (PCRE2_SPTR16)subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (yield >= 0)? yield : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Read or extend an input line *
|
* Read or extend an input line *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
@ -3099,8 +3149,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
||||||
BOOL match_limit_set, recursion_limit_set;
|
BOOL match_limit_set, recursion_limit_set;
|
||||||
uint32_t backrefmax, bsr_convention, capture_count, first_ctype, first_cunit,
|
uint32_t backrefmax, bsr_convention, capture_count, first_ctype, first_cunit,
|
||||||
hascrorlf, jchanged, last_ctype, last_cunit, match_empty, match_limit,
|
hascrorlf, jchanged, last_ctype, last_cunit, match_empty, match_limit,
|
||||||
maxlookbehind, minlength, nameentrysize, namecount, newline_convention,
|
minlength, nameentrysize, namecount, newline_convention, recursion_limit;
|
||||||
recursion_limit;
|
|
||||||
|
|
||||||
/* These info requests may return PCRE2_ERROR_UNSET. */
|
/* These info requests may return PCRE2_ERROR_UNSET. */
|
||||||
|
|
||||||
|
@ -3145,7 +3194,6 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
||||||
pattern_info(PCRE2_INFO_LASTCODEUNIT, &last_cunit, FALSE) +
|
pattern_info(PCRE2_INFO_LASTCODEUNIT, &last_cunit, FALSE) +
|
||||||
pattern_info(PCRE2_INFO_LASTCODETYPE, &last_ctype, FALSE) +
|
pattern_info(PCRE2_INFO_LASTCODETYPE, &last_ctype, FALSE) +
|
||||||
pattern_info(PCRE2_INFO_MATCHEMPTY, &match_empty, FALSE) +
|
pattern_info(PCRE2_INFO_MATCHEMPTY, &match_empty, FALSE) +
|
||||||
pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) +
|
|
||||||
pattern_info(PCRE2_INFO_MINLENGTH, &minlength, FALSE) +
|
pattern_info(PCRE2_INFO_MINLENGTH, &minlength, FALSE) +
|
||||||
pattern_info(PCRE2_INFO_NAMECOUNT, &namecount, FALSE) +
|
pattern_info(PCRE2_INFO_NAMECOUNT, &namecount, FALSE) +
|
||||||
pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize, FALSE) +
|
pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize, FALSE) +
|
||||||
|
@ -3701,6 +3749,11 @@ if (TEST(compiled_code, ==, NULL))
|
||||||
return PR_SKIP;
|
return PR_SKIP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Remember the maximum lookbehind, for partial matching. */
|
||||||
|
|
||||||
|
if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0)
|
||||||
|
return PR_ABEND;
|
||||||
|
|
||||||
/* Call the JIT compiler if requested. */
|
/* Call the JIT compiler if requested. */
|
||||||
|
|
||||||
if (pat_patctl.jit != 0)
|
if (pat_patctl.jit != 0)
|
||||||
|
@ -4875,22 +4928,41 @@ for (gmatched = 0;; gmatched++)
|
||||||
} /* End of handling a successful match */
|
} /* End of handling a successful match */
|
||||||
|
|
||||||
/* There was a partial match. The value of ovector[0] is the bumpalong point,
|
/* There was a partial match. The value of ovector[0] is the bumpalong point,
|
||||||
not any \K point that might exist. */
|
that is, startchar, not any \K point that might have been passed. */
|
||||||
|
|
||||||
else if (capcount == PCRE2_ERROR_PARTIAL)
|
else if (capcount == PCRE2_ERROR_PARTIAL)
|
||||||
{
|
{
|
||||||
|
PCRE2_SIZE poffset;
|
||||||
|
int backlength;
|
||||||
|
int rubriclength = 0;
|
||||||
|
|
||||||
fprintf(outfile, "Partial match");
|
fprintf(outfile, "Partial match");
|
||||||
if ((dat_datctl.control & CTL_MARK) != 0 &&
|
if ((dat_datctl.control & CTL_MARK) != 0 &&
|
||||||
TESTFLD(match_data, mark, !=, NULL))
|
TESTFLD(match_data, mark, !=, NULL))
|
||||||
{
|
{
|
||||||
fprintf(outfile, ", mark=");
|
fprintf(outfile, ", mark=");
|
||||||
PCHARSV(CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
|
PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
|
||||||
|
rubriclength += 7;
|
||||||
}
|
}
|
||||||
fprintf(outfile, ": ");
|
fprintf(outfile, ": ");
|
||||||
|
rubriclength += 15;
|
||||||
|
|
||||||
|
poffset = backchars(pp, ovector[0], maxlookbehind, utf);
|
||||||
|
PCHARS(backlength, pp, poffset, ovector[0] - poffset, utf, outfile);
|
||||||
PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
|
PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
|
||||||
|
|
||||||
if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
|
if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
|
||||||
fprintf(outfile, " (JIT)");
|
fprintf(outfile, " (JIT)");
|
||||||
fprintf(outfile, "\n");
|
fprintf(outfile, "\n");
|
||||||
|
|
||||||
|
if (backlength != 0)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < rubriclength; i++) fprintf(outfile, " ");
|
||||||
|
for (i = 0; i < backlength; i++) fprintf(outfile, "<");
|
||||||
|
fprintf(outfile, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
break; /* Out of the /g loop */
|
break; /* Out of the /g loop */
|
||||||
} /* End of handling partial match */
|
} /* End of handling partial match */
|
||||||
|
|
||||||
|
|
|
@ -9286,17 +9286,21 @@ Partial match: abc12
|
||||||
xyzabc123pqr
|
xyzabc123pqr
|
||||||
0: 123
|
0: 123
|
||||||
xyzabc12\=ps
|
xyzabc12\=ps
|
||||||
Partial match: 12
|
Partial match: abc12
|
||||||
|
<<<
|
||||||
xyzabc12\=ph
|
xyzabc12\=ph
|
||||||
Partial match: 12
|
Partial match: abc12
|
||||||
|
<<<
|
||||||
|
|
||||||
/\babc\b/
|
/\babc\b/
|
||||||
+++abc+++
|
+++abc+++
|
||||||
0: abc
|
0: abc
|
||||||
+++ab\=ps
|
+++ab\=ps
|
||||||
Partial match: ab
|
Partial match: +ab
|
||||||
|
<
|
||||||
+++ab\=ph
|
+++ab\=ph
|
||||||
Partial match: ab
|
Partial match: +ab
|
||||||
|
<
|
||||||
|
|
||||||
/(?&word)(?&element)(?(DEFINE)(?<element><[^m][^>]>[^<])(?<word>\w*+))/B
|
/(?&word)(?&element)(?(DEFINE)(?<element><[^m][^>]>[^<])(?<word>\w*+))/B
|
||||||
------------------------------------------------------------------
|
------------------------------------------------------------------
|
||||||
|
@ -10324,7 +10328,8 @@ No match
|
||||||
|
|
||||||
/(?<=abc)def/
|
/(?<=abc)def/
|
||||||
abc\=ph
|
abc\=ph
|
||||||
Partial match:
|
Partial match: abc
|
||||||
|
<<<
|
||||||
|
|
||||||
/abc$/
|
/abc$/
|
||||||
abc
|
abc
|
||||||
|
@ -11877,9 +11882,11 @@ Callout 2: last capture = 0
|
||||||
|
|
||||||
/(?<=123)(*MARK:xx)abc/mark
|
/(?<=123)(*MARK:xx)abc/mark
|
||||||
xxxx123a\=ph
|
xxxx123a\=ph
|
||||||
Partial match, mark=xx: a
|
Partial match, mark=xx: 123a
|
||||||
|
<<<
|
||||||
xxxx123a\=ps
|
xxxx123a\=ps
|
||||||
Partial match, mark=xx: a
|
Partial match, mark=xx: 123a
|
||||||
|
<<<
|
||||||
|
|
||||||
/123\Kabc/
|
/123\Kabc/
|
||||||
xxxx123a\=ph
|
xxxx123a\=ph
|
||||||
|
|
|
@ -947,7 +947,8 @@ Partial match: abc
|
||||||
xyzfo\=ps
|
xyzfo\=ps
|
||||||
No match
|
No match
|
||||||
foob\=ps,offset=2
|
foob\=ps,offset=2
|
||||||
Partial match: b
|
Partial match: foob
|
||||||
|
<<<
|
||||||
foobar...\=ps,dfa_restart,offset=4
|
foobar...\=ps,dfa_restart,offset=4
|
||||||
0: ar
|
0: ar
|
||||||
xyzfo\=ps
|
xyzfo\=ps
|
||||||
|
@ -7092,17 +7093,21 @@ Failed: error -40: item unsupported for DFA matching
|
||||||
xyzabc123pqr
|
xyzabc123pqr
|
||||||
0: 123
|
0: 123
|
||||||
xyzabc12\=ps
|
xyzabc12\=ps
|
||||||
Partial match: 12
|
Partial match: abc12
|
||||||
|
<<<
|
||||||
xyzabc12\=ph
|
xyzabc12\=ph
|
||||||
Partial match: 12
|
Partial match: abc12
|
||||||
|
<<<
|
||||||
|
|
||||||
/\babc\b/
|
/\babc\b/
|
||||||
+++abc+++
|
+++abc+++
|
||||||
0: abc
|
0: abc
|
||||||
+++ab\=ps
|
+++ab\=ps
|
||||||
Partial match: ab
|
Partial match: +ab
|
||||||
|
<
|
||||||
+++ab\=ph
|
+++ab\=ph
|
||||||
Partial match: ab
|
Partial match: +ab
|
||||||
|
<
|
||||||
|
|
||||||
/(?=C)/g,aftertext
|
/(?=C)/g,aftertext
|
||||||
ABCDECBA
|
ABCDECBA
|
||||||
|
@ -7226,7 +7231,8 @@ Failed: error -40: item unsupported for DFA matching
|
||||||
|
|
||||||
/(?<=abc)def/
|
/(?<=abc)def/
|
||||||
abc\=ph
|
abc\=ph
|
||||||
Partial match:
|
Partial match: abc
|
||||||
|
<<<
|
||||||
|
|
||||||
/abc$/
|
/abc$/
|
||||||
abc
|
abc
|
||||||
|
|
Loading…
Reference in New Issue