Partial documentation and partial code tweaks.
This commit is contained in:
parent
a6302442f2
commit
26cd0bccb3
|
@ -34,6 +34,7 @@ dist_html_DATA = \
|
|||
doc/html/pcre2jit.html \
|
||||
doc/html/pcre2limits.html \
|
||||
doc/html/pcre2matching.html \
|
||||
doc/html/pcre2partial.html \
|
||||
doc/html/pcre2test.html \
|
||||
doc/html/pcre2unicode.html
|
||||
|
||||
|
@ -64,7 +65,6 @@ dist_html_DATA = \
|
|||
# doc/html/pcre2_utf16_to_host_byte_order.html \
|
||||
# doc/html/pcre2_utf32_to_host_byte_order.html \
|
||||
# doc/html/pcre2_version.html \
|
||||
# doc/html/pcre2partial.html \
|
||||
# doc/html/pcre2pattern.html \
|
||||
# doc/html/pcre2perform.html \
|
||||
# doc/html/pcre2posix.html \
|
||||
|
@ -86,6 +86,7 @@ dist_man_MANS = \
|
|||
doc/pcre2jit.3 \
|
||||
doc/pcre2limits.3 \
|
||||
doc/pcre2matching.3 \
|
||||
doc/pcre2partial.3 \
|
||||
doc/pcre2test.1 \
|
||||
doc/pcre2unicode.3
|
||||
|
||||
|
@ -118,7 +119,6 @@ dist_man_MANS = \
|
|||
# doc/pcre2_utf16_to_host_byte_order.3 \
|
||||
# doc/pcre2_utf32_to_host_byte_order.3 \
|
||||
# doc/pcre2_version.3 \
|
||||
# doc/pcre2partial.3 \
|
||||
# doc/pcre2pattern.3 \
|
||||
# doc/pcre2perform.3 \
|
||||
# doc/pcre2posix.3 \
|
||||
|
|
|
@ -90,9 +90,6 @@ document for an overview of all the PCRE2 documentation.
|
|||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a><br>
|
||||
<P>
|
||||
<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -102,9 +99,6 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a><br>
|
||||
|
@ -133,7 +127,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -141,7 +135,7 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b> const unsigned char *<i>tables</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -165,10 +159,6 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
|
@ -178,10 +168,6 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -596,7 +582,7 @@ A compile context is created, copied, and freed by the following functions:
|
|||
A compile context is created with default values for its parameters. These can
|
||||
be changed by calling the following functions, which return 0 on success, or
|
||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -605,8 +591,7 @@ or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
|
|||
ending sequence. The value of this parameter does not affect what is compiled;
|
||||
it is just saved with the compiled pattern. The value is used by the JIT
|
||||
compiler and by the two interpreted matching functions, <i>pcre2_match()</i> and
|
||||
<i>pcre2_dfa_match()</i>. You can change the value when calling these functions,
|
||||
but doing so disables the use of JIT.
|
||||
<i>pcre2_dfa_match()</i>.
|
||||
<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> const unsigned char *<i>tables</i>);</b>
|
||||
<br>
|
||||
|
@ -614,7 +599,7 @@ but doing so disables the use of JIT.
|
|||
The value must be the result of a call to <i>pcre2_maketables()</i>, whose only
|
||||
argument is a general context. This function builds a set of character tables
|
||||
in the current locale.
|
||||
<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
|
@ -629,8 +614,7 @@ When a pattern is compiled with the PCRE2_EXTENDED option, the value of this
|
|||
parameter affects the recognition of white space and the end of internal
|
||||
comments starting with #. The value is saved with the compiled pattern for
|
||||
subsequent use by the JIT compiler and by the two interpreted matching
|
||||
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>. You can change the
|
||||
value when calling these functions, but doing so disables the use of JIT.
|
||||
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
||||
<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -685,14 +669,6 @@ A match context is created, copied, and freed by the following functions:
|
|||
A match context is created with default values for its parameters. These can
|
||||
be changed by calling the following functions, which return 0 on success, or
|
||||
PCRE2_ERROR_BADDATA if invalid data is detected.
|
||||
<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF,
|
||||
or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
|
||||
ending sequence. If you want to make use of JIT matching, you should not use
|
||||
this function, but instead set the value in a compile context.
|
||||
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
|
||||
<b> void *<i>callout_data</i>);</b>
|
||||
|
@ -769,17 +745,6 @@ pattern of the form
|
|||
where ddd is a decimal number. However, such a setting is ignored unless ddd is
|
||||
less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
|
||||
limit is set, less than the default.
|
||||
<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
This specifies which characters or character sequences are to be recognized as
|
||||
newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only),
|
||||
PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character
|
||||
sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or
|
||||
PCRE2_NEWLINE_ANY (any Unicode newline sequence). If you want to make use of
|
||||
JIT matching, you should not use this function, but instead set the value in a
|
||||
compile context.
|
||||
<b>int pcre2_set_recursion_memory_management(</b>
|
||||
<b> pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
|
||||
|
@ -956,9 +921,8 @@ documentation).
|
|||
<P>
|
||||
For those options that can be different in different parts of the pattern, the
|
||||
contents of the <i>options</i> argument specifies their settings at the start of
|
||||
compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and
|
||||
PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as well as
|
||||
at compile time.
|
||||
compilation. The PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK options can be set at
|
||||
the time of matching as well as at compile time.
|
||||
</P>
|
||||
<P>
|
||||
Other, less frequently required compile-time parameters (for example, the
|
||||
|
@ -1176,14 +1140,55 @@ purposes.
|
|||
<pre>
|
||||
PCRE2_NO_START_OPTIMIZE
|
||||
</pre>
|
||||
This is an option that acts at matching time; that is, it is really an option
|
||||
for <b>pcre2_match()</b> or <b>pcre_dfa_match()</b>. If it is set at compile
|
||||
time, it is remembered with the compiled pattern and assumed at matching time.
|
||||
This is necessary if you want to use JIT execution, because the JIT compiler
|
||||
needs to know whether or not this option is set. For details, see the
|
||||
discussion of PCRE2_NO_START_OPTIMIZE in the section on <b>pcre2_match()</b>
|
||||
options
|
||||
<a href="#matchoptions">below.</a>
|
||||
This is an option whose main effect is at matching time. It does not change
|
||||
what <b>pcre2_compile()</b> generates, but it does affect the output of the JIT
|
||||
compiler.
|
||||
</P>
|
||||
<P>
|
||||
There are a number of optimizations that may occur at the start of a match, in
|
||||
order to speed up the process. For example, if it is known that an unanchored
|
||||
match must start with a specific character, the matching code searches the
|
||||
subject for that character, and fails immediately if it cannot find it, without
|
||||
actually running the main matching function. This means that a special item
|
||||
such as (*COMMIT) at the start of a pattern is not considered until after a
|
||||
suitable starting point for the match has been found. Also, when callouts or
|
||||
(*MARK) items are in use, these "start-up" optimizations can cause them to be
|
||||
skipped if the pattern is never actually used. The start-up optimizations are
|
||||
in effect a pre-scan of the subject that takes place before the pattern is run.
|
||||
</P>
|
||||
<P>
|
||||
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
|
||||
possibly causing performance to suffer, but ensuring that in cases where the
|
||||
result is "no match", the callouts do occur, and that items such as (*COMMIT)
|
||||
and (*MARK) are considered at every possible starting position in the subject
|
||||
string.
|
||||
</P>
|
||||
<P>
|
||||
Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation.
|
||||
Consider the pattern
|
||||
<pre>
|
||||
(*COMMIT)ABC
|
||||
</pre>
|
||||
When this is compiled, PCRE2 records the fact that a match must start with the
|
||||
character "A". Suppose the subject string is "DEFABC". The start-up
|
||||
optimization scans along the subject, finds "A" and runs the first match
|
||||
attempt from there. The (*COMMIT) item means that the pattern must match the
|
||||
current starting position, which in this case, it does. However, if the same
|
||||
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
|
||||
subject string does not happen. The first match attempt is run starting from
|
||||
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
|
||||
the overall result is "no match". There are also other start-up optimizations.
|
||||
For example, a minimum length for the subject may be recorded. Consider the
|
||||
pattern
|
||||
<pre>
|
||||
(*MARK:A)(X|Y)
|
||||
</pre>
|
||||
The minimum length for a match is one character. If the subject is "ABC", there
|
||||
will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
|
||||
string at the end of the subject does not take place, because PCRE2 knows that
|
||||
the subject is now too short, and so the (*MARK) is never encountered. In this
|
||||
case, the optimization does not affect the overall match result, which is still
|
||||
"no match", but it does affect the auxiliary information that is returned.
|
||||
<pre>
|
||||
PCRE2_NO_UTF_CHECK
|
||||
</pre>
|
||||
|
@ -1648,13 +1653,15 @@ string that define the matched part of the subject and any substrings that were
|
|||
capured. This is know as the <i>ovector</i>.
|
||||
</P>
|
||||
<P>
|
||||
Before calling <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b> you must create a
|
||||
Before calling <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b> you must create a
|
||||
match data block by calling one of the creation functions above. For
|
||||
<b>pcre2_match_data_create()</b>, the first argument is the number of pairs of
|
||||
offsets in the <i>ovector</i>. One pair of offsets is required to identify the
|
||||
string that matched the whole pattern, with another pair for each captured
|
||||
substring. For example, a value of 4 creates enough space to record the
|
||||
matched portion of the subject plus three captured substrings.
|
||||
substring. For example, a value of 4 creates enough space to record the matched
|
||||
portion of the subject plus three captured substrings. A minimum of at least 1
|
||||
pair is imposed by <b>pcre2_match_data_create()</b>, so it is always possible to
|
||||
return the overall matched string.
|
||||
</P>
|
||||
<P>
|
||||
For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
|
||||
|
@ -1779,10 +1786,9 @@ Option bits for <b>pcre2_match()</b>
|
|||
</b><br>
|
||||
<P>
|
||||
The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
|
||||
zero. The only bits that may be set are PCRE2_ANCHORED,
|
||||
PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
||||
PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and
|
||||
PCRE2_PARTIAL_SOFT. Their action is described below.
|
||||
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
|
||||
PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below.
|
||||
</P>
|
||||
<P>
|
||||
If the pattern was successfully processed by the just-in-time (JIT) compiler,
|
||||
|
@ -1833,56 +1839,6 @@ valid, so PCRE2 searches further into the string for occurrences of "a" or "b".
|
|||
This is like PCRE2_NOTEMPTY, except that an empty string match that is not at
|
||||
the start of the subject is permitted. If the pattern is anchored, such a match
|
||||
can occur only if the pattern contains \K.
|
||||
<pre>
|
||||
PCRE2_NO_START_OPTIMIZE
|
||||
</pre>
|
||||
There are a number of optimizations that <b>pcre2_match()</b> uses at the start
|
||||
of a match, in order to speed up the process. For example, if it is known that
|
||||
an unanchored match must start with a specific character, it searches the
|
||||
subject for that character, and fails immediately if it cannot find it, without
|
||||
actually running the main matching function. This means that a special item
|
||||
such as (*COMMIT) at the start of a pattern is not considered until after a
|
||||
suitable starting point for the match has been found. Also, when callouts or
|
||||
(*MARK) items are in use, these "start-up" optimizations can cause them to be
|
||||
skipped if the pattern is never actually used. The start-up optimizations are
|
||||
in effect a pre-scan of the subject that takes place before the pattern is run.
|
||||
</P>
|
||||
<P>
|
||||
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
|
||||
possibly causing performance to suffer, but ensuring that in cases where the
|
||||
result is "no match", the callouts do occur, and that items such as (*COMMIT)
|
||||
and (*MARK) are considered at every possible starting position in the subject
|
||||
string. If PCRE2_NO_START_OPTIMIZE is set at compile time, it cannot be unset
|
||||
at matching time. The use of PCRE2_NO_START_OPTIMIZE at matching time (that is,
|
||||
passing it to <b>pcre2_match()</b>) disables JIT execution; in this situation,
|
||||
matching is always done using interpretively.
|
||||
</P>
|
||||
<P>
|
||||
Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching operation.
|
||||
Consider the pattern
|
||||
<pre>
|
||||
(*COMMIT)ABC
|
||||
</pre>
|
||||
When this is compiled, PCRE2 records the fact that a match must start with the
|
||||
character "A". Suppose the subject string is "DEFABC". The start-up
|
||||
optimization scans along the subject, finds "A" and runs the first match
|
||||
attempt from there. The (*COMMIT) item means that the pattern must match the
|
||||
current starting position, which in this case, it does. However, if the same
|
||||
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
|
||||
subject string does not happen. The first match attempt is run starting from
|
||||
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
|
||||
the overall result is "no match". There are also other start-up optimizations.
|
||||
For example, a minimum length for the subject may be recorded. Consider the
|
||||
pattern
|
||||
<pre>
|
||||
(*MARK:A)(X|Y)
|
||||
</pre>
|
||||
The minimum length for a match is one character. If the subject is "ABC", there
|
||||
will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
|
||||
string at the end of the subject does not take place, because PCRE2 knows that
|
||||
the subject is now too short, and so the (*MARK) is never encountered. In this
|
||||
case, the optimization does not affect the overall match result, which is still
|
||||
"no match", but it does affect the auxiliary information that is returned.
|
||||
<pre>
|
||||
PCRE2_NO_UTF_CHECK
|
||||
</pre>
|
||||
|
@ -2035,13 +1991,13 @@ returned.
|
|||
</P>
|
||||
<P>
|
||||
If the ovector is too small to hold all the captured substring offsets, as much
|
||||
as possible is filled in, and the function returns a value of zero. If neither
|
||||
the actual string matched nor any captured substrings are of interest,
|
||||
<b>pcre2_match()</b> may be called with a match data block whose ovector is of
|
||||
zero length. However, if the pattern contains back references and the
|
||||
<i>ovector</i> is not big enough to remember the related substrings, PCRE2 has
|
||||
to get additional memory for use during matching. Thus it is usually advisable
|
||||
to set up a match data block containing an ovector of reasonable size.
|
||||
as possible is filled in, and the function returns a value of zero. If captured
|
||||
substrings are not of interest, <b>pcre2_match()</b> may be called with a match
|
||||
data block whose ovector is of minimum length (that is, one pair). However, if
|
||||
the pattern contains back references and the <i>ovector</i> is not big enough to
|
||||
remember the related substrings, PCRE2 has to get additional memory for use
|
||||
during matching. Thus it is usually advisable to set up a match data block
|
||||
containing an ovector of reasonable size.
|
||||
</P>
|
||||
<P>
|
||||
It is possible for capturing subpattern number <i>n+1</i> to match some part of
|
||||
|
@ -2074,12 +2030,6 @@ Other information about the match
|
|||
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
|
@ -2093,39 +2043,10 @@ Otherwise NULL is returned. A (*MARK) name may be available after a failed
|
|||
match or a partial match, as well as after a successful one.
|
||||
</P>
|
||||
<P>
|
||||
The other three functions yield values that give information about the part of
|
||||
the subject string that was inspected during a successful match or a partial
|
||||
match. Their results are undefined after a failed match. They return the
|
||||
following values, respectively:
|
||||
<br>
|
||||
<br>
|
||||
(1) The offset of the leftmost character that was inspected during the match.
|
||||
This can be earlier than the point at which the match started if the pattern
|
||||
contains lookbehind assertions or \b or \B at the start.
|
||||
<br>
|
||||
<br>
|
||||
(2) The offset of the character that follows the rightmost character that was
|
||||
inspected during the match. This can be after the end of the match if the
|
||||
pattern contains lookahead assertions.
|
||||
<br>
|
||||
<br>
|
||||
(3) The offset of the character at which the successful or partial match
|
||||
started. This can be different to the value of <i>ovector[0]</i> if the pattern
|
||||
contains the \K escape sequence.
|
||||
</P>
|
||||
<P>
|
||||
For example, if the pattern (?<=abc)xx\Kyy(?=def) is matched against the
|
||||
string "123abcxxyydef123", the resulting offsets are:
|
||||
<pre>
|
||||
ovector[0] 8
|
||||
ovector[1] 10
|
||||
leftchar 3
|
||||
rightchar 13
|
||||
startchar 6
|
||||
</pre>
|
||||
The <b>allusedtext</b> modifier in <b>pcre2test</b> can be used to display a
|
||||
longer string that shows the leftmost and rightmost characters in a match
|
||||
instead of just the matched string.
|
||||
The offset of the character at which the successful match started is
|
||||
returned by <b>pcre2_get_startchar()</b>. This can be different to the value of
|
||||
<i>ovector[0]</i> if the pattern contains the \K escape sequence. Note,
|
||||
however, the \K has no effect for a partial match.
|
||||
<a name="errorlist"></a></P>
|
||||
<br><b>
|
||||
Error return values from <b>pcre2_match()</b>
|
||||
|
@ -2513,10 +2434,9 @@ Option bits for <b>pcre_dfa_match()</b>
|
|||
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
|
||||
be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
|
||||
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
|
||||
PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
|
||||
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are
|
||||
exactly the same as for <b>pcre2_match()</b>, so their description is not
|
||||
repeated here.
|
||||
PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and
|
||||
PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for
|
||||
<b>pcre2_match()</b>, so their description is not repeated here.
|
||||
<pre>
|
||||
PCRE2_PARTIAL_HARD
|
||||
PCRE2_PARTIAL_SOFT
|
||||
|
@ -2650,7 +2570,7 @@ Cambridge CB2 3QH, England.
|
|||
</P>
|
||||
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 16 September 2014
|
||||
Last updated: 14 October 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -131,7 +131,7 @@ long enough, or, for unanchored patterns, if it has been scanned far enough.
|
|||
</P>
|
||||
<P>
|
||||
You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
|
||||
option to the matching function, or by starting the pattern with
|
||||
option to <b>pcre2_compile()</b>, or by starting the pattern with
|
||||
(*NO_START_OPT). This slows down the matching process, but does ensure that
|
||||
callouts such as the example above are obeyed.
|
||||
</P>
|
||||
|
|
|
@ -128,9 +128,8 @@ or the JIT compiler was not able to handle the pattern.
|
|||
<P>
|
||||
The <b>pcre2_match()</b> options that are supported for JIT matching are
|
||||
PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
|
||||
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The options
|
||||
that are not supported at match time are PCRE2_ANCHORED and
|
||||
PCRE2_NO_START_OPTIMIZE, though they are supported if given at compile time.
|
||||
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The
|
||||
PCRE2_ANCHORED option is not supported at match time.
|
||||
</P>
|
||||
<P>
|
||||
The only unsupported pattern items are \C (match a single data unit) when
|
||||
|
|
|
@ -0,0 +1,464 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2partial specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2partial man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<ul>
|
||||
<li><a name="TOC1" href="#SEC1">PARTIAL MATCHING IN PCRE2</a>
|
||||
<li><a name="TOC2" href="#SEC2">PARTIAL MATCHING USING pcre2_match()</a>
|
||||
<li><a name="TOC3" href="#SEC3">PARTIAL MATCHING USING pcre2_dfa_match()</a>
|
||||
<li><a name="TOC4" href="#SEC4">PARTIAL MATCHING AND WORD BOUNDARIES</a>
|
||||
<li><a name="TOC5" href="#SEC5">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a>
|
||||
<li><a name="TOC6" href="#SEC6">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a>
|
||||
<li><a name="TOC7" href="#SEC7">MULTI-SEGMENT MATCHING WITH pcre2_match()</a>
|
||||
<li><a name="TOC8" href="#SEC8">ISSUES WITH MULTI-SEGMENT MATCHING</a>
|
||||
<li><a name="TOC9" href="#SEC9">AUTHOR</a>
|
||||
<li><a name="TOC10" href="#SEC10">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE2</a><br>
|
||||
<P>
|
||||
In normal use of PCRE2, if the subject string that is passed to a matching
|
||||
function matches as far as it goes, but is too short to match the entire
|
||||
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
|
||||
might be helpful to distinguish this case from other cases in which there is no
|
||||
match.
|
||||
</P>
|
||||
<P>
|
||||
Consider, for example, an application where a human is required to type in data
|
||||
for a field with specific formatting requirements. An example might be a date
|
||||
in the form <i>ddmmmyy</i>, defined by this pattern:
|
||||
<pre>
|
||||
^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
|
||||
</pre>
|
||||
If the application sees the user's keystrokes one by one, and can check that
|
||||
what has been typed so far is potentially valid, it is able to raise an error
|
||||
as soon as a mistake is made, by beeping and not reflecting the character that
|
||||
has been typed, for example. This immediate feedback is likely to be a better
|
||||
user interface than a check that is delayed until the entire string has been
|
||||
entered. Partial matching can also be useful when the subject string is very
|
||||
long and is not all available at once.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
|
||||
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
|
||||
The difference between the two options is whether or not a partial match is
|
||||
preferred to an alternative complete match, though the details differ between
|
||||
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
|
||||
takes precedence.
|
||||
</P>
|
||||
<P>
|
||||
If you want to use partial matching with just-in-time optimized code, you must
|
||||
call <b>pcre2_jit_compile()</b> with one or both of these options:
|
||||
<pre>
|
||||
PCRE2_JIT_PARTIAL_SOFT
|
||||
PCRE2_JIT_PARTIAL_HARD
|
||||
</pre>
|
||||
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
|
||||
matches on the same pattern. If the appropriate JIT mode has not been compiled,
|
||||
interpretive matching code is used.
|
||||
</P>
|
||||
<P>
|
||||
Setting a partial matching option disables two of PCRE2's standard
|
||||
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
|
||||
abandons matching immediately if it is not present in the subject string. This
|
||||
optimization cannot be used for a subject string that might match only
|
||||
partially. PCRE2 also knows the minimum length of a matching string, and does
|
||||
not bother to run the matching function on shorter strings. This optimization
|
||||
is also disabled for partial matching.
|
||||
</P>
|
||||
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre2_match()</a><br>
|
||||
<P>
|
||||
A partial match occurs during a call to <b>pcre2_match()</b> when the end of the
|
||||
subject string is reached successfully, but matching cannot continue because
|
||||
more characters are needed. However, at least one character in the subject must
|
||||
have been inspected. This character need not form part of the final matched
|
||||
string; lookbehind assertions and the \K escape sequence provide ways of
|
||||
inspecting characters before the start of a matched string. The requirement for
|
||||
inspecting at least one character exists because an empty string can always be
|
||||
matched; without such a restriction there would always be a partial match of an
|
||||
empty string at the end of the subject.
|
||||
</P>
|
||||
<P>
|
||||
When a partial match is returned, the first two elements in the ovector point
|
||||
to the portion of the subject that was matched. The appearance of \K in the
|
||||
pattern has no effect for a partial match. Consider this pattern:
|
||||
<pre>
|
||||
/abc\K123/
|
||||
</pre>
|
||||
If it is matched against "456abc123xyz" the result is a complete match, and the
|
||||
ovector defines the matched string as "123", because \K resets the "start of
|
||||
match" point. However, if a partial match is requested and the subject string
|
||||
is "456abc12", a partial match is found for the string "abc12", because all
|
||||
these characters are needed for a subsequent re-match with additional
|
||||
characters.
|
||||
</P>
|
||||
<P>
|
||||
What happens when a partial match is identified depends on which of the two
|
||||
partial matching options are set.
|
||||
</P>
|
||||
<br><b>
|
||||
PCRE2_PARTIAL_SOFT WITH pcre2_match()
|
||||
</b><br>
|
||||
<P>
|
||||
If PCRE2_PARTIAL_SOFT is set when <b>pcre2_match()</b> identifies a partial
|
||||
match, the partial match is remembered, but matching continues as normal, and
|
||||
other alternatives in the pattern are tried. If no complete match can be found,
|
||||
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
|
||||
</P>
|
||||
<P>
|
||||
This option is "soft" because it prefers a complete match over a partial match.
|
||||
All the various matching items in a pattern behave as if the subject string is
|
||||
potentially complete. For example, \z, \Z, and $ match at the end of the
|
||||
subject, as normal, and for \b and \B the end of the subject is treated as a
|
||||
non-alphanumeric.
|
||||
</P>
|
||||
<P>
|
||||
If there is more than one partial match, the first one that was found provides
|
||||
the data that is returned. Consider this pattern:
|
||||
<pre>
|
||||
/123\w+X|dogY/
|
||||
</pre>
|
||||
If this is matched against the subject string "abc123dog", both
|
||||
alternatives fail to match, but the end of the subject is reached during
|
||||
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
|
||||
identifying "123dog" as the first partial match that was found. (In this
|
||||
example, there are two partial matches, because "dog" on its own partially
|
||||
matches the second alternative.)
|
||||
</P>
|
||||
<br><b>
|
||||
PCRE2_PARTIAL_HARD WITH pcre2_match()
|
||||
</b><br>
|
||||
<P>
|
||||
If PCRE2_PARTIAL_HARD is set for <b>pcre2_match()</b>, PCRE2_ERROR_PARTIAL is
|
||||
returned as soon as a partial match is found, without continuing to search for
|
||||
possible complete matches. This option is "hard" because it prefers an earlier
|
||||
partial match over a later complete match. For this reason, the assumption is
|
||||
made that the end of the supplied subject string may not be the true end of the
|
||||
available data, and so, if \z, \Z, \b, \B, or $ are encountered at the end
|
||||
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
|
||||
character in the subject has been inspected.
|
||||
</P>
|
||||
<br><b>
|
||||
Comparing hard and soft partial matching
|
||||
</b><br>
|
||||
<P>
|
||||
The difference between the two partial matching options can be illustrated by a
|
||||
pattern such as:
|
||||
<pre>
|
||||
/dog(sbody)?/
|
||||
</pre>
|
||||
This matches either "dog" or "dogsbody", greedily (that is, it prefers the
|
||||
longer string if possible). If it is matched against the string "dog" with
|
||||
PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if
|
||||
PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other
|
||||
hand, if the pattern is made ungreedy the result is different:
|
||||
<pre>
|
||||
/dog(sbody)??/
|
||||
</pre>
|
||||
In this case the result is always a complete match because that is found first,
|
||||
and matching never continues after finding a complete match. It might be easier
|
||||
to follow this explanation by thinking of the two patterns like this:
|
||||
<pre>
|
||||
/dog(sbody)?/ is the same as /dogsbody|dog/
|
||||
/dog(sbody)??/ is the same as /dog|dogsbody/
|
||||
</pre>
|
||||
The second pattern will never match "dogsbody", because it will always find the
|
||||
shorter match first.
|
||||
</P>
|
||||
<br><a name="SEC3" href="#TOC1">PARTIAL MATCHING USING pcre2_dfa_match()</a><br>
|
||||
<P>
|
||||
The DFA functions move along the subject string character by character, without
|
||||
backtracking, searching for all possible matches simultaneously. If the end of
|
||||
the subject is reached before the end of the pattern, there is the possibility
|
||||
of a partial match, again provided that at least one character has been
|
||||
inspected.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
|
||||
have been no complete matches. Otherwise, the complete matches are returned.
|
||||
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
|
||||
any complete matches. The portion of the string that was matched when the
|
||||
longest partial match was found is set as the first matching string.
|
||||
</P>
|
||||
<P>
|
||||
Because the DFA functions always search for all possible matches, and there is
|
||||
no difference between greedy and ungreedy repetition, their behaviour is
|
||||
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
|
||||
the string "dog" matched against the ungreedy pattern shown above:
|
||||
<pre>
|
||||
/dog(sbody)??/
|
||||
</pre>
|
||||
Whereas the standard functions stop as soon as they find the complete match for
|
||||
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
||||
return that when PCRE2_PARTIAL_HARD is set.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
|
||||
<P>
|
||||
If a pattern ends with one of sequences \b or \B, which test for word
|
||||
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
|
||||
results. Consider this pattern:
|
||||
<pre>
|
||||
/\bcat\b/
|
||||
</pre>
|
||||
This matches "cat", provided there is a word boundary at either end. If the
|
||||
subject string is "the cat", the comparison of the final "t" with a following
|
||||
character cannot take place, so a partial match is found. However, normal
|
||||
matching carries on, and \b matches at the end of the subject when the last
|
||||
character is a letter, so a complete match is found. The result, therefore, is
|
||||
<i>not</i> PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
|
||||
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a><br>
|
||||
<P>
|
||||
If the <b>partial_soft</b> (or <b>ps</b>) modifier is present on a
|
||||
<b>pcre2test</b> data line, the PCRE2_PARTIAL_SOFT option is used for the match.
|
||||
Here is a run of <b>pcre2test</b> that uses the date example quoted above:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 25jun04\=ps
|
||||
0: 25jun04
|
||||
1: jun
|
||||
data> 25dec3\=ps
|
||||
Partial match: 23dec3
|
||||
data> 3ju\=ps
|
||||
Partial match: 3ju
|
||||
data> 3juj\=ps
|
||||
No match
|
||||
data> j\=ps
|
||||
No match
|
||||
</pre>
|
||||
The first data string is matched completely, so <b>pcre2test</b> shows the
|
||||
matched substrings. The remaining four strings do not match the complete
|
||||
pattern, but the first two are partial matches. Similar output is obtained
|
||||
if DFA matching is used.
|
||||
</P>
|
||||
<P>
|
||||
If the <b>partial_hard</b> (or <b>ph</b>) modifier is present on a
|
||||
<b>pcre2test</b> data line, the PCRE2_PARTIAL_HARD option is set for the match.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a><br>
|
||||
<P>
|
||||
When a partial match has been found using a DFA matching function, it is
|
||||
possible to continue the match by providing additional subject data and calling
|
||||
the function again with the same compiled regular expression, this time setting
|
||||
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
|
||||
because this is where details of the previous partial match are stored. Here is
|
||||
an example using <b>pcre2test</b>:
|
||||
<pre>
|
||||
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
|
||||
data> 23ja\=dfa,ps
|
||||
Partial match: 23ja
|
||||
data> n05\=dfa,dfa_restart
|
||||
0: n05
|
||||
</pre>
|
||||
The first call has "23ja" as the subject, and requests partial matching; the
|
||||
second call has "n05" as the subject for the continued (restarted) match.
|
||||
Notice that when the match is complete, only the last part is shown; PCRE2 does
|
||||
not retain the previously partially-matched string. It is up to the calling
|
||||
program to do that if it needs to.
|
||||
</P>
|
||||
<P>
|
||||
That means that, for an unanchored pattern, if a continued match fails, it is
|
||||
not possible to try again at a new starting point. All this facility is capable
|
||||
of doing is continuing with the previous match attempt. In the previous
|
||||
example, if the second set of data is "ug23" the result is no match, even
|
||||
though there would be a match for "aug23" if the entire string were given at
|
||||
once. Depending on the application, this may or may not be what you want.
|
||||
The only way to allow for starting again at the next character is to retain the
|
||||
matched part of the subject and try a new complete match.
|
||||
</P>
|
||||
<P>
|
||||
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
|
||||
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
|
||||
facility can be used to pass very long subject strings to the DFA matching
|
||||
functions.
|
||||
</P>
|
||||
<br><a name="SEC7" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_match()</a><br>
|
||||
<P>
|
||||
Unlike the DFA function, it is not possible to restart the previous match with
|
||||
a new segment of data when using <b>pcre2_match()</b>. Instead, new data must be
|
||||
added to the previous subject string, and the entire match re-run, starting
|
||||
from the point where the partial match occurred. Earlier data can be discarded.
|
||||
</P>
|
||||
<P>
|
||||
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
|
||||
treat the end of a segment as the end of the subject when matching \z, \Z,
|
||||
\b, \B, and $. Consider an unanchored pattern that matches dates:
|
||||
<pre>
|
||||
re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
|
||||
data> The date is 23ja\=ph
|
||||
Partial match: 23ja
|
||||
</pre>
|
||||
At this stage, an application could discard the text preceding "23ja", add on
|
||||
text from the next segment, and call the matching function again. Unlike the
|
||||
DFA matching function, the entire matching string must always be available,
|
||||
and the complete matching process occurs for each call, so more memory and more
|
||||
processing time is needed.
|
||||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">ISSUES WITH MULTI-SEGMENT MATCHING</a><br>
|
||||
<P>
|
||||
Certain types of pattern may give problems with multi-segment matching,
|
||||
whichever matching function is used.
|
||||
</P>
|
||||
<P>
|
||||
1. If the pattern contains a test for the beginning of a line, you need to pass
|
||||
the PCRE2_NOTBOL option when the subject string for any call does start at the
|
||||
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
|
||||
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
|
||||
includes the effect of PCRE2_NOTEOL.
|
||||
</P>
|
||||
<P>
|
||||
2. If a pattern contains a lookbehind assertion, characters that precede the
|
||||
start of the partial match may have been inspected during the matching process.
|
||||
When using <b>pcre2_match()</b>, sufficient characters must be retained for the
|
||||
next match attempt. You can ensure that enough characters are retained by doing
|
||||
the following:
|
||||
</P>
|
||||
<P>
|
||||
Before doing any matching, find the length of the longest lookbehind in the
|
||||
pattern by calling <b>pcre2_pattern_info()</b> with the PCRE2_INFO_MAXLOOKBEHIND
|
||||
option. Note that the resulting count is in characters, not code units. After a
|
||||
partial match, moving back from the ovector[0] offset in the subject by the
|
||||
number of characters given for the maximum lookbehind gets you to the earliest
|
||||
character that must be retained. In a non-UTF or a 32-bit situation, moving
|
||||
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
|
||||
while moving back through the code units.
|
||||
</P>
|
||||
<P>
|
||||
Characters before the point you have now reached can be discarded, and after
|
||||
the next segment has been added to what is retained, you should run the next
|
||||
match with the <b>startoffset</b> argument set so that the match begins at the
|
||||
same point as before.
|
||||
</P>
|
||||
<P>
|
||||
For example, if the pattern "(?<=123)abc" is partially matched against the
|
||||
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
|
||||
lookbehind count is 3, so all characters before offset 2 can be discarded. The
|
||||
value of <b>startoffset</b> for the next match should be 3. When <b>pcre2test</b>
|
||||
displays a partial match, it indicates the lookbehind characters with '<'
|
||||
characters:
|
||||
<pre>
|
||||
re> "(?<=123)abc"
|
||||
data> xx123ab\=ph
|
||||
Partial match: 123ab
|
||||
<<<
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
3. Because a partial match must always contain at least one character, what
|
||||
might be considered a partial match of an empty string actually gives a "no
|
||||
match" result. For example:
|
||||
<pre>
|
||||
re> /c(?<=abc)x/
|
||||
data> ab\=ps
|
||||
No match
|
||||
</pre>
|
||||
If the next segment begins "cx", a match should be found, but this will only
|
||||
happen if characters from the previous segment are retained. For this reason, a
|
||||
"no match" result should be interpreted as "partial match of an empty string"
|
||||
when the pattern contains lookbehinds.
|
||||
</P>
|
||||
<P>
|
||||
4. Matching a subject string that is split into multiple segments may not
|
||||
always produce exactly the same result as matching over one single long string,
|
||||
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
|
||||
Word Boundaries" above describes an issue that arises if the pattern ends with
|
||||
\b or \B. Another kind of difference may occur when there are multiple
|
||||
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
|
||||
is given only when there are no completed matches. This means that as soon as
|
||||
the shortest match has been found, continuation to a new subject segment is no
|
||||
longer possible. Consider this <b>pcre2test</b> example:
|
||||
<pre>
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\=ps
|
||||
0: dog
|
||||
data> do\=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\=ps,dfa,dfa_restart
|
||||
0: g
|
||||
data> dogsbody\=dfa
|
||||
0: dogsbody
|
||||
1: dog
|
||||
</pre>
|
||||
The first data line passes the string "dogsb" to a standard matching function,
|
||||
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
|
||||
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
|
||||
string "dog" is a complete match. Similarly, when the subject is presented to
|
||||
a DFA matching function in several parts ("do" and "gsb" being the first two)
|
||||
the match stops when "dog" has been found, and it is not possible to continue.
|
||||
On the other hand, if "dogsbody" is presented as a single string, a DFA
|
||||
matching function finds both matches.
|
||||
</P>
|
||||
<P>
|
||||
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
|
||||
multi-segment data. The example above then behaves differently:
|
||||
<pre>
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\=ph
|
||||
Partial match: dogsb
|
||||
data> do\=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\=ph,dfa,dfa_restart
|
||||
Partial match: gsb
|
||||
</pre>
|
||||
5. Patterns that contain alternatives at the top level which do not all start
|
||||
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
|
||||
used. For example, consider this pattern:
|
||||
<pre>
|
||||
1234|3789
|
||||
</pre>
|
||||
If the first part of the subject is "ABC123", a partial match of the first
|
||||
alternative is found at offset 3. There is no partial match for the second
|
||||
alternative, because such a match does not start at the same point in the
|
||||
subject string. Attempting to continue with the string "7890" does not yield a
|
||||
match because only those alternatives that match at one point in the subject
|
||||
are remembered. The problem arises because the start of the second alternative
|
||||
matches within the first alternative. There is no problem with anchored
|
||||
patterns or patterns such as:
|
||||
<pre>
|
||||
1234|ABCD
|
||||
</pre>
|
||||
where no string can be a partial match for both alternatives. This is not a
|
||||
problem if a standard matching function is used, because the entire match has
|
||||
to be rerun each time:
|
||||
<pre>
|
||||
re> /1234|3789/
|
||||
data> ABC123\=ph
|
||||
Partial match: 123
|
||||
data> 1237890
|
||||
0: 3789
|
||||
</pre>
|
||||
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
|
||||
the entire match can also be used with the DFA matching function. Another
|
||||
possibility is to work with two buffers. If a partial match at offset <i>n</i>
|
||||
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
|
||||
the second buffer, you can then try a new match starting at offset <i>n+1</i> in
|
||||
the first buffer.
|
||||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
University Computing Service
|
||||
<br>
|
||||
Cambridge CB2 3QH, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC10" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 14 October 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -476,6 +476,7 @@ about the pattern:
|
|||
/I info show info about compiled pattern
|
||||
hex pattern is coded in hexadecimal
|
||||
jit[=<number>] use JIT
|
||||
jitverify verify JIT use
|
||||
locale=<name> use this locale
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
|
@ -503,10 +504,6 @@ The <b>newline</b> modifier specifies which characters are to be interpreted as
|
|||
newlines, both in the pattern and (by default) in subject lines. The type must
|
||||
be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
||||
</P>
|
||||
<P>
|
||||
Both the \R and newline settings can be changed at match time, but if this is
|
||||
done, JIT matching is disabled.
|
||||
</P>
|
||||
<br><b>
|
||||
Information about a pattern
|
||||
</b><br>
|
||||
|
@ -556,29 +553,32 @@ length of the pattern is passed. This is implied if <b>hex</b> is set.
|
|||
JIT compilation
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>/jit</b> modifier may optionally be followed by a number in the range 0
|
||||
to 7:
|
||||
The <b>/jit</b> modifier may optionally be followed by and equals sign and a
|
||||
number in the range 0 to 7:
|
||||
<pre>
|
||||
0 disable JIT
|
||||
1 normal match only
|
||||
2 soft partial match only
|
||||
3 normal match and soft partial match
|
||||
4 hard partial match only
|
||||
6 soft and hard partial match
|
||||
1 use JIT for normal match only
|
||||
2 use JIT for soft partial match only
|
||||
3 use JIT for normal match and soft partial match
|
||||
4 use JIT for hard partial match only
|
||||
6 use JIT for soft and hard partial match
|
||||
7 all three modes
|
||||
</pre>
|
||||
If no number is given, 7 is assumed. If JIT compilation is successful, the
|
||||
compiled JIT code will automatically be used when <b>pcre2_match()</b> is run,
|
||||
except when incompatible run-time options are specified. For more details, see
|
||||
the
|
||||
compiled JIT code will automatically be used when <b>pcre2_match()</b> is run
|
||||
for the appropriate type of match, except when incompatible run-time options
|
||||
are specified. For more details, see the
|
||||
<a href="pcre2jit.html"><b>pcre2jit</b></a>
|
||||
documentation. See also the <b>jitstack</b> modifier below for a way of
|
||||
setting the size of the JIT stack.
|
||||
</P>
|
||||
<P>
|
||||
If the <b>jitverify</b> modifier is specified, the text "(JIT)" is added to the
|
||||
first output line after a match or non match when JIT-compiled code was
|
||||
actually used. This modifier can also be set on a subject line.
|
||||
If the <b>jitverify</b> modifier is specified, information about the compiled
|
||||
pattern shows whether JIT compilation was or was not successful. If
|
||||
<b>jitverify</b> is specified without <b>jit</b>, jit=7 is assumed. If JIT
|
||||
compilation is successful when <b>jitverify</b> is set, the text "(JIT)" is
|
||||
added to the first output line after a match or non match when JIT-compiled
|
||||
code was actually used.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting a locale
|
||||
|
@ -678,9 +678,8 @@ not affect the compilation process.
|
|||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
allusedtext show all consulted text
|
||||
/g global global matching
|
||||
jitverify verify JIT usage
|
||||
mark show mark values
|
||||
</pre>
|
||||
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
||||
|
@ -703,7 +702,6 @@ for a description of their effects.
|
|||
anchored set PCRE2_ANCHORED
|
||||
dfa_restart set PCRE2_DFA_RESTART
|
||||
dfa_shortest set PCRE2_DFA_SHORTEST
|
||||
no_start_optimize set PCRE2_NO_START_OPTIMIZE
|
||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
notbol set PCRE2_NOTBOL
|
||||
notempty set PCRE2_NOTEMPTY
|
||||
|
@ -734,9 +732,8 @@ pattern.
|
|||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
callout_capture show captures at callout time
|
||||
callout_data=<n> set a value to pass via callouts
|
||||
callout_fail=<n>[:<m>] control callout failure
|
||||
|
@ -748,11 +745,9 @@ pattern.
|
|||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
jitverify verify JIT usage
|
||||
mark show mark values
|
||||
match_limit=>n> set a match limit
|
||||
memory show memory usage
|
||||
newline=<type> set newline type
|
||||
offset=<n> set starting offset
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
|
@ -761,14 +756,6 @@ The effects of these modifiers are described in the following sections.
|
|||
FIXME: Give more examples.
|
||||
</P>
|
||||
<br><b>
|
||||
Newline and \R handling
|
||||
</b><br>
|
||||
<P>
|
||||
These modifiers set the newline and \R processing conventions for the subject
|
||||
line, overriding any values that were set at compile time (as described above).
|
||||
JIT matching is disabled if these settings are changed at match time.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing more text
|
||||
</b><br>
|
||||
<P>
|
||||
|
@ -781,11 +768,13 @@ substring. In each case the remainder is output on the following line with a
|
|||
plus character following the capture number.
|
||||
</P>
|
||||
<P>
|
||||
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
||||
during a successful pattern match be shown. This affects the output if there
|
||||
is a lookbehind at the start of a match, or a lookahead at the end, or if \K
|
||||
is used in the pattern. Characters that precede or follow the start and end of
|
||||
the actual match are indicated in the output by '<' or '>' characters
|
||||
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
||||
during a successful pattern match by the interpreter should be shown. This
|
||||
feature is not supported for JIT matching, and if requested with JIT it is
|
||||
ignored (with a warning message). Setting this modifier affects the output if
|
||||
there is a lookbehind at the start of a match, or a lookahead at the end, or if
|
||||
\K is used in the pattern. Characters that precede or follow the start and end
|
||||
of the actual match are indicated in the output by '<' or '>' characters
|
||||
underneath them. Here is an example:
|
||||
<pre>
|
||||
/(?<=pqr)abc(?=xyz)/
|
||||
|
@ -903,6 +892,11 @@ until it finds the minimum values for each parameter that allow
|
|||
<b>pcre2_match()</b> to complete without error.
|
||||
</P>
|
||||
<P>
|
||||
If JIT is being used, only the match limit is relevant. If DFA matching is
|
||||
being used, neither limit is relevant, and this modifier is ignored (with a
|
||||
warning message).
|
||||
</P>
|
||||
<P>
|
||||
The <i>match_limit</i> number is a measure of the amount of backtracking
|
||||
that takes place, and learning the minimum value can be instructive. For most
|
||||
simple matches, the number is quite small, but for patterns with very large
|
||||
|
@ -944,6 +938,13 @@ appears, though of course it can also be used to set a default in a
|
|||
<b>#subject</b> command. It specifies the number of pairs of offsets that are
|
||||
available for storing matching information. The default is 15.
|
||||
</P>
|
||||
<P>
|
||||
At least one pair of offsets is always created by
|
||||
<b>pcre2_match_data_create()</b>, for matching with PCRE2's native API, so a
|
||||
value of 0 is the same as 1. However a value of 0 is useful when testing the
|
||||
POSIX API because it causes <b>regexec()</b> to be called with a NULL capture
|
||||
vector.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
<P>
|
||||
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
|
||||
|
@ -1190,7 +1191,7 @@ Cambridge CB2 3QH, England.
|
|||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 19 August 2014
|
||||
Last updated: 11 October 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
|
|
701
doc/pcre2.txt
701
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "10 October 2014" "PCRE2 10.00"
|
||||
.TH PCRE2API 3 "14 October 2014" "PCRE2 10.00"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -2061,15 +2061,10 @@ pointer to the zero-terminated name, which is within the compiled pattern.
|
|||
Otherwise NULL is returned. A (*MARK) name may be available after a failed
|
||||
match or a partial match, as well as after a successful one.
|
||||
.P
|
||||
The offset of the character at which the successful or partial match started is
|
||||
The offset of the character at which the successful match started is
|
||||
returned by \fBpcre2_get_startchar()\fP. This can be different to the value of
|
||||
\fIovector[0]\fP if the pattern contains the \eK escape sequence. This
|
||||
information is needed when doing partial matching over multiple data segments
|
||||
(see the
|
||||
.\" HREF
|
||||
\fBpcre2partial\fP
|
||||
.\"
|
||||
documentation).
|
||||
\fIovector[0]\fP if the pattern contains the \eK escape sequence. Note,
|
||||
however, the \eK has no effect for a partial match.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="errorlist"></a>
|
||||
|
@ -2626,6 +2621,6 @@ Cambridge CB2 3QH, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 10 October 2014
|
||||
Last updated: 14 October 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -0,0 +1,433 @@
|
|||
.TH PCRE2PARTIAL 3 "14 October 2014" "PCRE2 10.00"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions
|
||||
.SH "PARTIAL MATCHING IN PCRE2"
|
||||
.rs
|
||||
.sp
|
||||
In normal use of PCRE2, if the subject string that is passed to a matching
|
||||
function matches as far as it goes, but is too short to match the entire
|
||||
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
|
||||
might be helpful to distinguish this case from other cases in which there is no
|
||||
match.
|
||||
.P
|
||||
Consider, for example, an application where a human is required to type in data
|
||||
for a field with specific formatting requirements. An example might be a date
|
||||
in the form \fIddmmmyy\fP, defined by this pattern:
|
||||
.sp
|
||||
^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$
|
||||
.sp
|
||||
If the application sees the user's keystrokes one by one, and can check that
|
||||
what has been typed so far is potentially valid, it is able to raise an error
|
||||
as soon as a mistake is made, by beeping and not reflecting the character that
|
||||
has been typed, for example. This immediate feedback is likely to be a better
|
||||
user interface than a check that is delayed until the entire string has been
|
||||
entered. Partial matching can also be useful when the subject string is very
|
||||
long and is not all available at once.
|
||||
.P
|
||||
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
|
||||
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
|
||||
The difference between the two options is whether or not a partial match is
|
||||
preferred to an alternative complete match, though the details differ between
|
||||
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
|
||||
takes precedence.
|
||||
.P
|
||||
If you want to use partial matching with just-in-time optimized code, you must
|
||||
call \fBpcre2_jit_compile()\fP with one or both of these options:
|
||||
.sp
|
||||
PCRE2_JIT_PARTIAL_SOFT
|
||||
PCRE2_JIT_PARTIAL_HARD
|
||||
.sp
|
||||
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
|
||||
matches on the same pattern. If the appropriate JIT mode has not been compiled,
|
||||
interpretive matching code is used.
|
||||
.P
|
||||
Setting a partial matching option disables two of PCRE2's standard
|
||||
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
|
||||
abandons matching immediately if it is not present in the subject string. This
|
||||
optimization cannot be used for a subject string that might match only
|
||||
partially. PCRE2 also knows the minimum length of a matching string, and does
|
||||
not bother to run the matching function on shorter strings. This optimization
|
||||
is also disabled for partial matching.
|
||||
.
|
||||
.
|
||||
.SH "PARTIAL MATCHING USING pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
A partial match occurs during a call to \fBpcre2_match()\fP when the end of the
|
||||
subject string is reached successfully, but matching cannot continue because
|
||||
more characters are needed. However, at least one character in the subject must
|
||||
have been inspected. This character need not form part of the final matched
|
||||
string; lookbehind assertions and the \eK escape sequence provide ways of
|
||||
inspecting characters before the start of a matched string. The requirement for
|
||||
inspecting at least one character exists because an empty string can always be
|
||||
matched; without such a restriction there would always be a partial match of an
|
||||
empty string at the end of the subject.
|
||||
.P
|
||||
When a partial match is returned, the first two elements in the ovector point
|
||||
to the portion of the subject that was matched. The appearance of \eK in the
|
||||
pattern has no effect for a partial match. Consider this pattern:
|
||||
.sp
|
||||
/abc\eK123/
|
||||
.sp
|
||||
If it is matched against "456abc123xyz" the result is a complete match, and the
|
||||
ovector defines the matched string as "123", because \eK resets the "start of
|
||||
match" point. However, if a partial match is requested and the subject string
|
||||
is "456abc12", a partial match is found for the string "abc12", because all
|
||||
these characters are needed for a subsequent re-match with additional
|
||||
characters.
|
||||
.P
|
||||
What happens when a partial match is identified depends on which of the two
|
||||
partial matching options are set.
|
||||
.
|
||||
.
|
||||
.SS "PCRE2_PARTIAL_SOFT WITH pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
If PCRE2_PARTIAL_SOFT is set when \fBpcre2_match()\fP identifies a partial
|
||||
match, the partial match is remembered, but matching continues as normal, and
|
||||
other alternatives in the pattern are tried. If no complete match can be found,
|
||||
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
|
||||
.P
|
||||
This option is "soft" because it prefers a complete match over a partial match.
|
||||
All the various matching items in a pattern behave as if the subject string is
|
||||
potentially complete. For example, \ez, \eZ, and $ match at the end of the
|
||||
subject, as normal, and for \eb and \eB the end of the subject is treated as a
|
||||
non-alphanumeric.
|
||||
.P
|
||||
If there is more than one partial match, the first one that was found provides
|
||||
the data that is returned. Consider this pattern:
|
||||
.sp
|
||||
/123\ew+X|dogY/
|
||||
.sp
|
||||
If this is matched against the subject string "abc123dog", both
|
||||
alternatives fail to match, but the end of the subject is reached during
|
||||
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
|
||||
identifying "123dog" as the first partial match that was found. (In this
|
||||
example, there are two partial matches, because "dog" on its own partially
|
||||
matches the second alternative.)
|
||||
.
|
||||
.
|
||||
.SS "PCRE2_PARTIAL_HARD WITH pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
If PCRE2_PARTIAL_HARD is set for \fBpcre2_match()\fP, PCRE2_ERROR_PARTIAL is
|
||||
returned as soon as a partial match is found, without continuing to search for
|
||||
possible complete matches. This option is "hard" because it prefers an earlier
|
||||
partial match over a later complete match. For this reason, the assumption is
|
||||
made that the end of the supplied subject string may not be the true end of the
|
||||
available data, and so, if \ez, \eZ, \eb, \eB, or $ are encountered at the end
|
||||
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
|
||||
character in the subject has been inspected.
|
||||
.
|
||||
.
|
||||
.SS "Comparing hard and soft partial matching"
|
||||
.rs
|
||||
.sp
|
||||
The difference between the two partial matching options can be illustrated by a
|
||||
pattern such as:
|
||||
.sp
|
||||
/dog(sbody)?/
|
||||
.sp
|
||||
This matches either "dog" or "dogsbody", greedily (that is, it prefers the
|
||||
longer string if possible). If it is matched against the string "dog" with
|
||||
PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if
|
||||
PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other
|
||||
hand, if the pattern is made ungreedy the result is different:
|
||||
.sp
|
||||
/dog(sbody)??/
|
||||
.sp
|
||||
In this case the result is always a complete match because that is found first,
|
||||
and matching never continues after finding a complete match. It might be easier
|
||||
to follow this explanation by thinking of the two patterns like this:
|
||||
.sp
|
||||
/dog(sbody)?/ is the same as /dogsbody|dog/
|
||||
/dog(sbody)??/ is the same as /dog|dogsbody/
|
||||
.sp
|
||||
The second pattern will never match "dogsbody", because it will always find the
|
||||
shorter match first.
|
||||
.
|
||||
.
|
||||
.SH "PARTIAL MATCHING USING pcre2_dfa_match()"
|
||||
.rs
|
||||
.sp
|
||||
The DFA functions move along the subject string character by character, without
|
||||
backtracking, searching for all possible matches simultaneously. If the end of
|
||||
the subject is reached before the end of the pattern, there is the possibility
|
||||
of a partial match, again provided that at least one character has been
|
||||
inspected.
|
||||
.P
|
||||
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
|
||||
have been no complete matches. Otherwise, the complete matches are returned.
|
||||
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
|
||||
any complete matches. The portion of the string that was matched when the
|
||||
longest partial match was found is set as the first matching string.
|
||||
.P
|
||||
Because the DFA functions always search for all possible matches, and there is
|
||||
no difference between greedy and ungreedy repetition, their behaviour is
|
||||
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
|
||||
the string "dog" matched against the ungreedy pattern shown above:
|
||||
.sp
|
||||
/dog(sbody)??/
|
||||
.sp
|
||||
Whereas the standard functions stop as soon as they find the complete match for
|
||||
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
||||
return that when PCRE2_PARTIAL_HARD is set.
|
||||
.
|
||||
.
|
||||
.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
|
||||
.rs
|
||||
.sp
|
||||
If a pattern ends with one of sequences \eb or \eB, which test for word
|
||||
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
|
||||
results. Consider this pattern:
|
||||
.sp
|
||||
/\ebcat\eb/
|
||||
.sp
|
||||
This matches "cat", provided there is a word boundary at either end. If the
|
||||
subject string is "the cat", the comparison of the final "t" with a following
|
||||
character cannot take place, so a partial match is found. However, normal
|
||||
matching carries on, and \eb matches at the end of the subject when the last
|
||||
character is a letter, so a complete match is found. The result, therefore, is
|
||||
\fInot\fP PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
|
||||
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
|
||||
.
|
||||
.
|
||||
.SH "EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST"
|
||||
.rs
|
||||
.sp
|
||||
If the \fBpartial_soft\fP (or \fBps\fP) modifier is present on a
|
||||
\fBpcre2test\fP data line, the PCRE2_PARTIAL_SOFT option is used for the match.
|
||||
Here is a run of \fBpcre2test\fP that uses the date example quoted above:
|
||||
.sp
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 25jun04\e=ps
|
||||
0: 25jun04
|
||||
1: jun
|
||||
data> 25dec3\e=ps
|
||||
Partial match: 23dec3
|
||||
data> 3ju\e=ps
|
||||
Partial match: 3ju
|
||||
data> 3juj\e=ps
|
||||
No match
|
||||
data> j\e=ps
|
||||
No match
|
||||
.sp
|
||||
The first data string is matched completely, so \fBpcre2test\fP shows the
|
||||
matched substrings. The remaining four strings do not match the complete
|
||||
pattern, but the first two are partial matches. Similar output is obtained
|
||||
if DFA matching is used.
|
||||
.P
|
||||
If the \fBpartial_hard\fP (or \fBph\fP) modifier is present on a
|
||||
\fBpcre2test\fP data line, the PCRE2_PARTIAL_HARD option is set for the match.
|
||||
.
|
||||
.
|
||||
.SH "MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()"
|
||||
.rs
|
||||
.sp
|
||||
When a partial match has been found using a DFA matching function, it is
|
||||
possible to continue the match by providing additional subject data and calling
|
||||
the function again with the same compiled regular expression, this time setting
|
||||
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
|
||||
because this is where details of the previous partial match are stored. Here is
|
||||
an example using \fBpcre2test\fP:
|
||||
.sp
|
||||
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
|
||||
data> 23ja\e=dfa,ps
|
||||
Partial match: 23ja
|
||||
data> n05\e=dfa,dfa_restart
|
||||
0: n05
|
||||
.sp
|
||||
The first call has "23ja" as the subject, and requests partial matching; the
|
||||
second call has "n05" as the subject for the continued (restarted) match.
|
||||
Notice that when the match is complete, only the last part is shown; PCRE2 does
|
||||
not retain the previously partially-matched string. It is up to the calling
|
||||
program to do that if it needs to.
|
||||
.P
|
||||
That means that, for an unanchored pattern, if a continued match fails, it is
|
||||
not possible to try again at a new starting point. All this facility is capable
|
||||
of doing is continuing with the previous match attempt. In the previous
|
||||
example, if the second set of data is "ug23" the result is no match, even
|
||||
though there would be a match for "aug23" if the entire string were given at
|
||||
once. Depending on the application, this may or may not be what you want.
|
||||
The only way to allow for starting again at the next character is to retain the
|
||||
matched part of the subject and try a new complete match.
|
||||
.P
|
||||
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
|
||||
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
|
||||
facility can be used to pass very long subject strings to the DFA matching
|
||||
functions.
|
||||
.
|
||||
.
|
||||
.SH "MULTI-SEGMENT MATCHING WITH pcre2_match()"
|
||||
.rs
|
||||
.sp
|
||||
Unlike the DFA function, it is not possible to restart the previous match with
|
||||
a new segment of data when using \fBpcre2_match()\fP. Instead, new data must be
|
||||
added to the previous subject string, and the entire match re-run, starting
|
||||
from the point where the partial match occurred. Earlier data can be discarded.
|
||||
.P
|
||||
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
|
||||
treat the end of a segment as the end of the subject when matching \ez, \eZ,
|
||||
\eb, \eB, and $. Consider an unanchored pattern that matches dates:
|
||||
.sp
|
||||
re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/
|
||||
data> The date is 23ja\e=ph
|
||||
Partial match: 23ja
|
||||
.sp
|
||||
At this stage, an application could discard the text preceding "23ja", add on
|
||||
text from the next segment, and call the matching function again. Unlike the
|
||||
DFA matching function, the entire matching string must always be available,
|
||||
and the complete matching process occurs for each call, so more memory and more
|
||||
processing time is needed.
|
||||
.
|
||||
.
|
||||
.SH "ISSUES WITH MULTI-SEGMENT MATCHING"
|
||||
.rs
|
||||
.sp
|
||||
Certain types of pattern may give problems with multi-segment matching,
|
||||
whichever matching function is used.
|
||||
.P
|
||||
1. If the pattern contains a test for the beginning of a line, you need to pass
|
||||
the PCRE2_NOTBOL option when the subject string for any call does start at the
|
||||
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
|
||||
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
|
||||
includes the effect of PCRE2_NOTEOL.
|
||||
.P
|
||||
2. If a pattern contains a lookbehind assertion, characters that precede the
|
||||
start of the partial match may have been inspected during the matching process.
|
||||
When using \fBpcre2_match()\fP, sufficient characters must be retained for the
|
||||
next match attempt. You can ensure that enough characters are retained by doing
|
||||
the following:
|
||||
.P
|
||||
Before doing any matching, find the length of the longest lookbehind in the
|
||||
pattern by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_MAXLOOKBEHIND
|
||||
option. Note that the resulting count is in characters, not code units. After a
|
||||
partial match, moving back from the ovector[0] offset in the subject by the
|
||||
number of characters given for the maximum lookbehind gets you to the earliest
|
||||
character that must be retained. In a non-UTF or a 32-bit situation, moving
|
||||
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
|
||||
while moving back through the code units.
|
||||
.P
|
||||
Characters before the point you have now reached can be discarded, and after
|
||||
the next segment has been added to what is retained, you should run the next
|
||||
match with the \fBstartoffset\fP argument set so that the match begins at the
|
||||
same point as before.
|
||||
.P
|
||||
For example, if the pattern "(?<=123)abc" is partially matched against the
|
||||
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
|
||||
lookbehind count is 3, so all characters before offset 2 can be discarded. The
|
||||
value of \fBstartoffset\fP for the next match should be 3. When \fBpcre2test\fP
|
||||
displays a partial match, it indicates the lookbehind characters with '<'
|
||||
characters:
|
||||
.sp
|
||||
re> "(?<=123)abc"
|
||||
data> xx123ab\e=ph
|
||||
Partial match: 123ab
|
||||
<<<
|
||||
.P
|
||||
3. Because a partial match must always contain at least one character, what
|
||||
might be considered a partial match of an empty string actually gives a "no
|
||||
match" result. For example:
|
||||
.sp
|
||||
re> /c(?<=abc)x/
|
||||
data> ab\e=ps
|
||||
No match
|
||||
.sp
|
||||
If the next segment begins "cx", a match should be found, but this will only
|
||||
happen if characters from the previous segment are retained. For this reason, a
|
||||
"no match" result should be interpreted as "partial match of an empty string"
|
||||
when the pattern contains lookbehinds.
|
||||
.P
|
||||
4. Matching a subject string that is split into multiple segments may not
|
||||
always produce exactly the same result as matching over one single long string,
|
||||
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
|
||||
Word Boundaries" above describes an issue that arises if the pattern ends with
|
||||
\eb or \eB. Another kind of difference may occur when there are multiple
|
||||
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
|
||||
is given only when there are no completed matches. This means that as soon as
|
||||
the shortest match has been found, continuation to a new subject segment is no
|
||||
longer possible. Consider this \fBpcre2test\fP example:
|
||||
.sp
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\e=ps
|
||||
0: dog
|
||||
data> do\e=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\e=ps,dfa,dfa_restart
|
||||
0: g
|
||||
data> dogsbody\e=dfa
|
||||
0: dogsbody
|
||||
1: dog
|
||||
.sp
|
||||
The first data line passes the string "dogsb" to a standard matching function,
|
||||
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
|
||||
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
|
||||
string "dog" is a complete match. Similarly, when the subject is presented to
|
||||
a DFA matching function in several parts ("do" and "gsb" being the first two)
|
||||
the match stops when "dog" has been found, and it is not possible to continue.
|
||||
On the other hand, if "dogsbody" is presented as a single string, a DFA
|
||||
matching function finds both matches.
|
||||
.P
|
||||
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
|
||||
multi-segment data. The example above then behaves differently:
|
||||
.sp
|
||||
re> /dog(sbody)?/
|
||||
data> dogsb\e=ph
|
||||
Partial match: dogsb
|
||||
data> do\e=ps,dfa
|
||||
Partial match: do
|
||||
data> gsb\e=ph,dfa,dfa_restart
|
||||
Partial match: gsb
|
||||
.sp
|
||||
5. Patterns that contain alternatives at the top level which do not all start
|
||||
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
|
||||
used. For example, consider this pattern:
|
||||
.sp
|
||||
1234|3789
|
||||
.sp
|
||||
If the first part of the subject is "ABC123", a partial match of the first
|
||||
alternative is found at offset 3. There is no partial match for the second
|
||||
alternative, because such a match does not start at the same point in the
|
||||
subject string. Attempting to continue with the string "7890" does not yield a
|
||||
match because only those alternatives that match at one point in the subject
|
||||
are remembered. The problem arises because the start of the second alternative
|
||||
matches within the first alternative. There is no problem with anchored
|
||||
patterns or patterns such as:
|
||||
.sp
|
||||
1234|ABCD
|
||||
.sp
|
||||
where no string can be a partial match for both alternatives. This is not a
|
||||
problem if a standard matching function is used, because the entire match has
|
||||
to be rerun each time:
|
||||
.sp
|
||||
re> /1234|3789/
|
||||
data> ABC123\e=ph
|
||||
Partial match: 123
|
||||
data> 1237890
|
||||
0: 3789
|
||||
.sp
|
||||
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
|
||||
the entire match can also be used with the DFA matching function. Another
|
||||
possibility is to work with two buffers. If a partial match at offset \fIn\fP
|
||||
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
|
||||
the second buffer, you can then try a new match starting at offset \fIn+1\fP in
|
||||
the first buffer.
|
||||
.
|
||||
.
|
||||
.SH AUTHOR
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Philip Hazel
|
||||
University Computing Service
|
||||
Cambridge CB2 3QH, England.
|
||||
.fi
|
||||
.
|
||||
.
|
||||
.SH REVISION
|
||||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 14 October 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
.fi
|
|
@ -424,6 +424,7 @@ PATTERN MODIFIERS
|
|||
/I info show info about compiled pattern
|
||||
hex pattern is coded in hexadecimal
|
||||
jit[=<number>] use JIT
|
||||
jitverify verify JIT use
|
||||
locale=<name> use this locale
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
|
@ -448,68 +449,69 @@ PATTERN MODIFIERS
|
|||
as newlines, both in the pattern and (by default) in subject lines. The
|
||||
type must be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
||||
|
||||
Both the \R and newline settings can be changed at match time, but if
|
||||
this is done, JIT matching is disabled.
|
||||
|
||||
Information about a pattern
|
||||
|
||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||
available information.
|
||||
|
||||
The bincode modifier causes a representation of the compiled code to be
|
||||
output after compilation. This information does not contain length and
|
||||
output after compilation. This information does not contain length and
|
||||
offset values, which ensures that the same output is generated for dif-
|
||||
ferent internal link sizes and different code unit widths. By using
|
||||
bincode, the same regression tests can be used in different environ-
|
||||
ferent internal link sizes and different code unit widths. By using
|
||||
bincode, the same regression tests can be used in different environ-
|
||||
ments.
|
||||
|
||||
The fullbincode modifier, by contrast, does include length and offset
|
||||
The fullbincode modifier, by contrast, does include length and offset
|
||||
values. This is used in a few special tests and is also useful for one-
|
||||
off tests.
|
||||
|
||||
The info modifier requests information about the compiled pattern
|
||||
(whether it is anchored, has a fixed first character, and so on). The
|
||||
The info modifier requests information about the compiled pattern
|
||||
(whether it is anchored, has a fixed first character, and so on). The
|
||||
information is obtained from the pcre2_pattern_info() function.
|
||||
|
||||
Specifying a pattern in hex
|
||||
|
||||
The hex modifier specifies that the characters of the pattern are to be
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
||||
between pairs. For example:
|
||||
|
||||
/ab 32 59/hex
|
||||
|
||||
This feature is provided as a way of creating patterns that contain
|
||||
This feature is provided as a way of creating patterns that contain
|
||||
binary zero characters. When hex is set, it implies use_length.
|
||||
|
||||
Using the pattern's length
|
||||
|
||||
By default, pcre2test passes patterns as zero-terminated strings to
|
||||
pcre2_compile(), giving the length as -1. If use_length is set, the
|
||||
By default, pcre2test passes patterns as zero-terminated strings to
|
||||
pcre2_compile(), giving the length as -1. If use_length is set, the
|
||||
length of the pattern is passed. This is implied if hex is set.
|
||||
|
||||
JIT compilation
|
||||
|
||||
The /jit modifier may optionally be followed by a number in the range 0
|
||||
to 7:
|
||||
The /jit modifier may optionally be followed by and equals sign and a
|
||||
number in the range 0 to 7:
|
||||
|
||||
0 disable JIT
|
||||
1 normal match only
|
||||
2 soft partial match only
|
||||
3 normal match and soft partial match
|
||||
4 hard partial match only
|
||||
6 soft and hard partial match
|
||||
1 use JIT for normal match only
|
||||
2 use JIT for soft partial match only
|
||||
3 use JIT for normal match and soft partial match
|
||||
4 use JIT for hard partial match only
|
||||
6 use JIT for soft and hard partial match
|
||||
7 all three modes
|
||||
|
||||
If no number is given, 7 is assumed. If JIT compilation is successful,
|
||||
the compiled JIT code will automatically be used when pcre2_match() is
|
||||
run, except when incompatible run-time options are specified. For more
|
||||
details, see the pcre2jit documentation. See also the jitstack modifier
|
||||
below for a way of setting the size of the JIT stack.
|
||||
If no number is given, 7 is assumed. If JIT compilation is successful,
|
||||
the compiled JIT code will automatically be used when pcre2_match() is
|
||||
run for the appropriate type of match, except when incompatible run-
|
||||
time options are specified. For more details, see the pcre2jit documen-
|
||||
tation. See also the jitstack modifier below for a way of setting the
|
||||
size of the JIT stack.
|
||||
|
||||
If the jitverify modifier is specified, the text "(JIT)" is added to
|
||||
If the jitverify modifier is specified, information about the compiled
|
||||
pattern shows whether JIT compilation was or was not successful. If
|
||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||
the first output line after a match or non match when JIT-compiled code
|
||||
was actually used. This modifier can also be set on a subject line.
|
||||
was actually used.
|
||||
|
||||
Setting a locale
|
||||
|
||||
|
@ -518,31 +520,31 @@ PATTERN MODIFIERS
|
|||
/pattern/locale=fr_FR
|
||||
|
||||
The given locale is set, pcre2_maketables() is called to build a set of
|
||||
character tables for the locale, and this is then passed to pcre2_com-
|
||||
pile() when compiling the regular expression. The same tables are used
|
||||
character tables for the locale, and this is then passed to pcre2_com-
|
||||
pile() when compiling the regular expression. The same tables are used
|
||||
when matching the following subject lines. The /locale modifier applies
|
||||
only to the pattern on which it appears, but can be given in a #pattern
|
||||
command if a default is needed. Setting a locale and alternate charac-
|
||||
command if a default is needed. Setting a locale and alternate charac-
|
||||
ter tables are mutually exclusive.
|
||||
|
||||
Showing pattern memory
|
||||
|
||||
The /memory modifier causes the size in bytes of the memory block used
|
||||
to hold the compiled pattern to be output. This does not include the
|
||||
size of the pcre2_code block; it is just the actual compiled data. If
|
||||
The /memory modifier causes the size in bytes of the memory block used
|
||||
to hold the compiled pattern to be output. This does not include the
|
||||
size of the pcre2_code block; it is just the actual compiled data. If
|
||||
the pattern is subsequently passed to the JIT compiler, the size of the
|
||||
JIT compiled code is also output.
|
||||
|
||||
Limiting nested parentheses
|
||||
|
||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||
parentheses in a pattern. Breaching the limit causes a compilation
|
||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||
parentheses in a pattern. Breaching the limit causes a compilation
|
||||
error.
|
||||
|
||||
Using the POSIX wrapper API
|
||||
|
||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
||||
per API rather than its native API. This supports only the 8-bit
|
||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
||||
per API rather than its native API. This supports only the 8-bit
|
||||
library. When the POSIX API is being used, the following pattern modi-
|
||||
fiers set options for the regcomp() function:
|
||||
|
||||
|
@ -554,25 +556,25 @@ PATTERN MODIFIERS
|
|||
ucp REG_UCP ) the POSIX standard
|
||||
utf REG_UTF8 )
|
||||
|
||||
The aftertext and allaftertext subject modifiers work as described
|
||||
The aftertext and allaftertext subject modifiers work as described
|
||||
below. All other modifiers cause an error.
|
||||
|
||||
Testing the stack guard feature
|
||||
|
||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||
pile_recursion_guard(), a function that is provided to enable stack
|
||||
availability to be checked during compilation (see the pcre2api docu-
|
||||
mentation for details). If the number specified by the modifier is
|
||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||
pile_recursion_guard(), a function that is provided to enable stack
|
||||
availability to be checked during compilation (see the pcre2api docu-
|
||||
mentation for details). If the number specified by the modifier is
|
||||
greater than zero, pcre2_set_compile_recursion_guard() is called to set
|
||||
up callback from pcre2_compile() to a local function. The argument it
|
||||
is passed is the current nesting parenthesis depth; if this is greater
|
||||
up callback from pcre2_compile() to a local function. The argument it
|
||||
is passed is the current nesting parenthesis depth; if this is greater
|
||||
than the value given by the modifier, non-zero is returned, causing the
|
||||
compilation to be aborted.
|
||||
|
||||
Using alternative character tables
|
||||
|
||||
The /tables modifier must be followed by a single digit. It causes a
|
||||
specific set of built-in character tables to be passed to pcre2_com-
|
||||
The /tables modifier must be followed by a single digit. It causes a
|
||||
specific set of built-in character tables to be passed to pcre2_com-
|
||||
pile(). This is used in the PCRE2 tests to check behaviour with differ-
|
||||
ent character tables. The digit specifies the tables as follows:
|
||||
|
||||
|
@ -581,15 +583,15 @@ PATTERN MODIFIERS
|
|||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
tables and a locale are mutually exclusive.
|
||||
|
||||
Setting certain match controls
|
||||
|
||||
The following modifiers are really subject modifiers, and are described
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
with that pattern. They do not affect the compilation process.
|
||||
|
||||
aftertext show text after match
|
||||
|
@ -597,10 +599,9 @@ PATTERN MODIFIERS
|
|||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
/g global global matching
|
||||
jitverify verify JIT usage
|
||||
mark show mark values
|
||||
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
as defaults, set them in a #subject command.
|
||||
|
||||
|
||||
|
@ -611,13 +612,12 @@ SUBJECT MODIFIERS
|
|||
|
||||
Setting match options
|
||||
|
||||
The following modifiers set options for pcre2_match() or
|
||||
The following modifiers set options for pcre2_match() or
|
||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||
|
||||
anchored set PCRE2_ANCHORED
|
||||
dfa_restart set PCRE2_DFA_RESTART
|
||||
dfa_shortest set PCRE2_DFA_SHORTEST
|
||||
no_start_optimize set PCRE2_NO_START_OPTIMIZE
|
||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
notbol set PCRE2_NOTBOL
|
||||
notempty set PCRE2_NOTEMPTY
|
||||
|
@ -626,28 +626,27 @@ SUBJECT MODIFIERS
|
|||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
they appear frequently in tests.
|
||||
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
wrapper API to be used, the only option-setting modifiers that have any
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
Any other modifiers cause an error.
|
||||
|
||||
Setting match controls
|
||||
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
is matched against that pattern.
|
||||
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
callout_capture show captures at callout time
|
||||
callout_data=<n> set a value to pass via callouts
|
||||
callout_fail=<n>[:<m>] control callout failure
|
||||
|
@ -659,11 +658,9 @@ SUBJECT MODIFIERS
|
|||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
jitverify verify JIT usage
|
||||
mark show mark values
|
||||
match_limit=>n> set a match limit
|
||||
memory show memory usage
|
||||
newline=<type> set newline type
|
||||
offset=<n> set starting offset
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
|
@ -671,13 +668,6 @@ SUBJECT MODIFIERS
|
|||
The effects of these modifiers are described in the following sections.
|
||||
FIXME: Give more examples.
|
||||
|
||||
Newline and \R handling
|
||||
|
||||
These modifiers set the newline and \R processing conventions for the
|
||||
subject line, overriding any values that were set at compile time (as
|
||||
described above). JIT matching is disabled if these settings are
|
||||
changed at match time.
|
||||
|
||||
Showing more text
|
||||
|
||||
The aftertext modifier requests that as well as outputting the sub-
|
||||
|
@ -690,18 +680,21 @@ SUBJECT MODIFIERS
|
|||
ture number.
|
||||
|
||||
The allusedtext modifier requests that all the text that was consulted
|
||||
during a successful pattern match be shown. This affects the output if
|
||||
there is a lookbehind at the start of a match, or a lookahead at the
|
||||
end, or if \K is used in the pattern. Characters that precede or follow
|
||||
the start and end of the actual match are indicated in the output by
|
||||
'<' or '>' characters underneath them. Here is an example:
|
||||
during a successful pattern match by the interpreter should be shown.
|
||||
This feature is not supported for JIT matching, and if requested with
|
||||
JIT it is ignored (with a warning message). Setting this modifier
|
||||
affects the output if there is a lookbehind at the start of a match, or
|
||||
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||
that precede or follow the start and end of the actual match are indi-
|
||||
cated in the output by '<' or '>' characters underneath them. Here is
|
||||
an example:
|
||||
|
||||
/(?<=pqr)abc(?=xyz)/
|
||||
123pqrabcxyz456\=allusedtext
|
||||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
|
||||
This shows that the matched string is "abc", with the preceding and
|
||||
This shows that the matched string is "abc", with the preceding and
|
||||
following strings "pqr" and "xyz" also consulted during the match.
|
||||
|
||||
Showing the value of all capture groups
|
||||
|
@ -709,124 +702,133 @@ SUBJECT MODIFIERS
|
|||
The allcaptures modifier requests that the values of all potential cap-
|
||||
tured parentheses be output after a match. By default, only those up to
|
||||
the highest one actually used in the match are output (corresponding to
|
||||
the return code from pcre2_match()). Groups that did not take part in
|
||||
the return code from pcre2_match()). Groups that did not take part in
|
||||
the match are output as "<unset>".
|
||||
|
||||
Testing callouts
|
||||
|
||||
A callout function is supplied when pcre2test calls the library match-
|
||||
ing functions, unless callout_none is specified. If callout_capture is
|
||||
A callout function is supplied when pcre2test calls the library match-
|
||||
ing functions, unless callout_none is specified. If callout_capture is
|
||||
set, the current captured groups are output when a callout occurs.
|
||||
|
||||
The callout_fail modifier can be given one or two numbers. If there is
|
||||
The callout_fail modifier can be given one or two numbers. If there is
|
||||
only one number, 1 is returned instead of 0 when a callout of that num-
|
||||
ber is reached. If two numbers are given, 1 is returned when callout
|
||||
ber is reached. If two numbers are given, 1 is returned when callout
|
||||
<n> is reached for the <m>th time.
|
||||
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. Any value other than zero is used as a return from pcre2test's
|
||||
The callout_data modifier can be given an unsigned or a negative num-
|
||||
ber. Any value other than zero is used as a return from pcre2test's
|
||||
callout function.
|
||||
|
||||
Testing substring extraction functions
|
||||
|
||||
The copy and get modifiers can be used to test the pcre2_sub-
|
||||
The copy and get modifiers can be used to test the pcre2_sub-
|
||||
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
|
||||
given more than once, and each can specify a group name or number, for
|
||||
given more than once, and each can specify a group name or number, for
|
||||
example:
|
||||
|
||||
abcd\=copy=1,copy=3,get=G1
|
||||
|
||||
If the #subject command is used to set default copy and get lists,
|
||||
these can be unset by specifying a negative number for numbered groups
|
||||
If the #subject command is used to set default copy and get lists,
|
||||
these can be unset by specifying a negative number for numbered groups
|
||||
and an empty name for named groups.
|
||||
|
||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||
all captured substrings.
|
||||
|
||||
If the subject line is successfully matched, the substrings extracted
|
||||
by the convenience functions are output with C, G, or L after the
|
||||
string number instead of a colon. This is in addition to the normal
|
||||
full list. The string length (that is, the return from the extraction
|
||||
If the subject line is successfully matched, the substrings extracted
|
||||
by the convenience functions are output with C, G, or L after the
|
||||
string number instead of a colon. This is in addition to the normal
|
||||
full list. The string length (that is, the return from the extraction
|
||||
function) is given in parentheses after each substring.
|
||||
|
||||
Finding all matches in a string
|
||||
|
||||
Searching for all possible matches within a subject can be requested by
|
||||
the global or /altglobal modifier. After finding a match, the matching
|
||||
function is called again to search the remainder of the subject. The
|
||||
difference between global and altglobal is that the former uses the
|
||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||
searching at a new point within the entire string (which is what Perl
|
||||
the global or /altglobal modifier. After finding a match, the matching
|
||||
function is called again to search the remainder of the subject. The
|
||||
difference between global and altglobal is that the former uses the
|
||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||
searching at a new point within the entire string (which is what Perl
|
||||
does), whereas the latter passes over a shortened substring. This makes
|
||||
a difference to the matching process if the pattern begins with a look-
|
||||
behind assertion (including \b or \B).
|
||||
|
||||
If an empty string is matched, the next match is done with the
|
||||
If an empty string is matched, the next match is done with the
|
||||
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
||||
for another, non-empty, match at the same point in the subject. If this
|
||||
match fails, the start offset is advanced, and the normal match is
|
||||
retried. This imitates the way Perl handles such cases when using the
|
||||
/g modifier or the split() function. Normally, the start offset is
|
||||
advanced by one character, but if the newline convention recognizes
|
||||
CRLF as a newline, and the current character is CR followed by LF, an
|
||||
match fails, the start offset is advanced, and the normal match is
|
||||
retried. This imitates the way Perl handles such cases when using the
|
||||
/g modifier or the split() function. Normally, the start offset is
|
||||
advanced by one character, but if the newline convention recognizes
|
||||
CRLF as a newline, and the current character is CR followed by LF, an
|
||||
advance of two is used.
|
||||
|
||||
Setting the JIT stack size
|
||||
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
JIT optimization is not being used. Providing a stack that is larger
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
JIT optimization is not being used. Providing a stack that is larger
|
||||
than the default 32K is necessary only for very complicated patterns.
|
||||
|
||||
Setting match and recursion limits
|
||||
|
||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||
its in the match context. These values are ignored when the find_limits
|
||||
modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||
several times, setting different values in the match context via
|
||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||
the minimum values for each parameter that allow pcre2_match() to com-
|
||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||
several times, setting different values in the match context via
|
||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||
the minimum values for each parameter that allow pcre2_match() to com-
|
||||
plete without error.
|
||||
|
||||
The match_limit number is a measure of the amount of backtracking that
|
||||
takes place, and learning the minimum value can be instructive. For
|
||||
most simple matches, the number is quite small, but for patterns with
|
||||
very large numbers of matching possibilities, it can become large very
|
||||
quickly with increasing length of subject string. The
|
||||
match_limit_recursion number is a measure of how much stack (or, if
|
||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||
If JIT is being used, only the match limit is relevant. If DFA matching
|
||||
is being used, neither limit is relevant, and this modifier is ignored
|
||||
(with a warning message).
|
||||
|
||||
The match_limit number is a measure of the amount of backtracking that
|
||||
takes place, and learning the minimum value can be instructive. For
|
||||
most simple matches, the number is quite small, but for patterns with
|
||||
very large numbers of matching possibilities, it can become large very
|
||||
quickly with increasing length of subject string. The
|
||||
match_limit_recursion number is a measure of how much stack (or, if
|
||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||
complete the match attempt.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
||||
The mark modifier causes the names from backtracking control verbs that
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
it is added to the non-match message.
|
||||
|
||||
Showing memory usage
|
||||
|
||||
The memory modifier causes pcre2test to log all memory allocation and
|
||||
The memory modifier causes pcre2test to log all memory allocation and
|
||||
freeing calls that occur during a match operation.
|
||||
|
||||
Setting a starting offset
|
||||
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
matching starts. Its value is a number of code units, not characters.
|
||||
|
||||
Setting the size of the output vector
|
||||
|
||||
The ovector modifier applies only to the subject line in which it
|
||||
appears, though of course it can also be used to set a default in a
|
||||
#subject command. It specifies the number of pairs of offsets that are
|
||||
The ovector modifier applies only to the subject line in which it
|
||||
appears, though of course it can also be used to set a default in a
|
||||
#subject command. It specifies the number of pairs of offsets that are
|
||||
available for storing matching information. The default is 15.
|
||||
|
||||
At least one pair of offsets is always created by pcre2_match_data_cre-
|
||||
ate(), for matching with PCRE2's native API, so a value of 0 is the
|
||||
same as 1. However a value of 0 is useful when testing the POSIX API
|
||||
because it causes regexec() to be called with a NULL capture vector.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
|
@ -1069,5 +1071,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 19 August 2014
|
||||
Last updated: 11 October 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
|
|
|
@ -612,6 +612,7 @@ clock_t total_match_time = 0;
|
|||
|
||||
static uint32_t dfa_matched;
|
||||
static uint32_t forbid_utf = 0;
|
||||
static uint32_t maxlookbehind;
|
||||
static uint32_t max_oveccount;
|
||||
static uint32_t callout_count;
|
||||
|
||||
|
@ -2293,6 +2294,55 @@ return 0;
|
|||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Move back by so many characters *
|
||||
*************************************************/
|
||||
|
||||
/* Given a code unit offset in a subject string, move backwards by a number of
|
||||
characters, and return the resulting offset.
|
||||
|
||||
Arguments:
|
||||
subject pointer to the string
|
||||
offset start offset
|
||||
count count to move back by
|
||||
utf TRUE if in UTF mode
|
||||
|
||||
Returns: a possibly changed offset
|
||||
*/
|
||||
|
||||
static PCRE2_SIZE
|
||||
backchars(uint8_t *subject, PCRE2_SIZE offset, uint32_t count, BOOL utf)
|
||||
{
|
||||
long int yield;
|
||||
|
||||
if (!utf || test_mode == PCRE32_MODE) yield = offset - count;
|
||||
|
||||
else if (test_mode == PCRE8_MODE)
|
||||
{
|
||||
PCRE2_SPTR8 pp = (PCRE2_SPTR8)subject + offset;
|
||||
for (; count > 0; count--)
|
||||
{
|
||||
pp--;
|
||||
while ((*pp & 0xc0) == 0x80) pp--;
|
||||
}
|
||||
yield = pp - (PCRE2_SPTR8)subject;
|
||||
}
|
||||
|
||||
else /* 16-bit mode */
|
||||
{
|
||||
PCRE2_SPTR16 pp = (PCRE2_SPTR16)subject + offset;
|
||||
for (; count > 0; count--)
|
||||
{
|
||||
pp--;
|
||||
if ((*pp & 0xfc00) == 0xdc00) pp--;
|
||||
}
|
||||
yield = pp - (PCRE2_SPTR16)subject;
|
||||
}
|
||||
|
||||
return (yield >= 0)? yield : 0;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Read or extend an input line *
|
||||
*************************************************/
|
||||
|
@ -3099,8 +3149,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
|||
BOOL match_limit_set, recursion_limit_set;
|
||||
uint32_t backrefmax, bsr_convention, capture_count, first_ctype, first_cunit,
|
||||
hascrorlf, jchanged, last_ctype, last_cunit, match_empty, match_limit,
|
||||
maxlookbehind, minlength, nameentrysize, namecount, newline_convention,
|
||||
recursion_limit;
|
||||
minlength, nameentrysize, namecount, newline_convention, recursion_limit;
|
||||
|
||||
/* These info requests may return PCRE2_ERROR_UNSET. */
|
||||
|
||||
|
@ -3145,7 +3194,6 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
|||
pattern_info(PCRE2_INFO_LASTCODEUNIT, &last_cunit, FALSE) +
|
||||
pattern_info(PCRE2_INFO_LASTCODETYPE, &last_ctype, FALSE) +
|
||||
pattern_info(PCRE2_INFO_MATCHEMPTY, &match_empty, FALSE) +
|
||||
pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) +
|
||||
pattern_info(PCRE2_INFO_MINLENGTH, &minlength, FALSE) +
|
||||
pattern_info(PCRE2_INFO_NAMECOUNT, &namecount, FALSE) +
|
||||
pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize, FALSE) +
|
||||
|
@ -3700,6 +3748,11 @@ if (TEST(compiled_code, ==, NULL))
|
|||
fprintf(outfile, "\n");
|
||||
return PR_SKIP;
|
||||
}
|
||||
|
||||
/* Remember the maximum lookbehind, for partial matching. */
|
||||
|
||||
if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0)
|
||||
return PR_ABEND;
|
||||
|
||||
/* Call the JIT compiler if requested. */
|
||||
|
||||
|
@ -4875,22 +4928,41 @@ for (gmatched = 0;; gmatched++)
|
|||
} /* End of handling a successful match */
|
||||
|
||||
/* There was a partial match. The value of ovector[0] is the bumpalong point,
|
||||
not any \K point that might exist. */
|
||||
that is, startchar, not any \K point that might have been passed. */
|
||||
|
||||
else if (capcount == PCRE2_ERROR_PARTIAL)
|
||||
{
|
||||
PCRE2_SIZE poffset;
|
||||
int backlength;
|
||||
int rubriclength = 0;
|
||||
|
||||
fprintf(outfile, "Partial match");
|
||||
if ((dat_datctl.control & CTL_MARK) != 0 &&
|
||||
TESTFLD(match_data, mark, !=, NULL))
|
||||
{
|
||||
fprintf(outfile, ", mark=");
|
||||
PCHARSV(CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
|
||||
PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
|
||||
rubriclength += 7;
|
||||
}
|
||||
fprintf(outfile, ": ");
|
||||
rubriclength += 15;
|
||||
|
||||
poffset = backchars(pp, ovector[0], maxlookbehind, utf);
|
||||
PCHARS(backlength, pp, poffset, ovector[0] - poffset, utf, outfile);
|
||||
PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
|
||||
|
||||
if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
|
||||
fprintf(outfile, " (JIT)");
|
||||
fprintf(outfile, "\n");
|
||||
|
||||
if (backlength != 0)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < rubriclength; i++) fprintf(outfile, " ");
|
||||
for (i = 0; i < backlength; i++) fprintf(outfile, "<");
|
||||
fprintf(outfile, "\n");
|
||||
}
|
||||
|
||||
break; /* Out of the /g loop */
|
||||
} /* End of handling partial match */
|
||||
|
||||
|
|
|
@ -9286,17 +9286,21 @@ Partial match: abc12
|
|||
xyzabc123pqr
|
||||
0: 123
|
||||
xyzabc12\=ps
|
||||
Partial match: 12
|
||||
Partial match: abc12
|
||||
<<<
|
||||
xyzabc12\=ph
|
||||
Partial match: 12
|
||||
Partial match: abc12
|
||||
<<<
|
||||
|
||||
/\babc\b/
|
||||
+++abc+++
|
||||
0: abc
|
||||
+++ab\=ps
|
||||
Partial match: ab
|
||||
Partial match: +ab
|
||||
<
|
||||
+++ab\=ph
|
||||
Partial match: ab
|
||||
Partial match: +ab
|
||||
<
|
||||
|
||||
/(?&word)(?&element)(?(DEFINE)(?<element><[^m][^>]>[^<])(?<word>\w*+))/B
|
||||
------------------------------------------------------------------
|
||||
|
@ -10324,7 +10328,8 @@ No match
|
|||
|
||||
/(?<=abc)def/
|
||||
abc\=ph
|
||||
Partial match:
|
||||
Partial match: abc
|
||||
<<<
|
||||
|
||||
/abc$/
|
||||
abc
|
||||
|
@ -11877,9 +11882,11 @@ Callout 2: last capture = 0
|
|||
|
||||
/(?<=123)(*MARK:xx)abc/mark
|
||||
xxxx123a\=ph
|
||||
Partial match, mark=xx: a
|
||||
Partial match, mark=xx: 123a
|
||||
<<<
|
||||
xxxx123a\=ps
|
||||
Partial match, mark=xx: a
|
||||
Partial match, mark=xx: 123a
|
||||
<<<
|
||||
|
||||
/123\Kabc/
|
||||
xxxx123a\=ph
|
||||
|
|
|
@ -947,7 +947,8 @@ Partial match: abc
|
|||
xyzfo\=ps
|
||||
No match
|
||||
foob\=ps,offset=2
|
||||
Partial match: b
|
||||
Partial match: foob
|
||||
<<<
|
||||
foobar...\=ps,dfa_restart,offset=4
|
||||
0: ar
|
||||
xyzfo\=ps
|
||||
|
@ -7092,17 +7093,21 @@ Failed: error -40: item unsupported for DFA matching
|
|||
xyzabc123pqr
|
||||
0: 123
|
||||
xyzabc12\=ps
|
||||
Partial match: 12
|
||||
Partial match: abc12
|
||||
<<<
|
||||
xyzabc12\=ph
|
||||
Partial match: 12
|
||||
Partial match: abc12
|
||||
<<<
|
||||
|
||||
/\babc\b/
|
||||
+++abc+++
|
||||
0: abc
|
||||
+++ab\=ps
|
||||
Partial match: ab
|
||||
Partial match: +ab
|
||||
<
|
||||
+++ab\=ph
|
||||
Partial match: ab
|
||||
Partial match: +ab
|
||||
<
|
||||
|
||||
/(?=C)/g,aftertext
|
||||
ABCDECBA
|
||||
|
@ -7226,7 +7231,8 @@ Failed: error -40: item unsupported for DFA matching
|
|||
|
||||
/(?<=abc)def/
|
||||
abc\=ph
|
||||
Partial match:
|
||||
Partial match: abc
|
||||
<<<
|
||||
|
||||
/abc$/
|
||||
abc
|
||||
|
|
Loading…
Reference in New Issue