Partial documentation and partial code tweaks.

This commit is contained in:
Philip.Hazel 2014-10-14 16:23:57 +00:00
parent a6302442f2
commit 26cd0bccb3
13 changed files with 1576 additions and 748 deletions

View File

@ -34,6 +34,7 @@ dist_html_DATA = \
doc/html/pcre2jit.html \
doc/html/pcre2limits.html \
doc/html/pcre2matching.html \
doc/html/pcre2partial.html \
doc/html/pcre2test.html \
doc/html/pcre2unicode.html
@ -64,7 +65,6 @@ dist_html_DATA = \
# doc/html/pcre2_utf16_to_host_byte_order.html \
# doc/html/pcre2_utf32_to_host_byte_order.html \
# doc/html/pcre2_version.html \
# doc/html/pcre2partial.html \
# doc/html/pcre2pattern.html \
# doc/html/pcre2perform.html \
# doc/html/pcre2posix.html \
@ -86,6 +86,7 @@ dist_man_MANS = \
doc/pcre2jit.3 \
doc/pcre2limits.3 \
doc/pcre2matching.3 \
doc/pcre2partial.3 \
doc/pcre2test.1 \
doc/pcre2unicode.3
@ -118,7 +119,6 @@ dist_man_MANS = \
# doc/pcre2_utf16_to_host_byte_order.3 \
# doc/pcre2_utf32_to_host_byte_order.3 \
# doc/pcre2_version.3 \
# doc/pcre2partial.3 \
# doc/pcre2pattern.3 \
# doc/pcre2perform.3 \
# doc/pcre2posix.3 \

View File

@ -90,9 +90,6 @@ document for an overview of all the PCRE2 documentation.
</P>
<br><a name="SEC2" href="#TOC1">PCRE2 NATIVE API AUXILIARY MATCH FUNCTIONS</a><br>
<P>
<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
<br>
<br>
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
<br>
<br>
@ -102,9 +99,6 @@ document for an overview of all the PCRE2 documentation.
<b>PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *<i>match_data</i>);</b>
<br>
<br>
<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
<br>
<br>
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
</P>
<br><a name="SEC3" href="#TOC1">PCRE2 NATIVE API GENERAL CONTEXT FUNCTIONS</a><br>
@ -133,7 +127,7 @@ document for an overview of all the PCRE2 documentation.
<b>void pcre2_compile_context_free(pcre2_compile_context *<i>ccontext</i>);</b>
<br>
<br>
<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
@ -141,7 +135,7 @@ document for an overview of all the PCRE2 documentation.
<b> const unsigned char *<i>tables</i>);</b>
<br>
<br>
<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
@ -165,10 +159,6 @@ document for an overview of all the PCRE2 documentation.
<b>void pcre2_match_context_free(pcre2_match_context *<i>mcontext</i>);</b>
<br>
<br>
<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
<b> void *<i>callout_data</i>);</b>
@ -178,10 +168,6 @@ document for an overview of all the PCRE2 documentation.
<b> uint32_t <i>value</i>);</b>
<br>
<br>
<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
<b>int pcre2_set_recursion_limit(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
@ -596,7 +582,7 @@ A compile context is created, copied, and freed by the following functions:
A compile context is created with default values for its parameters. These can
be changed by calling the following functions, which return 0 on success, or
PCRE2_ERROR_BADDATA if invalid data is detected.
<b>int pcre2_set_bsr_compile(pcre2_compile_context *<i>ccontext</i>,</b>
<b>int pcre2_set_bsr(pcre2_compile_context *<i>ccontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
@ -605,8 +591,7 @@ or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
ending sequence. The value of this parameter does not affect what is compiled;
it is just saved with the compiled pattern. The value is used by the JIT
compiler and by the two interpreted matching functions, <i>pcre2_match()</i> and
<i>pcre2_dfa_match()</i>. You can change the value when calling these functions,
but doing so disables the use of JIT.
<i>pcre2_dfa_match()</i>.
<b>int pcre2_set_character_tables(pcre2_compile_context *<i>ccontext</i>,</b>
<b> const unsigned char *<i>tables</i>);</b>
<br>
@ -614,7 +599,7 @@ but doing so disables the use of JIT.
The value must be the result of a call to <i>pcre2_maketables()</i>, whose only
argument is a general context. This function builds a set of character tables
in the current locale.
<b>int pcre2_set_newline_compile(pcre2_compile_context *<i>ccontext</i>,</b>
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
@ -629,8 +614,7 @@ When a pattern is compiled with the PCRE2_EXTENDED option, the value of this
parameter affects the recognition of white space and the end of internal
comments starting with #. The value is saved with the compiled pattern for
subsequent use by the JIT compiler and by the two interpreted matching
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>. You can change the
value when calling these functions, but doing so disables the use of JIT.
functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
<b>int pcre2_set_parens_nest_limit(pcre2_compile_context *<i>ccontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
@ -685,14 +669,6 @@ A match context is created, copied, and freed by the following functions:
A match context is created with default values for its parameters. These can
be changed by calling the following functions, which return 0 on success, or
PCRE2_ERROR_BADDATA if invalid data is detected.
<b>int pcre2_set_bsr_match(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
The value must be PCRE2_BSR_ANYCRLF, to specify that \R matches only CR, LF,
or CRLF, or PCRE2_BSR_UNICODE, to specify that \R matches any Unicode line
ending sequence. If you want to make use of JIT matching, you should not use
this function, but instead set the value in a compile context.
<b>int pcre2_set_callout(pcre2_match_context *<i>mcontext</i>,</b>
<b> int (*<i>callout_function</i>)(pcre2_callout_block *),</b>
<b> void *<i>callout_data</i>);</b>
@ -769,17 +745,6 @@ pattern of the form
where ddd is a decimal number. However, such a setting is ignored unless ddd is
less than the limit set by the caller of <b>pcre2_match()</b> or, if no such
limit is set, less than the default.
<b>int pcre2_set_newline_match(pcre2_match_context *<i>mcontext</i>,</b>
<b> uint32_t <i>value</i>);</b>
<br>
<br>
This specifies which characters or character sequences are to be recognized as
newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only),
PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character
sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or
PCRE2_NEWLINE_ANY (any Unicode newline sequence). If you want to make use of
JIT matching, you should not use this function, but instead set the value in a
compile context.
<b>int pcre2_set_recursion_memory_management(</b>
<b> pcre2_match_context *<i>mcontext</i>,</b>
<b> void *(*<i>private_malloc</i>)(PCRE2_SIZE, void *),</b>
@ -956,9 +921,8 @@ documentation).
<P>
For those options that can be different in different parts of the pattern, the
contents of the <i>options</i> argument specifies their settings at the start of
compilation. The PCRE2_ANCHORED, PCRE2_NO_UTF_CHECK, and
PCRE2_NO_START_OPTIMIZE options can be set at the time of matching as well as
at compile time.
compilation. The PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK options can be set at
the time of matching as well as at compile time.
</P>
<P>
Other, less frequently required compile-time parameters (for example, the
@ -1176,14 +1140,55 @@ purposes.
<pre>
PCRE2_NO_START_OPTIMIZE
</pre>
This is an option that acts at matching time; that is, it is really an option
for <b>pcre2_match()</b> or <b>pcre_dfa_match()</b>. If it is set at compile
time, it is remembered with the compiled pattern and assumed at matching time.
This is necessary if you want to use JIT execution, because the JIT compiler
needs to know whether or not this option is set. For details, see the
discussion of PCRE2_NO_START_OPTIMIZE in the section on <b>pcre2_match()</b>
options
<a href="#matchoptions">below.</a>
This is an option whose main effect is at matching time. It does not change
what <b>pcre2_compile()</b> generates, but it does affect the output of the JIT
compiler.
</P>
<P>
There are a number of optimizations that may occur at the start of a match, in
order to speed up the process. For example, if it is known that an unanchored
match must start with a specific character, the matching code searches the
subject for that character, and fails immediately if it cannot find it, without
actually running the main matching function. This means that a special item
such as (*COMMIT) at the start of a pattern is not considered until after a
suitable starting point for the match has been found. Also, when callouts or
(*MARK) items are in use, these "start-up" optimizations can cause them to be
skipped if the pattern is never actually used. The start-up optimizations are
in effect a pre-scan of the subject that takes place before the pattern is run.
</P>
<P>
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
possibly causing performance to suffer, but ensuring that in cases where the
result is "no match", the callouts do occur, and that items such as (*COMMIT)
and (*MARK) are considered at every possible starting position in the subject
string.
</P>
<P>
Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching operation.
Consider the pattern
<pre>
(*COMMIT)ABC
</pre>
When this is compiled, PCRE2 records the fact that a match must start with the
character "A". Suppose the subject string is "DEFABC". The start-up
optimization scans along the subject, finds "A" and runs the first match
attempt from there. The (*COMMIT) item means that the pattern must match the
current starting position, which in this case, it does. However, if the same
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
subject string does not happen. The first match attempt is run starting from
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
the overall result is "no match". There are also other start-up optimizations.
For example, a minimum length for the subject may be recorded. Consider the
pattern
<pre>
(*MARK:A)(X|Y)
</pre>
The minimum length for a match is one character. If the subject is "ABC", there
will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
string at the end of the subject does not take place, because PCRE2 knows that
the subject is now too short, and so the (*MARK) is never encountered. In this
case, the optimization does not affect the overall match result, which is still
"no match", but it does affect the auxiliary information that is returned.
<pre>
PCRE2_NO_UTF_CHECK
</pre>
@ -1653,8 +1658,10 @@ match data block by calling one of the creation functions above. For
<b>pcre2_match_data_create()</b>, the first argument is the number of pairs of
offsets in the <i>ovector</i>. One pair of offsets is required to identify the
string that matched the whole pattern, with another pair for each captured
substring. For example, a value of 4 creates enough space to record the
matched portion of the subject plus three captured substrings.
substring. For example, a value of 4 creates enough space to record the matched
portion of the subject plus three captured substrings. A minimum of at least 1
pair is imposed by <b>pcre2_match_data_create()</b>, so it is always possible to
return the overall matched string.
</P>
<P>
For <b>pcre2_match_data_create_from_pattern()</b>, the first argument is a
@ -1779,10 +1786,9 @@ Option bits for <b>pcre2_match()</b>
</b><br>
<P>
The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
zero. The only bits that may be set are PCRE2_ANCHORED,
PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
PCRE2_NO_START_OPTIMIZE, PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and
PCRE2_PARTIAL_SOFT. Their action is described below.
zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below.
</P>
<P>
If the pattern was successfully processed by the just-in-time (JIT) compiler,
@ -1833,56 +1839,6 @@ valid, so PCRE2 searches further into the string for occurrences of "a" or "b".
This is like PCRE2_NOTEMPTY, except that an empty string match that is not at
the start of the subject is permitted. If the pattern is anchored, such a match
can occur only if the pattern contains \K.
<pre>
PCRE2_NO_START_OPTIMIZE
</pre>
There are a number of optimizations that <b>pcre2_match()</b> uses at the start
of a match, in order to speed up the process. For example, if it is known that
an unanchored match must start with a specific character, it searches the
subject for that character, and fails immediately if it cannot find it, without
actually running the main matching function. This means that a special item
such as (*COMMIT) at the start of a pattern is not considered until after a
suitable starting point for the match has been found. Also, when callouts or
(*MARK) items are in use, these "start-up" optimizations can cause them to be
skipped if the pattern is never actually used. The start-up optimizations are
in effect a pre-scan of the subject that takes place before the pattern is run.
</P>
<P>
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
possibly causing performance to suffer, but ensuring that in cases where the
result is "no match", the callouts do occur, and that items such as (*COMMIT)
and (*MARK) are considered at every possible starting position in the subject
string. If PCRE2_NO_START_OPTIMIZE is set at compile time, it cannot be unset
at matching time. The use of PCRE2_NO_START_OPTIMIZE at matching time (that is,
passing it to <b>pcre2_match()</b>) disables JIT execution; in this situation,
matching is always done using interpretively.
</P>
<P>
Setting PCRE2_NO_START_OPTIMIZE can change the outcome of a matching operation.
Consider the pattern
<pre>
(*COMMIT)ABC
</pre>
When this is compiled, PCRE2 records the fact that a match must start with the
character "A". Suppose the subject string is "DEFABC". The start-up
optimization scans along the subject, finds "A" and runs the first match
attempt from there. The (*COMMIT) item means that the pattern must match the
current starting position, which in this case, it does. However, if the same
match is run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the
subject string does not happen. The first match attempt is run starting from
"D" and when this fails, (*COMMIT) prevents any further matches being tried, so
the overall result is "no match". There are also other start-up optimizations.
For example, a minimum length for the subject may be recorded. Consider the
pattern
<pre>
(*MARK:A)(X|Y)
</pre>
The minimum length for a match is one character. If the subject is "ABC", there
will be attempts to match "ABC", "BC", and "C". An attempt to match an empty
string at the end of the subject does not take place, because PCRE2 knows that
the subject is now too short, and so the (*MARK) is never encountered. In this
case, the optimization does not affect the overall match result, which is still
"no match", but it does affect the auxiliary information that is returned.
<pre>
PCRE2_NO_UTF_CHECK
</pre>
@ -2035,13 +1991,13 @@ returned.
</P>
<P>
If the ovector is too small to hold all the captured substring offsets, as much
as possible is filled in, and the function returns a value of zero. If neither
the actual string matched nor any captured substrings are of interest,
<b>pcre2_match()</b> may be called with a match data block whose ovector is of
zero length. However, if the pattern contains back references and the
<i>ovector</i> is not big enough to remember the related substrings, PCRE2 has
to get additional memory for use during matching. Thus it is usually advisable
to set up a match data block containing an ovector of reasonable size.
as possible is filled in, and the function returns a value of zero. If captured
substrings are not of interest, <b>pcre2_match()</b> may be called with a match
data block whose ovector is of minimum length (that is, one pair). However, if
the pattern contains back references and the <i>ovector</i> is not big enough to
remember the related substrings, PCRE2 has to get additional memory for use
during matching. Thus it is usually advisable to set up a match data block
containing an ovector of reasonable size.
</P>
<P>
It is possible for capturing subpattern number <i>n+1</i> to match some part of
@ -2074,12 +2030,6 @@ Other information about the match
<b>PCRE2_SPTR pcre2_get_mark(pcre2_match_data *<i>match_data</i>);</b>
<br>
<br>
<b>PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *<i>match_data</i>);</b>
<br>
<br>
<b>PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *<i>match_data</i>);</b>
<br>
<br>
<b>PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *<i>match_data</i>);</b>
</P>
<P>
@ -2093,39 +2043,10 @@ Otherwise NULL is returned. A (*MARK) name may be available after a failed
match or a partial match, as well as after a successful one.
</P>
<P>
The other three functions yield values that give information about the part of
the subject string that was inspected during a successful match or a partial
match. Their results are undefined after a failed match. They return the
following values, respectively:
<br>
<br>
(1) The offset of the leftmost character that was inspected during the match.
This can be earlier than the point at which the match started if the pattern
contains lookbehind assertions or \b or \B at the start.
<br>
<br>
(2) The offset of the character that follows the rightmost character that was
inspected during the match. This can be after the end of the match if the
pattern contains lookahead assertions.
<br>
<br>
(3) The offset of the character at which the successful or partial match
started. This can be different to the value of <i>ovector[0]</i> if the pattern
contains the \K escape sequence.
</P>
<P>
For example, if the pattern (?&#60;=abc)xx\Kyy(?=def) is matched against the
string "123abcxxyydef123", the resulting offsets are:
<pre>
ovector[0] 8
ovector[1] 10
leftchar 3
rightchar 13
startchar 6
</pre>
The <b>allusedtext</b> modifier in <b>pcre2test</b> can be used to display a
longer string that shows the leftmost and rightmost characters in a match
instead of just the matched string.
The offset of the character at which the successful match started is
returned by <b>pcre2_get_startchar()</b>. This can be different to the value of
<i>ovector[0]</i> if the pattern contains the \K escape sequence. Note,
however, the \K has no effect for a partial match.
<a name="errorlist"></a></P>
<br><b>
Error return values from <b>pcre2_match()</b>
@ -2513,10 +2434,9 @@ Option bits for <b>pcre_dfa_match()</b>
The unused bits of the <i>options</i> argument for <b>pcre2_dfa_match()</b> must
be zero. The only bits that may be set are PCRE2_ANCHORED, PCRE2_NOTBOL,
PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_UTF_CHECK,
PCRE2_NO_START_OPTIMIZE, PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT,
PCRE2_DFA_SHORTEST, and PCRE2_DFA_RESTART. All but the last four of these are
exactly the same as for <b>pcre2_match()</b>, so their description is not
repeated here.
PCRE2_PARTIAL_HARD, PCRE2_PARTIAL_SOFT, PCRE2_DFA_SHORTEST, and
PCRE2_DFA_RESTART. All but the last four of these are exactly the same as for
<b>pcre2_match()</b>, so their description is not repeated here.
<pre>
PCRE2_PARTIAL_HARD
PCRE2_PARTIAL_SOFT
@ -2650,7 +2570,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
<P>
Last updated: 16 September 2014
Last updated: 14 October 2014
<br>
Copyright &copy; 1997-2014 University of Cambridge.
<br>

View File

@ -131,7 +131,7 @@ long enough, or, for unanchored patterns, if it has been scanned far enough.
</P>
<P>
You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
option to the matching function, or by starting the pattern with
option to <b>pcre2_compile()</b>, or by starting the pattern with
(*NO_START_OPT). This slows down the matching process, but does ensure that
callouts such as the example above are obeyed.
</P>

View File

@ -128,9 +128,8 @@ or the JIT compiler was not able to handle the pattern.
<P>
The <b>pcre2_match()</b> options that are supported for JIT matching are
PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART,
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The options
that are not supported at match time are PCRE2_ANCHORED and
PCRE2_NO_START_OPTIMIZE, though they are supported if given at compile time.
PCRE2_NO_UTF_CHECK, PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. The
PCRE2_ANCHORED option is not supported at match time.
</P>
<P>
The only unsupported pattern items are \C (match a single data unit) when

464
doc/html/pcre2partial.html Normal file
View File

@ -0,0 +1,464 @@
<html>
<head>
<title>pcre2partial specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre2partial man page</h1>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>
<p>
This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">PARTIAL MATCHING IN PCRE2</a>
<li><a name="TOC2" href="#SEC2">PARTIAL MATCHING USING pcre2_match()</a>
<li><a name="TOC3" href="#SEC3">PARTIAL MATCHING USING pcre2_dfa_match()</a>
<li><a name="TOC4" href="#SEC4">PARTIAL MATCHING AND WORD BOUNDARIES</a>
<li><a name="TOC5" href="#SEC5">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a>
<li><a name="TOC6" href="#SEC6">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a>
<li><a name="TOC7" href="#SEC7">MULTI-SEGMENT MATCHING WITH pcre2_match()</a>
<li><a name="TOC8" href="#SEC8">ISSUES WITH MULTI-SEGMENT MATCHING</a>
<li><a name="TOC9" href="#SEC9">AUTHOR</a>
<li><a name="TOC10" href="#SEC10">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">PARTIAL MATCHING IN PCRE2</a><br>
<P>
In normal use of PCRE2, if the subject string that is passed to a matching
function matches as far as it goes, but is too short to match the entire
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
might be helpful to distinguish this case from other cases in which there is no
match.
</P>
<P>
Consider, for example, an application where a human is required to type in data
for a field with specific formatting requirements. An example might be a date
in the form <i>ddmmmyy</i>, defined by this pattern:
<pre>
^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$
</pre>
If the application sees the user's keystrokes one by one, and can check that
what has been typed so far is potentially valid, it is able to raise an error
as soon as a mistake is made, by beeping and not reflecting the character that
has been typed, for example. This immediate feedback is likely to be a better
user interface than a check that is delayed until the entire string has been
entered. Partial matching can also be useful when the subject string is very
long and is not all available at once.
</P>
<P>
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
The difference between the two options is whether or not a partial match is
preferred to an alternative complete match, though the details differ between
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
takes precedence.
</P>
<P>
If you want to use partial matching with just-in-time optimized code, you must
call <b>pcre2_jit_compile()</b> with one or both of these options:
<pre>
PCRE2_JIT_PARTIAL_SOFT
PCRE2_JIT_PARTIAL_HARD
</pre>
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
matches on the same pattern. If the appropriate JIT mode has not been compiled,
interpretive matching code is used.
</P>
<P>
Setting a partial matching option disables two of PCRE2's standard
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
abandons matching immediately if it is not present in the subject string. This
optimization cannot be used for a subject string that might match only
partially. PCRE2 also knows the minimum length of a matching string, and does
not bother to run the matching function on shorter strings. This optimization
is also disabled for partial matching.
</P>
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre2_match()</a><br>
<P>
A partial match occurs during a call to <b>pcre2_match()</b> when the end of the
subject string is reached successfully, but matching cannot continue because
more characters are needed. However, at least one character in the subject must
have been inspected. This character need not form part of the final matched
string; lookbehind assertions and the \K escape sequence provide ways of
inspecting characters before the start of a matched string. The requirement for
inspecting at least one character exists because an empty string can always be
matched; without such a restriction there would always be a partial match of an
empty string at the end of the subject.
</P>
<P>
When a partial match is returned, the first two elements in the ovector point
to the portion of the subject that was matched. The appearance of \K in the
pattern has no effect for a partial match. Consider this pattern:
<pre>
/abc\K123/
</pre>
If it is matched against "456abc123xyz" the result is a complete match, and the
ovector defines the matched string as "123", because \K resets the "start of
match" point. However, if a partial match is requested and the subject string
is "456abc12", a partial match is found for the string "abc12", because all
these characters are needed for a subsequent re-match with additional
characters.
</P>
<P>
What happens when a partial match is identified depends on which of the two
partial matching options are set.
</P>
<br><b>
PCRE2_PARTIAL_SOFT WITH pcre2_match()
</b><br>
<P>
If PCRE2_PARTIAL_SOFT is set when <b>pcre2_match()</b> identifies a partial
match, the partial match is remembered, but matching continues as normal, and
other alternatives in the pattern are tried. If no complete match can be found,
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
</P>
<P>
This option is "soft" because it prefers a complete match over a partial match.
All the various matching items in a pattern behave as if the subject string is
potentially complete. For example, \z, \Z, and $ match at the end of the
subject, as normal, and for \b and \B the end of the subject is treated as a
non-alphanumeric.
</P>
<P>
If there is more than one partial match, the first one that was found provides
the data that is returned. Consider this pattern:
<pre>
/123\w+X|dogY/
</pre>
If this is matched against the subject string "abc123dog", both
alternatives fail to match, but the end of the subject is reached during
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
identifying "123dog" as the first partial match that was found. (In this
example, there are two partial matches, because "dog" on its own partially
matches the second alternative.)
</P>
<br><b>
PCRE2_PARTIAL_HARD WITH pcre2_match()
</b><br>
<P>
If PCRE2_PARTIAL_HARD is set for <b>pcre2_match()</b>, PCRE2_ERROR_PARTIAL is
returned as soon as a partial match is found, without continuing to search for
possible complete matches. This option is "hard" because it prefers an earlier
partial match over a later complete match. For this reason, the assumption is
made that the end of the supplied subject string may not be the true end of the
available data, and so, if \z, \Z, \b, \B, or $ are encountered at the end
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
character in the subject has been inspected.
</P>
<br><b>
Comparing hard and soft partial matching
</b><br>
<P>
The difference between the two partial matching options can be illustrated by a
pattern such as:
<pre>
/dog(sbody)?/
</pre>
This matches either "dog" or "dogsbody", greedily (that is, it prefers the
longer string if possible). If it is matched against the string "dog" with
PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if
PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other
hand, if the pattern is made ungreedy the result is different:
<pre>
/dog(sbody)??/
</pre>
In this case the result is always a complete match because that is found first,
and matching never continues after finding a complete match. It might be easier
to follow this explanation by thinking of the two patterns like this:
<pre>
/dog(sbody)?/ is the same as /dogsbody|dog/
/dog(sbody)??/ is the same as /dog|dogsbody/
</pre>
The second pattern will never match "dogsbody", because it will always find the
shorter match first.
</P>
<br><a name="SEC3" href="#TOC1">PARTIAL MATCHING USING pcre2_dfa_match()</a><br>
<P>
The DFA functions move along the subject string character by character, without
backtracking, searching for all possible matches simultaneously. If the end of
the subject is reached before the end of the pattern, there is the possibility
of a partial match, again provided that at least one character has been
inspected.
</P>
<P>
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
have been no complete matches. Otherwise, the complete matches are returned.
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
any complete matches. The portion of the string that was matched when the
longest partial match was found is set as the first matching string.
</P>
<P>
Because the DFA functions always search for all possible matches, and there is
no difference between greedy and ungreedy repetition, their behaviour is
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
the string "dog" matched against the ungreedy pattern shown above:
<pre>
/dog(sbody)??/
</pre>
Whereas the standard functions stop as soon as they find the complete match for
"dog", the DFA functions also find the partial match for "dogsbody", and so
return that when PCRE2_PARTIAL_HARD is set.
</P>
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
<P>
If a pattern ends with one of sequences \b or \B, which test for word
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
results. Consider this pattern:
<pre>
/\bcat\b/
</pre>
This matches "cat", provided there is a word boundary at either end. If the
subject string is "the cat", the comparison of the final "t" with a following
character cannot take place, so a partial match is found. However, normal
matching carries on, and \b matches at the end of the subject when the last
character is a letter, so a complete match is found. The result, therefore, is
<i>not</i> PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
</P>
<br><a name="SEC5" href="#TOC1">EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST</a><br>
<P>
If the <b>partial_soft</b> (or <b>ps</b>) modifier is present on a
<b>pcre2test</b> data line, the PCRE2_PARTIAL_SOFT option is used for the match.
Here is a run of <b>pcre2test</b> that uses the date example quoted above:
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 25jun04\=ps
0: 25jun04
1: jun
data&#62; 25dec3\=ps
Partial match: 23dec3
data&#62; 3ju\=ps
Partial match: 3ju
data&#62; 3juj\=ps
No match
data&#62; j\=ps
No match
</pre>
The first data string is matched completely, so <b>pcre2test</b> shows the
matched substrings. The remaining four strings do not match the complete
pattern, but the first two are partial matches. Similar output is obtained
if DFA matching is used.
</P>
<P>
If the <b>partial_hard</b> (or <b>ph</b>) modifier is present on a
<b>pcre2test</b> data line, the PCRE2_PARTIAL_HARD option is set for the match.
</P>
<br><a name="SEC6" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()</a><br>
<P>
When a partial match has been found using a DFA matching function, it is
possible to continue the match by providing additional subject data and calling
the function again with the same compiled regular expression, this time setting
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
because this is where details of the previous partial match are stored. Here is
an example using <b>pcre2test</b>:
<pre>
re&#62; /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
data&#62; 23ja\=dfa,ps
Partial match: 23ja
data&#62; n05\=dfa,dfa_restart
0: n05
</pre>
The first call has "23ja" as the subject, and requests partial matching; the
second call has "n05" as the subject for the continued (restarted) match.
Notice that when the match is complete, only the last part is shown; PCRE2 does
not retain the previously partially-matched string. It is up to the calling
program to do that if it needs to.
</P>
<P>
That means that, for an unanchored pattern, if a continued match fails, it is
not possible to try again at a new starting point. All this facility is capable
of doing is continuing with the previous match attempt. In the previous
example, if the second set of data is "ug23" the result is no match, even
though there would be a match for "aug23" if the entire string were given at
once. Depending on the application, this may or may not be what you want.
The only way to allow for starting again at the next character is to retain the
matched part of the subject and try a new complete match.
</P>
<P>
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
facility can be used to pass very long subject strings to the DFA matching
functions.
</P>
<br><a name="SEC7" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre2_match()</a><br>
<P>
Unlike the DFA function, it is not possible to restart the previous match with
a new segment of data when using <b>pcre2_match()</b>. Instead, new data must be
added to the previous subject string, and the entire match re-run, starting
from the point where the partial match occurred. Earlier data can be discarded.
</P>
<P>
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
treat the end of a segment as the end of the subject when matching \z, \Z,
\b, \B, and $. Consider an unanchored pattern that matches dates:
<pre>
re&#62; /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
data&#62; The date is 23ja\=ph
Partial match: 23ja
</pre>
At this stage, an application could discard the text preceding "23ja", add on
text from the next segment, and call the matching function again. Unlike the
DFA matching function, the entire matching string must always be available,
and the complete matching process occurs for each call, so more memory and more
processing time is needed.
</P>
<br><a name="SEC8" href="#TOC1">ISSUES WITH MULTI-SEGMENT MATCHING</a><br>
<P>
Certain types of pattern may give problems with multi-segment matching,
whichever matching function is used.
</P>
<P>
1. If the pattern contains a test for the beginning of a line, you need to pass
the PCRE2_NOTBOL option when the subject string for any call does start at the
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
includes the effect of PCRE2_NOTEOL.
</P>
<P>
2. If a pattern contains a lookbehind assertion, characters that precede the
start of the partial match may have been inspected during the matching process.
When using <b>pcre2_match()</b>, sufficient characters must be retained for the
next match attempt. You can ensure that enough characters are retained by doing
the following:
</P>
<P>
Before doing any matching, find the length of the longest lookbehind in the
pattern by calling <b>pcre2_pattern_info()</b> with the PCRE2_INFO_MAXLOOKBEHIND
option. Note that the resulting count is in characters, not code units. After a
partial match, moving back from the ovector[0] offset in the subject by the
number of characters given for the maximum lookbehind gets you to the earliest
character that must be retained. In a non-UTF or a 32-bit situation, moving
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
while moving back through the code units.
</P>
<P>
Characters before the point you have now reached can be discarded, and after
the next segment has been added to what is retained, you should run the next
match with the <b>startoffset</b> argument set so that the match begins at the
same point as before.
</P>
<P>
For example, if the pattern "(?&#60;=123)abc" is partially matched against the
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
lookbehind count is 3, so all characters before offset 2 can be discarded. The
value of <b>startoffset</b> for the next match should be 3. When <b>pcre2test</b>
displays a partial match, it indicates the lookbehind characters with '&#60;'
characters:
<pre>
re&#62; "(?&#60;=123)abc"
data&#62; xx123ab\=ph
Partial match: 123ab
&#60;&#60;&#60;
</PRE>
</P>
<P>
3. Because a partial match must always contain at least one character, what
might be considered a partial match of an empty string actually gives a "no
match" result. For example:
<pre>
re&#62; /c(?&#60;=abc)x/
data&#62; ab\=ps
No match
</pre>
If the next segment begins "cx", a match should be found, but this will only
happen if characters from the previous segment are retained. For this reason, a
"no match" result should be interpreted as "partial match of an empty string"
when the pattern contains lookbehinds.
</P>
<P>
4. Matching a subject string that is split into multiple segments may not
always produce exactly the same result as matching over one single long string,
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
Word Boundaries" above describes an issue that arises if the pattern ends with
\b or \B. Another kind of difference may occur when there are multiple
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
is given only when there are no completed matches. This means that as soon as
the shortest match has been found, continuation to a new subject segment is no
longer possible. Consider this <b>pcre2test</b> example:
<pre>
re&#62; /dog(sbody)?/
data&#62; dogsb\=ps
0: dog
data&#62; do\=ps,dfa
Partial match: do
data&#62; gsb\=ps,dfa,dfa_restart
0: g
data&#62; dogsbody\=dfa
0: dogsbody
1: dog
</pre>
The first data line passes the string "dogsb" to a standard matching function,
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
string "dog" is a complete match. Similarly, when the subject is presented to
a DFA matching function in several parts ("do" and "gsb" being the first two)
the match stops when "dog" has been found, and it is not possible to continue.
On the other hand, if "dogsbody" is presented as a single string, a DFA
matching function finds both matches.
</P>
<P>
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
multi-segment data. The example above then behaves differently:
<pre>
re&#62; /dog(sbody)?/
data&#62; dogsb\=ph
Partial match: dogsb
data&#62; do\=ps,dfa
Partial match: do
data&#62; gsb\=ph,dfa,dfa_restart
Partial match: gsb
</pre>
5. Patterns that contain alternatives at the top level which do not all start
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
used. For example, consider this pattern:
<pre>
1234|3789
</pre>
If the first part of the subject is "ABC123", a partial match of the first
alternative is found at offset 3. There is no partial match for the second
alternative, because such a match does not start at the same point in the
subject string. Attempting to continue with the string "7890" does not yield a
match because only those alternatives that match at one point in the subject
are remembered. The problem arises because the start of the second alternative
matches within the first alternative. There is no problem with anchored
patterns or patterns such as:
<pre>
1234|ABCD
</pre>
where no string can be a partial match for both alternatives. This is not a
problem if a standard matching function is used, because the entire match has
to be rerun each time:
<pre>
re&#62; /1234|3789/
data&#62; ABC123\=ph
Partial match: 123
data&#62; 1237890
0: 3789
</pre>
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
the entire match can also be used with the DFA matching function. Another
possibility is to work with two buffers. If a partial match at offset <i>n</i>
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
the second buffer, you can then try a new match starting at offset <i>n+1</i> in
the first buffer.
</P>
<br><a name="SEC9" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC10" href="#TOC1">REVISION</a><br>
<P>
Last updated: 14 October 2014
<br>
Copyright &copy; 1997-2014 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>

View File

@ -476,6 +476,7 @@ about the pattern:
/I info show info about compiled pattern
hex pattern is coded in hexadecimal
jit[=&#60;number&#62;] use JIT
jitverify verify JIT use
locale=&#60;name&#62; use this locale
memory show memory used
newline=&#60;type&#62; set newline type
@ -503,10 +504,6 @@ The <b>newline</b> modifier specifies which characters are to be interpreted as
newlines, both in the pattern and (by default) in subject lines. The type must
be one of CR, LF, CRLF, ANYCRLF, or ANY.
</P>
<P>
Both the \R and newline settings can be changed at match time, but if this is
done, JIT matching is disabled.
</P>
<br><b>
Information about a pattern
</b><br>
@ -556,29 +553,32 @@ length of the pattern is passed. This is implied if <b>hex</b> is set.
JIT compilation
</b><br>
<P>
The <b>/jit</b> modifier may optionally be followed by a number in the range 0
to 7:
The <b>/jit</b> modifier may optionally be followed by and equals sign and a
number in the range 0 to 7:
<pre>
0 disable JIT
1 normal match only
2 soft partial match only
3 normal match and soft partial match
4 hard partial match only
6 soft and hard partial match
1 use JIT for normal match only
2 use JIT for soft partial match only
3 use JIT for normal match and soft partial match
4 use JIT for hard partial match only
6 use JIT for soft and hard partial match
7 all three modes
</pre>
If no number is given, 7 is assumed. If JIT compilation is successful, the
compiled JIT code will automatically be used when <b>pcre2_match()</b> is run,
except when incompatible run-time options are specified. For more details, see
the
compiled JIT code will automatically be used when <b>pcre2_match()</b> is run
for the appropriate type of match, except when incompatible run-time options
are specified. For more details, see the
<a href="pcre2jit.html"><b>pcre2jit</b></a>
documentation. See also the <b>jitstack</b> modifier below for a way of
setting the size of the JIT stack.
</P>
<P>
If the <b>jitverify</b> modifier is specified, the text "(JIT)" is added to the
first output line after a match or non match when JIT-compiled code was
actually used. This modifier can also be set on a subject line.
If the <b>jitverify</b> modifier is specified, information about the compiled
pattern shows whether JIT compilation was or was not successful. If
<b>jitverify</b> is specified without <b>jit</b>, jit=7 is assumed. If JIT
compilation is successful when <b>jitverify</b> is set, the text "(JIT)" is
added to the first output line after a match or non match when JIT-compiled
code was actually used.
</P>
<br><b>
Setting a locale
@ -680,7 +680,6 @@ not affect the compilation process.
allcaptures show all captures
allusedtext show all consulted text
/g global global matching
jitverify verify JIT usage
mark show mark values
</pre>
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
@ -703,7 +702,6 @@ for a description of their effects.
anchored set PCRE2_ANCHORED
dfa_restart set PCRE2_DFA_RESTART
dfa_shortest set PCRE2_DFA_SHORTEST
no_start_optimize set PCRE2_NO_START_OPTIMIZE
no_utf_check set PCRE2_NO_UTF_CHECK
notbol set PCRE2_NOTBOL
notempty set PCRE2_NOTEMPTY
@ -734,9 +732,8 @@ pattern.
aftertext show text after match
allaftertext show text after captures
allcaptures show all captures
allusedtext show all consulted text
allusedtext show all consulted text (non-JIT only)
altglobal alternative global matching
bsr=[anycrlf|unicode] specify \R handling
callout_capture show captures at callout time
callout_data=&#60;n&#62; set a value to pass via callouts
callout_fail=&#60;n&#62;[:&#60;m&#62;] control callout failure
@ -748,11 +745,9 @@ pattern.
getall extract all captured substrings
/g global global matching
jitstack=&#60;n&#62; set size of JIT stack
jitverify verify JIT usage
mark show mark values
match_limit=&#62;n&#62; set a match limit
memory show memory usage
newline=&#60;type&#62; set newline type
offset=&#60;n&#62; set starting offset
ovector=&#60;n&#62; set size of output vector
recursion_limit=&#60;n&#62; set a recursion limit
@ -761,14 +756,6 @@ The effects of these modifiers are described in the following sections.
FIXME: Give more examples.
</P>
<br><b>
Newline and \R handling
</b><br>
<P>
These modifiers set the newline and \R processing conventions for the subject
line, overriding any values that were set at compile time (as described above).
JIT matching is disabled if these settings are changed at match time.
</P>
<br><b>
Showing more text
</b><br>
<P>
@ -782,10 +769,12 @@ plus character following the capture number.
</P>
<P>
The <b>allusedtext</b> modifier requests that all the text that was consulted
during a successful pattern match be shown. This affects the output if there
is a lookbehind at the start of a match, or a lookahead at the end, or if \K
is used in the pattern. Characters that precede or follow the start and end of
the actual match are indicated in the output by '&#60;' or '&#62;' characters
during a successful pattern match by the interpreter should be shown. This
feature is not supported for JIT matching, and if requested with JIT it is
ignored (with a warning message). Setting this modifier affects the output if
there is a lookbehind at the start of a match, or a lookahead at the end, or if
\K is used in the pattern. Characters that precede or follow the start and end
of the actual match are indicated in the output by '&#60;' or '&#62;' characters
underneath them. Here is an example:
<pre>
/(?&#60;=pqr)abc(?=xyz)/
@ -903,6 +892,11 @@ until it finds the minimum values for each parameter that allow
<b>pcre2_match()</b> to complete without error.
</P>
<P>
If JIT is being used, only the match limit is relevant. If DFA matching is
being used, neither limit is relevant, and this modifier is ignored (with a
warning message).
</P>
<P>
The <i>match_limit</i> number is a measure of the amount of backtracking
that takes place, and learning the minimum value can be instructive. For most
simple matches, the number is quite small, but for patterns with very large
@ -944,6 +938,13 @@ appears, though of course it can also be used to set a default in a
<b>#subject</b> command. It specifies the number of pairs of offsets that are
available for storing matching information. The default is 15.
</P>
<P>
At least one pair of offsets is always created by
<b>pcre2_match_data_create()</b>, for matching with PCRE2's native API, so a
value of 0 is the same as 1. However a value of 0 is useful when testing the
POSIX API because it causes <b>regexec()</b> to be called with a NULL capture
vector.
</P>
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
<P>
By default, <b>pcre2test</b> uses the standard PCRE2 matching function,
@ -1190,7 +1191,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC20" href="#TOC1">REVISION</a><br>
<P>
Last updated: 19 August 2014
Last updated: 11 October 2014
<br>
Copyright &copy; 1997-2014 University of Cambridge.
<br>

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
.TH PCRE2API 3 "10 October 2014" "PCRE2 10.00"
.TH PCRE2API 3 "14 October 2014" "PCRE2 10.00"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.sp
@ -2061,15 +2061,10 @@ pointer to the zero-terminated name, which is within the compiled pattern.
Otherwise NULL is returned. A (*MARK) name may be available after a failed
match or a partial match, as well as after a successful one.
.P
The offset of the character at which the successful or partial match started is
The offset of the character at which the successful match started is
returned by \fBpcre2_get_startchar()\fP. This can be different to the value of
\fIovector[0]\fP if the pattern contains the \eK escape sequence. This
information is needed when doing partial matching over multiple data segments
(see the
.\" HREF
\fBpcre2partial\fP
.\"
documentation).
\fIovector[0]\fP if the pattern contains the \eK escape sequence. Note,
however, the \eK has no effect for a partial match.
.
.
.\" HTML <a name="errorlist"></a>
@ -2626,6 +2621,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
Last updated: 10 October 2014
Last updated: 14 October 2014
Copyright (c) 1997-2014 University of Cambridge.
.fi

433
doc/pcre2partial.3 Normal file
View File

@ -0,0 +1,433 @@
.TH PCRE2PARTIAL 3 "14 October 2014" "PCRE2 10.00"
.SH NAME
PCRE2 - Perl-compatible regular expressions
.SH "PARTIAL MATCHING IN PCRE2"
.rs
.sp
In normal use of PCRE2, if the subject string that is passed to a matching
function matches as far as it goes, but is too short to match the entire
pattern, PCRE2_ERROR_NOMATCH is returned. There are circumstances where it
might be helpful to distinguish this case from other cases in which there is no
match.
.P
Consider, for example, an application where a human is required to type in data
for a field with specific formatting requirements. An example might be a date
in the form \fIddmmmyy\fP, defined by this pattern:
.sp
^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$
.sp
If the application sees the user's keystrokes one by one, and can check that
what has been typed so far is potentially valid, it is able to raise an error
as soon as a mistake is made, by beeping and not reflecting the character that
has been typed, for example. This immediate feedback is likely to be a better
user interface than a check that is delayed until the entire string has been
entered. Partial matching can also be useful when the subject string is very
long and is not all available at once.
.P
PCRE2 supports partial matching by means of the PCRE2_PARTIAL_SOFT and
PCRE2_PARTIAL_HARD options, which can be set when calling a matching function.
The difference between the two options is whether or not a partial match is
preferred to an alternative complete match, though the details differ between
the two types of matching function. If both options are set, PCRE2_PARTIAL_HARD
takes precedence.
.P
If you want to use partial matching with just-in-time optimized code, you must
call \fBpcre2_jit_compile()\fP with one or both of these options:
.sp
PCRE2_JIT_PARTIAL_SOFT
PCRE2_JIT_PARTIAL_HARD
.sp
PCRE2_JIT_COMPLETE should also be set if you are going to run non-partial
matches on the same pattern. If the appropriate JIT mode has not been compiled,
interpretive matching code is used.
.P
Setting a partial matching option disables two of PCRE2's standard
optimizations. PCRE2 remembers the last literal code unit in a pattern, and
abandons matching immediately if it is not present in the subject string. This
optimization cannot be used for a subject string that might match only
partially. PCRE2 also knows the minimum length of a matching string, and does
not bother to run the matching function on shorter strings. This optimization
is also disabled for partial matching.
.
.
.SH "PARTIAL MATCHING USING pcre2_match()"
.rs
.sp
A partial match occurs during a call to \fBpcre2_match()\fP when the end of the
subject string is reached successfully, but matching cannot continue because
more characters are needed. However, at least one character in the subject must
have been inspected. This character need not form part of the final matched
string; lookbehind assertions and the \eK escape sequence provide ways of
inspecting characters before the start of a matched string. The requirement for
inspecting at least one character exists because an empty string can always be
matched; without such a restriction there would always be a partial match of an
empty string at the end of the subject.
.P
When a partial match is returned, the first two elements in the ovector point
to the portion of the subject that was matched. The appearance of \eK in the
pattern has no effect for a partial match. Consider this pattern:
.sp
/abc\eK123/
.sp
If it is matched against "456abc123xyz" the result is a complete match, and the
ovector defines the matched string as "123", because \eK resets the "start of
match" point. However, if a partial match is requested and the subject string
is "456abc12", a partial match is found for the string "abc12", because all
these characters are needed for a subsequent re-match with additional
characters.
.P
What happens when a partial match is identified depends on which of the two
partial matching options are set.
.
.
.SS "PCRE2_PARTIAL_SOFT WITH pcre2_match()"
.rs
.sp
If PCRE2_PARTIAL_SOFT is set when \fBpcre2_match()\fP identifies a partial
match, the partial match is remembered, but matching continues as normal, and
other alternatives in the pattern are tried. If no complete match can be found,
PCRE2_ERROR_PARTIAL is returned instead of PCRE2_ERROR_NOMATCH.
.P
This option is "soft" because it prefers a complete match over a partial match.
All the various matching items in a pattern behave as if the subject string is
potentially complete. For example, \ez, \eZ, and $ match at the end of the
subject, as normal, and for \eb and \eB the end of the subject is treated as a
non-alphanumeric.
.P
If there is more than one partial match, the first one that was found provides
the data that is returned. Consider this pattern:
.sp
/123\ew+X|dogY/
.sp
If this is matched against the subject string "abc123dog", both
alternatives fail to match, but the end of the subject is reached during
matching, so PCRE2_ERROR_PARTIAL is returned. The offsets are set to 3 and 9,
identifying "123dog" as the first partial match that was found. (In this
example, there are two partial matches, because "dog" on its own partially
matches the second alternative.)
.
.
.SS "PCRE2_PARTIAL_HARD WITH pcre2_match()"
.rs
.sp
If PCRE2_PARTIAL_HARD is set for \fBpcre2_match()\fP, PCRE2_ERROR_PARTIAL is
returned as soon as a partial match is found, without continuing to search for
possible complete matches. This option is "hard" because it prefers an earlier
partial match over a later complete match. For this reason, the assumption is
made that the end of the supplied subject string may not be the true end of the
available data, and so, if \ez, \eZ, \eb, \eB, or $ are encountered at the end
of the subject, the result is PCRE2_ERROR_PARTIAL, provided that at least one
character in the subject has been inspected.
.
.
.SS "Comparing hard and soft partial matching"
.rs
.sp
The difference between the two partial matching options can be illustrated by a
pattern such as:
.sp
/dog(sbody)?/
.sp
This matches either "dog" or "dogsbody", greedily (that is, it prefers the
longer string if possible). If it is matched against the string "dog" with
PCRE2_PARTIAL_SOFT, it yields a complete match for "dog". However, if
PCRE2_PARTIAL_HARD is set, the result is PCRE2_ERROR_PARTIAL. On the other
hand, if the pattern is made ungreedy the result is different:
.sp
/dog(sbody)??/
.sp
In this case the result is always a complete match because that is found first,
and matching never continues after finding a complete match. It might be easier
to follow this explanation by thinking of the two patterns like this:
.sp
/dog(sbody)?/ is the same as /dogsbody|dog/
/dog(sbody)??/ is the same as /dog|dogsbody/
.sp
The second pattern will never match "dogsbody", because it will always find the
shorter match first.
.
.
.SH "PARTIAL MATCHING USING pcre2_dfa_match()"
.rs
.sp
The DFA functions move along the subject string character by character, without
backtracking, searching for all possible matches simultaneously. If the end of
the subject is reached before the end of the pattern, there is the possibility
of a partial match, again provided that at least one character has been
inspected.
.P
When PCRE2_PARTIAL_SOFT is set, PCRE2_ERROR_PARTIAL is returned only if there
have been no complete matches. Otherwise, the complete matches are returned.
However, if PCRE2_PARTIAL_HARD is set, a partial match takes precedence over
any complete matches. The portion of the string that was matched when the
longest partial match was found is set as the first matching string.
.P
Because the DFA functions always search for all possible matches, and there is
no difference between greedy and ungreedy repetition, their behaviour is
different from the standard functions when PCRE2_PARTIAL_HARD is set. Consider
the string "dog" matched against the ungreedy pattern shown above:
.sp
/dog(sbody)??/
.sp
Whereas the standard functions stop as soon as they find the complete match for
"dog", the DFA functions also find the partial match for "dogsbody", and so
return that when PCRE2_PARTIAL_HARD is set.
.
.
.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
.rs
.sp
If a pattern ends with one of sequences \eb or \eB, which test for word
boundaries, partial matching with PCRE2_PARTIAL_SOFT can give counter-intuitive
results. Consider this pattern:
.sp
/\ebcat\eb/
.sp
This matches "cat", provided there is a word boundary at either end. If the
subject string is "the cat", the comparison of the final "t" with a following
character cannot take place, so a partial match is found. However, normal
matching carries on, and \eb matches at the end of the subject when the last
character is a letter, so a complete match is found. The result, therefore, is
\fInot\fP PCRE2_ERROR_PARTIAL. Using PCRE2_PARTIAL_HARD in this case does yield
PCRE2_ERROR_PARTIAL, because then the partial match takes precedence.
.
.
.SH "EXAMPLE OF PARTIAL MATCHING USING PCRE2TEST"
.rs
.sp
If the \fBpartial_soft\fP (or \fBps\fP) modifier is present on a
\fBpcre2test\fP data line, the PCRE2_PARTIAL_SOFT option is used for the match.
Here is a run of \fBpcre2test\fP that uses the date example quoted above:
.sp
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
data> 25jun04\e=ps
0: 25jun04
1: jun
data> 25dec3\e=ps
Partial match: 23dec3
data> 3ju\e=ps
Partial match: 3ju
data> 3juj\e=ps
No match
data> j\e=ps
No match
.sp
The first data string is matched completely, so \fBpcre2test\fP shows the
matched substrings. The remaining four strings do not match the complete
pattern, but the first two are partial matches. Similar output is obtained
if DFA matching is used.
.P
If the \fBpartial_hard\fP (or \fBph\fP) modifier is present on a
\fBpcre2test\fP data line, the PCRE2_PARTIAL_HARD option is set for the match.
.
.
.SH "MULTI-SEGMENT MATCHING WITH pcre2_dfa_match()"
.rs
.sp
When a partial match has been found using a DFA matching function, it is
possible to continue the match by providing additional subject data and calling
the function again with the same compiled regular expression, this time setting
the PCRE2_DFA_RESTART option. You must pass the same working space as before,
because this is where details of the previous partial match are stored. Here is
an example using \fBpcre2test\fP:
.sp
re> /^\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed$/
data> 23ja\e=dfa,ps
Partial match: 23ja
data> n05\e=dfa,dfa_restart
0: n05
.sp
The first call has "23ja" as the subject, and requests partial matching; the
second call has "n05" as the subject for the continued (restarted) match.
Notice that when the match is complete, only the last part is shown; PCRE2 does
not retain the previously partially-matched string. It is up to the calling
program to do that if it needs to.
.P
That means that, for an unanchored pattern, if a continued match fails, it is
not possible to try again at a new starting point. All this facility is capable
of doing is continuing with the previous match attempt. In the previous
example, if the second set of data is "ug23" the result is no match, even
though there would be a match for "aug23" if the entire string were given at
once. Depending on the application, this may or may not be what you want.
The only way to allow for starting again at the next character is to retain the
matched part of the subject and try a new complete match.
.P
You can set the PCRE2_PARTIAL_SOFT or PCRE2_PARTIAL_HARD options with
PCRE2_DFA_RESTART to continue partial matching over multiple segments. This
facility can be used to pass very long subject strings to the DFA matching
functions.
.
.
.SH "MULTI-SEGMENT MATCHING WITH pcre2_match()"
.rs
.sp
Unlike the DFA function, it is not possible to restart the previous match with
a new segment of data when using \fBpcre2_match()\fP. Instead, new data must be
added to the previous subject string, and the entire match re-run, starting
from the point where the partial match occurred. Earlier data can be discarded.
.P
It is best to use PCRE2_PARTIAL_HARD in this situation, because it does not
treat the end of a segment as the end of the subject when matching \ez, \eZ,
\eb, \eB, and $. Consider an unanchored pattern that matches dates:
.sp
re> /\ed?\ed(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\ed\ed/
data> The date is 23ja\e=ph
Partial match: 23ja
.sp
At this stage, an application could discard the text preceding "23ja", add on
text from the next segment, and call the matching function again. Unlike the
DFA matching function, the entire matching string must always be available,
and the complete matching process occurs for each call, so more memory and more
processing time is needed.
.
.
.SH "ISSUES WITH MULTI-SEGMENT MATCHING"
.rs
.sp
Certain types of pattern may give problems with multi-segment matching,
whichever matching function is used.
.P
1. If the pattern contains a test for the beginning of a line, you need to pass
the PCRE2_NOTBOL option when the subject string for any call does start at the
beginning of a line. There is also a PCRE2_NOTEOL option, but in practice when
doing multi-segment matching you should be using PCRE2_PARTIAL_HARD, which
includes the effect of PCRE2_NOTEOL.
.P
2. If a pattern contains a lookbehind assertion, characters that precede the
start of the partial match may have been inspected during the matching process.
When using \fBpcre2_match()\fP, sufficient characters must be retained for the
next match attempt. You can ensure that enough characters are retained by doing
the following:
.P
Before doing any matching, find the length of the longest lookbehind in the
pattern by calling \fBpcre2_pattern_info()\fP with the PCRE2_INFO_MAXLOOKBEHIND
option. Note that the resulting count is in characters, not code units. After a
partial match, moving back from the ovector[0] offset in the subject by the
number of characters given for the maximum lookbehind gets you to the earliest
character that must be retained. In a non-UTF or a 32-bit situation, moving
back is just a subtraction, but in UTF-8 or UTF-16 you have to count characters
while moving back through the code units.
.P
Characters before the point you have now reached can be discarded, and after
the next segment has been added to what is retained, you should run the next
match with the \fBstartoffset\fP argument set so that the match begins at the
same point as before.
.P
For example, if the pattern "(?<=123)abc" is partially matched against the
string "xx123ab", the ovector offsets are 5 and 7 ("ab"). The maximum
lookbehind count is 3, so all characters before offset 2 can be discarded. The
value of \fBstartoffset\fP for the next match should be 3. When \fBpcre2test\fP
displays a partial match, it indicates the lookbehind characters with '<'
characters:
.sp
re> "(?<=123)abc"
data> xx123ab\e=ph
Partial match: 123ab
<<<
.P
3. Because a partial match must always contain at least one character, what
might be considered a partial match of an empty string actually gives a "no
match" result. For example:
.sp
re> /c(?<=abc)x/
data> ab\e=ps
No match
.sp
If the next segment begins "cx", a match should be found, but this will only
happen if characters from the previous segment are retained. For this reason, a
"no match" result should be interpreted as "partial match of an empty string"
when the pattern contains lookbehinds.
.P
4. Matching a subject string that is split into multiple segments may not
always produce exactly the same result as matching over one single long string,
especially when PCRE2_PARTIAL_SOFT is used. The section "Partial Matching and
Word Boundaries" above describes an issue that arises if the pattern ends with
\eb or \eB. Another kind of difference may occur when there are multiple
matching possibilities, because (for PCRE2_PARTIAL_SOFT) a partial match result
is given only when there are no completed matches. This means that as soon as
the shortest match has been found, continuation to a new subject segment is no
longer possible. Consider this \fBpcre2test\fP example:
.sp
re> /dog(sbody)?/
data> dogsb\e=ps
0: dog
data> do\e=ps,dfa
Partial match: do
data> gsb\e=ps,dfa,dfa_restart
0: g
data> dogsbody\e=dfa
0: dogsbody
1: dog
.sp
The first data line passes the string "dogsb" to a standard matching function,
setting the PCRE2_PARTIAL_SOFT option. Although the string is a partial match
for "dogsbody", the result is not PCRE2_ERROR_PARTIAL, because the shorter
string "dog" is a complete match. Similarly, when the subject is presented to
a DFA matching function in several parts ("do" and "gsb" being the first two)
the match stops when "dog" has been found, and it is not possible to continue.
On the other hand, if "dogsbody" is presented as a single string, a DFA
matching function finds both matches.
.P
Because of these problems, it is best to use PCRE2_PARTIAL_HARD when matching
multi-segment data. The example above then behaves differently:
.sp
re> /dog(sbody)?/
data> dogsb\e=ph
Partial match: dogsb
data> do\e=ps,dfa
Partial match: do
data> gsb\e=ph,dfa,dfa_restart
Partial match: gsb
.sp
5. Patterns that contain alternatives at the top level which do not all start
with the same pattern item may not work as expected when PCRE2_DFA_RESTART is
used. For example, consider this pattern:
.sp
1234|3789
.sp
If the first part of the subject is "ABC123", a partial match of the first
alternative is found at offset 3. There is no partial match for the second
alternative, because such a match does not start at the same point in the
subject string. Attempting to continue with the string "7890" does not yield a
match because only those alternatives that match at one point in the subject
are remembered. The problem arises because the start of the second alternative
matches within the first alternative. There is no problem with anchored
patterns or patterns such as:
.sp
1234|ABCD
.sp
where no string can be a partial match for both alternatives. This is not a
problem if a standard matching function is used, because the entire match has
to be rerun each time:
.sp
re> /1234|3789/
data> ABC123\e=ph
Partial match: 123
data> 1237890
0: 3789
.sp
Of course, instead of using PCRE2_DFA_RESTART, the same technique of re-running
the entire match can also be used with the DFA matching function. Another
possibility is to work with two buffers. If a partial match at offset \fIn\fP
in the first buffer is followed by "no match" when PCRE2_DFA_RESTART is used on
the second buffer, you can then try a new match starting at offset \fIn+1\fP in
the first buffer.
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 14 October 2014
Copyright (c) 1997-2014 University of Cambridge.
.fi

View File

@ -424,6 +424,7 @@ PATTERN MODIFIERS
/I info show info about compiled pattern
hex pattern is coded in hexadecimal
jit[=<number>] use JIT
jitverify verify JIT use
locale=<name> use this locale
memory show memory used
newline=<type> set newline type
@ -448,68 +449,69 @@ PATTERN MODIFIERS
as newlines, both in the pattern and (by default) in subject lines. The
type must be one of CR, LF, CRLF, ANYCRLF, or ANY.
Both the \R and newline settings can be changed at match time, but if
this is done, JIT matching is disabled.
Information about a pattern
The debug modifier is a shorthand for info,fullbincode, requesting all
The debug modifier is a shorthand for info,fullbincode, requesting all
available information.
The bincode modifier causes a representation of the compiled code to be
output after compilation. This information does not contain length and
output after compilation. This information does not contain length and
offset values, which ensures that the same output is generated for dif-
ferent internal link sizes and different code unit widths. By using
bincode, the same regression tests can be used in different environ-
ferent internal link sizes and different code unit widths. By using
bincode, the same regression tests can be used in different environ-
ments.
The fullbincode modifier, by contrast, does include length and offset
The fullbincode modifier, by contrast, does include length and offset
values. This is used in a few special tests and is also useful for one-
off tests.
The info modifier requests information about the compiled pattern
(whether it is anchored, has a fixed first character, and so on). The
The info modifier requests information about the compiled pattern
(whether it is anchored, has a fixed first character, and so on). The
information is obtained from the pcre2_pattern_info() function.
Specifying a pattern in hex
The hex modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted
interpreted as pairs of hexadecimal digits. White space is permitted
between pairs. For example:
/ab 32 59/hex
This feature is provided as a way of creating patterns that contain
This feature is provided as a way of creating patterns that contain
binary zero characters. When hex is set, it implies use_length.
Using the pattern's length
By default, pcre2test passes patterns as zero-terminated strings to
pcre2_compile(), giving the length as -1. If use_length is set, the
By default, pcre2test passes patterns as zero-terminated strings to
pcre2_compile(), giving the length as -1. If use_length is set, the
length of the pattern is passed. This is implied if hex is set.
JIT compilation
The /jit modifier may optionally be followed by a number in the range 0
to 7:
The /jit modifier may optionally be followed by and equals sign and a
number in the range 0 to 7:
0 disable JIT
1 normal match only
2 soft partial match only
3 normal match and soft partial match
4 hard partial match only
6 soft and hard partial match
1 use JIT for normal match only
2 use JIT for soft partial match only
3 use JIT for normal match and soft partial match
4 use JIT for hard partial match only
6 use JIT for soft and hard partial match
7 all three modes
If no number is given, 7 is assumed. If JIT compilation is successful,
the compiled JIT code will automatically be used when pcre2_match() is
run, except when incompatible run-time options are specified. For more
details, see the pcre2jit documentation. See also the jitstack modifier
below for a way of setting the size of the JIT stack.
If no number is given, 7 is assumed. If JIT compilation is successful,
the compiled JIT code will automatically be used when pcre2_match() is
run for the appropriate type of match, except when incompatible run-
time options are specified. For more details, see the pcre2jit documen-
tation. See also the jitstack modifier below for a way of setting the
size of the JIT stack.
If the jitverify modifier is specified, the text "(JIT)" is added to
If the jitverify modifier is specified, information about the compiled
pattern shows whether JIT compilation was or was not successful. If
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
tion is successful when jitverify is set, the text "(JIT)" is added to
the first output line after a match or non match when JIT-compiled code
was actually used. This modifier can also be set on a subject line.
was actually used.
Setting a locale
@ -518,31 +520,31 @@ PATTERN MODIFIERS
/pattern/locale=fr_FR
The given locale is set, pcre2_maketables() is called to build a set of
character tables for the locale, and this is then passed to pcre2_com-
pile() when compiling the regular expression. The same tables are used
character tables for the locale, and this is then passed to pcre2_com-
pile() when compiling the regular expression. The same tables are used
when matching the following subject lines. The /locale modifier applies
only to the pattern on which it appears, but can be given in a #pattern
command if a default is needed. Setting a locale and alternate charac-
command if a default is needed. Setting a locale and alternate charac-
ter tables are mutually exclusive.
Showing pattern memory
The /memory modifier causes the size in bytes of the memory block used
to hold the compiled pattern to be output. This does not include the
size of the pcre2_code block; it is just the actual compiled data. If
The /memory modifier causes the size in bytes of the memory block used
to hold the compiled pattern to be output. This does not include the
size of the pcre2_code block; it is just the actual compiled data. If
the pattern is subsequently passed to the JIT compiler, the size of the
JIT compiled code is also output.
Limiting nested parentheses
The parens_nest_limit modifier sets a limit on the depth of nested
parentheses in a pattern. Breaching the limit causes a compilation
The parens_nest_limit modifier sets a limit on the depth of nested
parentheses in a pattern. Breaching the limit causes a compilation
error.
Using the POSIX wrapper API
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
per API rather than its native API. This supports only the 8-bit
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
per API rather than its native API. This supports only the 8-bit
library. When the POSIX API is being used, the following pattern modi-
fiers set options for the regcomp() function:
@ -554,25 +556,25 @@ PATTERN MODIFIERS
ucp REG_UCP ) the POSIX standard
utf REG_UTF8 )
The aftertext and allaftertext subject modifiers work as described
The aftertext and allaftertext subject modifiers work as described
below. All other modifiers cause an error.
Testing the stack guard feature
The /stackguard modifier is used to test the use of pcre2_set_com-
pile_recursion_guard(), a function that is provided to enable stack
availability to be checked during compilation (see the pcre2api docu-
mentation for details). If the number specified by the modifier is
The /stackguard modifier is used to test the use of pcre2_set_com-
pile_recursion_guard(), a function that is provided to enable stack
availability to be checked during compilation (see the pcre2api docu-
mentation for details). If the number specified by the modifier is
greater than zero, pcre2_set_compile_recursion_guard() is called to set
up callback from pcre2_compile() to a local function. The argument it
is passed is the current nesting parenthesis depth; if this is greater
up callback from pcre2_compile() to a local function. The argument it
is passed is the current nesting parenthesis depth; if this is greater
than the value given by the modifier, non-zero is returned, causing the
compilation to be aborted.
Using alternative character tables
The /tables modifier must be followed by a single digit. It causes a
specific set of built-in character tables to be passed to pcre2_com-
The /tables modifier must be followed by a single digit. It causes a
specific set of built-in character tables to be passed to pcre2_com-
pile(). This is used in the PCRE2 tests to check behaviour with differ-
ent character tables. The digit specifies the tables as follows:
@ -581,15 +583,15 @@ PATTERN MODIFIERS
pcre2_chartables.c.dist
2 a set of tables defining ISO 8859 characters
In table 2, some characters whose codes are greater than 128 are iden-
tified as letters, digits, spaces, etc. Setting alternate character
In table 2, some characters whose codes are greater than 128 are iden-
tified as letters, digits, spaces, etc. Setting alternate character
tables and a locale are mutually exclusive.
Setting certain match controls
The following modifiers are really subject modifiers, and are described
below. However, they may be included in a pattern's modifier list, in
which case they are applied to every subject line that is processed
below. However, they may be included in a pattern's modifier list, in
which case they are applied to every subject line that is processed
with that pattern. They do not affect the compilation process.
aftertext show text after match
@ -597,10 +599,9 @@ PATTERN MODIFIERS
allcaptures show all captures
allusedtext show all consulted text
/g global global matching
jitverify verify JIT usage
mark show mark values
These modifiers may not appear in a #pattern command. If you want them
These modifiers may not appear in a #pattern command. If you want them
as defaults, set them in a #subject command.
@ -611,13 +612,12 @@ SUBJECT MODIFIERS
Setting match options
The following modifiers set options for pcre2_match() or
The following modifiers set options for pcre2_match() or
pcre2_dfa_match(). See pcreapi for a description of their effects.
anchored set PCRE2_ANCHORED
dfa_restart set PCRE2_DFA_RESTART
dfa_shortest set PCRE2_DFA_SHORTEST
no_start_optimize set PCRE2_NO_START_OPTIMIZE
no_utf_check set PCRE2_NO_UTF_CHECK
notbol set PCRE2_NOTBOL
notempty set PCRE2_NOTEMPTY
@ -626,28 +626,27 @@ SUBJECT MODIFIERS
partial_hard (or ph) set PCRE2_PARTIAL_HARD
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
The partial matching modifiers are provided with abbreviations because
The partial matching modifiers are provided with abbreviations because
they appear frequently in tests.
If the /posix modifier was present on the pattern, causing the POSIX
If the /posix modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
Any other modifiers cause an error.
Setting match controls
The following modifiers affect the matching process or request addi-
tional information. Some of them may also be specified on a pattern
line (see above), in which case they apply to every subject line that
The following modifiers affect the matching process or request addi-
tional information. Some of them may also be specified on a pattern
line (see above), in which case they apply to every subject line that
is matched against that pattern.
aftertext show text after match
allaftertext show text after captures
allcaptures show all captures
allusedtext show all consulted text
allusedtext show all consulted text (non-JIT only)
altglobal alternative global matching
bsr=[anycrlf|unicode] specify \R handling
callout_capture show captures at callout time
callout_data=<n> set a value to pass via callouts
callout_fail=<n>[:<m>] control callout failure
@ -659,11 +658,9 @@ SUBJECT MODIFIERS
getall extract all captured substrings
/g global global matching
jitstack=<n> set size of JIT stack
jitverify verify JIT usage
mark show mark values
match_limit=>n> set a match limit
memory show memory usage
newline=<type> set newline type
offset=<n> set starting offset
ovector=<n> set size of output vector
recursion_limit=<n> set a recursion limit
@ -671,13 +668,6 @@ SUBJECT MODIFIERS
The effects of these modifiers are described in the following sections.
FIXME: Give more examples.
Newline and \R handling
These modifiers set the newline and \R processing conventions for the
subject line, overriding any values that were set at compile time (as
described above). JIT matching is disabled if these settings are
changed at match time.
Showing more text
The aftertext modifier requests that as well as outputting the sub-
@ -690,18 +680,21 @@ SUBJECT MODIFIERS
ture number.
The allusedtext modifier requests that all the text that was consulted
during a successful pattern match be shown. This affects the output if
there is a lookbehind at the start of a match, or a lookahead at the
end, or if \K is used in the pattern. Characters that precede or follow
the start and end of the actual match are indicated in the output by
'<' or '>' characters underneath them. Here is an example:
during a successful pattern match by the interpreter should be shown.
This feature is not supported for JIT matching, and if requested with
JIT it is ignored (with a warning message). Setting this modifier
affects the output if there is a lookbehind at the start of a match, or
a lookahead at the end, or if \K is used in the pattern. Characters
that precede or follow the start and end of the actual match are indi-
cated in the output by '<' or '>' characters underneath them. Here is
an example:
/(?<=pqr)abc(?=xyz)/
123pqrabcxyz456\=allusedtext
0: pqrabcxyz
<<< >>>
This shows that the matched string is "abc", with the preceding and
This shows that the matched string is "abc", with the preceding and
following strings "pqr" and "xyz" also consulted during the match.
Showing the value of all capture groups
@ -709,124 +702,133 @@ SUBJECT MODIFIERS
The allcaptures modifier requests that the values of all potential cap-
tured parentheses be output after a match. By default, only those up to
the highest one actually used in the match are output (corresponding to
the return code from pcre2_match()). Groups that did not take part in
the return code from pcre2_match()). Groups that did not take part in
the match are output as "<unset>".
Testing callouts
A callout function is supplied when pcre2test calls the library match-
ing functions, unless callout_none is specified. If callout_capture is
A callout function is supplied when pcre2test calls the library match-
ing functions, unless callout_none is specified. If callout_capture is
set, the current captured groups are output when a callout occurs.
The callout_fail modifier can be given one or two numbers. If there is
The callout_fail modifier can be given one or two numbers. If there is
only one number, 1 is returned instead of 0 when a callout of that num-
ber is reached. If two numbers are given, 1 is returned when callout
ber is reached. If two numbers are given, 1 is returned when callout
<n> is reached for the <m>th time.
The callout_data modifier can be given an unsigned or a negative num-
ber. Any value other than zero is used as a return from pcre2test's
The callout_data modifier can be given an unsigned or a negative num-
ber. Any value other than zero is used as a return from pcre2test's
callout function.
Testing substring extraction functions
The copy and get modifiers can be used to test the pcre2_sub-
The copy and get modifiers can be used to test the pcre2_sub-
string_copy_xxx() and pcre2_substring_get_xxx() functions. They can be
given more than once, and each can specify a group name or number, for
given more than once, and each can specify a group name or number, for
example:
abcd\=copy=1,copy=3,get=G1
If the #subject command is used to set default copy and get lists,
these can be unset by specifying a negative number for numbered groups
If the #subject command is used to set default copy and get lists,
these can be unset by specifying a negative number for numbered groups
and an empty name for named groups.
The getall modifier tests pcre2_substring_list_get(), which extracts
The getall modifier tests pcre2_substring_list_get(), which extracts
all captured substrings.
If the subject line is successfully matched, the substrings extracted
by the convenience functions are output with C, G, or L after the
string number instead of a colon. This is in addition to the normal
full list. The string length (that is, the return from the extraction
If the subject line is successfully matched, the substrings extracted
by the convenience functions are output with C, G, or L after the
string number instead of a colon. This is in addition to the normal
full list. The string length (that is, the return from the extraction
function) is given in parentheses after each substring.
Finding all matches in a string
Searching for all possible matches within a subject can be requested by
the global or /altglobal modifier. After finding a match, the matching
function is called again to search the remainder of the subject. The
difference between global and altglobal is that the former uses the
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
searching at a new point within the entire string (which is what Perl
the global or /altglobal modifier. After finding a match, the matching
function is called again to search the remainder of the subject. The
difference between global and altglobal is that the former uses the
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
searching at a new point within the entire string (which is what Perl
does), whereas the latter passes over a shortened substring. This makes
a difference to the matching process if the pattern begins with a look-
behind assertion (including \b or \B).
If an empty string is matched, the next match is done with the
If an empty string is matched, the next match is done with the
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
for another, non-empty, match at the same point in the subject. If this
match fails, the start offset is advanced, and the normal match is
retried. This imitates the way Perl handles such cases when using the
/g modifier or the split() function. Normally, the start offset is
advanced by one character, but if the newline convention recognizes
CRLF as a newline, and the current character is CR followed by LF, an
match fails, the start offset is advanced, and the normal match is
retried. This imitates the way Perl handles such cases when using the
/g modifier or the split() function. Normally, the start offset is
advanced by one character, but if the newline convention recognizes
CRLF as a newline, and the current character is CR followed by LF, an
advance of two is used.
Setting the JIT stack size
The jitstack modifier provides a way of setting the maximum stack size
that is used by the just-in-time optimization code. It is ignored if
JIT optimization is not being used. Providing a stack that is larger
The jitstack modifier provides a way of setting the maximum stack size
that is used by the just-in-time optimization code. It is ignored if
JIT optimization is not being used. Providing a stack that is larger
than the default 32K is necessary only for very complicated patterns.
Setting match and recursion limits
The match_limit and recursion_limit modifiers set the appropriate lim-
The match_limit and recursion_limit modifiers set the appropriate lim-
its in the match context. These values are ignored when the find_limits
modifier is specified.
Finding minimum limits
If the find_limits modifier is present, pcre2test calls pcre2_match()
several times, setting different values in the match context via
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
the minimum values for each parameter that allow pcre2_match() to com-
If the find_limits modifier is present, pcre2test calls pcre2_match()
several times, setting different values in the match context via
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
the minimum values for each parameter that allow pcre2_match() to com-
plete without error.
The match_limit number is a measure of the amount of backtracking that
takes place, and learning the minimum value can be instructive. For
most simple matches, the number is quite small, but for patterns with
very large numbers of matching possibilities, it can become large very
quickly with increasing length of subject string. The
match_limit_recursion number is a measure of how much stack (or, if
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
If JIT is being used, only the match limit is relevant. If DFA matching
is being used, neither limit is relevant, and this modifier is ignored
(with a warning message).
The match_limit number is a measure of the amount of backtracking that
takes place, and learning the minimum value can be instructive. For
most simple matches, the number is quite small, but for patterns with
very large numbers of matching possibilities, it can become large very
quickly with increasing length of subject string. The
match_limit_recursion number is a measure of how much stack (or, if
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
complete the match attempt.
Showing MARK names
The mark modifier causes the names from backtracking control verbs that
are returned from calls to pcre2_match() to be displayed. If a mark is
returned for a match, non-match, or partial match, pcre2test shows it.
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
are returned from calls to pcre2_match() to be displayed. If a mark is
returned for a match, non-match, or partial match, pcre2test shows it.
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
it is added to the non-match message.
Showing memory usage
The memory modifier causes pcre2test to log all memory allocation and
The memory modifier causes pcre2test to log all memory allocation and
freeing calls that occur during a match operation.
Setting a starting offset
The offset modifier sets an offset in the subject string at which
The offset modifier sets an offset in the subject string at which
matching starts. Its value is a number of code units, not characters.
Setting the size of the output vector
The ovector modifier applies only to the subject line in which it
appears, though of course it can also be used to set a default in a
#subject command. It specifies the number of pairs of offsets that are
The ovector modifier applies only to the subject line in which it
appears, though of course it can also be used to set a default in a
#subject command. It specifies the number of pairs of offsets that are
available for storing matching information. The default is 15.
At least one pair of offsets is always created by pcre2_match_data_cre-
ate(), for matching with PCRE2's native API, so a value of 0 is the
same as 1. However a value of 0 is useful when testing the POSIX API
because it causes regexec() to be called with a NULL capture vector.
THE ALTERNATIVE MATCHING FUNCTION
@ -1069,5 +1071,5 @@ AUTHOR
REVISION
Last updated: 19 August 2014
Last updated: 11 October 2014
Copyright (c) 1997-2014 University of Cambridge.

View File

@ -612,6 +612,7 @@ clock_t total_match_time = 0;
static uint32_t dfa_matched;
static uint32_t forbid_utf = 0;
static uint32_t maxlookbehind;
static uint32_t max_oveccount;
static uint32_t callout_count;
@ -2293,6 +2294,55 @@ return 0;
/*************************************************
* Move back by so many characters *
*************************************************/
/* Given a code unit offset in a subject string, move backwards by a number of
characters, and return the resulting offset.
Arguments:
subject pointer to the string
offset start offset
count count to move back by
utf TRUE if in UTF mode
Returns: a possibly changed offset
*/
static PCRE2_SIZE
backchars(uint8_t *subject, PCRE2_SIZE offset, uint32_t count, BOOL utf)
{
long int yield;
if (!utf || test_mode == PCRE32_MODE) yield = offset - count;
else if (test_mode == PCRE8_MODE)
{
PCRE2_SPTR8 pp = (PCRE2_SPTR8)subject + offset;
for (; count > 0; count--)
{
pp--;
while ((*pp & 0xc0) == 0x80) pp--;
}
yield = pp - (PCRE2_SPTR8)subject;
}
else /* 16-bit mode */
{
PCRE2_SPTR16 pp = (PCRE2_SPTR16)subject + offset;
for (; count > 0; count--)
{
pp--;
if ((*pp & 0xfc00) == 0xdc00) pp--;
}
yield = pp - (PCRE2_SPTR16)subject;
}
return (yield >= 0)? yield : 0;
}
/*************************************************
* Read or extend an input line *
*************************************************/
@ -3099,8 +3149,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
BOOL match_limit_set, recursion_limit_set;
uint32_t backrefmax, bsr_convention, capture_count, first_ctype, first_cunit,
hascrorlf, jchanged, last_ctype, last_cunit, match_empty, match_limit,
maxlookbehind, minlength, nameentrysize, namecount, newline_convention,
recursion_limit;
minlength, nameentrysize, namecount, newline_convention, recursion_limit;
/* These info requests may return PCRE2_ERROR_UNSET. */
@ -3145,7 +3194,6 @@ if ((pat_patctl.control & CTL_INFO) != 0)
pattern_info(PCRE2_INFO_LASTCODEUNIT, &last_cunit, FALSE) +
pattern_info(PCRE2_INFO_LASTCODETYPE, &last_ctype, FALSE) +
pattern_info(PCRE2_INFO_MATCHEMPTY, &match_empty, FALSE) +
pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) +
pattern_info(PCRE2_INFO_MINLENGTH, &minlength, FALSE) +
pattern_info(PCRE2_INFO_NAMECOUNT, &namecount, FALSE) +
pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize, FALSE) +
@ -3701,6 +3749,11 @@ if (TEST(compiled_code, ==, NULL))
return PR_SKIP;
}
/* Remember the maximum lookbehind, for partial matching. */
if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0)
return PR_ABEND;
/* Call the JIT compiler if requested. */
if (pat_patctl.jit != 0)
@ -4875,22 +4928,41 @@ for (gmatched = 0;; gmatched++)
} /* End of handling a successful match */
/* There was a partial match. The value of ovector[0] is the bumpalong point,
not any \K point that might exist. */
that is, startchar, not any \K point that might have been passed. */
else if (capcount == PCRE2_ERROR_PARTIAL)
{
PCRE2_SIZE poffset;
int backlength;
int rubriclength = 0;
fprintf(outfile, "Partial match");
if ((dat_datctl.control & CTL_MARK) != 0 &&
TESTFLD(match_data, mark, !=, NULL))
{
fprintf(outfile, ", mark=");
PCHARSV(CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf, outfile);
rubriclength += 7;
}
fprintf(outfile, ": ");
rubriclength += 15;
poffset = backchars(pp, ovector[0], maxlookbehind, utf);
PCHARS(backlength, pp, poffset, ovector[0] - poffset, utf, outfile);
PCHARSV(pp, ovector[0], ulen - ovector[0], utf, outfile);
if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
fprintf(outfile, " (JIT)");
fprintf(outfile, "\n");
if (backlength != 0)
{
int i;
for (i = 0; i < rubriclength; i++) fprintf(outfile, " ");
for (i = 0; i < backlength; i++) fprintf(outfile, "<");
fprintf(outfile, "\n");
}
break; /* Out of the /g loop */
} /* End of handling partial match */

21
testdata/testoutput2 vendored
View File

@ -9286,17 +9286,21 @@ Partial match: abc12
xyzabc123pqr
0: 123
xyzabc12\=ps
Partial match: 12
Partial match: abc12
<<<
xyzabc12\=ph
Partial match: 12
Partial match: abc12
<<<
/\babc\b/
+++abc+++
0: abc
+++ab\=ps
Partial match: ab
Partial match: +ab
<
+++ab\=ph
Partial match: ab
Partial match: +ab
<
/(?&word)(?&element)(?(DEFINE)(?<element><[^m][^>]>[^<])(?<word>\w*+))/B
------------------------------------------------------------------
@ -10324,7 +10328,8 @@ No match
/(?<=abc)def/
abc\=ph
Partial match:
Partial match: abc
<<<
/abc$/
abc
@ -11877,9 +11882,11 @@ Callout 2: last capture = 0
/(?<=123)(*MARK:xx)abc/mark
xxxx123a\=ph
Partial match, mark=xx: a
Partial match, mark=xx: 123a
<<<
xxxx123a\=ps
Partial match, mark=xx: a
Partial match, mark=xx: 123a
<<<
/123\Kabc/
xxxx123a\=ph

18
testdata/testoutput6 vendored
View File

@ -947,7 +947,8 @@ Partial match: abc
xyzfo\=ps
No match
foob\=ps,offset=2
Partial match: b
Partial match: foob
<<<
foobar...\=ps,dfa_restart,offset=4
0: ar
xyzfo\=ps
@ -7092,17 +7093,21 @@ Failed: error -40: item unsupported for DFA matching
xyzabc123pqr
0: 123
xyzabc12\=ps
Partial match: 12
Partial match: abc12
<<<
xyzabc12\=ph
Partial match: 12
Partial match: abc12
<<<
/\babc\b/
+++abc+++
0: abc
+++ab\=ps
Partial match: ab
Partial match: +ab
<
+++ab\=ph
Partial match: ab
Partial match: +ab
<
/(?=C)/g,aftertext
ABCDECBA
@ -7226,7 +7231,8 @@ Failed: error -40: item unsupported for DFA matching
/(?<=abc)def/
abc\=ph
Partial match:
Partial match: abc
<<<
/abc$/
abc