File tidies for 10.00-RC2.

This commit is contained in:
Philip.Hazel 2014-12-19 09:55:25 +00:00
parent e34c44e2aa
commit 2a5767d757
19 changed files with 753 additions and 580 deletions

View File

@ -1,7 +1,7 @@
Change Log for PCRE2 Change Log for PCRE2
-------------------- --------------------
Version 10.00 28-November-2014 Version 10.00 19-December-2014
------------------------------ ------------------------------
Version 10.00 is the first release of PCRE2, a revised API for the PCRE Version 10.00 is the first release of PCRE2, a revised API for the PCRE
@ -14,7 +14,8 @@ logged. In addition to the API changes, the following changes were made. They
are either new functionality, or bug fixes and other noticeable changes of are either new functionality, or bug fixes and other noticeable changes of
behaviour that were implemented after the code had been forked. behaviour that were implemented after the code had been forked.
1. Unicode support is now enabled by default. 1. Unicode support is now enabled by default, but it can optionally be
disabled.
2. The test program, now called pcre2test, was re-specified and almost 2. The test program, now called pcre2test, was re-specified and almost
completely re-written. Its input is not compatible with input for pcretest. completely re-written. Its input is not compatible with input for pcretest.

2
NEWS
View File

@ -1,7 +1,7 @@
News about PCRE2 releases News about PCRE2 releases
------------------------- -------------------------
Version 10.00 28-November-2014 Version 10.00 19-December-2014
------------------------------ ------------------------------
Version 10.00 is the first release of PCRE2, a revised API for the PCRE Version 10.00 is the first release of PCRE2, a revised API for the PCRE

View File

@ -11,7 +11,7 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
m4_define(pcre2_major, [10]) m4_define(pcre2_major, [10])
m4_define(pcre2_minor, [00]) m4_define(pcre2_minor, [00])
m4_define(pcre2_prerelease, [-RC2]) m4_define(pcre2_prerelease, [-RC2])
m4_define(pcre2_date, [2014-11-28]) m4_define(pcre2_date, [2014-12-19])
# NOTE: The CMakeLists.txt file searches for the above variables in the first # NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved. # 50 lines of this file. Please update that if the variables above are moved.

View File

@ -36,8 +36,16 @@ by name, into a given buffer. The arguments are:
</pre> </pre>
The <i>bufflen</i> variable is updated to contain the length of the extracted The <i>bufflen</i> variable is updated to contain the length of the extracted
string, excluding the trailing zero. The yield of the function is zero for string, excluding the trailing zero. The yield of the function is zero for
success, PCRE2_ERROR_NOMEMORY if the buffer is too small, or success or one of the following error numbers:
PCRE2_ERROR_NOSUBSTRING if the string name is invalid. <pre>
PCRE2_ERROR_NOSUBSTRING there are no groups of that name
PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group
PCRE2_ERROR_UNSET the group did not participate in the match
PCRE2_ERROR_NOMEMORY the buffer is not big enough
</pre>
If there is more than one group with the given name, the first one that is set
is returned. In this situation PCRE2_ERROR_UNSET means that no group with the
given name was set.
</P> </P>
<P> <P>
There is a complete description of the PCRE2 native API in the There is a complete description of the PCRE2 native API in the

View File

@ -36,9 +36,15 @@ buffer. The arguments are:
<i>bufflen</i> Length of buffer <i>bufflen</i> Length of buffer
</pre> </pre>
The <i>bufflen</i> variable is updated with the length of the extracted string, The <i>bufflen</i> variable is updated with the length of the extracted string,
excluding the terminating zero. The yield of the function is zero for success, excluding the terminating zero. The yield of the function is zero for success
PCRE2_ERROR_NOMEMORY if the buffer was too small, or PCRE2_ERROR_NOSUBSTRING if or one of the following error numbers:
the string number is invalid. <pre>
PCRE2_ERROR_NOSUBSTRING there are no groups of that number
PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group
PCRE2_ERROR_UNSET the group did not participate in the match
PCRE2_ERROR_NOMEMORY the buffer is too small
</PRE>
</P> </P>
<P> <P>
There is a complete description of the PCRE2 native API in the There is a complete description of the PCRE2 native API in the

View File

@ -37,9 +37,17 @@ newly acquired memory. The arguments are:
The memory in which the substring is placed is obtained by calling the same The memory in which the substring is placed is obtained by calling the same
memory allocation function that was used for the match data block. The memory allocation function that was used for the match data block. The
convenience function <b>pcre2_substring_free()</b> can be used to free it when convenience function <b>pcre2_substring_free()</b> can be used to free it when
it is no longer needed. The yield of the function is zero for success, it is no longer needed. The yield of the function is zero for success or one of
PCRE2_ERROR_NOMEMORY if sufficient memory could not be obtained, or the following error numbers:
PCRE2_ERROR_NOSUBSTRING if the string name is invalid. <pre>
PCRE2_ERROR_NOSUBSTRING there are no groups of that name
PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group
PCRE2_ERROR_UNSET the group did not participate in the match
PCRE2_ERROR_NOMEMORY memory could not be obtained
</pre>
If there is more than one group with the given name, the first one that is set
is returned. In this situation PCRE2_ERROR_UNSET means that no group with the
given name was set.
</P> </P>
<P> <P>
There is a complete description of the PCRE2 native API in the There is a complete description of the PCRE2 native API in the

View File

@ -37,9 +37,15 @@ into newly acquired memory. The arguments are:
The memory in which the substring is placed is obtained by calling the same The memory in which the substring is placed is obtained by calling the same
memory allocation function that was used for the match data block. The memory allocation function that was used for the match data block. The
convenience function <b>pcre2_substring_free()</b> can be used to free it when convenience function <b>pcre2_substring_free()</b> can be used to free it when
it is no longer needed. The yield of the function is zero for success, it is no longer needed. The yield of the function is zero for success or one of
PCRE2_ERROR_NOMEMORY if sufficient memory could not be obtained, or the following error numbers:
PCRE2_ERROR_NOSUBSTRING if the string number is invalid. <pre>
PCRE2_ERROR_NOSUBSTRING there are no groups of that number
PCRE2_ERROR_UNAVAILBLE the ovector was too small for that group
PCRE2_ERROR_UNSET the group did not participate in the match
PCRE2_ERROR_NOMEMORY memory could not be obtained
</PRE>
</P> </P>
<P> <P>
There is a complete description of the PCRE2 native API in the There is a complete description of the PCRE2 native API in the

View File

@ -947,6 +947,14 @@ contains the compiled pattern and related data. The caller must free the memory
by calling <b>pcre2_code_free()</b> when it is no longer needed. by calling <b>pcre2_code_free()</b> when it is no longer needed.
</P> </P>
<P> <P>
NOTE: When one of the matching functions is called, pointers to the compiled
pattern and the subject string are set in the match data block so that they can
be referenced by the extraction functions. After running a match, you must not
free a compiled pattern (or a subject string) until after all operations on the
<a href="#matchdatablock">match data block</a>
have taken place.
</P>
<P>
If the compile context argument <i>ccontext</i> is NULL, memory for the compiled If the compile context argument <i>ccontext</i> is NULL, memory for the compiled
pattern is obtained by calling <b>malloc()</b>. Otherwise, it is obtained from pattern is obtained by calling <b>malloc()</b>. Otherwise, it is obtained from
the same memory function that was used for the compile context. the same memory function that was used for the compile context.
@ -1690,7 +1698,7 @@ pattern with the JIT compiler does not alter the value returned by this option.
<b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b> <b>void pcre2_match_data_free(pcre2_match_data *<i>match_data</i>);</b>
</P> </P>
<P> <P>
Information about successful and unsuccessful matches is placed in a match Information about a successful or unsuccessful match is placed in a match
data block, which is an opaque structure that is accessed by function calls. In data block, which is an opaque structure that is accessed by function calls. In
particular, the match data block contains a vector of offsets into the subject particular, the match data block contains a vector of offsets into the subject
string that define the matched part of the subject and any substrings that were string that define the matched part of the subject and any substrings that were
@ -1724,15 +1732,24 @@ pattern (custom or default).
</P> </P>
<P> <P>
A match data block can be used many times, with the same or different compiled A match data block can be used many times, with the same or different compiled
patterns. When it is no longer needed, it should be freed by calling patterns. You can extract information from a match data block after a match
<b>pcre2_match_data_free()</b>. You can extract information from a match data operation has finished, using functions that are described in the sections on
block after a match operation has finished, using functions that are described
in the sections on
<a href="#matchedstrings">matched strings</a> <a href="#matchedstrings">matched strings</a>
and and
<a href="#matchotherdata">other match data</a> <a href="#matchotherdata">other match data</a>
below. below.
</P> </P>
<P>
When one of the matching functions is called, pointers to the compiled pattern
and the subject string are set in the match data block so that they can be
referenced by the extraction functions. After running a match, you must not
free a compiled pattern or a subject string until after all operations on the
match data block (for that match) have taken place.
</P>
<P>
When a match data block itself is no longer needed, it should be freed by
calling <b>pcre2_match_data_free()</b>.
</P>
<br><a name="SEC23" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br> <br><a name="SEC23" href="#TOC1">MATCHING A PATTERN: THE TRADITIONAL FUNCTION</a><br>
<P> <P>
<b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b> <b>int pcre2_match(const pcre2_code *<i>code</i>, PCRE2_SPTR <i>subject</i>,</b>
@ -2034,8 +2051,14 @@ from a successful match is 1, indicating that just the first pair of offsets
has been set. has been set.
</P> </P>
<P> <P>
If a capturing subpattern is matched repeatedly within a single match If a pattern uses the \K escape sequence within a positive assertion, the
operation, it is the last portion of the string that it matched that is reported start of the match can be greater than the end of the match. For
example, if the pattern (?=ab\K) is matched against "ab", the start and end
offset values for the match are 2 and 0.
</P>
<P>
If a capturing subpattern group is matched repeatedly within a single match
operation, it is the last portion of the subject that it matched that is
returned. returned.
</P> </P>
<P> <P>
@ -2234,25 +2257,34 @@ Captured substrings can be accessed directly by using the ovector as described
<a href="#matchedstrings">above.</a> <a href="#matchedstrings">above.</a>
For convenience, auxiliary functions are provided for extracting captured For convenience, auxiliary functions are provided for extracting captured
substrings as new, separate, zero-terminated strings. The functions in this substrings as new, separate, zero-terminated strings. The functions in this
section identify substrings by number. The next section describes similar section identify substrings by number. The number zero refers to the entire
functions for extracting substrings by name. A substring that contains a binary matched substring, with higher numbers referring to substrings captured by
zero is correctly extracted and has a further zero added on the end, but the parenthesized groups. The next section describes similar functions for
result is not, of course, a C string. extracting captured substrings by name. A substring that contains a binary zero
is correctly extracted and has a further zero added on the end, but the result
is not, of course, a C string.
</P>
<P>
If a pattern uses the \K escape sequence within a positive assertion, the
reported start of the match can be greater than the end of the match. For
example, if the pattern (?=ab\K) is matched against "ab", the start and end
offset values for the match are 2 and 0. In this situation, calling these
functions with a zero substring number extracts a zero-length empty string.
</P> </P>
<P> <P>
You can find the length in code units of a captured substring without You can find the length in code units of a captured substring without
extracting it by calling <b>pcre2_substring_length_bynumber()</b>. The first extracting it by calling <b>pcre2_substring_length_bynumber()</b>. The first
argument is a pointer to the match data block, the second is the group number, argument is a pointer to the match data block, the second is the group number,
and the third is a pointer to a variable into which the length is placed. and the third is a pointer to a variable into which the length is placed. If
you just want to know whether or not the substring has been captured, you can
pass the third argument as NULL.
</P> </P>
<P> <P>
The <b>pcre2_substring_copy_bynumber()</b> function copies one string into a The <b>pcre2_substring_copy_bynumber()</b> function copies a captured substring
supplied buffer, whereas <b>pcre2_substring_get_bynumber()</b> copies it into into a supplied buffer, whereas <b>pcre2_substring_get_bynumber()</b> copies it
new memory, obtained using the same memory allocation function that was used into new memory, obtained using the same memory allocation function that was
for the match data block. The first two arguments of these functions are a used for the match data block. The first two arguments of these functions are a
pointer to the match data block and a capturing group number. A group number of pointer to the match data block and a capturing group number.
zero extracts the substring that matched the entire pattern, and higher values
extract the captured substrings.
</P> </P>
<P> <P>
The final arguments of <b>pcre2_substring_copy_bynumber()</b> are a pointer to The final arguments of <b>pcre2_substring_copy_bynumber()</b> are a pointer to
@ -2268,8 +2300,9 @@ zero. When the substring is no longer needed, the memory should be freed by
calling <b>pcre2_substring_free()</b>. calling <b>pcre2_substring_free()</b>.
</P> </P>
<P> <P>
The return value from these functions is zero for success, or one of these The return value from all these functions is zero for success, or a negative
error codes: error code. If the pattern match failed, the match failure code is returned.
Other possible error codes are:
<pre> <pre>
PCRE2_ERROR_NOMEMORY PCRE2_ERROR_NOMEMORY
</pre> </pre>
@ -2278,10 +2311,20 @@ attempt to get memory failed for <b>pcre2_substring_get_bynumber()</b>.
<pre> <pre>
PCRE2_ERROR_NOSUBSTRING PCRE2_ERROR_NOSUBSTRING
</pre> </pre>
No substring with the given number was captured. This could be because there is There is no substring with that number in the pattern, that is, the number is
no capturing group of that number in the pattern, or because the group with greater than the number of capturing parentheses.
that number did not participate in the match, or because the ovector was too <pre>
small to capture that group. PCRE2_ERROR_UNAVAILABLE
</pre>
The substring number, though not greater than the number of captures in the
pattern, is greater than the number of slots in the ovector, so the substring
could not be captured.
<pre>
PCRE2_ERROR_UNSET
</pre>
The substring did not participate in the match. For example, if the pattern is
(abc)|(def) and the subject is "def", and the ovector contains at least two
capturing slots, substring number 1 is unset.
</P> </P>
<br><a name="SEC29" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br> <br><a name="SEC29" href="#TOC1">EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS</a><br>
<P> <P>
@ -2316,7 +2359,7 @@ capturing subpattern number <i>n+1</i> matches some part of the subject, but
subpattern <i>n</i> has not been used at all, it returns an empty string. This subpattern <i>n</i> has not been used at all, it returns an empty string. This
can be distinguished from a genuine zero-length substring by inspecting the can be distinguished from a genuine zero-length substring by inspecting the
appropriate offset in the ovector, which contain PCRE2_UNSET for unset appropriate offset in the ovector, which contain PCRE2_UNSET for unset
substrings. substrings, or by calling <b>pcre2_substring_length_bynumber()</b>.
<a name="extractbyname"></a></P> <a name="extractbyname"></a></P>
<br><a name="SEC30" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br> <br><a name="SEC30" href="#TOC1">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a><br>
<P> <P>
@ -2350,14 +2393,22 @@ calling <b>pcre2_substring_number_from_name()</b>. The first argument is the
compiled pattern, and the second is the name. The yield of the function is the compiled pattern, and the second is the name. The yield of the function is the
subpattern number, PCRE2_ERROR_NOSUBSTRING if there is no subpattern of that subpattern number, PCRE2_ERROR_NOSUBSTRING if there is no subpattern of that
name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is more than one subpattern of name, or PCRE2_ERROR_NOUNIQUESUBSTRING if there is more than one subpattern of
that name. that name. Given the number, you can extract the substring directly, or use one
of the functions described above.
</P> </P>
<P> <P>
Given the number, you can extract the substring directly, or use one of the For convenience, there are also "byname" functions that correspond to the
functions described above. For convenience, there are also "byname" functions "bynumber" functions, the only difference being that the second argument is a
that correspond to the "bynumber" functions, the only difference being that the name instead of a number. If PCRE2_DUPNAMES is set and there are duplicate
second argument is a name instead of a number. However, if PCRE2_DUPNAMES is names, these functions scan all the groups with the given name, and return the
set and there are duplicate names, the behaviour may not be what you want. first named string that is set.
</P>
<P>
If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
returned. If all groups with the name have numbers that are greater than the
number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is returned. If there
is at least one group with a slot in the ovector, but no group is found to be
set, PCRE2_ERROR_UNSET is returned.
</P> </P>
<P> <P>
<b>Warning:</b> If the pattern uses the (?| feature to set up multiple <b>Warning:</b> If the pattern uses the (?| feature to set up multiple
@ -2451,9 +2502,9 @@ documentation.
<P> <P>
When duplicates are present, <b>pcre2_substring_copy_byname()</b> and When duplicates are present, <b>pcre2_substring_copy_byname()</b> and
<b>pcre2_substring_get_byname()</b> return the first substring corresponding to <b>pcre2_substring_get_byname()</b> return the first substring corresponding to
the given name that is set. If none are set, PCRE2_ERROR_NOSUBSTRING is the given name that is set. Only if none are set is PCRE2_ERROR_UNSET is
returned. The <b>pcre2_substring_number_from_name()</b> function returns returned. The <b>pcre2_substring_number_from_name()</b> function returns the
the error PCRE2_ERROR_NOUNIQUESUBSTRING. error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names.
</P> </P>
<P> <P>
If you want to get full details of all captured substrings for a given name, If you want to get full details of all captured substrings for a given name,
@ -2607,17 +2658,38 @@ is matched against the string
</pre> </pre>
the three matched strings are the three matched strings are
<pre> <pre>
&#60;something&#62;
&#60;something&#62; &#60;something else&#62;
&#60;something&#62; &#60;something else&#62; &#60;something further&#62; &#60;something&#62; &#60;something else&#62; &#60;something further&#62;
&#60;something&#62; &#60;something else&#62;
&#60;something&#62;
</pre> </pre>
On success, the yield of the function is a number greater than zero, which is On success, the yield of the function is a number greater than zero, which is
the number of matched substrings. The offsets of the substrings are returned in the number of matched substrings. The offsets of the substrings are returned in
the ovector, and can be extracted in the same way as for <b>pcre2_match()</b>. the ovector, and can be extracted by number in the same way as for
They are returned in reverse order of length; that is, the longest <b>pcre2_match()</b>, but the numbers bear no relation to any capturing groups
matching string is given first. If there were too many matches to fit into that may exist in the pattern, because DFA matching does not support group
the ovector, the yield of the function is zero, and the vector is filled with capture.
the longest matches. </P>
<P>
Calls to the convenience functions that extract substrings by name
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used after a
DFA match. The convenience functions that extract substrings by number never
return PCRE2_ERROR_NOSUBSTRING, and the meanings of some other errors are
slightly different:
<pre>
PCRE2_ERROR_UNAVAILABLE
</pre>
The ovector is not big enough to include a slot for the given substring number.
<pre>
PCRE2_ERROR_UNSET
</pre>
There is a slot in the ovector for this substring, but there were insufficient
matches to fill it.
</P>
<P>
The matched strings are stored in the ovector in reverse order of length; that
is, the longest matching string is first. If there were too many matches to fit
into the ovector, the yield of the function is zero, and the vector is filled
with the longest matches.
</P> </P>
<P> <P>
NOTE: PCRE2's "auto-possessification" optimization usually applies to character NOTE: PCRE2's "auto-possessification" optimization usually applies to character
@ -2685,7 +2757,7 @@ Cambridge, England.
</P> </P>
<br><a name="SEC37" href="#TOC1">REVISION</a><br> <br><a name="SEC37" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 01 December 2014 Last updated: 14 December 2014
<br> <br>
Copyright &copy; 1997-2014 University of Cambridge. Copyright &copy; 1997-2014 University of Cambridge.
<br> <br>

View File

@ -995,6 +995,13 @@ COMPILING A PATTERN
must free the memory by calling pcre2_code_free() when it is no longer must free the memory by calling pcre2_code_free() when it is no longer
needed. needed.
NOTE: When one of the matching functions is called, pointers to the
compiled pattern and the subject string are set in the match data block
so that they can be referenced by the extraction functions. After run-
ning a match, you must not free a compiled pattern (or a subject
string) until after all operations on the match data block have taken
place.
If the compile context argument ccontext is NULL, memory for the com- If the compile context argument ccontext is NULL, memory for the com-
piled pattern is obtained by calling malloc(). Otherwise, it is piled pattern is obtained by calling malloc(). Otherwise, it is
obtained from the same memory function that was used for the compile obtained from the same memory function that was used for the compile
@ -1710,7 +1717,7 @@ THE MATCH DATA BLOCK
void pcre2_match_data_free(pcre2_match_data *match_data); void pcre2_match_data_free(pcre2_match_data *match_data);
Information about successful and unsuccessful matches is placed in a Information about a successful or unsuccessful match is placed in a
match data block, which is an opaque structure that is accessed by match data block, which is an opaque structure that is accessed by
function calls. In particular, the match data block contains a vector function calls. In particular, the match data block contains a vector
of offsets into the subject string that define the matched part of the of offsets into the subject string that define the matched part of the
@ -1741,11 +1748,20 @@ THE MATCH DATA BLOCK
was used for the compiled pattern (custom or default). was used for the compiled pattern (custom or default).
A match data block can be used many times, with the same or different A match data block can be used many times, with the same or different
compiled patterns. When it is no longer needed, it should be freed by compiled patterns. You can extract information from a match data block
calling pcre2_match_data_free(). You can extract information from a after a match operation has finished, using functions that are
match data block after a match operation has finished, using functions described in the sections on matched strings and other match data
that are described in the sections on matched strings and other match below.
data below.
When one of the matching functions is called, pointers to the compiled
pattern and the subject string are set in the match data block so that
they can be referenced by the extraction functions. After running a
match, you must not free a compiled pattern or a subject string until
after all operations on the match data block (for that match) have
taken place.
When a match data block itself is no longer needed, it should be freed
by calling pcre2_match_data_free().
MATCHING A PATTERN: THE TRADITIONAL FUNCTION MATCHING A PATTERN: THE TRADITIONAL FUNCTION
@ -2017,9 +2033,14 @@ HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS
subpatterns, the return value from a successful match is 1, indicating subpatterns, the return value from a successful match is 1, indicating
that just the first pair of offsets has been set. that just the first pair of offsets has been set.
If a capturing subpattern is matched repeatedly within a single match If a pattern uses the \K escape sequence within a positive assertion,
operation, it is the last portion of the string that it matched that is the reported start of the match can be greater than the end of the
returned. match. For example, if the pattern (?=ab\K) is matched against "ab",
the start and end offset values for the match are 2 and 0.
If a capturing subpattern group is matched repeatedly within a single
match operation, it is the last portion of the subject that it matched
that is returned.
If the ovector is too small to hold all the captured substring offsets, If the ovector is too small to hold all the captured substring offsets,
as much as possible is filled in, and the function returns a value of as much as possible is filled in, and the function returns a value of
@ -2205,24 +2226,33 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
described above. For convenience, auxiliary functions are provided for described above. For convenience, auxiliary functions are provided for
extracting captured substrings as new, separate, zero-terminated extracting captured substrings as new, separate, zero-terminated
strings. The functions in this section identify substrings by number. strings. The functions in this section identify substrings by number.
The next section describes similar functions for extracting substrings The number zero refers to the entire matched substring, with higher
by name. A substring that contains a binary zero is correctly extracted numbers referring to substrings captured by parenthesized groups. The
and has a further zero added on the end, but the result is not, of next section describes similar functions for extracting captured sub-
course, a C string. strings by name. A substring that contains a binary zero is correctly
extracted and has a further zero added on the end, but the result is
not, of course, a C string.
If a pattern uses the \K escape sequence within a positive assertion,
the reported start of the match can be greater than the end of the
match. For example, if the pattern (?=ab\K) is matched against "ab",
the start and end offset values for the match are 2 and 0. In this sit-
uation, calling these functions with a zero substring number extracts a
zero-length empty string.
You can find the length in code units of a captured substring without You can find the length in code units of a captured substring without
extracting it by calling pcre2_substring_length_bynumber(). The first extracting it by calling pcre2_substring_length_bynumber(). The first
argument is a pointer to the match data block, the second is the group argument is a pointer to the match data block, the second is the group
number, and the third is a pointer to a variable into which the length number, and the third is a pointer to a variable into which the length
is placed. is placed. If you just want to know whether or not the substring has
been captured, you can pass the third argument as NULL.
The pcre2_substring_copy_bynumber() function copies one string into a The pcre2_substring_copy_bynumber() function copies a captured sub-
supplied buffer, whereas pcre2_substring_get_bynumber() copies it into string into a supplied buffer, whereas pcre2_substring_get_bynumber()
new memory, obtained using the same memory allocation function that was copies it into new memory, obtained using the same memory allocation
used for the match data block. The first two arguments of these func- function that was used for the match data block. The first two argu-
tions are a pointer to the match data block and a capturing group num- ments of these functions are a pointer to the match data block and a
ber. A group number of zero extracts the substring that matched the capturing group number.
entire pattern, and higher values extract the captured substrings.
The final arguments of pcre2_substring_copy_bynumber() are a pointer to The final arguments of pcre2_substring_copy_bynumber() are a pointer to
the buffer and a pointer to a variable that contains its length in code the buffer and a pointer to a variable that contains its length in code
@ -2235,8 +2265,9 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
terminating zero. When the substring is no longer needed, the memory terminating zero. When the substring is no longer needed, the memory
should be freed by calling pcre2_substring_free(). should be freed by calling pcre2_substring_free().
The return value from these functions is zero for success, or one of The return value from all these functions is zero for success, or a
these error codes: negative error code. If the pattern match failed, the match failure
code is returned. Other possible error codes are:
PCRE2_ERROR_NOMEMORY PCRE2_ERROR_NOMEMORY
@ -2245,10 +2276,20 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
PCRE2_ERROR_NOSUBSTRING PCRE2_ERROR_NOSUBSTRING
No substring with the given number was captured. This could be because There is no substring with that number in the pattern, that is, the
there is no capturing group of that number in the pattern, or because number is greater than the number of capturing parentheses.
the group with that number did not participate in the match, or because
the ovector was too small to capture that group. PCRE2_ERROR_UNAVAILABLE
The substring number, though not greater than the number of captures in
the pattern, is greater than the number of slots in the ovector, so the
substring could not be captured.
PCRE2_ERROR_UNSET
The substring did not participate in the match. For example, if the
pattern is (abc)|(def) and the subject is "def", and the ovector con-
tains at least two capturing slots, substring number 1 is unset.
EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS
@ -2280,7 +2321,8 @@ EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS
but subpattern n has not been used at all, it returns an empty string. but subpattern n has not been used at all, it returns an empty string.
This can be distinguished from a genuine zero-length substring by This can be distinguished from a genuine zero-length substring by
inspecting the appropriate offset in the ovector, which contain inspecting the appropriate offset in the ovector, which contain
PCRE2_UNSET for unset substrings. PCRE2_UNSET for unset substrings, or by calling pcre2_sub-
string_length_bynumber().
EXTRACTING CAPTURED SUBSTRINGS BY NAME EXTRACTING CAPTURED SUBSTRINGS BY NAME
@ -2310,14 +2352,21 @@ EXTRACTING CAPTURED SUBSTRINGS BY NAME
ment is the compiled pattern, and the second is the name. The yield of ment is the compiled pattern, and the second is the name. The yield of
the function is the subpattern number, PCRE2_ERROR_NOSUBSTRING if there the function is the subpattern number, PCRE2_ERROR_NOSUBSTRING if there
is no subpattern of that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if is no subpattern of that name, or PCRE2_ERROR_NOUNIQUESUBSTRING if
there is more than one subpattern of that name. there is more than one subpattern of that name. Given the number, you
can extract the substring directly, or use one of the functions
described above.
Given the number, you can extract the substring directly, or use one of For convenience, there are also "byname" functions that correspond to
the functions described above. For convenience, there are also "byname" the "bynumber" functions, the only difference being that the second
functions that correspond to the "bynumber" functions, the only differ- argument is a name instead of a number. If PCRE2_DUPNAMES is set and
ence being that the second argument is a name instead of a number. How- there are duplicate names, these functions scan all the groups with the
ever, if PCRE2_DUPNAMES is set and there are duplicate names, the be- given name, and return the first named string that is set.
haviour may not be what you want.
If there are no groups with the given name, PCRE2_ERROR_NOSUBSTRING is
returned. If all groups with the name have numbers that are greater
than the number of slots in the ovector, PCRE2_ERROR_UNAVAILABLE is
returned. If there is at least one group with a slot in the ovector,
but no group is found to be set, PCRE2_ERROR_UNSET is returned.
Warning: If the pattern uses the (?| feature to set up multiple subpat- Warning: If the pattern uses the (?| feature to set up multiple subpat-
terns with the same number, as described in the section on duplicate terns with the same number, as described in the section on duplicate
@ -2404,9 +2453,10 @@ DUPLICATE SUBPATTERN NAMES
When duplicates are present, pcre2_substring_copy_byname() and When duplicates are present, pcre2_substring_copy_byname() and
pcre2_substring_get_byname() return the first substring corresponding pcre2_substring_get_byname() return the first substring corresponding
to the given name that is set. If none are set, PCRE2_ERROR_NOSUBSTRING to the given name that is set. Only if none are set is
is returned. The pcre2_substring_number_from_name() function returns PCRE2_ERROR_UNSET is returned. The pcre2_substring_number_from_name()
the error PCRE2_ERROR_NOUNIQUESUBSTRING. function returns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are
duplicate names.
If you want to get full details of all captured substrings for a given If you want to get full details of all captured substrings for a given
name, you must use the pcre2_substring_nametable_scan() function. The name, you must use the pcre2_substring_nametable_scan() function. The
@ -2549,17 +2599,37 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
the three matched strings are the three matched strings are
<something>
<something> <something else>
<something> <something else> <something further> <something> <something else> <something further>
<something> <something else>
<something>
On success, the yield of the function is a number greater than zero, On success, the yield of the function is a number greater than zero,
which is the number of matched substrings. The offsets of the sub- which is the number of matched substrings. The offsets of the sub-
strings are returned in the ovector, and can be extracted in the same strings are returned in the ovector, and can be extracted by number in
way as for pcre2_match(). They are returned in reverse order of the same way as for pcre2_match(), but the numbers bear no relation to
length; that is, the longest matching string is given first. If there any capturing groups that may exist in the pattern, because DFA match-
were too many matches to fit into the ovector, the yield of the func- ing does not support group capture.
tion is zero, and the vector is filled with the longest matches.
Calls to the convenience functions that extract substrings by name
return the error PCRE2_ERROR_DFA_UFUNC (unsupported function) if used
after a DFA match. The convenience functions that extract substrings by
number never return PCRE2_ERROR_NOSUBSTRING, and the meanings of some
other errors are slightly different:
PCRE2_ERROR_UNAVAILABLE
The ovector is not big enough to include a slot for the given substring
number.
PCRE2_ERROR_UNSET
There is a slot in the ovector for this substring, but there were
insufficient matches to fill it.
The matched strings are stored in the ovector in reverse order of
length; that is, the longest matching string is first. If there were
too many matches to fit into the ovector, the yield of the function is
zero, and the vector is filled with the longest matches.
NOTE: PCRE2's "auto-possessification" optimization usually applies to NOTE: PCRE2's "auto-possessification" optimization usually applies to
character repeats at the end of a pattern (as well as internally). For character repeats at the end of a pattern (as well as internally). For
@ -2624,7 +2694,7 @@ AUTHOR
REVISION REVISION
Last updated: 01 December 2014 Last updated: 14 December 2014
Copyright (c) 1997-2014 University of Cambridge. Copyright (c) 1997-2014 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------

View File

@ -201,7 +201,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_NAME "PCRE2" #define PACKAGE_NAME "PCRE2"
/* Define to the full name and version of this package. */ /* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE2 10.00-RC1" #define PACKAGE_STRING "PCRE2 10.00-RC2"
/* Define to the one symbol short name of this package. */ /* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre2" #define PACKAGE_TARNAME "pcre2"
@ -210,7 +210,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_URL "" #define PACKAGE_URL ""
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "10.00-RC1" #define PACKAGE_VERSION "10.00-RC2"
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested /* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
parentheses (of any kind) in a pattern. This limits the amount of system parentheses (of any kind) in a pattern. This limits the amount of system
@ -288,7 +288,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* #undef SUPPORT_VALGRIND */ /* #undef SUPPORT_VALGRIND */
/* Version number of package */ /* Version number of package */
#define VERSION "10.00-RC1" #define VERSION "10.00-RC2"
/* Define to empty if `const' does not conform to ANSI C. */ /* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */ /* #undef const */

View File

@ -43,8 +43,8 @@ POSSIBILITY OF SUCH DAMAGE.
#define PCRE2_MAJOR 10 #define PCRE2_MAJOR 10
#define PCRE2_MINOR 00 #define PCRE2_MINOR 00
#define PCRE2_PRERELEASE -RC1 #define PCRE2_PRERELEASE -RC2
#define PCRE2_DATE 2014-11-28 #define PCRE2_DATE 2014-12-19
/* When an application links to a PCRE DLL in Windows, the symbols that are /* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE2, the appropriate imported have to be identified as such. When building PCRE2, the appropriate
@ -80,20 +80,20 @@ uint8_t, UCHAR_MAX, etc are defined. */
extern "C" { extern "C" {
#endif #endif
/* The following options can be passed to pcre2_compile(), pcre2_match(), or /* The following option bits can be passed to pcre2_compile(), pcre2_match(),
pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it is or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the function to which it
passed. Put these bits at the most significant end of the options word so is passed. Put these bits at the most significant end of the options word so
others can be added next to them */ others can be added next to them */
#define PCRE2_ANCHORED 0x80000000u #define PCRE2_ANCHORED 0x80000000u
#define PCRE2_NO_UTF_CHECK 0x40000000u #define PCRE2_NO_UTF_CHECK 0x40000000u
/* Other options that can be passed to pcre2_compile(). They may affect /* The following option bits can be passed only to pcre2_compile(). However,
compilation, JIT compilation, and/or interpretive execution. The following tags they may affect compilation, JIT compilation, and/or interpretive execution.
indicate which: The following tags indicate which:
C alters what is compiled C alters what is compiled by pcre2_compile()
J alters what JIT compiles J alters what is compiled by pcre2_jit_compile()
M is inspected during pcre2_match() execution M is inspected during pcre2_match() execution
D is inspected during pcre2_dfa_match() execution D is inspected during pcre2_dfa_match() execution
*/ */
@ -212,19 +212,21 @@ context functions. */
#define PCRE2_ERROR_DFA_BADRESTART (-38) #define PCRE2_ERROR_DFA_BADRESTART (-38)
#define PCRE2_ERROR_DFA_RECURSE (-39) #define PCRE2_ERROR_DFA_RECURSE (-39)
#define PCRE2_ERROR_DFA_UCOND (-40) #define PCRE2_ERROR_DFA_UCOND (-40)
#define PCRE2_ERROR_DFA_UITEM (-41) #define PCRE2_ERROR_DFA_UFUNC (-41)
#define PCRE2_ERROR_DFA_WSSIZE (-42) #define PCRE2_ERROR_DFA_UITEM (-42)
#define PCRE2_ERROR_INTERNAL (-43) #define PCRE2_ERROR_DFA_WSSIZE (-43)
#define PCRE2_ERROR_JIT_BADOPTION (-44) #define PCRE2_ERROR_INTERNAL (-44)
#define PCRE2_ERROR_JIT_STACKLIMIT (-45) #define PCRE2_ERROR_JIT_BADOPTION (-45)
#define PCRE2_ERROR_MATCHLIMIT (-46) #define PCRE2_ERROR_JIT_STACKLIMIT (-46)
#define PCRE2_ERROR_NOMEMORY (-47) #define PCRE2_ERROR_MATCHLIMIT (-47)
#define PCRE2_ERROR_NOSUBSTRING (-48) #define PCRE2_ERROR_NOMEMORY (-48)
#define PCRE2_ERROR_NOUNIQUESUBSTRING (-49) #define PCRE2_ERROR_NOSUBSTRING (-49)
#define PCRE2_ERROR_NULL (-50) #define PCRE2_ERROR_NOUNIQUESUBSTRING (-50)
#define PCRE2_ERROR_RECURSELOOP (-51) #define PCRE2_ERROR_NULL (-51)
#define PCRE2_ERROR_RECURSIONLIMIT (-52) #define PCRE2_ERROR_RECURSELOOP (-52)
#define PCRE2_ERROR_UNSET (-53) #define PCRE2_ERROR_RECURSIONLIMIT (-53)
#define PCRE2_ERROR_UNAVAILABLE (-54)
#define PCRE2_ERROR_UNSET (-55)
/* Request types for pcre2_pattern_info() */ /* Request types for pcre2_pattern_info() */
@ -434,16 +436,16 @@ PCRE2_EXP_DECL PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *);
PCRE2_EXP_DECL int pcre2_substring_copy_byname(pcre2_match_data *, \ PCRE2_EXP_DECL int pcre2_substring_copy_byname(pcre2_match_data *, \
PCRE2_SPTR, PCRE2_UCHAR *, PCRE2_SIZE *); \ PCRE2_SPTR, PCRE2_UCHAR *, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_copy_bynumber(pcre2_match_data *, \ PCRE2_EXP_DECL int pcre2_substring_copy_bynumber(pcre2_match_data *, \
unsigned int, PCRE2_UCHAR *, PCRE2_SIZE *); \ uint32_t, PCRE2_UCHAR *, PCRE2_SIZE *); \
PCRE2_EXP_DECL void pcre2_substring_free(PCRE2_UCHAR *); \ PCRE2_EXP_DECL void pcre2_substring_free(PCRE2_UCHAR *); \
PCRE2_EXP_DECL int pcre2_substring_get_byname(pcre2_match_data *, \ PCRE2_EXP_DECL int pcre2_substring_get_byname(pcre2_match_data *, \
PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_SIZE *); \ PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_get_bynumber(pcre2_match_data *, \ PCRE2_EXP_DECL int pcre2_substring_get_bynumber(pcre2_match_data *, \
unsigned int, PCRE2_UCHAR **, PCRE2_SIZE *); \ uint32_t, PCRE2_UCHAR **, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \ PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \
PCRE2_SPTR, PCRE2_SIZE *); \ PCRE2_SPTR, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \ PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \
unsigned int, PCRE2_SIZE *); \ uint32_t, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \ PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \
PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \ PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
PCRE2_EXP_DECL int pcre2_substring_number_from_name(\ PCRE2_EXP_DECL int pcre2_substring_number_from_name(\