diff --git a/CMakeLists.txt b/CMakeLists.txt index 93326cd..eecf208 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -160,7 +160,7 @@ SET(PCRE2GREP_MAX_BUFSIZE "1048576" CACHE STRING "Buffer maximum size parameter for pcre2grep. See PCRE2GREP_MAX_BUFSIZE in config.h.in for details.") SET(PCRE2_NEWLINE "LF" CACHE STRING - "What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF).") + "What to recognize as a newline (one of CR, LF, CRLF, ANY, ANYCRLF, NUL).") SET(PCRE2_HEAP_MATCH_RECURSE OFF CACHE BOOL "Obsolete option: do not use") @@ -344,6 +344,9 @@ ENDIF(PCRE2_NEWLINE STREQUAL "ANY") IF(PCRE2_NEWLINE STREQUAL "ANYCRLF") SET(NEWLINE_DEFAULT "5") ENDIF(PCRE2_NEWLINE STREQUAL "ANYCRLF") +IF(PCRE2_NEWLINE STREQUAL "NUL") + SET(NEWLINE_DEFAULT "6") +ENDIF(PCRE2_NEWLINE STREQUAL "NUL") IF(NEWLINE_DEFAULT STREQUAL "") MESSAGE(FATAL_ERROR "The PCRE2_NEWLINE variable must be set to one of the following values: \"LF\", \"CR\", \"CRLF\", \"ANY\", \"ANYCRLF\".") diff --git a/ChangeLog b/ChangeLog index 1d8f460..d37ec15 100644 --- a/ChangeLog +++ b/ChangeLog @@ -169,6 +169,7 @@ all the tests can run with clang's sanitizing options. 33. Implement extra compile options in the compile context and add the first one: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. +34. Implement newline type PCRE2_NEWLINE_NUL. Version 10.23 14-February-2017 diff --git a/RunGrepTest b/RunGrepTest index 4f6393b..205caf0 100755 --- a/RunGrepTest +++ b/RunGrepTest @@ -662,6 +662,11 @@ $valgrind $vjs $pcre2grep -n --newline=any "^(abc|def|ghi|jkl)" testNinputgrep > printf "%c--------------------------- Test N6 ------------------------------\r\n" - >>testtrygrep $valgrind $vjs $pcre2grep -n --newline=anycrlf "^(abc|def|ghi|jkl)" testNinputgrep >>testtrygrep +printf "abc\0def" >testNinputgrep + +printf "%c--------------------------- Test N7 ------------------------------\r\n" - >>testtrygrep +$valgrind $vjs $pcre2grep -na --newline=nul "^(abc|def)" testNinputgrep | sed 's/\x00/ZERO/' >>testtrygrep + $cf $srcdir/testdata/grepoutputN testtrygrep if [ $? != 0 ] ; then exit 1; fi diff --git a/configure.ac b/configure.ac index 821a246..1205039 100644 --- a/configure.ac +++ b/configure.ac @@ -189,6 +189,10 @@ AC_ARG_ENABLE(newline-is-any, AS_HELP_STRING([--enable-newline-is-any], [use any valid Unicode newline sequence]), ac_pcre2_newline=any) +AC_ARG_ENABLE(newline-is-nul, + AS_HELP_STRING([--enable-newline-is-nul], + [use NUL (binary zero) as newline character]), + ac_pcre2_newline=nul) enable_newline="$ac_pcre2_newline" # Handle --enable-bsr-anycrlf @@ -360,6 +364,7 @@ case "$enable_newline" in crlf) ac_pcre2_newline_value=3 ;; any) ac_pcre2_newline_value=4 ;; anycrlf) ac_pcre2_newline_value=5 ;; + nul) ac_pcre2_newline_value=6 ;; *) AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option]) ;; @@ -658,7 +663,7 @@ AC_DEFINE_UNQUOTED([NEWLINE_DEFAULT], [$ac_pcre2_newline_value], [ The value of NEWLINE_DEFAULT determines the default newline character sequence. PCRE2 client programs can override this by selecting other values at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), - and 5 (ANYCRLF).]) + 5 (ANYCRLF), and 6 (NUL).]) if test "$enable_bsr_anycrlf" = "yes"; then AC_DEFINE([BSR_ANYCRLF], [], [ diff --git a/doc/html/pcre2_config.html b/doc/html/pcre2_config.html index 0edce66..7929d62 100644 --- a/doc/html/pcre2_config.html +++ b/doc/html/pcre2_config.html @@ -57,6 +57,7 @@ point to a uint32_t integer variable. The available codes are: PCRE2_NEWLINE_CRLF PCRE2_NEWLINE_ANY PCRE2_NEWLINE_ANYCRLF + PCRE2_NEWLINE_NUL PCRE2_CONFIG_PARENSLIMIT Default parentheses nesting limit PCRE2_CONFIG_RECURSIONLIMIT Obsolete: use PCRE2_CONFIG_DEPTHLIMIT PCRE2_CONFIG_STACKRECURSE Obsolete: always returns 0 diff --git a/doc/html/pcre2_pattern_info.html b/doc/html/pcre2_pattern_info.html index f59e999..d07f9ed 100644 --- a/doc/html/pcre2_pattern_info.html +++ b/doc/html/pcre2_pattern_info.html @@ -71,6 +71,7 @@ request are as follows: PCRE2_NEWLINE_CRLF PCRE2_NEWLINE_ANY PCRE2_NEWLINE_ANYCRLF + PCRE2_NEWLINE_NUL PCRE2_INFO_RECURSIONLIMIT Obsolete synonym for PCRE2_INFO_DEPTHLIMIT PCRE2_INFO_SIZE Size of compiled pattern diff --git a/doc/html/pcre2_set_newline.html b/doc/html/pcre2_set_newline.html index ae6332a..a078f69 100644 --- a/doc/html/pcre2_set_newline.html +++ b/doc/html/pcre2_set_newline.html @@ -35,6 +35,7 @@ matching patterns. The second argument must be one of: PCRE2_NEWLINE_CRLF CR followed by LF only PCRE2_NEWLINE_ANYCRLF Any of the above PCRE2_NEWLINE_ANY Any Unicode newline sequence + PCRE2_NEWLINE_NUL The NUL character (binary zero) The result is zero for success or PCRE2_ERROR_BADDATA if the second argument is invalid. diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index 04da2ea..98323c6 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -783,8 +783,9 @@ PCRE2_SIZE variable can hold, which is effectively unlimited. This specifies which characters or character sequences are to be recognized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the two-character -sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), or -PCRE2_NEWLINE_ANY (any Unicode newline sequence). +sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any of the above), +PCRE2_NEWLINE_ANY (any Unicode newline sequence), or PCRE2_NEWLINE_NUL (the +NUL character, that is a binary zero).
A pattern can override the value set in the compile context by starting with a @@ -1106,6 +1107,7 @@ sequence that is recognized as meaning "newline". The values are: PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF + PCRE2_NEWLINE_NUL The NUL character (binary zero) The default should normally correspond to the standard sequence for your operating system. @@ -2121,6 +2123,7 @@ The output is one of the following uint32_t values: PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF + PCRE2_NEWLINE_NUL The NUL character (binary zero) This identifies the character sequence that will be recognized as meaning "newline" while matching. @@ -3468,7 +3471,7 @@ Cambridge, England.
-Last updated: 17 May 2017
+Last updated: 26 May 2017
Copyright © 1997-2017 University of Cambridge.
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html
index 499b2d1..ec2f726 100644
--- a/doc/html/pcre2grep.html
+++ b/doc/html/pcre2grep.html
@@ -142,9 +142,11 @@ standard input is always so treated.
BINARY FILES
By default, a file that contains a binary zero byte within the first 1024 bytes -is identified as a binary file, and is processed specially. (GNU grep also -identifies binary files in this manner.) See the --binary-files option -for a means of changing the way binary files are handled. +is identified as a binary file, and is processed specially. (GNU grep +identifies binary files in this manner.) However, if the newline type is +specified as "nul", that is, the line terminator is a binary zero, the test for +a binary file is not applied. See the --binary-files option for a means +of changing the way binary files are handled.
@@ -934,7 +936,7 @@ Cambridge, England.
-Last updated: 11 April 2017
+Last updated: 26 May 2017
Copyright © 1997-2017 University of Cambridge.
diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html
index 10aeeee..9679933 100644
--- a/doc/html/pcre2pattern.html
+++ b/doc/html/pcre2pattern.html
@@ -214,10 +214,10 @@ amount of system stack that is used.
Newline conventions
-PCRE2 supports five different conventions for indicating line breaks in +PCRE2 supports six different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (linefeed) -character, the two-character sequence CRLF, any of the three preceding, or any -Unicode newline sequence. The +character, the two-character sequence CRLF, any of the three preceding, any +Unicode newline sequence, or the NUL character (binary zero). The pcre2api page has further discussion @@ -226,13 +226,14 @@ about newlines, and shows how to set the newline convention when calling
It is also possible to specify a newline convention by starting a pattern -string with one of the following five sequences: +string with one of the following sequences:
(*CR) carriage return (*LF) linefeed (*CRLF) carriage return, followed by linefeed (*ANYCRLF) any of the three above (*ANY) all Unicode newline sequences + (*NUL) the NUL character (binary zero)These override the default and the options given to the compiling function. For example, on a Unix system where LF is the default newline sequence, the pattern @@ -3444,7 +3445,7 @@ Cambridge, England.
-Last updated: 18 April 2017
+Last updated: 26 May 2017
Copyright © 1997-2017 University of Cambridge.
diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html
index 3f4a063..ec29303 100644
--- a/doc/html/pcre2syntax.html
+++ b/doc/html/pcre2syntax.html
@@ -468,6 +468,7 @@ settings with a similar syntax.
(*CRLF) carriage return followed by linefeed
(*ANYCRLF) all three of the above
(*ANY) any Unicode newline sequence
+ (*NUL) the NUL character (binary zero)
-Last updated: 18 April 2017
+Last updated: 26 May 2017
Copyright © 1997-2017 University of Cambridge.
diff --git a/doc/html/pcre2test.html b/doc/html/pcre2test.html
index 0788a3f..a8d7d08 100644
--- a/doc/html/pcre2test.html
+++ b/doc/html/pcre2test.html
@@ -182,7 +182,7 @@ following options output the value and set the exit code as indicated:
linksize the configured internal link size (2, 3, or 4)
exit code is set to the link size
newline the default newline setting:
- CR, LF, CRLF, ANYCRLF, or ANY
+ CR, LF, CRLF, ANYCRLF, ANY, or NUL
exit code is always 0
bsr the default setting for what \R matches:
ANYCRLF or ANY
@@ -367,8 +367,8 @@ when PCRE2 is compiled with either CR or CRLF as the default newline.
The #newline_default command specifies a list of newline types that are -acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, or -ANY (in upper or lower case), for example: +acceptable as the default. The types must be one of CR, LF, CRLF, ANYCRLF, +ANY, or NUL (in upper or lower case), for example:
#newline_default LF Any anyCRLF@@ -655,7 +655,7 @@ is built, with the default default being Unicode.
The newline modifier specifies which characters are to be interpreted as newlines, both in the pattern and in subject lines. The type must be one of CR, -LF, CRLF, ANYCRLF, or ANY (in upper or lower case). +LF, CRLF, ANYCRLF, ANY, or NUL (in upper or lower case).
-Last updated: 17 May 2017
+Last updated: 26 May 2017
Copyright © 1997-2017 University of Cambridge.
diff --git a/doc/pcre2.txt b/doc/pcre2.txt
index 4805b05..d672333 100644
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
@@ -817,42 +817,43 @@ PCRE2 CONTEXTS
nized as newlines. The value must be one of PCRE2_NEWLINE_CR (carriage
return only), PCRE2_NEWLINE_LF (linefeed only), PCRE2_NEWLINE_CRLF (the
two-character sequence CR followed by LF), PCRE2_NEWLINE_ANYCRLF (any
- of the above), or PCRE2_NEWLINE_ANY (any Unicode newline sequence).
+ of the above), PCRE2_NEWLINE_ANY (any Unicode newline sequence), or
+ PCRE2_NEWLINE_NUL (the NUL character, that is a binary zero).
A pattern can override the value set in the compile context by starting
with a sequence such as (*CRLF). See the pcre2pattern page for details.
- When a pattern is compiled with the PCRE2_EXTENDED or
+ When a pattern is compiled with the PCRE2_EXTENDED or
PCRE2_EXTENDED_MORE option, the newline convention affects the recogni-
- tion of white space and the end of internal comments starting with #.
- The value is saved with the compiled pattern for subsequent use by the
- JIT compiler and by the two interpreted matching functions,
+ tion of white space and the end of internal comments starting with #.
+ The value is saved with the compiled pattern for subsequent use by the
+ JIT compiler and by the two interpreted matching functions,
pcre2_match() and pcre2_dfa_match().
int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext,
uint32_t value);
This parameter ajusts the limit, set when PCRE2 is built (default 250),
- on the depth of parenthesis nesting in a pattern. This limit stops
- rogue patterns using up too much system stack when being compiled. The
+ on the depth of parenthesis nesting in a pattern. This limit stops
+ rogue patterns using up too much system stack when being compiled. The
limit applies to parentheses of all kinds, not just capturing parenthe-
ses.
int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
int (*guard_function)(uint32_t, void *), void *user_data);
- There is at least one application that runs PCRE2 in threads with very
- limited system stack, where running out of stack is to be avoided at
- all costs. The parenthesis limit above cannot take account of how much
- stack is actually available during compilation. For a finer control,
- you can supply a function that is called whenever pcre2_compile()
- starts to compile a parenthesized part of a pattern. This function can
- check the actual stack size (or anything else that it wants to, of
+ There is at least one application that runs PCRE2 in threads with very
+ limited system stack, where running out of stack is to be avoided at
+ all costs. The parenthesis limit above cannot take account of how much
+ stack is actually available during compilation. For a finer control,
+ you can supply a function that is called whenever pcre2_compile()
+ starts to compile a parenthesized part of a pattern. This function can
+ check the actual stack size (or anything else that it wants to, of
course).
- The first argument to the callout function gives the current depth of
- nesting, and the second is user data that is set up by the last argu-
- ment of pcre2_set_compile_recursion_guard(). The callout function
+ The first argument to the callout function gives the current depth of
+ nesting, and the second is user data that is set up by the last argu-
+ ment of pcre2_set_compile_recursion_guard(). The callout function
should return zero if all is well, or non-zero to force an error.
The match context
@@ -866,10 +867,10 @@ PCRE2 CONTEXTS
Change the backtracking depth limit
Set custom memory management specifically for the match
- If none of these apply, just pass NULL as the context argument of
+ If none of these apply, just pass NULL as the context argument of
pcre2_match(), pcre2_dfa_match(), or pcre2_jit_match().
- A match context is created, copied, and freed by the following func-
+ A match context is created, copied, and freed by the following func-
tions:
pcre2_match_context *pcre2_match_context_create(
@@ -880,7 +881,7 @@ PCRE2 CONTEXTS
void pcre2_match_context_free(pcre2_match_context *mcontext);
- A match context is created with default values for its parameters.
+ A match context is created with default values for its parameters.
These can be changed by calling the following functions, which return 0
on success, or PCRE2_ERROR_BADDATA if invalid data is detected.
@@ -895,39 +896,39 @@ PCRE2 CONTEXTS
int pcre2_set_offset_limit(pcre2_match_context *mcontext,
PCRE2_SIZE value);
- The offset_limit parameter limits how far an unanchored search can
- advance in the subject string. The default value is PCRE2_UNSET. The
- pcre2_match() and pcre2_dfa_match() functions return
- PCRE2_ERROR_NOMATCH if a match with a starting point before or at the
+ The offset_limit parameter limits how far an unanchored search can
+ advance in the subject string. The default value is PCRE2_UNSET. The
+ pcre2_match() and pcre2_dfa_match() functions return
+ PCRE2_ERROR_NOMATCH if a match with a starting point before or at the
given offset is not found. For example, if the pattern /abc/ is matched
- against "123abc" with an offset limit less than 3, the result is
- PCRE2_ERROR_NO_MATCH. A match can never be found if the startoffset
+ against "123abc" with an offset limit less than 3, the result is
+ PCRE2_ERROR_NO_MATCH. A match can never be found if the startoffset
argument of pcre2_match() or pcre2_dfa_match() is greater than the off-
set limit.
- When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT
+ When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT
option when calling pcre2_compile() so that when JIT is in use, differ-
- ent code can be compiled. If a match is started with a non-default
- match limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is gener-
+ ent code can be compiled. If a match is started with a non-default
+ match limit when PCRE2_USE_OFFSET_LIMIT is not set, an error is gener-
ated.
- The offset limit facility can be used to track progress when searching
- large subject strings. See also the PCRE2_FIRSTLINE option, which
+ The offset limit facility can be used to track progress when searching
+ large subject strings. See also the PCRE2_FIRSTLINE option, which
requires a match to start within the first line of the subject. If this
- is set with an offset limit, a match must occur in the first line and
- also within the offset limit. In other words, whichever limit comes
+ is set with an offset limit, a match must occur in the first line and
+ also within the offset limit. In other words, whichever limit comes
first is used.
int pcre2_set_heap_limit(pcre2_match_context *mcontext,
uint32_t value);
- The heap_limit parameter specifies, in units of kilobytes, the maximum
- amount of heap memory that pcre2_match() may use to hold backtracking
- information when running an interpretive match. This limit does not
- apply to matching with the JIT optimization, which has its own memory
+ The heap_limit parameter specifies, in units of kilobytes, the maximum
+ amount of heap memory that pcre2_match() may use to hold backtracking
+ information when running an interpretive match. This limit does not
+ apply to matching with the JIT optimization, which has its own memory
control arrangements (see the pcre2jit documentation for more details),
- nor does it apply to pcre2_dfa_match(). If the limit is reached, the
- negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
+ nor does it apply to pcre2_dfa_match(). If the limit is reached, the
+ negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default
limit is set when PCRE2 is built; the default default is very large and
is essentially "unlimited".
@@ -936,83 +937,83 @@ PCRE2 CONTEXTS
(*LIMIT_HEAP=ddd)
- where ddd is a decimal number. However, such a setting is ignored
- unless ddd is less than the limit set by the caller of pcre2_match()
+ where ddd is a decimal number. However, such a setting is ignored
+ unless ddd is less than the limit set by the caller of pcre2_match()
or, if no such limit is set, less than the default.
- The pcre2_match() function starts out using a 20K vector on the system
- stack for recording backtracking points. The more nested backtracking
+ The pcre2_match() function starts out using a 20K vector on the system
+ stack for recording backtracking points. The more nested backtracking
points there are (that is, the deeper the search tree), the more memory
- is needed. Heap memory is used only if the initial vector is too
+ is needed. Heap memory is used only if the initial vector is too
small. If the heap limit is set to a value less than 21 (in particular,
- zero) no heap memory will be used. In this case, only patterns that do
+ zero) no heap memory will be used. In this case, only patterns that do
not have a lot of nested backtracking can be successfully processed.
int pcre2_set_match_limit(pcre2_match_context *mcontext,
uint32_t value);
- The match_limit parameter provides a means of preventing PCRE2 from
+ The match_limit parameter provides a means of preventing PCRE2 from
using up too many computing resources when processing patterns that are
not going to match, but which have a very large number of possibilities
- in their search trees. The classic example is a pattern that uses
+ in their search trees. The classic example is a pattern that uses
nested unlimited repeats.
- There is an internal counter in pcre2_match() that is incremented each
- time round its main matching loop. If this value reaches the match
+ There is an internal counter in pcre2_match() that is incremented each
+ time round its main matching loop. If this value reaches the match
limit, pcre2_match() returns the negative value PCRE2_ERROR_MATCHLIMIT.
- This has the effect of limiting the amount of backtracking that can
+ This has the effect of limiting the amount of backtracking that can
take place. For patterns that are not anchored, the count restarts from
- zero for each position in the subject string. This limit is not rele-
+ zero for each position in the subject string. This limit is not rele-
vant to pcre2_dfa_match(), which ignores it.
- When pcre2_match() is called with a pattern that was successfully pro-
+ When pcre2_match() is called with a pattern that was successfully pro-
cessed by pcre2_jit_compile(), the way in which matching is executed is
- entirely different. However, there is still the possibility of runaway
- matching that goes on for a very long time, and so the match_limit
- value is also used in this case (but in a different way) to limit how
+ entirely different. However, there is still the possibility of runaway
+ matching that goes on for a very long time, and so the match_limit
+ value is also used in this case (but in a different way) to limit how
long the matching can continue.
- The default value for the limit can be set when PCRE2 is built; the
- default default is 10 million, which handles all but the most extreme
- cases. A value for the match limit may also be supplied by an item at
+ The default value for the limit can be set when PCRE2 is built; the
+ default default is 10 million, which handles all but the most extreme
+ cases. A value for the match limit may also be supplied by an item at
the start of a pattern of the form
(*LIMIT_MATCH=ddd)
- where ddd is a decimal number. However, such a setting is ignored
- unless ddd is less than the limit set by the caller of pcre2_match()
+ where ddd is a decimal number. However, such a setting is ignored
+ unless ddd is less than the limit set by the caller of pcre2_match()
or, if no such limit is set, less than the default.
int pcre2_set_depth_limit(pcre2_match_context *mcontext,
uint32_t value);
- This parameter limits the depth of nested backtracking in
- pcre2_match(). Each time a nested backtracking point is passed, a new
+ This parameter limits the depth of nested backtracking in
+ pcre2_match(). Each time a nested backtracking point is passed, a new
memory "frame" is used to remember the state of matching at that point.
- Thus, this parameter indirectly limits the amount of memory that is
- used in a match. However, because the size of each memory "frame"
+ Thus, this parameter indirectly limits the amount of memory that is
+ used in a match. However, because the size of each memory "frame"
depends on the number of capturing parentheses, the actual memory limit
- varies from pattern to pattern. This limit was more useful in versions
+ varies from pattern to pattern. This limit was more useful in versions
before 10.30, where function recursion was used for backtracking.
- The depth limit is not relevant, and is ignored, when matching is done
+ The depth limit is not relevant, and is ignored, when matching is done
using JIT compiled code. However, it is supported by pcre2_dfa_match(),
- which uses it to limit the depth of internal recursive function calls
+ which uses it to limit the depth of internal recursive function calls
that implement atomic groups, lookaround assertions, and pattern recur-
- sions. This is, therefore, an indirect limit on the amount of system
+ sions. This is, therefore, an indirect limit on the amount of system
stack that is used. A recursive pattern such as /(.)(?1)/, when matched
- to a very long string using pcre2_dfa_match(), can use a great deal of
+ to a very long string using pcre2_dfa_match(), can use a great deal of
stack.
- The default value for the depth limit can be set when PCRE2 is built;
- the default default is the same value as the default for the match
- limit. If the limit is exceeded, pcre2_match() or pcre2_dfa_match()
+ The default value for the depth limit can be set when PCRE2 is built;
+ the default default is the same value as the default for the match
+ limit. If the limit is exceeded, pcre2_match() or pcre2_dfa_match()
returns PCRE2_ERROR_DEPTHLIMIT. A value for the depth limit may also be
supplied by an item at the start of a pattern of the form
(*LIMIT_DEPTH=ddd)
- where ddd is a decimal number. However, such a setting is ignored
+ where ddd is a decimal number. However, such a setting is ignored
unless ddd is less than the limit set by the caller of pcre2_match() or
pcre2_dfa_match() or, if no such limit is set, less than the default.
@@ -1021,88 +1022,88 @@ CHECKING BUILD-TIME OPTIONS
int pcre2_config(uint32_t what, void *where);
- The function pcre2_config() makes it possible for a PCRE2 client to
- discover which optional features have been compiled into the PCRE2
- library. The pcre2build documentation has more details about these
+ The function pcre2_config() makes it possible for a PCRE2 client to
+ discover which optional features have been compiled into the PCRE2
+ library. The pcre2build documentation has more details about these
optional features.
- The first argument for pcre2_config() specifies which information is
- required. The second argument is a pointer to memory into which the
- information is placed. If NULL is passed, the function returns the
- amount of memory that is needed for the requested information. For
- calls that return numerical values, the value is in bytes; when
- requesting these values, where should point to appropriately aligned
- memory. For calls that return strings, the required length is given in
+ The first argument for pcre2_config() specifies which information is
+ required. The second argument is a pointer to memory into which the
+ information is placed. If NULL is passed, the function returns the
+ amount of memory that is needed for the requested information. For
+ calls that return numerical values, the value is in bytes; when
+ requesting these values, where should point to appropriately aligned
+ memory. For calls that return strings, the required length is given in
code units, not counting the terminating zero.
- When requesting information, the returned value from pcre2_config() is
- non-negative on success, or the negative error code PCRE2_ERROR_BADOP-
- TION if the value in the first argument is not recognized. The follow-
+ When requesting information, the returned value from pcre2_config() is
+ non-negative on success, or the negative error code PCRE2_ERROR_BADOP-
+ TION if the value in the first argument is not recognized. The follow-
ing information is available:
PCRE2_CONFIG_BSR
- The output is a uint32_t integer whose value indicates what character
- sequences the \R escape sequence matches by default. A value of
+ The output is a uint32_t integer whose value indicates what character
+ sequences the \R escape sequence matches by default. A value of
PCRE2_BSR_UNICODE means that \R matches any Unicode line ending
- sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR,
+ sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR,
LF, or CRLF. The default can be overridden when a pattern is compiled.
PCRE2_CONFIG_DEPTHLIMIT
- The output is a uint32_t integer that gives the default limit for the
- depth of nested backtracking in pcre2_match() or the depth of nested
- recursions and lookarounds in pcre2_dfa_match(). Further details are
+ The output is a uint32_t integer that gives the default limit for the
+ depth of nested backtracking in pcre2_match() or the depth of nested
+ recursions and lookarounds in pcre2_dfa_match(). Further details are
given with pcre2_set_depth_limit() above.
PCRE2_CONFIG_HEAPLIMIT
- The output is a uint32_t integer that gives, in kilobytes, the default
- limit for the amount of heap memory used by pcre2_match(). Further
+ The output is a uint32_t integer that gives, in kilobytes, the default
+ limit for the amount of heap memory used by pcre2_match(). Further
details are given with pcre2_set_heap_limit() above.
PCRE2_CONFIG_JIT
- The output is a uint32_t integer that is set to one if support for
+ The output is a uint32_t integer that is set to one if support for
just-in-time compiling is available; otherwise it is set to zero.
PCRE2_CONFIG_JITTARGET
- The where argument should point to a buffer that is at least 48 code
- units long. (The exact length required can be found by calling
- pcre2_config() with where set to NULL.) The buffer is filled with a
- string that contains the name of the architecture for which the JIT
- compiler is configured, for example "x86 32bit (little endian +
- unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is
- returned, otherwise the number of code units used is returned. This is
+ The where argument should point to a buffer that is at least 48 code
+ units long. (The exact length required can be found by calling
+ pcre2_config() with where set to NULL.) The buffer is filled with a
+ string that contains the name of the architecture for which the JIT
+ compiler is configured, for example "x86 32bit (little endian +
+ unaligned)". If JIT support is not available, PCRE2_ERROR_BADOPTION is
+ returned, otherwise the number of code units used is returned. This is
the length of the string, plus one unit for the terminating zero.
PCRE2_CONFIG_LINKSIZE
The output is a uint32_t integer that contains the number of bytes used
- for internal linkage in compiled regular expressions. When PCRE2 is
- configured, the value can be set to 2, 3, or 4, with the default being
- 2. This is the value that is returned by pcre2_config(). However, when
- the 16-bit library is compiled, a value of 3 is rounded up to 4, and
- when the 32-bit library is compiled, internal linkages always use 4
+ for internal linkage in compiled regular expressions. When PCRE2 is
+ configured, the value can be set to 2, 3, or 4, with the default being
+ 2. This is the value that is returned by pcre2_config(). However, when
+ the 16-bit library is compiled, a value of 3 is rounded up to 4, and
+ when the 32-bit library is compiled, internal linkages always use 4
bytes, so the configured value is not relevant.
The default value of 2 for the 8-bit and 16-bit libraries is sufficient
- for all but the most massive patterns, since it allows the size of the
+ for all but the most massive patterns, since it allows the size of the
compiled pattern to be up to 64K code units. Larger values allow larger
- regular expressions to be compiled by those two libraries, but at the
+ regular expressions to be compiled by those two libraries, but at the
expense of slower matching.
PCRE2_CONFIG_MATCHLIMIT
The output is a uint32_t integer that gives the default match limit for
- pcre2_match(). Further details are given with pcre2_set_match_limit()
+ pcre2_match(). Further details are given with pcre2_set_match_limit()
above.
PCRE2_CONFIG_NEWLINE
- The output is a uint32_t integer whose value specifies the default
- character sequence that is recognized as meaning "newline". The values
+ The output is a uint32_t integer whose value specifies the default
+ character sequence that is recognized as meaning "newline". The values
are:
PCRE2_NEWLINE_CR Carriage return (CR)
@@ -1110,18 +1111,19 @@ CHECKING BUILD-TIME OPTIONS
PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF)
PCRE2_NEWLINE_ANY Any Unicode line ending
PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF
+ PCRE2_NEWLINE_NUL The NUL character (binary zero)
- The default should normally correspond to the standard sequence for
+ The default should normally correspond to the standard sequence for
your operating system.
PCRE2_CONFIG_PARENSLIMIT
- The output is a uint32_t integer that gives the maximum depth of nest-
+ The output is a uint32_t integer that gives the maximum depth of nest-
ing of parentheses (of any kind) in a pattern. This limit is imposed to
- cap the amount of system stack used when a pattern is compiled. It is
- specified when PCRE2 is built; the default is 250. This limit does not
- take into account the stack that may already be used by the calling
- application. For finer control over compilation stack usage, see
+ cap the amount of system stack used when a pattern is compiled. It is
+ specified when PCRE2 is built; the default is 250. This limit does not
+ take into account the stack that may already be used by the calling
+ application. For finer control over compilation stack usage, see
pcre2_set_compile_recursion_guard().
PCRE2_CONFIG_STACKRECURSE
@@ -1131,25 +1133,25 @@ CHECKING BUILD-TIME OPTIONS
PCRE2_CONFIG_UNICODE_VERSION
- The where argument should point to a buffer that is at least 24 code
- units long. (The exact length required can be found by calling
- pcre2_config() with where set to NULL.) If PCRE2 has been compiled
- without Unicode support, the buffer is filled with the text "Unicode
- not supported". Otherwise, the Unicode version string (for example,
- "8.0.0") is inserted. The number of code units used is returned. This
+ The where argument should point to a buffer that is at least 24 code
+ units long. (The exact length required can be found by calling
+ pcre2_config() with where set to NULL.) If PCRE2 has been compiled
+ without Unicode support, the buffer is filled with the text "Unicode
+ not supported". Otherwise, the Unicode version string (for example,
+ "8.0.0") is inserted. The number of code units used is returned. This
is the length of the string plus one unit for the terminating zero.
PCRE2_CONFIG_UNICODE
- The output is a uint32_t integer that is set to one if Unicode support
- is available; otherwise it is set to zero. Unicode support implies UTF
+ The output is a uint32_t integer that is set to one if Unicode support
+ is available; otherwise it is set to zero. Unicode support implies UTF
support.
PCRE2_CONFIG_VERSION
- The where argument should point to a buffer that is at least 24 code
- units long. (The exact length required can be found by calling
- pcre2_config() with where set to NULL.) The buffer is filled with the
+ The where argument should point to a buffer that is at least 24 code
+ units long. (The exact length required can be found by calling
+ pcre2_config() with where set to NULL.) The buffer is filled with the
PCRE2 version string, zero-terminated. The number of code units used is
returned. This is the length of the string plus one unit for the termi-
nating zero.
@@ -1167,90 +1169,90 @@ COMPILING A PATTERN
pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
- The pcre2_compile() function compiles a pattern into an internal form.
- The pattern is defined by a pointer to a string of code units and a
- length. If the pattern is zero-terminated, the length can be specified
- as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of
- memory that contains the compiled pattern and related data, or NULL if
+ The pcre2_compile() function compiles a pattern into an internal form.
+ The pattern is defined by a pointer to a string of code units and a
+ length. If the pattern is zero-terminated, the length can be specified
+ as PCRE2_ZERO_TERMINATED. The function returns a pointer to a block of
+ memory that contains the compiled pattern and related data, or NULL if
an error occurred.
- If the compile context argument ccontext is NULL, memory for the com-
- piled pattern is obtained by calling malloc(). Otherwise, it is
- obtained from the same memory function that was used for the compile
- context. The caller must free the memory by calling pcre2_code_free()
+ If the compile context argument ccontext is NULL, memory for the com-
+ piled pattern is obtained by calling malloc(). Otherwise, it is
+ obtained from the same memory function that was used for the compile
+ context. The caller must free the memory by calling pcre2_code_free()
when it is no longer needed.
The function pcre2_code_copy() makes a copy of the compiled code in new
- memory, using the same memory allocator as was used for the original.
- However, if the code has been processed by the JIT compiler (see
- below), the JIT information cannot be copied (because it is position-
+ memory, using the same memory allocator as was used for the original.
+ However, if the code has been processed by the JIT compiler (see
+ below), the JIT information cannot be copied (because it is position-
dependent). The new copy can initially be used only for non-JIT match-
ing, though it can be passed to pcre2_jit_compile() if required.
The pcre2_code_copy() function provides a way for individual threads in
- a multithreaded application to acquire a private copy of shared com-
- piled code. However, it does not make a copy of the character tables
- used by the compiled pattern; the new pattern code points to the same
- tables as the original code. (See "Locale Support" below for details
- of these character tables.) In many applications the same tables are
- used throughout, so this behaviour is appropriate. Nevertheless, there
+ a multithreaded application to acquire a private copy of shared com-
+ piled code. However, it does not make a copy of the character tables
+ used by the compiled pattern; the new pattern code points to the same
+ tables as the original code. (See "Locale Support" below for details
+ of these character tables.) In many applications the same tables are
+ used throughout, so this behaviour is appropriate. Nevertheless, there
are occasions when a copy of a compiled pattern and the relevant tables
- are needed. The pcre2_code_copy_with_tables() provides this facility.
- Copies of both the code and the tables are made, with the new code
- pointing to the new tables. The memory for the new tables is automati-
- cally freed when pcre2_code_free() is called for the new copy of the
+ are needed. The pcre2_code_copy_with_tables() provides this facility.
+ Copies of both the code and the tables are made, with the new code
+ pointing to the new tables. The memory for the new tables is automati-
+ cally freed when pcre2_code_free() is called for the new copy of the
compiled code.
- NOTE: When one of the matching functions is called, pointers to the
+ NOTE: When one of the matching functions is called, pointers to the
compiled pattern and the subject string are set in the match data block
- so that they can be referenced by the substring extraction functions.
- After running a match, you must not free a compiled pattern (or a sub-
- ject string) until after all operations on the match data block have
+ so that they can be referenced by the substring extraction functions.
+ After running a match, you must not free a compiled pattern (or a sub-
+ ject string) until after all operations on the match data block have
taken place.
- The options argument for pcre2_compile() contains various bit settings
- that affect the compilation. It should be zero if no options are
- required. The available options are described below. Some of them (in
- particular, those that are compatible with Perl, but some others as
- well) can also be set and unset from within the pattern (see the
+ The options argument for pcre2_compile() contains various bit settings
+ that affect the compilation. It should be zero if no options are
+ required. The available options are described below. Some of them (in
+ particular, those that are compatible with Perl, but some others as
+ well) can also be set and unset from within the pattern (see the
detailed description in the pcre2pattern documentation).
- For those options that can be different in different parts of the pat-
- tern, the contents of the options argument specifies their settings at
- the start of compilation. The PCRE2_ANCHORED, PCRE2_ENDANCHORED, and
- PCRE2_NO_UTF_CHECK options can be set at the time of matching as well
+ For those options that can be different in different parts of the pat-
+ tern, the contents of the options argument specifies their settings at
+ the start of compilation. The PCRE2_ANCHORED, PCRE2_ENDANCHORED, and
+ PCRE2_NO_UTF_CHECK options can be set at the time of matching as well
as at compile time.
- Other, less frequently required compile-time parameters (for example,
+ Other, less frequently required compile-time parameters (for example,
the newline setting) can be provided in a compile context (as described
above).
If errorcode or erroroffset is NULL, pcre2_compile() returns NULL imme-
- diately. Otherwise, the variables to which these point are set to an
- error code and an offset (number of code units) within the pattern,
- respectively, when pcre2_compile() returns NULL because a compilation
+ diately. Otherwise, the variables to which these point are set to an
+ error code and an offset (number of code units) within the pattern,
+ respectively, when pcre2_compile() returns NULL because a compilation
error has occurred. The values are not defined when compilation is suc-
cessful and pcre2_compile() returns a non-NULL value.
The value returned in erroroffset is an indication of where in the pat-
- tern the error occurred. It is not necessarily the furthest point in
- the pattern that was read. For example, after the error "lookbehind
+ tern the error occurred. It is not necessarily the furthest point in
+ the pattern that was read. For example, after the error "lookbehind
assertion is not fixed length", the error offset points to the start of
the failing assertion.
- The pcre2_get_error_message() function (see "Obtaining a textual error
- message" below) provides a textual message for each error code. Compi-
+ The pcre2_get_error_message() function (see "Obtaining a textual error
+ message" below) provides a textual message for each error code. Compi-
lation errors have positive error codes; UTF formatting error codes are
- negative. For an invalid UTF-8 or UTF-16 string, the offset is that of
+ negative. For an invalid UTF-8 or UTF-16 string, the offset is that of
the first code unit of the failing character.
- Some errors are not detected until the whole pattern has been scanned;
- in these cases, the offset passed back is the length of the pattern.
- Note that the offset is in code units, not characters, even in a UTF
+ Some errors are not detected until the whole pattern has been scanned;
+ in these cases, the offset passed back is the length of the pattern.
+ Note that the offset is in code units, not characters, even in a UTF
mode. It may sometimes point into the middle of a UTF-8 or UTF-16 char-
acter.
- This code fragment shows a typical straightforward call to pcre2_com-
+ This code fragment shows a typical straightforward call to pcre2_com-
pile():
pcre2_code *re;
@@ -1264,437 +1266,437 @@ COMPILING A PATTERN
&erroffset, /* for error offset */
NULL); /* no compile context */
- The following names for option bits are defined in the pcre2.h header
+ The following names for option bits are defined in the pcre2.h header
file:
PCRE2_ANCHORED
If this bit is set, the pattern is forced to be "anchored", that is, it
- is constrained to match only at the first matching point in the string
- that is being searched (the "subject string"). This effect can also be
- achieved by appropriate constructs in the pattern itself, which is the
+ is constrained to match only at the first matching point in the string
+ that is being searched (the "subject string"). This effect can also be
+ achieved by appropriate constructs in the pattern itself, which is the
only way to do it in Perl.
PCRE2_ALLOW_EMPTY_CLASS
- By default, for compatibility with Perl, a closing square bracket that
- immediately follows an opening one is treated as a data character for
- the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the
+ By default, for compatibility with Perl, a closing square bracket that
+ immediately follows an opening one is treated as a data character for
+ the class. When PCRE2_ALLOW_EMPTY_CLASS is set, it terminates the
class, which therefore contains no characters and so can never match.
PCRE2_ALT_BSUX
- This option request alternative handling of three escape sequences,
- which makes PCRE2's behaviour more like ECMAscript (aka JavaScript).
+ This option request alternative handling of three escape sequences,
+ which makes PCRE2's behaviour more like ECMAscript (aka JavaScript).
When it is set:
(1) \U matches an upper case "U" character; by default \U causes a com-
pile time error (Perl uses \U to upper case subsequent characters).
(2) \u matches a lower case "u" character unless it is followed by four
- hexadecimal digits, in which case the hexadecimal number defines the
- code point to match. By default, \u causes a compile time error (Perl
+ hexadecimal digits, in which case the hexadecimal number defines the
+ code point to match. By default, \u causes a compile time error (Perl
uses it to upper case the following character).
- (3) \x matches a lower case "x" character unless it is followed by two
- hexadecimal digits, in which case the hexadecimal number defines the
- code point to match. By default, as in Perl, a hexadecimal number is
+ (3) \x matches a lower case "x" character unless it is followed by two
+ hexadecimal digits, in which case the hexadecimal number defines the
+ code point to match. By default, as in Perl, a hexadecimal number is
always expected after \x, but it may have zero, one, or two digits (so,
for example, \xz matches a binary zero character followed by z).
PCRE2_ALT_CIRCUMFLEX
In multiline mode (when PCRE2_MULTILINE is set), the circumflex
- metacharacter matches at the start of the subject (unless PCRE2_NOTBOL
- is set), and also after any internal newline. However, it does not
+ metacharacter matches at the start of the subject (unless PCRE2_NOTBOL
+ is set), and also after any internal newline. However, it does not
match after a newline at the end of the subject, for compatibility with
- Perl. If you want a multiline circumflex also to match after a termi-
+ Perl. If you want a multiline circumflex also to match after a termi-
nating newline, you must set PCRE2_ALT_CIRCUMFLEX.
PCRE2_ALT_VERBNAMES
- By default, for compatibility with Perl, the name in any verb sequence
- such as (*MARK:NAME) is any sequence of characters that does not
- include a closing parenthesis. The name is not processed in any way,
- and it is not possible to include a closing parenthesis in the name.
- However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash
- processing is applied to verb names and only an unescaped closing
- parenthesis terminates the name. A closing parenthesis can be included
- in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED or
- PCRE2_EXTENDED_MORE option is set, unescaped whitespace in verb names
- is skipped and #-comments are recognized in this mode, exactly as in
+ By default, for compatibility with Perl, the name in any verb sequence
+ such as (*MARK:NAME) is any sequence of characters that does not
+ include a closing parenthesis. The name is not processed in any way,
+ and it is not possible to include a closing parenthesis in the name.
+ However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash
+ processing is applied to verb names and only an unescaped closing
+ parenthesis terminates the name. A closing parenthesis can be included
+ in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED or
+ PCRE2_EXTENDED_MORE option is set, unescaped whitespace in verb names
+ is skipped and #-comments are recognized in this mode, exactly as in
the rest of the pattern.
PCRE2_AUTO_CALLOUT
- If this bit is set, pcre2_compile() automatically inserts callout
- items, all with number 255, before each pattern item, except immedi-
- ately before or after an explicit callout in the pattern. For discus-
+ If this bit is set, pcre2_compile() automatically inserts callout
+ items, all with number 255, before each pattern item, except immedi-
+ ately before or after an explicit callout in the pattern. For discus-
sion of the callout facility, see the pcre2callout documentation.
PCRE2_CASELESS
- If this bit is set, letters in the pattern match both upper and lower
- case letters in the subject. It is equivalent to Perl's /i option, and
- it can be changed within a pattern by a (?i) option setting. If
- PCRE2_UTF is set, Unicode properties are used for all characters with
- more than one other case, and for all characters whose code points are
- greater than U+007f. For lower valued characters with only one other
- case, a lookup table is used for speed. When PCRE2_UTF is not set, a
+ If this bit is set, letters in the pattern match both upper and lower
+ case letters in the subject. It is equivalent to Perl's /i option, and
+ it can be changed within a pattern by a (?i) option setting. If
+ PCRE2_UTF is set, Unicode properties are used for all characters with
+ more than one other case, and for all characters whose code points are
+ greater than U+007f. For lower valued characters with only one other
+ case, a lookup table is used for speed. When PCRE2_UTF is not set, a
lookup table is used for all code points less than 256, and higher code
- points (available only in 16-bit or 32-bit mode) are treated as not
+ points (available only in 16-bit or 32-bit mode) are treated as not
having another case.
PCRE2_DOLLAR_ENDONLY
- If this bit is set, a dollar metacharacter in the pattern matches only
- at the end of the subject string. Without this option, a dollar also
- matches immediately before a newline at the end of the string (but not
- before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored
- if PCRE2_MULTILINE is set. There is no equivalent to this option in
+ If this bit is set, a dollar metacharacter in the pattern matches only
+ at the end of the subject string. Without this option, a dollar also
+ matches immediately before a newline at the end of the string (but not
+ before any other newlines). The PCRE2_DOLLAR_ENDONLY option is ignored
+ if PCRE2_MULTILINE is set. There is no equivalent to this option in
Perl, and no way to set it within a pattern.
PCRE2_DOTALL
- If this bit is set, a dot metacharacter in the pattern matches any
- character, including one that indicates a newline. However, it only
+ If this bit is set, a dot metacharacter in the pattern matches any
+ character, including one that indicates a newline. However, it only
ever matches one character, even if newlines are coded as CRLF. Without
this option, a dot does not match when the current position in the sub-
- ject is at a newline. This option is equivalent to Perl's /s option,
+ ject is at a newline. This option is equivalent to Perl's /s option,
and it can be changed within a pattern by a (?s) option setting. A neg-
ative class such as [^a] always matches newline characters, independent
of the setting of this option.
PCRE2_DUPNAMES
- If this bit is set, names used to identify capturing subpatterns need
+ If this bit is set, names used to identify capturing subpatterns need
not be unique. This can be helpful for certain types of pattern when it
- is known that only one instance of the named subpattern can ever be
- matched. There are more details of named subpatterns below; see also
+ is known that only one instance of the named subpattern can ever be
+ matched. There are more details of named subpatterns below; see also
the pcre2pattern documentation.
PCRE2_ENDANCHORED
- If this bit is set, the end of any pattern match must be right at the
+ If this bit is set, the end of any pattern match must be right at the
end of the string being searched (the "subject string"). If the pattern
match succeeds by reaching (*ACCEPT), but does not reach the end of the
- subject, the match fails at the current starting point. For unanchored
- patterns, a new match is then tried at the next starting point. How-
+ subject, the match fails at the current starting point. For unanchored
+ patterns, a new match is then tried at the next starting point. How-
ever, if the match succeeds by reaching the end of the pattern, but not
- the end of the subject, backtracking occurs and an alternative match
+ the end of the subject, backtracking occurs and an alternative match
may be found. Consider these two patterns:
.(*ACCEPT)|..
.|..
- If matched against "abc" with PCRE2_ENDANCHORED set, the first matches
- "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED
- can also be achieved by appropriate constructs in the pattern itself,
+ If matched against "abc" with PCRE2_ENDANCHORED set, the first matches
+ "c" whereas the second matches "bc". The effect of PCRE2_ENDANCHORED
+ can also be achieved by appropriate constructs in the pattern itself,
which is the only way to do it in Perl.
For DFA matching with pcre2_dfa_match(), PCRE2_ENDANCHORED applies only
- to the first (that is, the longest) matched string. Other parallel
- matches, which are necessarily substrings of the first one, must obvi-
+ to the first (that is, the longest) matched string. Other parallel
+ matches, which are necessarily substrings of the first one, must obvi-
ously end before the end of the subject.
PCRE2_EXTENDED
- If this bit is set, most white space characters in the pattern are
- totally ignored except when escaped or inside a character class. How-
- ever, white space is not allowed within sequences such as (?> that
+ If this bit is set, most white space characters in the pattern are
+ totally ignored except when escaped or inside a character class. How-
+ ever, white space is not allowed within sequences such as (?> that
introduce various parenthesized subpatterns, nor within numerical quan-
- tifiers such as {1,3}. Ignorable white space is permitted between an
- item and a following quantifier and between a quantifier and a follow-
+ tifiers such as {1,3}. Ignorable white space is permitted between an
+ item and a following quantifier and between a quantifier and a follow-
ing + that indicates possessiveness.
- PCRE2_EXTENDED also causes characters between an unescaped # outside a
- character class and the next newline, inclusive, to be ignored, which
+ PCRE2_EXTENDED also causes characters between an unescaped # outside a
+ character class and the next newline, inclusive, to be ignored, which
makes it possible to include comments inside complicated patterns. Note
- that the end of this type of comment is a literal newline sequence in
+ that the end of this type of comment is a literal newline sequence in
the pattern; escape sequences that happen to represent a newline do not
- count. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be
+ count. PCRE2_EXTENDED is equivalent to Perl's /x option, and it can be
changed within a pattern by a (?x) option setting.
Which characters are interpreted as newlines can be specified by a set-
- ting in the compile context that is passed to pcre2_compile() or by a
- special sequence at the start of the pattern, as described in the sec-
- tion entitled "Newline conventions" in the pcre2pattern documentation.
+ ting in the compile context that is passed to pcre2_compile() or by a
+ special sequence at the start of the pattern, as described in the sec-
+ tion entitled "Newline conventions" in the pcre2pattern documentation.
A default is defined when PCRE2 is built.
PCRE2_EXTENDED_MORE
- This option has the effect of PCRE2_EXTENDED, but, in addition,
- unescaped space and horizontal tab characters are ignored inside a
- character class. PCRE2_EXTENDED_MORE is equivalent to Perl's 5.26 /xx
- option, and it can be changed within a pattern by a (?xx) option set-
+ This option has the effect of PCRE2_EXTENDED, but, in addition,
+ unescaped space and horizontal tab characters are ignored inside a
+ character class. PCRE2_EXTENDED_MORE is equivalent to Perl's 5.26 /xx
+ option, and it can be changed within a pattern by a (?xx) option set-
ting.
PCRE2_FIRSTLINE
If this option is set, the start of an unanchored pattern match must be
- before or at the first newline in the subject string, though the
- matched text may continue over the newline. See also PCRE2_USE_OFF-
- SET_LIMIT, which provides a more general limiting facility. If
- PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the
- first line and also within the offset limit. In other words, whichever
+ before or at the first newline in the subject string, though the
+ matched text may continue over the newline. See also PCRE2_USE_OFF-
+ SET_LIMIT, which provides a more general limiting facility. If
+ PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the
+ first line and also within the offset limit. In other words, whichever
limit comes first is used.
PCRE2_MATCH_UNSET_BACKREF
- If this option is set, a back reference to an unset subpattern group
- matches an empty string (by default this causes the current matching
- alternative to fail). A pattern such as (\1)(a) succeeds when this
- option is set (assuming it can find an "a" in the subject), whereas it
- fails by default, for Perl compatibility. Setting this option makes
+ If this option is set, a back reference to an unset subpattern group
+ matches an empty string (by default this causes the current matching
+ alternative to fail). A pattern such as (\1)(a) succeeds when this
+ option is set (assuming it can find an "a" in the subject), whereas it
+ fails by default, for Perl compatibility. Setting this option makes
PCRE2 behave more like ECMAscript (aka JavaScript).
PCRE2_MULTILINE
- By default, for the purposes of matching "start of line" and "end of
- line", PCRE2 treats the subject string as consisting of a single line
- of characters, even if it actually contains newlines. The "start of
- line" metacharacter (^) matches only at the start of the string, and
- the "end of line" metacharacter ($) matches only at the end of the
+ By default, for the purposes of matching "start of line" and "end of
+ line", PCRE2 treats the subject string as consisting of a single line
+ of characters, even if it actually contains newlines. The "start of
+ line" metacharacter (^) matches only at the start of the string, and
+ the "end of line" metacharacter ($) matches only at the end of the
string, or before a terminating newline (except when PCRE2_DOL-
- LAR_ENDONLY is set). Note, however, that unless PCRE2_DOTALL is set,
+ LAR_ENDONLY is set). Note, however, that unless PCRE2_DOTALL is set,
the "any character" metacharacter (.) does not match at a newline. This
behaviour (for ^, $, and dot) is the same as Perl.
- When PCRE2_MULTILINE it is set, the "start of line" and "end of line"
- constructs match immediately following or immediately before internal
- newlines in the subject string, respectively, as well as at the very
- start and end. This is equivalent to Perl's /m option, and it can be
+ When PCRE2_MULTILINE it is set, the "start of line" and "end of line"
+ constructs match immediately following or immediately before internal
+ newlines in the subject string, respectively, as well as at the very
+ start and end. This is equivalent to Perl's /m option, and it can be
changed within a pattern by a (?m) option setting. Note that the "start
of line" metacharacter does not match after a newline at the end of the
- subject, for compatibility with Perl. However, you can change this by
- setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a
- subject string, or no occurrences of ^ or $ in a pattern, setting
+ subject, for compatibility with Perl. However, you can change this by
+ setting the PCRE2_ALT_CIRCUMFLEX option. If there are no newlines in a
+ subject string, or no occurrences of ^ or $ in a pattern, setting
PCRE2_MULTILINE has no effect.
PCRE2_NEVER_BACKSLASH_C
- This option locks out the use of \C in the pattern that is being com-
- piled. This escape can cause unpredictable behaviour in UTF-8 or
- UTF-16 modes, because it may leave the current matching point in the
- middle of a multi-code-unit character. This option may be useful in
- applications that process patterns from external sources. Note that
+ This option locks out the use of \C in the pattern that is being com-
+ piled. This escape can cause unpredictable behaviour in UTF-8 or
+ UTF-16 modes, because it may leave the current matching point in the
+ middle of a multi-code-unit character. This option may be useful in
+ applications that process patterns from external sources. Note that
there is also a build-time option that permanently locks out the use of
\C.
PCRE2_NEVER_UCP
- This option locks out the use of Unicode properties for handling \B,
+ This option locks out the use of Unicode properties for handling \B,
\b, \D, \d, \S, \s, \W, \w, and some of the POSIX character classes, as
- described for the PCRE2_UCP option below. In particular, it prevents
- the creator of the pattern from enabling this facility by starting the
- pattern with (*UCP). This option may be useful in applications that
+ described for the PCRE2_UCP option below. In particular, it prevents
+ the creator of the pattern from enabling this facility by starting the
+ pattern with (*UCP). This option may be useful in applications that
process patterns from external sources. The option combination PCRE_UCP
and PCRE_NEVER_UCP causes an error.
PCRE2_NEVER_UTF
- This option locks out interpretation of the pattern as UTF-8, UTF-16,
+ This option locks out interpretation of the pattern as UTF-8, UTF-16,
or UTF-32, depending on which library is in use. In particular, it pre-
- vents the creator of the pattern from switching to UTF interpretation
- by starting the pattern with (*UTF). This option may be useful in
- applications that process patterns from external sources. The combina-
+ vents the creator of the pattern from switching to UTF interpretation
+ by starting the pattern with (*UTF). This option may be useful in
+ applications that process patterns from external sources. The combina-
tion of PCRE2_UTF and PCRE2_NEVER_UTF causes an error.
PCRE2_NO_AUTO_CAPTURE
If this option is set, it disables the use of numbered capturing paren-
- theses in the pattern. Any opening parenthesis that is not followed by
- ? behaves as if it were followed by ?: but named parentheses can still
+ theses in the pattern. Any opening parenthesis that is not followed by
+ ? behaves as if it were followed by ?: but named parentheses can still
be used for capturing (and they acquire numbers in the usual way). This
- is the same as Perl's /n option. Note that, when this option is set,
+ is the same as Perl's /n option. Note that, when this option is set,
references to capturing groups (back references or recursion/subroutine
- calls) may only refer to named groups, though the reference can be by
+ calls) may only refer to named groups, though the reference can be by
name or by number.
PCRE2_NO_AUTO_POSSESS
If this option is set, it disables "auto-possessification", which is an
- optimization that, for example, turns a+b into a++b in order to avoid
- backtracks into a+ that can never be successful. However, if callouts
- are in use, auto-possessification means that some callouts are never
+ optimization that, for example, turns a+b into a++b in order to avoid
+ backtracks into a+ that can never be successful. However, if callouts
+ are in use, auto-possessification means that some callouts are never
taken. You can set this option if you want the matching functions to do
- a full unoptimized search and run all the callouts, but it is mainly
+ a full unoptimized search and run all the callouts, but it is mainly
provided for testing purposes.
PCRE2_NO_DOTSTAR_ANCHOR
If this option is set, it disables an optimization that is applied when
- .* is the first significant item in a top-level branch of a pattern,
- and all the other branches also start with .* or with \A or \G or ^.
- The optimization is automatically disabled for .* if it is inside an
- atomic group or a capturing group that is the subject of a back refer-
- ence, or if the pattern contains (*PRUNE) or (*SKIP). When the opti-
- mization is not disabled, such a pattern is automatically anchored if
+ .* is the first significant item in a top-level branch of a pattern,
+ and all the other branches also start with .* or with \A or \G or ^.
+ The optimization is automatically disabled for .* if it is inside an
+ atomic group or a capturing group that is the subject of a back refer-
+ ence, or if the pattern contains (*PRUNE) or (*SKIP). When the opti-
+ mization is not disabled, such a pattern is automatically anchored if
PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILINE is not set
- for any ^ items. Otherwise, the fact that any match must start either
- at the start of the subject or following a newline is remembered. Like
+ for any ^ items. Otherwise, the fact that any match must start either
+ at the start of the subject or following a newline is remembered. Like
other optimizations, this can cause callouts to be skipped.
PCRE2_NO_START_OPTIMIZE
- This is an option whose main effect is at matching time. It does not
+ This is an option whose main effect is at matching time. It does not
change what pcre2_compile() generates, but it does affect the output of
the JIT compiler.
- There are a number of optimizations that may occur at the start of a
- match, in order to speed up the process. For example, if it is known
- that an unanchored match must start with a specific code unit value,
- the matching code searches the subject for that value, and fails imme-
- diately if it cannot find it, without actually running the main match-
- ing function. This means that a special item such as (*COMMIT) at the
- start of a pattern is not considered until after a suitable starting
- point for the match has been found. Also, when callouts or (*MARK)
- items are in use, these "start-up" optimizations can cause them to be
- skipped if the pattern is never actually used. The start-up optimiza-
- tions are in effect a pre-scan of the subject that takes place before
+ There are a number of optimizations that may occur at the start of a
+ match, in order to speed up the process. For example, if it is known
+ that an unanchored match must start with a specific code unit value,
+ the matching code searches the subject for that value, and fails imme-
+ diately if it cannot find it, without actually running the main match-
+ ing function. This means that a special item such as (*COMMIT) at the
+ start of a pattern is not considered until after a suitable starting
+ point for the match has been found. Also, when callouts or (*MARK)
+ items are in use, these "start-up" optimizations can cause them to be
+ skipped if the pattern is never actually used. The start-up optimiza-
+ tions are in effect a pre-scan of the subject that takes place before
the pattern is run.
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizations,
- possibly causing performance to suffer, but ensuring that in cases
- where the result is "no match", the callouts do occur, and that items
+ possibly causing performance to suffer, but ensuring that in cases
+ where the result is "no match", the callouts do occur, and that items
such as (*COMMIT) and (*MARK) are considered at every possible starting
position in the subject string.
- Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching
+ Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a matching
operation. Consider the pattern
(*COMMIT)ABC
- When this is compiled, PCRE2 records the fact that a match must start
- with the character "A". Suppose the subject string is "DEFABC". The
- start-up optimization scans along the subject, finds "A" and runs the
- first match attempt from there. The (*COMMIT) item means that the pat-
- tern must match the current starting position, which in this case, it
- does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE
- set, the initial scan along the subject string does not happen. The
- first match attempt is run starting from "D" and when this fails,
- (*COMMIT) prevents any further matches being tried, so the overall
+ When this is compiled, PCRE2 records the fact that a match must start
+ with the character "A". Suppose the subject string is "DEFABC". The
+ start-up optimization scans along the subject, finds "A" and runs the
+ first match attempt from there. The (*COMMIT) item means that the pat-
+ tern must match the current starting position, which in this case, it
+ does. However, if the same match is run with PCRE2_NO_START_OPTIMIZE
+ set, the initial scan along the subject string does not happen. The
+ first match attempt is run starting from "D" and when this fails,
+ (*COMMIT) prevents any further matches being tried, so the overall
result is "no match".
- There are also other start-up optimizations. For example, a minimum
+ There are also other start-up optimizations. For example, a minimum
length for the subject may be recorded. Consider the pattern
(*MARK:A)(X|Y)
- The minimum length for a match is one character. If the subject is
+ The minimum length for a match is one character. If the subject is
"ABC", there will be attempts to match "ABC", "BC", and "C". An attempt
to match an empty string at the end of the subject does not take place,
- because PCRE2 knows that the subject is now too short, and so the
- (*MARK) is never encountered. In this case, the optimization does not
+ because PCRE2 knows that the subject is now too short, and so the
+ (*MARK) is never encountered. In this case, the optimization does not
affect the overall match result, which is still "no match", but it does
affect the auxiliary information that is returned.
PCRE2_NO_UTF_CHECK
- When PCRE2_UTF is set, the validity of the pattern as a UTF string is
- automatically checked. There are discussions about the validity of
- UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode
- document. If an invalid UTF sequence is found, pcre2_compile() returns
+ When PCRE2_UTF is set, the validity of the pattern as a UTF string is
+ automatically checked. There are discussions about the validity of
+ UTF-8 strings, UTF-16 strings, and UTF-32 strings in the pcre2unicode
+ document. If an invalid UTF sequence is found, pcre2_compile() returns
a negative error code.
- If you know that your pattern is a valid UTF string, and you want to
- skip this check for performance reasons, you can set the
- PCRE2_NO_UTF_CHECK option. When it is set, the effect of passing an
+ If you know that your pattern is a valid UTF string, and you want to
+ skip this check for performance reasons, you can set the
+ PCRE2_NO_UTF_CHECK option. When it is set, the effect of passing an
invalid UTF string as a pattern is undefined. It may cause your program
to crash or loop.
Note that this option can also be passed to pcre2_match() and
- pcre_dfa_match(), to suppress UTF validity checking of the subject
+ pcre_dfa_match(), to suppress UTF validity checking of the subject
string.
Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis-
- able the error that is given if an escape sequence for an invalid Uni-
- code code point is encountered in the pattern. In particular, the so-
- called "surrogate" code points (0xd800 to 0xdfff) are invalid. If you
- want to allow escape sequences such as \x{d800} you can set the
- PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option, as described in the
- section entitled "Extra compile options" below. However, this is pos-
+ able the error that is given if an escape sequence for an invalid Uni-
+ code code point is encountered in the pattern. In particular, the so-
+ called "surrogate" code points (0xd800 to 0xdfff) are invalid. If you
+ want to allow escape sequences such as \x{d800} you can set the
+ PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra option, as described in the
+ section entitled "Extra compile options" below. However, this is pos-
sible only in UTF-8 and UTF-32 modes, because these values are not rep-
resentable in UTF-16.
PCRE2_UCP
This option changes the way PCRE2 processes \B, \b, \D, \d, \S, \s, \W,
- \w, and some of the POSIX character classes. By default, only ASCII
- characters are recognized, but if PCRE2_UCP is set, Unicode properties
- are used instead to classify characters. More details are given in the
+ \w, and some of the POSIX character classes. By default, only ASCII
+ characters are recognized, but if PCRE2_UCP is set, Unicode properties
+ are used instead to classify characters. More details are given in the
section on generic character types in the pcre2pattern page. If you set
- PCRE2_UCP, matching one of the items it affects takes much longer. The
- option is available only if PCRE2 has been compiled with Unicode sup-
+ PCRE2_UCP, matching one of the items it affects takes much longer. The
+ option is available only if PCRE2 has been compiled with Unicode sup-
port (which is the default).
PCRE2_UNGREEDY
- This option inverts the "greediness" of the quantifiers so that they
- are not greedy by default, but become greedy if followed by "?". It is
- not compatible with Perl. It can also be set by a (?U) option setting
+ This option inverts the "greediness" of the quantifiers so that they
+ are not greedy by default, but become greedy if followed by "?". It is
+ not compatible with Perl. It can also be set by a (?U) option setting
within the pattern.
PCRE2_USE_OFFSET_LIMIT
This option must be set for pcre2_compile() if pcre2_set_offset_limit()
- is going to be used to set a non-default offset limit in a match con-
- text for matches that use this pattern. An error is generated if an
- offset limit is set without this option. For more details, see the
- description of pcre2_set_offset_limit() in the section that describes
+ is going to be used to set a non-default offset limit in a match con-
+ text for matches that use this pattern. An error is generated if an
+ offset limit is set without this option. For more details, see the
+ description of pcre2_set_offset_limit() in the section that describes
match contexts. See also the PCRE2_FIRSTLINE option above.
PCRE2_UTF
- This option causes PCRE2 to regard both the pattern and the subject
- strings that are subsequently processed as strings of UTF characters
- instead of single-code-unit strings. It is available when PCRE2 is
- built to include Unicode support (which is the default). If Unicode
- support is not available, the use of this option provokes an error.
- Details of how PCRE2_UTF changes the behaviour of PCRE2 are given in
+ This option causes PCRE2 to regard both the pattern and the subject
+ strings that are subsequently processed as strings of UTF characters
+ instead of single-code-unit strings. It is available when PCRE2 is
+ built to include Unicode support (which is the default). If Unicode
+ support is not available, the use of this option provokes an error.
+ Details of how PCRE2_UTF changes the behaviour of PCRE2 are given in
the pcre2unicode page.
Extra compile options
- Unlike the main compile-time options, the extra options are not saved
+ Unlike the main compile-time options, the extra options are not saved
with the compiled pattern. The option bits that can be set in a compile
- context by calling the pcre2_set_compile_extra_options() function are
+ context by calling the pcre2_set_compile_extra_options() function are
as follows:
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
- This option applies when compiling a pattern in UTF-8 or UTF-32 mode.
- It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode
+ This option applies when compiling a pattern in UTF-8 or UTF-32 mode.
+ It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode
"surrogate" code points in the range 0xd800 to 0xdfff are used in pairs
- in UTF-16 to encode code points with values in the range 0x10000 to
- 0x10ffff. The surrogates cannot therefore be represented in UTF-16.
+ in UTF-16 to encode code points with values in the range 0x10000 to
+ 0x10ffff. The surrogates cannot therefore be represented in UTF-16.
They can be represented in UTF-8 and UTF-32, but are defined as invalid
- code points, and cause errors if encountered in a UTF-8 or UTF-32
+ code points, and cause errors if encountered in a UTF-8 or UTF-32
string that is being checked for validity by PCRE2.
- These values also cause errors if encountered in escape sequences such
+ These values also cause errors if encountered in escape sequences such
as \x{d912} within a pattern. However, it seems that some applications,
- when using PCRE2 to check for unwanted characters in UTF-8 strings,
- explicitly test for the surrogates using escape sequences. The
- PCRE2_NO_UTF_CHECK option does not disable the error that occurs,
- because it applies only to the testing of input strings for UTF valid-
+ when using PCRE2 to check for unwanted characters in UTF-8 strings,
+ explicitly test for the surrogates using escape sequences. The
+ PCRE2_NO_UTF_CHECK option does not disable the error that occurs,
+ because it applies only to the testing of input strings for UTF valid-
ity.
- If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro-
- gate code point values in UTF-8 and UTF-32 patterns no longer provoke
- errors and are incorporated in the compiled pattern. However, they can
- only match subject characters if the matching function is called with
+ If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surro-
+ gate code point values in UTF-8 and UTF-32 patterns no longer provoke
+ errors and are incorporated in the compiled pattern. However, they can
+ only match subject characters if the matching function is called with
PCRE2_NO_UTF_CHECK set.
COMPILATION ERROR CODES
- There are nearly 100 positive error codes that pcre2_compile() may
- return (via errorcode) if it finds an error in the pattern. There are
- also some negative error codes that are used for invalid UTF strings.
+ There are nearly 100 positive error codes that pcre2_compile() may
+ return (via errorcode) if it finds an error in the pattern. There are
+ also some negative error codes that are used for invalid UTF strings.
These are the same as given by pcre2_match() and pcre2_dfa_match(), and
- are described in the pcre2unicode page. The pcre2_get_error_message()
- function (see "Obtaining a textual error message" below) can be called
+ are described in the pcre2unicode page. The pcre2_get_error_message()
+ function (see "Obtaining a textual error message" below) can be called
to obtain a textual error message from any error code.
@@ -1717,53 +1719,53 @@ JUST-IN-TIME (JIT) COMPILATION
void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack);
- These functions provide support for JIT compilation, which, if the
- just-in-time compiler is available, further processes a compiled pat-
+ These functions provide support for JIT compilation, which, if the
+ just-in-time compiler is available, further processes a compiled pat-
tern into machine code that executes much faster than the pcre2_match()
- interpretive matching function. Full details are given in the pcre2jit
+ interpretive matching function. Full details are given in the pcre2jit
documentation.
- JIT compilation is a heavyweight optimization. It can take some time
- for patterns to be analyzed, and for one-off matches and simple pat-
- terns the benefit of faster execution might be offset by a much slower
- compilation time. Most (but not all) patterns can be optimized by the
+ JIT compilation is a heavyweight optimization. It can take some time
+ for patterns to be analyzed, and for one-off matches and simple pat-
+ terns the benefit of faster execution might be offset by a much slower
+ compilation time. Most (but not all) patterns can be optimized by the
JIT compiler.
LOCALE SUPPORT
- PCRE2 handles caseless matching, and determines whether characters are
- letters, digits, or whatever, by reference to a set of tables, indexed
- by character code point. This applies only to characters whose code
- points are less than 256. By default, higher-valued code points never
- match escapes such as \w or \d. However, if PCRE2 is built with Uni-
+ PCRE2 handles caseless matching, and determines whether characters are
+ letters, digits, or whatever, by reference to a set of tables, indexed
+ by character code point. This applies only to characters whose code
+ points are less than 256. By default, higher-valued code points never
+ match escapes such as \w or \d. However, if PCRE2 is built with Uni-
code support, all characters can be tested with \p and \P, or, alterna-
- tively, the PCRE2_UCP option can be set when a pattern is compiled;
- this causes \w and friends to use Unicode property support instead of
+ tively, the PCRE2_UCP option can be set when a pattern is compiled;
+ this causes \w and friends to use Unicode property support instead of
the built-in tables.
- The use of locales with Unicode is discouraged. If you are handling
- characters with code points greater than 128, you should either use
+ The use of locales with Unicode is discouraged. If you are handling
+ characters with code points greater than 128, you should either use
Unicode support, or use locales, but not try to mix the two.
- PCRE2 contains an internal set of character tables that are used by
- default. These are sufficient for many applications. Normally, the
+ PCRE2 contains an internal set of character tables that are used by
+ default. These are sufficient for many applications. Normally, the
internal tables recognize only ASCII characters. However, when PCRE2 is
built, it is possible to cause the internal tables to be rebuilt in the
default "C" locale of the local system, which may cause them to be dif-
ferent.
- The internal tables can be overridden by tables supplied by the appli-
- cation that calls PCRE2. These may be created in a different locale
- from the default. As more and more applications change to using Uni-
+ The internal tables can be overridden by tables supplied by the appli-
+ cation that calls PCRE2. These may be created in a different locale
+ from the default. As more and more applications change to using Uni-
code, the need for this locale support is expected to die away.
- External tables are built by calling the pcre2_maketables() function,
- in the relevant locale. The result can be passed to pcre2_compile() as
- often as necessary, by creating a compile context and calling
- pcre2_set_character_tables() to set the tables pointer therein. For
- example, to build and use tables that are appropriate for the French
- locale (where accented characters with values greater than 128 are
+ External tables are built by calling the pcre2_maketables() function,
+ in the relevant locale. The result can be passed to pcre2_compile() as
+ often as necessary, by creating a compile context and calling
+ pcre2_set_character_tables() to set the tables pointer therein. For
+ example, to build and use tables that are appropriate for the French
+ locale (where accented characters with values greater than 128 are
treated as letters), the following code could be used:
setlocale(LC_CTYPE, "fr_FR");
@@ -1772,15 +1774,15 @@ LOCALE SUPPORT
pcre2_set_character_tables(ccontext, tables);
re = pcre2_compile(..., ccontext);
- The locale name "fr_FR" is used on Linux and other Unix-like systems;
- if you are using Windows, the name for the French locale is "french".
- It is the caller's responsibility to ensure that the memory containing
+ The locale name "fr_FR" is used on Linux and other Unix-like systems;
+ if you are using Windows, the name for the French locale is "french".
+ It is the caller's responsibility to ensure that the memory containing
the tables remains available for as long as it is needed.
The pointer that is passed (via the compile context) to pcre2_compile()
- is saved with the compiled pattern, and the same tables are used by
- pcre2_match() and pcre_dfa_match(). Thus, for any single pattern, com-
- pilation and matching both happen in the same locale, but different
+ is saved with the compiled pattern, and the same tables are used by
+ pcre2_match() and pcre_dfa_match(). Thus, for any single pattern, com-
+ pilation and matching both happen in the same locale, but different
patterns can be processed in different locales.
@@ -1788,13 +1790,13 @@ INFORMATION ABOUT A COMPILED PATTERN
int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);
- The pcre2_pattern_info() function returns general information about a
+ The pcre2_pattern_info() function returns general information about a
compiled pattern. For information about callouts, see the next section.
- The first argument for pcre2_pattern_info() is a pointer to the com-
+ The first argument for pcre2_pattern_info() is a pointer to the com-
piled pattern. The second argument specifies which piece of information
- is required, and the third argument is a pointer to a variable to
- receive the data. If the third argument is NULL, the first argument is
- ignored, and the function returns the size in bytes of the variable
+ is required, and the third argument is a pointer to a variable to
+ receive the data. If the third argument is NULL, the first argument is
+ ignored, and the function returns the size in bytes of the variable
that is required for the information requested. Otherwise, the yield of
the function is zero for success, or one of the following negative num-
bers:
@@ -1804,9 +1806,9 @@ INFORMATION ABOUT A COMPILED PATTERN
PCRE2_ERROR_BADOPTION the value of what was invalid
PCRE2_ERROR_UNSET the requested field is not set
- The "magic number" is placed at the start of each compiled pattern as
- an simple check against passing an arbitrary memory pointer. Here is a
- typical call of pcre2_pattern_info(), to obtain the length of the com-
+ The "magic number" is placed at the start of each compiled pattern as
+ an simple check against passing an arbitrary memory pointer. Here is a
+ typical call of pcre2_pattern_info(), to obtain the length of the com-
piled pattern:
int rc;
@@ -1823,19 +1825,19 @@ INFORMATION ABOUT A COMPILED PATTERN
PCRE2_INFO_ARGOPTIONS
Return a copy of the pattern's options. The third argument should point
- to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the
- options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP-
- TIONS returns the compile options as modified by any top-level (*XXX)
+ to a uint32_t variable. PCRE2_INFO_ARGOPTIONS returns exactly the
+ options that were passed to pcre2_compile(), whereas PCRE2_INFO_ALLOP-
+ TIONS returns the compile options as modified by any top-level (*XXX)
option settings such as (*UTF) at the start of the pattern itself.
- For example, if the pattern /(*UTF)abc/ is compiled with the
- PCRE2_EXTENDED option, the result for PCRE2_INFO_ALLOPTIONS is
- PCRE2_EXTENDED and PCRE2_UTF. Option settings such as (?i) that can
- change within a pattern do not affect the result of PCRE2_INFO_ALLOP-
+ For example, if the pattern /(*UTF)abc/ is compiled with the
+ PCRE2_EXTENDED option, the result for PCRE2_INFO_ALLOPTIONS is
+ PCRE2_EXTENDED and PCRE2_UTF. Option settings such as (?i) that can
+ change within a pattern do not affect the result of PCRE2_INFO_ALLOP-
TIONS, even if they appear right at the start of the pattern. (This was
different in some earlier releases.)
- A pattern compiled without PCRE2_ANCHORED is automatically anchored by
+ A pattern compiled without PCRE2_ANCHORED is automatically anchored by
PCRE2 if the first significant item in every top-level branch is one of
the following:
@@ -1844,7 +1846,7 @@ INFORMATION ABOUT A COMPILED PATTERN
\G always
.* sometimes - see below
- When .* is the first significant item, anchoring is possible only when
+ When .* is the first significant item, anchoring is possible only when
all the following are true:
.* is not in an atomic group
@@ -1854,178 +1856,178 @@ INFORMATION ABOUT A COMPILED PATTERN
Neither (*PRUNE) nor (*SKIP) appears in the pattern
PCRE2_NO_DOTSTAR_ANCHOR is not set
- For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in
+ For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in
the options returned for PCRE2_INFO_ALLOPTIONS.
PCRE2_INFO_BACKREFMAX
- Return the number of the highest back reference in the pattern. The
- third argument should point to an uint32_t variable. Named subpatterns
- acquire numbers as well as names, and these count towards the highest
- back reference. Back references such as \4 or \g{12} match the cap-
- tured characters of the given group, but in addition, the check that a
+ Return the number of the highest back reference in the pattern. The
+ third argument should point to an uint32_t variable. Named subpatterns
+ acquire numbers as well as names, and these count towards the highest
+ back reference. Back references such as \4 or \g{12} match the cap-
+ tured characters of the given group, but in addition, the check that a
capturing group is set in a conditional subpattern such as (?(3)a|b) is
- also a back reference. Zero is returned if there are no back refer-
+ also a back reference. Zero is returned if there are no back refer-
ences.
PCRE2_INFO_BSR
The output is a uint32_t whose value indicates what character sequences
the \R escape sequence matches. A value of PCRE2_BSR_UNICODE means that
- \R matches any Unicode line ending sequence; a value of PCRE2_BSR_ANY-
+ \R matches any Unicode line ending sequence; a value of PCRE2_BSR_ANY-
CRLF means that \R matches only CR, LF, or CRLF.
PCRE2_INFO_CAPTURECOUNT
- Return the highest capturing subpattern number in the pattern. In pat-
+ Return the highest capturing subpattern number in the pattern. In pat-
terns where (?| is not used, this is also the total number of capturing
subpatterns. The third argument should point to an uint32_t variable.
PCRE2_INFO_DEPTHLIMIT
- If the pattern set a backtracking depth limit by including an item of
- the form (*LIMIT_DEPTH=nnnn) at the start, the value is returned. The
- third argument should point to an unsigned 32-bit integer. If no such
- value has been set, the call to pcre2_pattern_info() returns the error
+ If the pattern set a backtracking depth limit by including an item of
+ the form (*LIMIT_DEPTH=nnnn) at the start, the value is returned. The
+ third argument should point to an unsigned 32-bit integer. If no such
+ value has been set, the call to pcre2_pattern_info() returns the error
PCRE2_ERROR_UNSET. Note that this limit will only be used during match-
- ing if it is less than the limit set or defaulted by the caller of the
+ ing if it is less than the limit set or defaulted by the caller of the
match function.
PCRE2_INFO_FIRSTBITMAP
- In the absence of a single first code unit for a non-anchored pattern,
- pcre2_compile() may construct a 256-bit table that defines a fixed set
- of values for the first code unit in any match. For example, a pattern
- that starts with [abc] results in a table with three bits set. When
- code unit values greater than 255 are supported, the flag bit for 255
- means "any code unit of value 255 or above". If such a table was con-
- structed, a pointer to it is returned. Otherwise NULL is returned. The
+ In the absence of a single first code unit for a non-anchored pattern,
+ pcre2_compile() may construct a 256-bit table that defines a fixed set
+ of values for the first code unit in any match. For example, a pattern
+ that starts with [abc] results in a table with three bits set. When
+ code unit values greater than 255 are supported, the flag bit for 255
+ means "any code unit of value 255 or above". If such a table was con-
+ structed, a pointer to it is returned. Otherwise NULL is returned. The
third argument should point to an const uint8_t * variable.
PCRE2_INFO_FIRSTCODETYPE
Return information about the first code unit of any matched string, for
- a non-anchored pattern. The third argument should point to an uint32_t
- variable. If there is a fixed first value, for example, the letter "c"
- from a pattern such as (cat|cow|coyote), 1 is returned, and the value
- can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed
- first value, but it is known that a match can occur only at the start
- of the subject or following a newline in the subject, 2 is returned.
+ a non-anchored pattern. The third argument should point to an uint32_t
+ variable. If there is a fixed first value, for example, the letter "c"
+ from a pattern such as (cat|cow|coyote), 1 is returned, and the value
+ can be retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed
+ first value, but it is known that a match can occur only at the start
+ of the subject or following a newline in the subject, 2 is returned.
Otherwise, and for anchored patterns, 0 is returned.
PCRE2_INFO_FIRSTCODEUNIT
- Return the value of the first code unit of any matched string for a
- pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
- The third argument should point to an uint32_t variable. In the 8-bit
- library, the value is always less than 256. In the 16-bit library the
- value can be up to 0xffff. In the 32-bit library in UTF-32 mode the
+ Return the value of the first code unit of any matched string for a
+ pattern where PCRE2_INFO_FIRSTCODETYPE returns 1; otherwise return 0.
+ The third argument should point to an uint32_t variable. In the 8-bit
+ library, the value is always less than 256. In the 16-bit library the
+ value can be up to 0xffff. In the 32-bit library in UTF-32 mode the
value can be up to 0x10ffff, and up to 0xffffffff when not using UTF-32
mode.
PCRE2_INFO_FRAMESIZE
Return the size (in bytes) of the data frames that are used to remember
- backtracking positions when the pattern is processed by pcre2_match()
- without the use of JIT. The third argument should point to an size_t
+ backtracking positions when the pattern is processed by pcre2_match()
+ without the use of JIT. The third argument should point to an size_t
variable. The frame size depends on the number of capturing parentheses
- in the pattern. Each additional capturing group adds two PCRE2_SIZE
+ in the pattern. Each additional capturing group adds two PCRE2_SIZE
variables.
PCRE2_INFO_HASBACKSLASHC
- Return 1 if the pattern contains any instances of \C, otherwise 0. The
+ Return 1 if the pattern contains any instances of \C, otherwise 0. The
third argument should point to an uint32_t variable.
PCRE2_INFO_HASCRORLF
- Return 1 if the pattern contains any explicit matches for CR or LF
+ Return 1 if the pattern contains any explicit matches for CR or LF
characters, otherwise 0. The third argument should point to an uint32_t
- variable. An explicit match is either a literal CR or LF character, or
- \r or \n or one of the equivalent hexadecimal or octal escape
+ variable. An explicit match is either a literal CR or LF character, or
+ \r or \n or one of the equivalent hexadecimal or octal escape
sequences.
PCRE2_INFO_HEAPLIMIT
If the pattern set a heap memory limit by including an item of the form
(*LIMIT_HEAP=nnnn) at the start, the value is returned. The third argu-
- ment should point to an unsigned 32-bit integer. If no such value has
- been set, the call to pcre2_pattern_info() returns the error
+ ment should point to an unsigned 32-bit integer. If no such value has
+ been set, the call to pcre2_pattern_info() returns the error
PCRE2_ERROR_UNSET. Note that this limit will only be used during match-
- ing if it is less than the limit set or defaulted by the caller of the
+ ing if it is less than the limit set or defaulted by the caller of the
match function.
PCRE2_INFO_JCHANGED
- Return 1 if the (?J) or (?-J) option setting is used in the pattern,
- otherwise 0. The third argument should point to an uint32_t variable.
- (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respec-
+ Return 1 if the (?J) or (?-J) option setting is used in the pattern,
+ otherwise 0. The third argument should point to an uint32_t variable.
+ (?J) and (?-J) set and unset the local PCRE2_DUPNAMES option, respec-
tively.
PCRE2_INFO_JITSIZE
- If the compiled pattern was successfully processed by pcre2_jit_com-
- pile(), return the size of the JIT compiled code, otherwise return
+ If the compiled pattern was successfully processed by pcre2_jit_com-
+ pile(), return the size of the JIT compiled code, otherwise return
zero. The third argument should point to a size_t variable.
PCRE2_INFO_LASTCODETYPE
- Returns 1 if there is a rightmost literal code unit that must exist in
- any matched string, other than at its start. The third argument should
- point to an uint32_t variable. If there is no such value, 0 is
- returned. When 1 is returned, the code unit value itself can be
- retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last
- literal value is recorded only if it follows something of variable
- length. For example, for the pattern /^a\d+z\d+/ the returned value is
- 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/
+ Returns 1 if there is a rightmost literal code unit that must exist in
+ any matched string, other than at its start. The third argument should
+ point to an uint32_t variable. If there is no such value, 0 is
+ returned. When 1 is returned, the code unit value itself can be
+ retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last
+ literal value is recorded only if it follows something of variable
+ length. For example, for the pattern /^a\d+z\d+/ the returned value is
+ 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/
the returned value is 0.
PCRE2_INFO_LASTCODEUNIT
- Return the value of the rightmost literal code unit that must exist in
- any matched string, other than at its start, for a pattern where
+ Return the value of the rightmost literal code unit that must exist in
+ any matched string, other than at its start, for a pattern where
PCRE2_INFO_LASTCODETYPE returns 1. Otherwise, return 0. The third argu-
ment should point to an uint32_t variable.
PCRE2_INFO_MATCHEMPTY
- Return 1 if the pattern might match an empty string, otherwise 0. The
- third argument should point to an uint32_t variable. When a pattern
+ Return 1 if the pattern might match an empty string, otherwise 0. The
+ third argument should point to an uint32_t variable. When a pattern
contains recursive subroutine calls it is not always possible to deter-
- mine whether or not it can match an empty string. PCRE2 takes a cau-
+ mine whether or not it can match an empty string. PCRE2 takes a cau-
tious approach and returns 1 in such cases.
PCRE2_INFO_MATCHLIMIT
- If the pattern set a match limit by including an item of the form
- (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third
- argument should point to an unsigned 32-bit integer. If no such value
- has been set, the call to pcre2_pattern_info() returns the error
+ If the pattern set a match limit by including an item of the form
+ (*LIMIT_MATCH=nnnn) at the start, the value is returned. The third
+ argument should point to an unsigned 32-bit integer. If no such value
+ has been set, the call to pcre2_pattern_info() returns the error
PCRE2_ERROR_UNSET. Note that this limit will only be used during match-
- ing if it is less than the limit set or defaulted by the caller of the
+ ing if it is less than the limit set or defaulted by the caller of the
match function.
PCRE2_INFO_MAXLOOKBEHIND
Return the number of characters (not code units) in the longest lookbe-
- hind assertion in the pattern. The third argument should point to an
- unsigned 32-bit integer. This information is useful when doing multi-
- segment matching using the partial matching facilities. Note that the
+ hind assertion in the pattern. The third argument should point to an
+ unsigned 32-bit integer. This information is useful when doing multi-
+ segment matching using the partial matching facilities. Note that the
simple assertions \b and \B require a one-character lookbehind. \A also
- registers a one-character lookbehind, though it does not actually
- inspect the previous character. This is to ensure that at least one
- character from the old segment is retained when a new segment is pro-
+ registers a one-character lookbehind, though it does not actually
+ inspect the previous character. This is to ensure that at least one
+ character from the old segment is retained when a new segment is pro-
cessed. Otherwise, if there are no lookbehinds in the pattern, \A might
match incorrectly at the start of a second or subsequent segment.
PCRE2_INFO_MINLENGTH
- If a minimum length for matching subject strings was computed, its
- value is returned. Otherwise the returned value is 0. The value is a
- number of characters, which in UTF mode may be different from the num-
- ber of code units. The third argument should point to an uint32_t
- variable. The value is a lower bound to the length of any matching
- string. There may not be any strings of that length that do actually
+ If a minimum length for matching subject strings was computed, its
+ value is returned. Otherwise the returned value is 0. The value is a
+ number of characters, which in UTF mode may be different from the num-
+ ber of code units. The third argument should point to an uint32_t
+ variable. The value is a lower bound to the length of any matching
+ string. There may not be any strings of that length that do actually
match, but every string that does match is at least that long.
PCRE2_INFO_NAMECOUNT
@@ -2033,50 +2035,50 @@ INFORMATION ABOUT A COMPILED PATTERN
PCRE2_INFO_NAMETABLE
PCRE2 supports the use of named as well as numbered capturing parenthe-
- ses. The names are just an additional way of identifying the parenthe-
+ ses. The names are just an additional way of identifying the parenthe-
ses, which still acquire numbers. Several convenience functions such as
- pcre2_substring_get_byname() are provided for extracting captured sub-
- strings by name. It is also possible to extract the data directly, by
- first converting the name to a number in order to access the correct
- pointers in the output vector (described with pcre2_match() below). To
- do the conversion, you need to use the name-to-number map, which is
+ pcre2_substring_get_byname() are provided for extracting captured sub-
+ strings by name. It is also possible to extract the data directly, by
+ first converting the name to a number in order to access the correct
+ pointers in the output vector (described with pcre2_match() below). To
+ do the conversion, you need to use the name-to-number map, which is
described by these three values.
- The map consists of a number of fixed-size entries. PCRE2_INFO_NAME-
- COUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives
- the size of each entry in code units; both of these return a uint32_t
+ The map consists of a number of fixed-size entries. PCRE2_INFO_NAME-
+ COUNT gives the number of entries, and PCRE2_INFO_NAMEENTRYSIZE gives
+ the size of each entry in code units; both of these return a uint32_t
value. The entry size depends on the length of the longest name.
PCRE2_INFO_NAMETABLE returns a pointer to the first entry of the table.
- This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit
- library, the first two bytes of each entry are the number of the cap-
+ This is a PCRE2_SPTR pointer to a block of code units. In the 8-bit
+ library, the first two bytes of each entry are the number of the cap-
turing parenthesis, most significant byte first. In the 16-bit library,
- the pointer points to 16-bit code units, the first of which contains
- the parenthesis number. In the 32-bit library, the pointer points to
- 32-bit code units, the first of which contains the parenthesis number.
+ the pointer points to 16-bit code units, the first of which contains
+ the parenthesis number. In the 32-bit library, the pointer points to
+ 32-bit code units, the first of which contains the parenthesis number.
The rest of the entry is the corresponding name, zero terminated.
- The names are in alphabetical order. If (?| is used to create multiple
- groups with the same number, as described in the section on duplicate
- subpattern numbers in the pcre2pattern page, the groups may be given
- the same name, but there is only one entry in the table. Different
+ The names are in alphabetical order. If (?| is used to create multiple
+ groups with the same number, as described in the section on duplicate
+ subpattern numbers in the pcre2pattern page, the groups may be given
+ the same name, but there is only one entry in the table. Different
names for groups of the same number are not permitted.
- Duplicate names for subpatterns with different numbers are permitted,
- but only if PCRE2_DUPNAMES is set. They appear in the table in the
- order in which they were found in the pattern. In the absence of (?|
- this is the order of increasing number; when (?| is used this is not
+ Duplicate names for subpatterns with different numbers are permitted,
+ but only if PCRE2_DUPNAMES is set. They appear in the table in the
+ order in which they were found in the pattern. In the absence of (?|
+ this is the order of increasing number; when (?| is used this is not
necessarily the case because later subpatterns may have lower numbers.
- As a simple example of the name/number table, consider the following
- pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED
+ As a simple example of the name/number table, consider the following
+ pattern after compilation by the 8-bit library (assume PCRE2_EXTENDED
is set, so white space - including newlines - is ignored):
(?