File tidies, version updates, etc. for 10.21-RC1
This commit is contained in:
parent
293da188aa
commit
dffd559601
|
@ -258,7 +258,7 @@ ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
||||||
|
|
||||||
IF(PCRE2_NEVER_BACKSLASH_C)
|
IF(PCRE2_NEVER_BACKSLASH_C)
|
||||||
SET(NEVER_BACKSLASH_C 1)
|
SET(NEVER_BACKSLASH_C 1)
|
||||||
ENDIF(PCRE2_NEVER_BACKSLASH_C)
|
ENDIF(PCRE2_NEVER_BACKSLASH_C)
|
||||||
|
|
||||||
IF(PCRE2_SUPPORT_UNICODE)
|
IF(PCRE2_SUPPORT_UNICODE)
|
||||||
SET(SUPPORT_UNICODE 1)
|
SET(SUPPORT_UNICODE 1)
|
||||||
|
@ -400,7 +400,7 @@ SET(PCRE2_SOURCES
|
||||||
src/pcre2_context.c
|
src/pcre2_context.c
|
||||||
src/pcre2_dfa_match.c
|
src/pcre2_dfa_match.c
|
||||||
src/pcre2_error.c
|
src/pcre2_error.c
|
||||||
src/pcre2_find_bracket.c
|
src/pcre2_find_bracket.c
|
||||||
src/pcre2_jit_compile.c
|
src/pcre2_jit_compile.c
|
||||||
src/pcre2_maketables.c
|
src/pcre2_maketables.c
|
||||||
src/pcre2_match.c
|
src/pcre2_match.c
|
||||||
|
|
86
ChangeLog
86
ChangeLog
|
@ -268,18 +268,18 @@ size of patterns that they are prepared to handle.
|
||||||
|
|
||||||
78. (*NO_AUTO_POSSESS) was not working.
|
78. (*NO_AUTO_POSSESS) was not working.
|
||||||
|
|
||||||
79. Adding group information caching improves the speed of compiling when
|
79. Adding group information caching improves the speed of compiling when
|
||||||
checking whether a group has a fixed length and/or could match an empty string,
|
checking whether a group has a fixed length and/or could match an empty string,
|
||||||
especially when recursion or subroutine calls are involved. However, this
|
especially when recursion or subroutine calls are involved. However, this
|
||||||
cannot be used when (?| is present in the pattern because the same number may
|
cannot be used when (?| is present in the pattern because the same number may
|
||||||
be used for groups of different sizes. To catch runaway patterns in this
|
be used for groups of different sizes. To catch runaway patterns in this
|
||||||
situation, counts have been introduced to the functions that scan for empty
|
situation, counts have been introduced to the functions that scan for empty
|
||||||
branches or compute fixed lengths.
|
branches or compute fixed lengths.
|
||||||
|
|
||||||
80. Allow for the possibility of the size of the nest_save structure not being
|
80. Allow for the possibility of the size of the nest_save structure not being
|
||||||
a factor of the size of the compiling workspace (it currently is).
|
a factor of the size of the compiling workspace (it currently is).
|
||||||
|
|
||||||
81. Check for integer overflow in minimum length calculation and cap it at
|
81. Check for integer overflow in minimum length calculation and cap it at
|
||||||
65535.
|
65535.
|
||||||
|
|
||||||
82. Small optimizations in code for finding the minimum matching length.
|
82. Small optimizations in code for finding the minimum matching length.
|
||||||
|
@ -290,72 +290,72 @@ a factor of the size of the compiling workspace (it currently is).
|
||||||
|
|
||||||
85. Check for too many replacements (more than INT_MAX) in pcre2_substitute().
|
85. Check for too many replacements (more than INT_MAX) in pcre2_substitute().
|
||||||
|
|
||||||
86. Avoid the possibility of computing with an out-of-bounds pointer (though
|
86. Avoid the possibility of computing with an out-of-bounds pointer (though
|
||||||
not dereferencing it) while handling lookbehind assertions.
|
not dereferencing it) while handling lookbehind assertions.
|
||||||
|
|
||||||
87. Failure to get memory for the match data in regcomp() is now given as a
|
87. Failure to get memory for the match data in regcomp() is now given as a
|
||||||
regcomp() error instead of waiting for regexec() to pick it up.
|
regcomp() error instead of waiting for regexec() to pick it up.
|
||||||
|
|
||||||
88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid
|
88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid
|
||||||
newline sequence.
|
newline sequence.
|
||||||
|
|
||||||
89. Paranoid check in regcomp() for bad error code from pcre2_compile().
|
89. Paranoid check in regcomp() for bad error code from pcre2_compile().
|
||||||
|
|
||||||
90. Run test 8 (internal offsets and code sizes) for link sizes 3 and 4 as well
|
90. Run test 8 (internal offsets and code sizes) for link sizes 3 and 4 as well
|
||||||
as for link size 2.
|
as for link size 2.
|
||||||
|
|
||||||
91. Document that JIT has a limit on pattern size, and give more information
|
91. Document that JIT has a limit on pattern size, and give more information
|
||||||
about JIT compile failures in pcre2test.
|
about JIT compile failures in pcre2test.
|
||||||
|
|
||||||
92. Implement PCRE2_INFO_HASBACKSLASHC.
|
92. Implement PCRE2_INFO_HASBACKSLASHC.
|
||||||
|
|
||||||
93. Re-arrange valgrind support code in pcre2test to avoid spurious reports
|
93. Re-arrange valgrind support code in pcre2test to avoid spurious reports
|
||||||
with JIT (possibly caused by SSE2?).
|
with JIT (possibly caused by SSE2?).
|
||||||
|
|
||||||
94. Support offset_limit in JIT.
|
94. Support offset_limit in JIT.
|
||||||
|
|
||||||
95. A sequence such as [[:punct:]b] that is, a POSIX character class followed
|
95. A sequence such as [[:punct:]b] that is, a POSIX character class followed
|
||||||
by a single ASCII character in a class item, was incorrectly compiled in UCP
|
by a single ASCII character in a class item, was incorrectly compiled in UCP
|
||||||
mode. The POSIX class got lost, but only if the single character followed it.
|
mode. The POSIX class got lost, but only if the single character followed it.
|
||||||
|
|
||||||
96. [:punct:] in UCP mode was matching some characters in the range 128-255
|
96. [:punct:] in UCP mode was matching some characters in the range 128-255
|
||||||
that should not have been matched.
|
that should not have been matched.
|
||||||
|
|
||||||
97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all
|
97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all
|
||||||
characters with code points greater than 255 are in the class. When a Unicode
|
characters with code points greater than 255 are in the class. When a Unicode
|
||||||
property was also in the class (if PCRE2_UCP is set, escapes such as \w are
|
property was also in the class (if PCRE2_UCP is set, escapes such as \w are
|
||||||
turned into Unicode properties), wide characters were not correctly handled,
|
turned into Unicode properties), wide characters were not correctly handled,
|
||||||
and could fail to match.
|
and could fail to match.
|
||||||
|
|
||||||
98. In pcre2test, make the "startoffset" modifier a synonym of "offset",
|
98. In pcre2test, make the "startoffset" modifier a synonym of "offset",
|
||||||
because it sets the "startoffset" parameter for pcre2_match().
|
because it sets the "startoffset" parameter for pcre2_match().
|
||||||
|
|
||||||
99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between
|
99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between
|
||||||
an item and its qualifier (for example, A(?#comment)?B) pcre2_compile()
|
an item and its qualifier (for example, A(?#comment)?B) pcre2_compile()
|
||||||
misbehaved. This bug was found by the LLVM fuzzer.
|
misbehaved. This bug was found by the LLVM fuzzer.
|
||||||
|
|
||||||
100. The error for an invalid UTF pattern string always gave the code unit
|
100. The error for an invalid UTF pattern string always gave the code unit
|
||||||
offset as zero instead of where the invalidity was found.
|
offset as zero instead of where the invalidity was found.
|
||||||
|
|
||||||
101. Further to 97 above, negated classes such as [^[:^ascii:]\d] were also not
|
101. Further to 97 above, negated classes such as [^[:^ascii:]\d] were also not
|
||||||
working correctly in UCP mode.
|
working correctly in UCP mode.
|
||||||
|
|
||||||
102. Similar to 99 above, if an isolated \E was present between an item and its
|
102. Similar to 99 above, if an isolated \E was present between an item and its
|
||||||
qualifier when PCRE2_AUTO_CALLOUT was set, pcre2_compile() misbehaved. This bug
|
qualifier when PCRE2_AUTO_CALLOUT was set, pcre2_compile() misbehaved. This bug
|
||||||
was found by the LLVM fuzzer.
|
was found by the LLVM fuzzer.
|
||||||
|
|
||||||
103. The POSIX wrapper function regexec() crashed if the option REG_STARTEND
|
103. The POSIX wrapper function regexec() crashed if the option REG_STARTEND
|
||||||
was set when the pmatch argument was NULL. It now returns REG_INVARG.
|
was set when the pmatch argument was NULL. It now returns REG_INVARG.
|
||||||
|
|
||||||
104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep.
|
104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep.
|
||||||
|
|
||||||
105. An empty \Q\E sequence between an item and its qualifier caused
|
105. An empty \Q\E sequence between an item and its qualifier caused
|
||||||
pcre2_compile() to misbehave when auto callouts were enabled. This bug
|
pcre2_compile() to misbehave when auto callouts were enabled. This bug
|
||||||
was found by the LLVM fuzzer.
|
was found by the LLVM fuzzer.
|
||||||
|
|
||||||
106. If both PCRE2_ALT_VERBNAMES and PCRE2_EXTENDED were set, and a (*MARK) or
|
106. If both PCRE2_ALT_VERBNAMES and PCRE2_EXTENDED were set, and a (*MARK) or
|
||||||
other verb "name" ended with whitespace immediately before the closing
|
other verb "name" ended with whitespace immediately before the closing
|
||||||
parenthesis, pcre2_compile() misbehaved. Example: /(*:abc )/, but only when
|
parenthesis, pcre2_compile() misbehaved. Example: /(*:abc )/, but only when
|
||||||
both those options were set.
|
both those options were set.
|
||||||
|
|
||||||
107. In a number of places pcre2_compile() was not handling NULL characters
|
107. In a number of places pcre2_compile() was not handling NULL characters
|
||||||
|
@ -363,27 +363,27 @@ correctly, and pcre2test with the "bincode" modifier was not always correctly
|
||||||
displaying fields containing NULLS:
|
displaying fields containing NULLS:
|
||||||
|
|
||||||
(a) Within /x extended #-comments
|
(a) Within /x extended #-comments
|
||||||
(b) Within the "name" part of (*MARK) and other *verbs
|
(b) Within the "name" part of (*MARK) and other *verbs
|
||||||
(c) Within the text argument of a callout
|
(c) Within the text argument of a callout
|
||||||
|
|
||||||
108. If a pattern that was compiled with PCRE2_EXTENDED started with white
|
108. If a pattern that was compiled with PCRE2_EXTENDED started with white
|
||||||
space or a #-type comment that was followed by (?-x), which turns off
|
space or a #-type comment that was followed by (?-x), which turns off
|
||||||
PCRE2_EXTENDED, and there was no subsequent (?x) to turn it on again,
|
PCRE2_EXTENDED, and there was no subsequent (?x) to turn it on again,
|
||||||
pcre2_compile() assumed that (?-x) applied to the whole pattern and
|
pcre2_compile() assumed that (?-x) applied to the whole pattern and
|
||||||
consequently mis-compiled it. This bug was found by the LLVM fuzzer. The fix
|
consequently mis-compiled it. This bug was found by the LLVM fuzzer. The fix
|
||||||
for this bug means that a setting of any of the (?imsxU) options at the start
|
for this bug means that a setting of any of the (?imsxU) options at the start
|
||||||
of a pattern is no longer transferred to the options that are returned by
|
of a pattern is no longer transferred to the options that are returned by
|
||||||
PCRE2_INFO_ALLOPTIONS. In fact, this was an anachronism that should have
|
PCRE2_INFO_ALLOPTIONS. In fact, this was an anachronism that should have
|
||||||
changed when the effects of those options were all moved to compile time.
|
changed when the effects of those options were all moved to compile time.
|
||||||
|
|
||||||
109. An escaped closing parenthesis in the "name" part of a (*verb) when
|
109. An escaped closing parenthesis in the "name" part of a (*verb) when
|
||||||
PCRE2_ALT_VERBNAMES was set caused pcre2_compile() to malfunction. This bug
|
PCRE2_ALT_VERBNAMES was set caused pcre2_compile() to malfunction. This bug
|
||||||
was found by the LLVM fuzzer.
|
was found by the LLVM fuzzer.
|
||||||
|
|
||||||
110. Implemented PCRE2_SUBSTITUTE_UNSET_EMPTY, and updated pcre2test to make it
|
110. Implemented PCRE2_SUBSTITUTE_UNSET_EMPTY, and updated pcre2test to make it
|
||||||
possible to test it.
|
possible to test it.
|
||||||
|
|
||||||
111. "Harden" pcre2test against ridiculously large values in modifiers and
|
111. "Harden" pcre2test against ridiculously large values in modifiers and
|
||||||
command line arguments.
|
command line arguments.
|
||||||
|
|
||||||
112. Implemented PCRE2_SUBSTITUTE_UNKNOWN_UNSET and PCRE2_SUBSTITUTE_OVERFLOW_
|
112. Implemented PCRE2_SUBSTITUTE_UNKNOWN_UNSET and PCRE2_SUBSTITUTE_OVERFLOW_
|
||||||
|
|
41
NEWS
41
NEWS
|
@ -1,6 +1,47 @@
|
||||||
News about PCRE2 releases
|
News about PCRE2 releases
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
|
Version 10.21 15-December-2015
|
||||||
|
------------------------------
|
||||||
|
|
||||||
|
1. Many bugs have been fixed. A large number of them were provoked only by very
|
||||||
|
strange pattern input, and were discovered by fuzzers. Some others were
|
||||||
|
discovered by code auditing. See ChangeLog for details.
|
||||||
|
|
||||||
|
2. The Unicode tables have been updated to Unicode version 8.0.0.
|
||||||
|
|
||||||
|
3. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
|
||||||
|
class, where both values are literal letters in the same case, omit the
|
||||||
|
non-letter EBCDIC code points within the range.
|
||||||
|
|
||||||
|
4. There have been a number of enhancements to the pcre2_substitute() function,
|
||||||
|
giving more flexibility to replacement facilities. It is now also possible to
|
||||||
|
cause the function to return the needed buffer size if the one given is too
|
||||||
|
small.
|
||||||
|
|
||||||
|
5. The PCRE2_ALT_VERBNAMES option causes the "name" parts of special verbs such
|
||||||
|
as (*THEN:name) to be processed for backslashes and to take note of
|
||||||
|
PCRE2_EXTENDED.
|
||||||
|
|
||||||
|
6. PCRE2_INFO_HASBACKSLASHC makes it possible for a client to find out if a
|
||||||
|
pattern uses \C, and --never-backslash-C makes it possible to compile a version
|
||||||
|
PCRE2 in which the use of \C is always forbidden.
|
||||||
|
|
||||||
|
7. A limit to the length of pattern that can be handled can now be set by
|
||||||
|
calling pcre2_set_max_pattern_length().
|
||||||
|
|
||||||
|
8. When matching an unanchored pattern, a match can be required to begin within
|
||||||
|
a given number of code units after the start of the subject by calling
|
||||||
|
pcre2_set_offset_limit().
|
||||||
|
|
||||||
|
9. The pcre2test program has been extended to test new facilities, and it can
|
||||||
|
now run the tests when LF on its own is not a valid newline sequence.
|
||||||
|
|
||||||
|
10. The RunTest script has also been updated to enable more tests to be run.
|
||||||
|
|
||||||
|
11. There have been some minor performance enhancements.
|
||||||
|
|
||||||
|
|
||||||
Version 10.20 30-June-2015
|
Version 10.20 30-June-2015
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ can skip ahead to the CMake section.
|
||||||
pcre2_context.c
|
pcre2_context.c
|
||||||
pcre2_dfa_match.c
|
pcre2_dfa_match.c
|
||||||
pcre2_error.c
|
pcre2_error.c
|
||||||
pcre2_find_bracket.c
|
pcre2_find_bracket.c
|
||||||
pcre2_jit_compile.c
|
pcre2_jit_compile.c
|
||||||
pcre2_maketables.c
|
pcre2_maketables.c
|
||||||
pcre2_match.c
|
pcre2_match.c
|
||||||
|
|
14
README
14
README
|
@ -219,13 +219,13 @@ library. They are also documented in the pcre2build man page.
|
||||||
to be the end of a line (see above). However, the caller of PCRE2 can
|
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||||
|
|
||||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||||
command. When \C is allowed by the library, individual applications can lock
|
command. When \C is allowed by the library, individual applications can lock
|
||||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||||
|
|
||||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||||
pattern. This limits the amount of system stack that a pattern uses when it
|
pattern. This limits the amount of system stack that a pattern uses when it
|
||||||
|
@ -731,7 +731,7 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_context.c )
|
src/pcre2_context.c )
|
||||||
src/pcre2_dfa_match.c )
|
src/pcre2_dfa_match.c )
|
||||||
src/pcre2_error.c )
|
src/pcre2_error.c )
|
||||||
src/pcre2_find_bracket.c )
|
src/pcre2_find_bracket.c )
|
||||||
src/pcre2_jit_compile.c )
|
src/pcre2_jit_compile.c )
|
||||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||||
|
|
|
@ -82,7 +82,7 @@ utf8=$?
|
||||||
nl=`$pcre2test -C newline`
|
nl=`$pcre2test -C newline`
|
||||||
if [ "$nl" != "LF" -a "$nl" != "ANY" -a "$nl" != "ANYCRLF" ]; then
|
if [ "$nl" != "LF" -a "$nl" != "ANY" -a "$nl" != "ANYCRLF" ]; then
|
||||||
pcre2grep="$pcre2grep -N LF"
|
pcre2grep="$pcre2grep -N LF"
|
||||||
echo "Default newline setting forced to LF"
|
echo "Default newline setting forced to LF"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# ------ Function to run and check a special pcre2grep arguments test -------
|
# ------ Function to run and check a special pcre2grep arguments test -------
|
||||||
|
|
20
RunTest
20
RunTest
|
@ -406,7 +406,7 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
|
||||||
jitopt=-jit
|
jitopt=-jit
|
||||||
if [ "$valgrind" != "" ] ; then
|
if [ "$valgrind" != "" ] ; then
|
||||||
vjs="--suppressions=$testdata/valgrind-jit.supp"
|
vjs="--suppressions=$testdata/valgrind-jit.supp"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# If no specific tests were requested, select all. Those that are not
|
# If no specific tests were requested, select all. Those that are not
|
||||||
|
@ -439,10 +439,10 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
||||||
do17=yes
|
do17=yes
|
||||||
do18=yes
|
do18=yes
|
||||||
do19=yes
|
do19=yes
|
||||||
do20=yes
|
do20=yes
|
||||||
do21=yes
|
do21=yes
|
||||||
do22=yes
|
do22=yes
|
||||||
do23=yes
|
do23=yes
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||||
|
@ -720,7 +720,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
||||||
checkresult $? 13 ""
|
checkresult $? 13 ""
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Tests for DFA UTF and UCP features. Output is different for the different widths.
|
# Tests for DFA UTF and UCP features. Output is different for the different widths.
|
||||||
|
|
||||||
if [ $do14 = yes ] ; then
|
if [ $do14 = yes ] ; then
|
||||||
|
@ -730,7 +730,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
||||||
else
|
else
|
||||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry
|
$sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry
|
||||||
checkresult $? 14-$bits ""
|
checkresult $? 14-$bits ""
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Test non-JIT match and recursion limits
|
# Test non-JIT match and recursion limits
|
||||||
|
@ -798,7 +798,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
||||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
|
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
|
||||||
checkresult $? 20 ""
|
checkresult $? 20 ""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# \C tests without UTF - DFA matching is supported
|
# \C tests without UTF - DFA matching is supported
|
||||||
|
|
||||||
if [ "$do21" = yes ] ; then
|
if [ "$do21" = yes ] ; then
|
||||||
|
@ -814,7 +814,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
|
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
|
||||||
|
|
||||||
if [ "$do22" = yes ] ; then
|
if [ "$do22" = yes ] ; then
|
||||||
echo $title22
|
echo $title22
|
||||||
if [ $supportBSC -eq 0 ] ; then
|
if [ $supportBSC -eq 0 ] ; then
|
||||||
|
@ -830,7 +830,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Test when \C is disabled
|
# Test when \C is disabled
|
||||||
|
|
||||||
if [ "$do23" = yes ] ; then
|
if [ "$do23" = yes ] ; then
|
||||||
echo $title23
|
echo $title23
|
||||||
if [ $supportBSC -ne 0 ] ; then
|
if [ $supportBSC -ne 0 ] ; then
|
||||||
|
|
20
configure.ac
20
configure.ac
|
@ -11,16 +11,16 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
|
||||||
m4_define(pcre2_major, [10])
|
m4_define(pcre2_major, [10])
|
||||||
m4_define(pcre2_minor, [21])
|
m4_define(pcre2_minor, [21])
|
||||||
m4_define(pcre2_prerelease, [-RC1])
|
m4_define(pcre2_prerelease, [-RC1])
|
||||||
m4_define(pcre2_date, [2015-07-06])
|
m4_define(pcre2_date, [2015-12-15])
|
||||||
|
|
||||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||||
# 50 lines of this file. Please update that if the variables above are moved.
|
# 50 lines of this file. Please update that if the variables above are moved.
|
||||||
|
|
||||||
# Libtool shared library interface versions (current:revision:age)
|
# Libtool shared library interface versions (current:revision:age)
|
||||||
m4_define(libpcre2_8_version, [2:0:2])
|
m4_define(libpcre2_8_version, [3:0:3])
|
||||||
m4_define(libpcre2_16_version, [2:0:2])
|
m4_define(libpcre2_16_version, [3:0:3])
|
||||||
m4_define(libpcre2_32_version, [2:0:2])
|
m4_define(libpcre2_32_version, [3:0:3])
|
||||||
m4_define(libpcre2_posix_version, [0:0:0])
|
m4_define(libpcre2_posix_version, [0:1:0])
|
||||||
|
|
||||||
AC_PREREQ(2.57)
|
AC_PREREQ(2.57)
|
||||||
AC_INIT(PCRE2, pcre2_major.pcre2_minor[]pcre2_prerelease, , pcre2)
|
AC_INIT(PCRE2, pcre2_major.pcre2_minor[]pcre2_prerelease, , pcre2)
|
||||||
|
@ -189,12 +189,12 @@ AC_ARG_ENABLE(bsr-anycrlf,
|
||||||
AS_HELP_STRING([--enable-bsr-anycrlf],
|
AS_HELP_STRING([--enable-bsr-anycrlf],
|
||||||
[\R matches only CR, LF, CRLF by default]),
|
[\R matches only CR, LF, CRLF by default]),
|
||||||
, enable_bsr_anycrlf=no)
|
, enable_bsr_anycrlf=no)
|
||||||
|
|
||||||
# Handle --enable-never-backslash-C
|
# Handle --enable-never-backslash-C
|
||||||
AC_ARG_ENABLE(never-backslash-C,
|
AC_ARG_ENABLE(never-backslash-C,
|
||||||
AS_HELP_STRING([--enable-never-backslash-C],
|
AS_HELP_STRING([--enable-never-backslash-C],
|
||||||
[use of \C causes an error]),
|
[use of \C causes an error]),
|
||||||
, enable_never_backslash_C=no)
|
, enable_never_backslash_C=no)
|
||||||
|
|
||||||
# Handle --enable-ebcdic
|
# Handle --enable-ebcdic
|
||||||
AC_ARG_ENABLE(ebcdic,
|
AC_ARG_ENABLE(ebcdic,
|
||||||
|
@ -348,7 +348,7 @@ if test "x$enable_ebcdic" = "xyes"; then
|
||||||
AC_MSG_ERROR([support for EBCDIC and Unicode cannot be enabled at the same time])
|
AC_MSG_ERROR([support for EBCDIC and Unicode cannot be enabled at the same time])
|
||||||
fi
|
fi
|
||||||
if test "x$enable_pcre2_16" = "xyes" -o "x$enable_pcre2_32" = "xyes"; then
|
if test "x$enable_pcre2_16" = "xyes" -o "x$enable_pcre2_32" = "xyes"; then
|
||||||
AC_MSG_ERROR([EBCDIC support is available only for the 8-bit library])
|
AC_MSG_ERROR([EBCDIC support is available only for the 8-bit library])
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -617,7 +617,7 @@ fi
|
||||||
if test "$enable_never_backslash_C" = "yes"; then
|
if test "$enable_never_backslash_C" = "yes"; then
|
||||||
AC_DEFINE([NEVER_BACKSLASH_C], [], [
|
AC_DEFINE([NEVER_BACKSLASH_C], [], [
|
||||||
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
|
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
||||||
The value of LINK_SIZE determines the number of bytes used to store
|
The value of LINK_SIZE determines the number of bytes used to store
|
||||||
|
@ -896,7 +896,7 @@ $PACKAGE-$VERSION configuration summary:
|
||||||
Enable Unicode support .......... : ${enable_unicode}
|
Enable Unicode support .......... : ${enable_unicode}
|
||||||
Newline char/sequence ........... : ${enable_newline}
|
Newline char/sequence ........... : ${enable_newline}
|
||||||
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
||||||
\C is disabled .................. : ${enable_never_backslash_C}
|
\C is disabled .................. : ${enable_never_backslash_C}
|
||||||
EBCDIC coding ................... : ${enable_ebcdic}
|
EBCDIC coding ................... : ${enable_ebcdic}
|
||||||
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
||||||
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
||||||
|
|
|
@ -97,7 +97,7 @@ can skip ahead to the CMake section.
|
||||||
pcre2_context.c
|
pcre2_context.c
|
||||||
pcre2_dfa_match.c
|
pcre2_dfa_match.c
|
||||||
pcre2_error.c
|
pcre2_error.c
|
||||||
pcre2_find_bracket.c
|
pcre2_find_bracket.c
|
||||||
pcre2_jit_compile.c
|
pcre2_jit_compile.c
|
||||||
pcre2_maketables.c
|
pcre2_maketables.c
|
||||||
pcre2_match.c
|
pcre2_match.c
|
||||||
|
|
|
@ -219,13 +219,13 @@ library. They are also documented in the pcre2build man page.
|
||||||
to be the end of a line (see above). However, the caller of PCRE2 can
|
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||||
|
|
||||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||||
command. When \C is allowed by the library, individual applications can lock
|
command. When \C is allowed by the library, individual applications can lock
|
||||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||||
|
|
||||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||||
pattern. This limits the amount of system stack that a pattern uses when it
|
pattern. This limits the amount of system stack that a pattern uses when it
|
||||||
|
@ -731,7 +731,7 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_context.c )
|
src/pcre2_context.c )
|
||||||
src/pcre2_dfa_match.c )
|
src/pcre2_dfa_match.c )
|
||||||
src/pcre2_error.c )
|
src/pcre2_error.c )
|
||||||
src/pcre2_find_bracket.c )
|
src/pcre2_find_bracket.c )
|
||||||
src/pcre2_jit_compile.c )
|
src/pcre2_jit_compile.c )
|
||||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||||
|
|
|
@ -126,9 +126,9 @@ running redundant checks.
|
||||||
<P>
|
<P>
|
||||||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||||
problems, because it may leave the current matching point in the middle of a
|
problems, because it may leave the current matching point in the middle of a
|
||||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||||
application to lock out the use of \C, causing a compile-time error if it is
|
application to lock out the use of \C, causing a compile-time error if it is
|
||||||
encountered. It is also possible to build PCRE2 with the use of \C permanently
|
encountered. It is also possible to build PCRE2 with the use of \C permanently
|
||||||
disabled.
|
disabled.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
|
|
@ -42,19 +42,20 @@ request are as follows:
|
||||||
PCRE2_BSR_ANYCRLF: CR, LF, or CRLF only
|
PCRE2_BSR_ANYCRLF: CR, LF, or CRLF only
|
||||||
PCRE2_INFO_CAPTURECOUNT Number of capturing subpatterns
|
PCRE2_INFO_CAPTURECOUNT Number of capturing subpatterns
|
||||||
PCRE2_INFO_FIRSTBITMAP Bitmap of first code units, or NULL
|
PCRE2_INFO_FIRSTBITMAP Bitmap of first code units, or NULL
|
||||||
PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1
|
|
||||||
PCRE2_INFO_FIRSTCODETYPE Type of start-of-match information
|
PCRE2_INFO_FIRSTCODETYPE Type of start-of-match information
|
||||||
0 nothing set
|
0 nothing set
|
||||||
1 first code unit is set
|
1 first code unit is set
|
||||||
2 start of string or after newline
|
2 start of string or after newline
|
||||||
|
PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1
|
||||||
|
PCRE2_INFO_HASBACKSLASHC Return 1 if pattern contains \C
|
||||||
PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches
|
PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches
|
||||||
exist in the pattern
|
exist in the pattern
|
||||||
PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
|
PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
|
||||||
PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0
|
PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0
|
||||||
PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1
|
|
||||||
PCRE2_INFO_LASTCODETYPE Type of must-be-present information
|
PCRE2_INFO_LASTCODETYPE Type of must-be-present information
|
||||||
0 nothing set
|
0 nothing set
|
||||||
1 code unit is set
|
1 code unit is set
|
||||||
|
PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1
|
||||||
PCRE2_INFO_MATCHEMPTY 1 if the pattern can match an
|
PCRE2_INFO_MATCHEMPTY 1 if the pattern can match an
|
||||||
empty string, 0 otherwise
|
empty string, 0 otherwise
|
||||||
PCRE2_INFO_MATCHLIMIT Match limit if set,
|
PCRE2_INFO_MATCHLIMIT Match limit if set,
|
||||||
|
@ -62,8 +63,8 @@ request are as follows:
|
||||||
PCRE2_INFO_MAXLOOKBEHIND Length (in characters) of the longest
|
PCRE2_INFO_MAXLOOKBEHIND Length (in characters) of the longest
|
||||||
lookbehind assertion
|
lookbehind assertion
|
||||||
PCRE2_INFO_MINLENGTH Lower bound length of matching strings
|
PCRE2_INFO_MINLENGTH Lower bound length of matching strings
|
||||||
PCRE2_INFO_NAMEENTRYSIZE Size of name table entries
|
|
||||||
PCRE2_INFO_NAMECOUNT Number of named subpatterns
|
PCRE2_INFO_NAMECOUNT Number of named subpatterns
|
||||||
|
PCRE2_INFO_NAMEENTRYSIZE Size of name table entries
|
||||||
PCRE2_INFO_NAMETABLE Pointer to name table
|
PCRE2_INFO_NAMETABLE Pointer to name table
|
||||||
PCRE2_CONFIG_NEWLINE Code for the newline sequence:
|
PCRE2_CONFIG_NEWLINE Code for the newline sequence:
|
||||||
PCRE2_NEWLINE_CR
|
PCRE2_NEWLINE_CR
|
||||||
|
|
|
@ -26,7 +26,7 @@ SYNOPSIS
|
||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
This function sets, in a compile context, the maximum length (in code units) of
|
This function sets, in a compile context, the maximum length (in code units) of
|
||||||
the pattern that can be compiled. The result is always zero.
|
the pattern that can be compiled. The result is always zero.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
|
|
@ -70,6 +70,9 @@ The options are:
|
||||||
PCRE2_UTF was set at compile time)
|
PCRE2_UTF was set at compile time)
|
||||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||||
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||||
|
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||||
|
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||||
</pre>
|
</pre>
|
||||||
The function returns the number of substitutions, which may be zero if there
|
The function returns the number of substitutions, which may be zero if there
|
||||||
were no matches. The result can be greater than one only when
|
were no matches. The result can be greater than one only when
|
||||||
|
|
|
@ -618,7 +618,7 @@ of the following compile-time parameters:
|
||||||
PCRE2's character tables
|
PCRE2's character tables
|
||||||
The newline character sequence
|
The newline character sequence
|
||||||
The compile time nested parentheses limit
|
The compile time nested parentheses limit
|
||||||
The maximum length of the pattern string
|
The maximum length of the pattern string
|
||||||
An external function for stack checking
|
An external function for stack checking
|
||||||
</pre>
|
</pre>
|
||||||
A compile context is also required if you are using custom memory management.
|
A compile context is also required if you are using custom memory management.
|
||||||
|
@ -661,10 +661,10 @@ in the current locale.
|
||||||
<b> PCRE2_SIZE <i>value</i>);</b>
|
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
This sets a maximum length, in code units, for the pattern string that is to be
|
This sets a maximum length, in code units, for the pattern string that is to be
|
||||||
compiled. If the pattern is longer, an error is generated. This facility is
|
compiled. If the pattern is longer, an error is generated. This facility is
|
||||||
provided so that applications that accept patterns from external sources can
|
provided so that applications that accept patterns from external sources can
|
||||||
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
||||||
can hold, which is effectively unlimited.
|
can hold, which is effectively unlimited.
|
||||||
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
|
@ -716,8 +716,8 @@ of the following match-time parameters:
|
||||||
<pre>
|
<pre>
|
||||||
A callout function
|
A callout function
|
||||||
The offset limit for matching an unanchored pattern
|
The offset limit for matching an unanchored pattern
|
||||||
The limit for calling <i>match()</i>
|
The limit for calling <b>match()</b> (see below)
|
||||||
The limit for calling <i>match()</i> recursively
|
The limit for calling <b>match()</b> recursively
|
||||||
</pre>
|
</pre>
|
||||||
A match context is also required if you are using custom memory management.
|
A match context is also required if you are using custom memory management.
|
||||||
If none of these apply, just pass NULL as the context argument of
|
If none of these apply, just pass NULL as the context argument of
|
||||||
|
@ -771,7 +771,9 @@ PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
|
||||||
<P>
|
<P>
|
||||||
The offset limit facility can be used to track progress when searching large
|
The offset limit facility can be used to track progress when searching large
|
||||||
subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
||||||
start within the first line of the subject.
|
start within the first line of the subject. If this is set with an offset
|
||||||
|
limit, a match must occur in the first line and also within the offset limit.
|
||||||
|
In other words, whichever limit comes first is used.
|
||||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||||
<b> uint32_t <i>value</i>);</b>
|
<b> uint32_t <i>value</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -1212,7 +1214,9 @@ built.
|
||||||
If this option is set, an unanchored pattern is required to match before or at
|
If this option is set, an unanchored pattern is required to match before or at
|
||||||
the first newline in the subject string, though the matched text may continue
|
the first newline in the subject string, though the matched text may continue
|
||||||
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
|
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
|
||||||
general limiting facility.
|
general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a
|
||||||
|
match must occur in the first line and also within the offset limit. In other
|
||||||
|
words, whichever limit comes first is used.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_MATCH_UNSET_BACKREF
|
PCRE2_MATCH_UNSET_BACKREF
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -1251,7 +1255,7 @@ This option locks out the use of \C in the pattern that is being compiled.
|
||||||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||||
it may leave the current matching point in the middle of a multi-code-unit
|
it may leave the current matching point in the middle of a multi-code-unit
|
||||||
character. This option may be useful in applications that process patterns from
|
character. This option may be useful in applications that process patterns from
|
||||||
external sources. Note that there is also a build-time option that permanently
|
external sources. Note that there is also a build-time option that permanently
|
||||||
locks out the use of \C.
|
locks out the use of \C.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_NEVER_UCP
|
PCRE2_NEVER_UCP
|
||||||
|
@ -1563,11 +1567,10 @@ are as follows:
|
||||||
Return a copy of the pattern's options. The third argument should point to a
|
Return a copy of the pattern's options. The third argument should point to a
|
||||||
<b>uint32_t</b> variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
|
<b>uint32_t</b> variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
|
||||||
were passed to <b>pcre2_compile()</b>, whereas PCRE2_INFO_ALLOPTIONS returns
|
were passed to <b>pcre2_compile()</b>, whereas PCRE2_INFO_ALLOPTIONS returns
|
||||||
the compile options as modified by any top-level option settings at the start
|
the compile options as modified by any top-level option settings such as (*UTF)
|
||||||
of the pattern itself. In other words, they are the options that will be in
|
at the start of the pattern itself. For example, if the pattern /(*UTF)abc/ is
|
||||||
force when matching starts. For example, if the pattern /(?im)abc(?-i)d/ is
|
compiled with the PCRE2_EXTENDED option, the result is PCRE2_EXTENDED and
|
||||||
compiled with the PCRE2_EXTENDED option, the result is PCRE2_CASELESS,
|
PCRE2_UTF.
|
||||||
PCRE2_MULTILINE, and PCRE2_EXTENDED.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if
|
A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if
|
||||||
|
@ -1609,18 +1612,27 @@ matches only CR, LF, or CRLF.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_INFO_CAPTURECOUNT
|
PCRE2_INFO_CAPTURECOUNT
|
||||||
</pre>
|
</pre>
|
||||||
Return the number of capturing subpatterns in the pattern. The third argument
|
Return the highest capturing subpattern number in the pattern. In patterns
|
||||||
should point to an <b>uint32_t</b> variable.
|
where (?| is not used, this is also the total number of capturing subpatterns.
|
||||||
|
The third argument should point to an <b>uint32_t</b> variable.
|
||||||
|
<pre>
|
||||||
|
PCRE2_INFO_FIRSTBITMAP
|
||||||
|
</pre>
|
||||||
|
In the absence of a single first code unit for a non-anchored pattern,
|
||||||
|
<b>pcre2_compile()</b> may construct a 256-bit table that defines a fixed set of
|
||||||
|
values for the first code unit in any match. For example, a pattern that starts
|
||||||
|
with [abc] results in a table with three bits set. When code unit values
|
||||||
|
greater than 255 are supported, the flag bit for 255 means "any code unit of
|
||||||
|
value 255 or above". If such a table was constructed, a pointer to it is
|
||||||
|
returned. Otherwise NULL is returned. The third argument should point to an
|
||||||
|
<b>const uint8_t *</b> variable.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_INFO_FIRSTCODETYPE
|
PCRE2_INFO_FIRSTCODETYPE
|
||||||
</pre>
|
</pre>
|
||||||
Return information about the first code unit of any matched string, for a
|
Return information about the first code unit of any matched string, for a
|
||||||
non-anchored pattern. The third argument should point to an <b>uint32_t</b>
|
non-anchored pattern. The third argument should point to an <b>uint32_t</b>
|
||||||
variable.
|
variable. If there is a fixed first value, for example, the letter "c" from a
|
||||||
</P>
|
pattern such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||||
<P>
|
|
||||||
If there is a fixed first value, for example, the letter "c" from a pattern
|
|
||||||
such as (cat|cow|coyote), 1 is returned, and the character value can be
|
|
||||||
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
||||||
it is known that a match can occur only at the start of the subject or
|
it is known that a match can occur only at the start of the subject or
|
||||||
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
||||||
|
@ -1635,16 +1647,10 @@ value is always less than 256. In the 16-bit library the value can be up to
|
||||||
0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff,
|
0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff,
|
||||||
and up to 0xffffffff when not using UTF-32 mode.
|
and up to 0xffffffff when not using UTF-32 mode.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_INFO_FIRSTBITMAP
|
PCRE2_INFO_HASBACKSLASHC
|
||||||
</pre>
|
</pre>
|
||||||
In the absence of a single first code unit for a non-anchored pattern,
|
Return 1 if the pattern contains any instances of \C, otherwise 0. The third
|
||||||
<b>pcre2_compile()</b> may construct a 256-bit table that defines a fixed set of
|
argument should point to an <b>uint32_t</b> variable.
|
||||||
values for the first code unit in any match. For example, a pattern that starts
|
|
||||||
with [abc] results in a table with three bits set. When code unit values
|
|
||||||
greater than 255 are supported, the flag bit for 255 means "any code unit of
|
|
||||||
value 255 or above". If such a table was constructed, a pointer to it is
|
|
||||||
returned. Otherwise NULL is returned. The third argument should point to an
|
|
||||||
<b>const uint8_t *</b> variable.
|
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_INFO_HASCRORLF
|
PCRE2_INFO_HASCRORLF
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -1670,13 +1676,10 @@ Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||||
matched string, other than at its start. The third argument should point to an
|
matched string, other than at its start. The third argument should point to an
|
||||||
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
||||||
returned, the code unit value itself can be retrieved using
|
returned, the code unit value itself can be retrieved using
|
||||||
PCRE2_INFO_LASTCODEUNIT.
|
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||||
</P>
|
recorded only if it follows something of variable length. For example, for the
|
||||||
<P>
|
pattern /^a\d+z\d+/ the returned value is 1 (with "z" returned from
|
||||||
For anchored patterns, a last literal value is recorded only if it follows
|
PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0.
|
||||||
something of variable length. For example, for the pattern /^a\d+z\d+/ the
|
|
||||||
returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for
|
|
||||||
/^a\dz\d/ the returned value is 0.
|
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_INFO_LASTCODEUNIT
|
PCRE2_INFO_LASTCODEUNIT
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -1687,8 +1690,11 @@ value, 0 is returned.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_INFO_MATCHEMPTY
|
PCRE2_INFO_MATCHEMPTY
|
||||||
</pre>
|
</pre>
|
||||||
Return 1 if the pattern can match an empty string, otherwise 0. The third
|
Return 1 if the pattern might match an empty string, otherwise 0. The third
|
||||||
argument should point to an <b>uint32_t</b> variable.
|
argument should point to an <b>uint32_t</b> variable. When a pattern contains
|
||||||
|
recursive subroutine calls it is not always possible to determine whether or
|
||||||
|
not it can match an empty string. PCRE2 takes a cautious approach and returns 1
|
||||||
|
in such cases.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_INFO_MATCHLIMIT
|
PCRE2_INFO_MATCHLIMIT
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -2142,8 +2148,13 @@ documentation.
|
||||||
When PCRE2 is built, a default newline convention is set; this is usually the
|
When PCRE2 is built, a default newline convention is set; this is usually the
|
||||||
standard convention for the operating system. The default can be overridden in
|
standard convention for the operating system. The default can be overridden in
|
||||||
a
|
a
|
||||||
<a href="#compilecontext">compile context.</a>
|
<a href="#compilecontext">compile context</a>
|
||||||
During matching, the newline choice affects the behaviour of the dot,
|
by calling <b>pcre2_set_newline()</b>. It can also be overridden by starting a
|
||||||
|
pattern string with, for example, (*CRLF), as described in the
|
||||||
|
<a href="pcre2pattern.html#newlines">section on newline conventions</a>
|
||||||
|
in the
|
||||||
|
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||||
|
page. During matching, the newline choice affects the behaviour of the dot,
|
||||||
circumflex, and dollar metacharacters. It may also alter the way the match
|
circumflex, and dollar metacharacters. It may also alter the way the match
|
||||||
starting position is advanced after a match failure for an unanchored pattern.
|
starting position is advanced after a match failure for an unanchored pattern.
|
||||||
</P>
|
</P>
|
||||||
|
@ -2191,19 +2202,20 @@ function can be used to find out how many capturing subpatterns there are in a
|
||||||
compiled pattern.
|
compiled pattern.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A successful match returns the overall matched string and any captured
|
You can use auxiliary functions for accessing captured substrings
|
||||||
substrings to the caller via a vector of PCRE2_SIZE values. This is called the
|
|
||||||
<b>ovector</b>, and is contained within the
|
|
||||||
<a href="#matchdatablock">match data block.</a>
|
|
||||||
You can obtain direct access to the ovector by calling
|
|
||||||
<b>pcre2_get_ovector_pointer()</b> to find its address, and
|
|
||||||
<b>pcre2_get_ovector_count()</b> to find the number of pairs of values it
|
|
||||||
contains. Alternatively, you can use the auxiliary functions for accessing
|
|
||||||
captured substrings
|
|
||||||
<a href="#extractbynumber">by number</a>
|
<a href="#extractbynumber">by number</a>
|
||||||
or
|
or
|
||||||
<a href="#extractbyname">by name</a>
|
<a href="#extractbyname">by name,</a>
|
||||||
(see below).
|
as described in sections below.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Alternatively, you can make direct use of the vector of PCRE2_SIZE values,
|
||||||
|
called the <b>ovector</b>, which contains the offsets of captured strings. It is
|
||||||
|
part of the
|
||||||
|
<a href="#matchdatablock">match data block.</a>
|
||||||
|
The function <b>pcre2_get_ovector_pointer()</b> returns the address of the
|
||||||
|
ovector, and <b>pcre2_get_ovector_count()</b> returns the number of pairs of
|
||||||
|
values it contains.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Within the ovector, the first in each pair of values is set to the offset of
|
Within the ovector, the first in each pair of values is set to the offset of
|
||||||
|
@ -2292,7 +2304,13 @@ After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure
|
||||||
to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and
|
to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and
|
||||||
<b>pcre2_get_mark()</b> can be called. It returns a pointer to the
|
<b>pcre2_get_mark()</b> can be called. It returns a pointer to the
|
||||||
zero-terminated name, which is within the compiled pattern. Otherwise NULL is
|
zero-terminated name, which is within the compiled pattern. Otherwise NULL is
|
||||||
returned. After a successful match, the (*MARK) name that is returned is the
|
returned. The length of the (*MARK) name (excluding the terminating zero) is
|
||||||
|
stored in the code unit that preceeds the name. You should use this instead of
|
||||||
|
relying on the terminating zero if the (*MARK) name might contain a binary
|
||||||
|
zero.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
After a successful match, the (*MARK) name that is returned is the
|
||||||
last one encountered on the matching path through the pattern. After a "no
|
last one encountered on the matching path through the pattern. After a "no
|
||||||
match" or a partial match, the last encountered (*MARK) name is returned. For
|
match" or a partial match, the last encountered (*MARK) name is returned. For
|
||||||
example, consider this pattern:
|
example, consider this pattern:
|
||||||
|
@ -2313,7 +2331,7 @@ escape sequence. After a partial match, however, this value is always the same
|
||||||
as <i>ovector[0]</i> because \K does not affect the result of a partial match.
|
as <i>ovector[0]</i> because \K does not affect the result of a partial match.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
After a UTF check failure, \fBpcre2_get_startchar()\fB can be used to obtain
|
After a UTF check failure, <b>pcre2_get_startchar()</b> can be used to obtain
|
||||||
the code unit offset of the invalid UTF character. Details are given in the
|
the code unit offset of the invalid UTF character. Details are given in the
|
||||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||||
page.
|
page.
|
||||||
|
@ -2636,7 +2654,7 @@ same number causes an error at compile time.
|
||||||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||||
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
||||||
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
||||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||||
which a \K item in a lookahead in the pattern causes the match to end before
|
which a \K item in a lookahead in the pattern causes the match to end before
|
||||||
it starts are not supported, and give rise to an error return.
|
it starts are not supported, and give rise to an error return.
|
||||||
</P>
|
</P>
|
||||||
|
@ -2650,12 +2668,21 @@ allocate memory for the compiled code.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||||
length, in code units, of the output buffer. If the function is successful,
|
length, in code units, of the output buffer. If the function is successful, the
|
||||||
the value is updated to contain the length of the new string, excluding the
|
value is updated to contain the length of the new string, excluding the
|
||||||
trailing zero that is automatically added. If the function is not successful,
|
trailing zero that is automatically added.
|
||||||
the value is set to PCRE2_UNSET for general errors (such as output buffer too
|
</P>
|
||||||
small). For syntax errors in the replacement string, the value is set to the
|
<P>
|
||||||
offset in the replacement string where the error was detected.
|
If the function is not successful, the value set via <i>outlengthptr</i> depends
|
||||||
|
on the type of error. For syntax errors in the replacement string, the value is
|
||||||
|
the offset in the replacement string where the error was detected. For other
|
||||||
|
errors, the value is PCRE2_UNSET by default. This includes the case of the
|
||||||
|
output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set
|
||||||
|
(see below), in which case the value is the minimum length needed, including
|
||||||
|
space for the trailing zero. Note that in order to compute the required length,
|
||||||
|
<b>pcre2_substitute()</b> has to simulate all the matching and copying, instead
|
||||||
|
of giving an error return as soon as the buffer overflows. Note also that the
|
||||||
|
length is in code units, not bytes.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||||
|
@ -2682,15 +2709,53 @@ simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
||||||
apple lemon
|
apple lemon
|
||||||
2: pear orange
|
2: pear orange
|
||||||
</pre>
|
</pre>
|
||||||
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
||||||
function to iterate over the subject string, replacing every matching
|
options can be set in the <i>options</i> argument.
|
||||||
substring. If this is not set, only the first matching substring is replaced.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
|
PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string,
|
||||||
to be applied to the replacement string. Without this option, only the dollar
|
replacing every matching substring. If this is not set, only the first matching
|
||||||
character is special, and only the group insertion forms listed above are
|
substring is replaced. If any matched substring has zero length, after the
|
||||||
valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
|
substitution has happened, an attempt to find a non-empty match at the same
|
||||||
|
position is performed. If this is not successful, the current position is
|
||||||
|
advanced by one character except when CRLF is a valid newline sequence and the
|
||||||
|
next two characters are CR, LF. In this case, the current position is advanced
|
||||||
|
by two characters.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||||
|
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||||
|
this option is set, however, <b>pcre2_substitute()</b> continues to go through
|
||||||
|
the motions of matching and substituting (without, of course, writing anything)
|
||||||
|
in order to compute the size of buffer that is needed. This value is passed
|
||||||
|
back via the <i>outlengthptr</i> variable, with the result of the function still
|
||||||
|
being PCRE2_ERROR_NOMEMORY.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||||
|
is needed for given substitution. However, this does mean that the entire
|
||||||
|
operation is carried out twice. Depending on the application, it may be more
|
||||||
|
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||||
|
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do
|
||||||
|
not appear in the pattern to be treated as unset groups. This option should be
|
||||||
|
used with care, because it means that a typo in a group name or number no
|
||||||
|
longer causes the PCRE2_ERROR_NOSUBSTRING error.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown
|
||||||
|
groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty
|
||||||
|
strings when inserted as described above. If this option is not set, an attempt
|
||||||
|
to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does
|
||||||
|
not influence the extended substitution syntax described below.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the
|
||||||
|
replacement string. Without this option, only the dollar character is special,
|
||||||
|
and only the group insertion forms listed above are valid. When
|
||||||
|
PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Firstly, backslash in a replacement string is interpreted as an escape
|
Firstly, backslash in a replacement string is interpreted as an escape
|
||||||
|
@ -2740,22 +2805,46 @@ string remains in force afterwards, as shown in this <b>pcre2test</b> example:
|
||||||
somebody
|
somebody
|
||||||
1: HELLO
|
1: HELLO
|
||||||
</pre>
|
</pre>
|
||||||
If successful, the function returns the number of replacements that were made.
|
The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
|
||||||
This may be zero if no matches were found, and is never greater than 1 unless
|
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
||||||
PCRE2_SUBSTITUTE_GLOBAL is set.
|
groups in the extended syntax forms to be treated as unset.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If successful, <b>pcre2_substitute()</b> returns the number of replacements that
|
||||||
|
were made. This may be zero if no matches were found, and is never greater than
|
||||||
|
1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In the event of an error, a negative error code is returned. Except for
|
In the event of an error, a negative error code is returned. Except for
|
||||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
|
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
|
||||||
are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer
|
are passed straight back.
|
||||||
is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax
|
</P>
|
||||||
errors in the replacement string, with more particular errors being
|
<P>
|
||||||
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence),
|
PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
|
||||||
PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found),
|
unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
|
||||||
PCRE2_BADSUBSTITUTION (syntax error in extended group substitution), and
|
</P>
|
||||||
PCRE2_BADSUBPATTERN (the pattern match ended before it started). As for all
|
<P>
|
||||||
PCRE2 errors, a text message that describes the error can be obtained by
|
PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an
|
||||||
calling <b>pcre2_get_error_message()</b>.
|
unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple
|
||||||
|
(non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the
|
||||||
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is
|
||||||
|
needed is returned via <i>outlengthptr</i>. Note that this does not happen by
|
||||||
|
default.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||||
|
replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||||
|
(invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket
|
||||||
|
not found), PCRE2_BADSUBSTITUTION (syntax error in extended group
|
||||||
|
substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it
|
||||||
|
started, which can happen if \K is used in an assertion).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
As for all PCRE2 errors, a text message that describes the error can be
|
||||||
|
obtained by calling <b>pcre2_get_error_message()</b>.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -2796,11 +2885,11 @@ function returns the length of each entry in code units. In both cases,
|
||||||
PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
|
PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The format of the name table is described above in the section entitled
|
The format of the name table is described
|
||||||
<i>Information about a pattern</i>
|
<a href="#infoaboutpattern">above</a>
|
||||||
<a href="#infoaboutpattern">above.</a>
|
in the section entitled <i>Information about a pattern</i>. Given all the
|
||||||
Given all the relevant entries for the name, you can extract each of their
|
relevant entries for the name, you can extract each of their numbers, and hence
|
||||||
numbers, and hence the captured data.
|
the captured data.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC36" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
<br><a name="SEC36" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -3032,7 +3121,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 05 November 2015
|
Last updated: 16 December 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -86,6 +86,13 @@ results. The returned value from <b>pcre2_jit_compile()</b> is zero on success,
|
||||||
or a negative error code.
|
or a negative error code.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
There is a limit to the size of pattern that JIT supports, imposed by the size
|
||||||
|
of machine stack that it uses. The exact rules are not documented because they
|
||||||
|
may change at any time, in particular, when new optimizations are introduced.
|
||||||
|
If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
|
||||||
|
PCRE2_ERROR_NOMEMORY.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete
|
PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete
|
||||||
matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or
|
matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or
|
||||||
PCRE2_PARTIAL_SOFT options of <b>pcre2_match()</b>, you should set one or both
|
PCRE2_PARTIAL_SOFT options of <b>pcre2_match()</b>, you should set one or both
|
||||||
|
@ -425,7 +432,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 28 July 2015
|
Last updated: 14 November 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -32,8 +32,8 @@ However, the speed of execution is slower. In the 32-bit library, the internal
|
||||||
linkage size is always 4.
|
linkage size is always 4.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The maximum length of a source pattern string is essentially unlimited; it is
|
The maximum length of a source pattern string is essentially unlimited; it is
|
||||||
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
||||||
calls <b>pcre2_compile()</b> can specify a smaller limit.
|
calls <b>pcre2_compile()</b> can specify a smaller limit.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
|
|
@ -669,8 +669,8 @@ This is an example of an "atomic group", details of which are given
|
||||||
This particular group matches either the two-character sequence CR followed by
|
This particular group matches either the two-character sequence CR followed by
|
||||||
LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
|
LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
|
||||||
U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next
|
U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next
|
||||||
line, U+0085). The two-character sequence is treated as a single unit that
|
line, U+0085). Because this is an atomic group, the two-character sequence is
|
||||||
cannot be split.
|
treated as a single unit that cannot be split.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In other modes, two additional characters whose codepoints are greater than 255
|
In other modes, two additional characters whose codepoints are greater than 255
|
||||||
|
@ -1186,6 +1186,16 @@ when the <i>startoffset</i> argument of <b>pcre2_match()</b> is non-zero. The
|
||||||
PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set.
|
PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
When the newline convention (see
|
||||||
|
<a href="#newlines">"Newline conventions"</a>
|
||||||
|
below) recognizes the two-character sequence CRLF as a newline, this is
|
||||||
|
preferred, even if the single characters CR and LF are also recognized as
|
||||||
|
newlines. For example, if the newline convention is "any", a multiline mode
|
||||||
|
circumflex matches before "xyz" in the string "abc\r\nxyz" rather than after
|
||||||
|
CR, even though CR on its own is a valid newline. (It also matches at the very
|
||||||
|
start of the string, of course.)
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
Note that the sequences \A, \Z, and \z can be used to match the start and
|
Note that the sequences \A, \Z, and \z can be used to match the start and
|
||||||
end of the subject in both modes, and if all branches of a pattern start with
|
end of the subject in both modes, and if all branches of a pattern start with
|
||||||
\A it is always anchored, whether or not PCRE2_MULTILINE is set.
|
\A it is always anchored, whether or not PCRE2_MULTILINE is set.
|
||||||
|
@ -1236,7 +1246,7 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
|
||||||
with a malformed UTF character. This has undefined results, because PCRE2
|
with a malformed UTF character. This has undefined results, because PCRE2
|
||||||
assumes that it is matching character by character in a valid UTF string (by
|
assumes that it is matching character by character in a valid UTF string (by
|
||||||
default it checks the subject string's validity at the start of processing
|
default it checks the subject string's validity at the start of processing
|
||||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
An application can lock out the use of \C by setting the
|
An application can lock out the use of \C by setting the
|
||||||
|
@ -1247,9 +1257,9 @@ build PCRE2 with the use of \C permanently disabled.
|
||||||
PCRE2 does not allow \C to appear in lookbehind assertions
|
PCRE2 does not allow \C to appear in lookbehind assertions
|
||||||
<a href="#lookbehind">(described below)</a>
|
<a href="#lookbehind">(described below)</a>
|
||||||
in a UTF mode, because this would make it impossible to calculate the length of
|
in a UTF mode, because this would make it impossible to calculate the length of
|
||||||
the lookbehind. Neither the alternative matching function
|
the lookbehind. Neither the alternative matching function
|
||||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||||
former gives a match-time error; the latter fails to optimize and so the match
|
former gives a match-time error; the latter fails to optimize and so the match
|
||||||
is always run using the interpreter.
|
is always run using the interpreter.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -1341,11 +1351,11 @@ example [\000-\037]. Ranges can include any characters that are valid for the
|
||||||
current mode.
|
current mode.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
There is a special case in EBCDIC environments for ranges whose end points are
|
There is a special case in EBCDIC environments for ranges whose end points are
|
||||||
both specified as literal letters in the same case. For compatibility with
|
both specified as literal letters in the same case. For compatibility with
|
||||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||||
example, [h-k] matches only four characters, even though the codes for h and k
|
example, [h-k] matches only four characters, even though the codes for h and k
|
||||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||||
specified numerically, for example, [\x88-\x92] or [h-\x92], all code points
|
specified numerically, for example, [\x88-\x92] or [h-\x92], all code points
|
||||||
are included.
|
are included.
|
||||||
</P>
|
</P>
|
||||||
|
@ -1672,6 +1682,10 @@ first one in the pattern with the given number. The following pattern matches
|
||||||
<pre>
|
<pre>
|
||||||
/(?|(abc)|(def))(?1)/
|
/(?|(abc)|(def))(?1)/
|
||||||
</pre>
|
</pre>
|
||||||
|
A relative reference such as (?-1) is no different: it is just a convenient way
|
||||||
|
of computing an absolute group number.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
If a
|
If a
|
||||||
<a href="#conditions">condition test</a>
|
<a href="#conditions">condition test</a>
|
||||||
for a subpattern's having matched refers to a non-unique number, the test is
|
for a subpattern's having matched refers to a non-unique number, the test is
|
||||||
|
@ -2512,7 +2526,7 @@ For example:
|
||||||
(?(VERSION>=10.4)yes|no)
|
(?(VERSION>=10.4)yes|no)
|
||||||
</pre>
|
</pre>
|
||||||
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or
|
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or
|
||||||
"no" otherwise. The fractional part of the version number may not contain more
|
"no" otherwise. The fractional part of the version number may not contain more
|
||||||
than two digits.
|
than two digits.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
@ -2626,6 +2640,21 @@ parentheses preceding the recursion. In other words, a negative number counts
|
||||||
capturing parentheses leftwards from the point at which it is encountered.
|
capturing parentheses leftwards from the point at which it is encountered.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
Be aware however, that if
|
||||||
|
<a href="#dupsubpatternnumber">duplicate subpattern numbers</a>
|
||||||
|
are in use, relative references refer to the earliest subpattern with the
|
||||||
|
appropriate number. Consider, for example:
|
||||||
|
<pre>
|
||||||
|
(?|(a)|(b)) (c) (?-2)
|
||||||
|
</pre>
|
||||||
|
The first two capturing groups (a) and (b) are both numbered 1, and group (c)
|
||||||
|
is number 2. When the reference (?-2) is encountered, the second most recently
|
||||||
|
opened parentheses has the number 1, but it is the first such group (the (a)
|
||||||
|
group) to which the recursion refers. This would be the same if an absolute
|
||||||
|
reference (?1) was used. In other words, relative references are just a
|
||||||
|
shorthand for computing a group number.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
It is also possible to refer to subsequently opened parentheses, by writing
|
It is also possible to refer to subsequently opened parentheses, by writing
|
||||||
references such as (?+2). However, these cannot be recursive because the
|
references such as (?+2). However, these cannot be recursive because the
|
||||||
reference is not inside the parentheses that are referenced. They are always
|
reference is not inside the parentheses that are referenced. They are always
|
||||||
|
@ -2929,13 +2958,13 @@ depending on whether or not a name is present.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
By default, for compatibility with Perl, a name is any sequence of characters
|
By default, for compatibility with Perl, a name is any sequence of characters
|
||||||
that does not include a closing parenthesis. The name is not processed in
|
that does not include a closing parenthesis. The name is not processed in
|
||||||
any way, and it is not possible to include a closing parenthesis in the name.
|
any way, and it is not possible to include a closing parenthesis in the name.
|
||||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||||
the name. A closing parenthesis can be included in a name either as \) or
|
the name. A closing parenthesis can be included in a name either as \) or
|
||||||
between \Q and \E. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
between \Q and \E. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
||||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
||||||
of the pattern.
|
of the pattern.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -3359,7 +3388,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 01 November 2015
|
Last updated: 13 November 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -200,7 +200,7 @@ Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and
|
||||||
PCRE2_DOLLAR_ENDONLY when calling <b>pcre2_compile()</b> directly, but there is
|
PCRE2_DOLLAR_ENDONLY when calling <b>pcre2_compile()</b> directly, but there is
|
||||||
no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using
|
no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using
|
||||||
the POSIX API, passing REG_NEWLINE to PCRE2's <b>regcomp()</b> function
|
the POSIX API, passing REG_NEWLINE to PCRE2's <b>regcomp()</b> function
|
||||||
causes PCRE2_MULTILINE to be passed to <b>pcre2_compile()</b>, and REG_DOTALL
|
causes PCRE2_MULTILINE to be passed to <b>pcre2_compile()</b>, and REG_DOTALL
|
||||||
passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
|
passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">MATCHING A PATTERN</a><br>
|
<br><a name="SEC5" href="#TOC1">MATCHING A PATTERN</a><br>
|
||||||
|
@ -235,7 +235,8 @@ to have a terminating NUL located at <i>string</i> + <i>pmatch[0].rm_eo</i>
|
||||||
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
||||||
intended to be portable to other systems. Note that a non-zero <i>rm_so</i> does
|
intended to be portable to other systems. Note that a non-zero <i>rm_so</i> does
|
||||||
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
||||||
how it is matched.
|
how it is matched. Setting REG_STARTEND and passing <i>pmatch</i> as NULL are
|
||||||
|
mutually exclusive; the error REG_INVARG is returned.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||||
|
@ -289,7 +290,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 30 October 2015
|
Last updated: 29 November 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -154,13 +154,13 @@ on a system with different endianness.
|
||||||
<P>
|
<P>
|
||||||
Decoded patterns can be used for matching in the usual way, and must be freed
|
Decoded patterns can be used for matching in the usual way, and must be freed
|
||||||
by calling <b>pcre2_code_free()</b>. However, be aware that there is a potential
|
by calling <b>pcre2_code_free()</b>. However, be aware that there is a potential
|
||||||
race issue if you are using multiple patterns that were decoded from a single
|
race issue if you are using multiple patterns that were decoded from a single
|
||||||
byte stream in a multithreaded application. A single copy of the character
|
byte stream in a multithreaded application. A single copy of the character
|
||||||
tables is used by all the decoded patterns and a reference count is used to
|
tables is used by all the decoded patterns and a reference count is used to
|
||||||
arrange for its memory to be automatically freed when the last pattern is
|
arrange for its memory to be automatically freed when the last pattern is
|
||||||
freed, but there is no locking on this reference count. Therefore, if you want
|
freed, but there is no locking on this reference count. Therefore, if you want
|
||||||
to call <b>pcre2_code_free()</b> for these patterns in different threads, you
|
to call <b>pcre2_code_free()</b> for these patterns in different threads, you
|
||||||
must arrange your own locking, and ensure that <b>pcre2_code_free()</b> cannot
|
must arrange your own locking, and ensure that <b>pcre2_code_free()</b> cannot
|
||||||
be called by two threads at the same time.
|
be called by two threads at the same time.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
|
|
@ -113,7 +113,7 @@ it matches a literal "u".
|
||||||
</pre>
|
</pre>
|
||||||
\C is dangerous because it may leave the current matching point in the middle
|
\C is dangerous because it may leave the current matching point in the middle
|
||||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
|
of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
|
||||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||||
with the use of \C permanently disabled.
|
with the use of \C permanently disabled.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
|
|
@ -486,7 +486,7 @@ the start of a modifier list. For example:
|
||||||
<pre>
|
<pre>
|
||||||
abc\=notbol,notempty
|
abc\=notbol,notempty
|
||||||
</pre>
|
</pre>
|
||||||
If the subject string is empty and \= is followed by whitespace, the line is
|
If the subject string is empty and \= is followed by whitespace, the line is
|
||||||
treated as a comment line, and is not used for matching. For example:
|
treated as a comment line, and is not used for matching. For example:
|
||||||
<pre>
|
<pre>
|
||||||
\= This is a comment.
|
\= This is a comment.
|
||||||
|
@ -538,7 +538,7 @@ for a description of their effects.
|
||||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||||
ucp set PCRE2_UCP
|
ucp set PCRE2_UCP
|
||||||
ungreedy set PCRE2_UNGREEDY
|
ungreedy set PCRE2_UNGREEDY
|
||||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||||
utf set PCRE2_UTF
|
utf set PCRE2_UTF
|
||||||
</pre>
|
</pre>
|
||||||
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
|
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
|
||||||
|
@ -564,7 +564,7 @@ about the pattern:
|
||||||
jitfast use JIT fast path
|
jitfast use JIT fast path
|
||||||
jitverify verify JIT use
|
jitverify verify JIT use
|
||||||
locale=<name> use this locale
|
locale=<name> use this locale
|
||||||
max_pattern_length=<n> set the maximum pattern length
|
max_pattern_length=<n> set the maximum pattern length
|
||||||
memory show memory used
|
memory show memory used
|
||||||
newline=<type> set newline type
|
newline=<type> set newline type
|
||||||
null_context compile with a NULL context
|
null_context compile with a NULL context
|
||||||
|
@ -649,9 +649,9 @@ by the item that follows it in the pattern.
|
||||||
Passing a NULL context
|
Passing a NULL context
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
|
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
|
||||||
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||||
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
@ -675,9 +675,9 @@ Generating long repetitive patterns
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Some tests use long patterns that are very repetitive. Instead of creating a
|
Some tests use long patterns that are very repetitive. Instead of creating a
|
||||||
very long input line for such a pattern, you can use a special repetition
|
very long input line for such a pattern, you can use a special repetition
|
||||||
feature, similar to the one described for subject lines above. If the
|
feature, similar to the one described for subject lines above. If the
|
||||||
<b>expand</b> modifier is present on a pattern, parts of the pattern that have
|
<b>expand</b> modifier is present on a pattern, parts of the pattern that have
|
||||||
the form
|
the form
|
||||||
<pre>
|
<pre>
|
||||||
\[<characters>]{<count>}
|
\[<characters>]{<count>}
|
||||||
|
@ -689,13 +689,13 @@ by decimal digits and "}" is found later in the pattern. If not, the characters
|
||||||
remain in the pattern unaltered.
|
remain in the pattern unaltered.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If part of an expanded pattern looks like an expansion, but is really part of
|
If part of an expanded pattern looks like an expansion, but is really part of
|
||||||
the actual pattern, unwanted expansion can be avoided by giving two values in
|
the actual pattern, unwanted expansion can be avoided by giving two values in
|
||||||
the quantifier. For example, \[AB]{6000,6000} is not recognized as an
|
the quantifier. For example, \[AB]{6000,6000} is not recognized as an
|
||||||
expansion item.
|
expansion item.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the <b>info</b> modifier is set on an expanded pattern, the result of the
|
If the <b>info</b> modifier is set on an expanded pattern, the result of the
|
||||||
expansion is included in the information that is output.
|
expansion is included in the information that is output.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
@ -812,9 +812,9 @@ suite.
|
||||||
Limiting the pattern length
|
Limiting the pattern length
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>max_pattern_length</b> modifier sets a limit, in code units, to the
|
The <b>max_pattern_length</b> modifier sets a limit, in code units, to the
|
||||||
length of pattern that <b>pcre2_compile()</b> will accept. Breaching the limit
|
length of pattern that <b>pcre2_compile()</b> will accept. Breaching the limit
|
||||||
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
||||||
variable can hold (essentially unlimited).
|
variable can hold (essentially unlimited).
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
@ -836,13 +836,13 @@ modifiers set options for the <b>regcomp()</b> function:
|
||||||
ucp REG_UCP ) the POSIX standard
|
ucp REG_UCP ) the POSIX standard
|
||||||
utf REG_UTF8 )
|
utf REG_UTF8 )
|
||||||
</pre>
|
</pre>
|
||||||
The <b>regerror_buffsize</b> modifier specifies a size for the error buffer that
|
The <b>regerror_buffsize</b> modifier specifies a size for the error buffer that
|
||||||
is passed to <b>regerror()</b> in the event of a compilation error. For example:
|
is passed to <b>regerror()</b> in the event of a compilation error. For example:
|
||||||
<pre>
|
<pre>
|
||||||
/abc/posix,regerror_buffsize=20
|
/abc/posix,regerror_buffsize=20
|
||||||
</pre>
|
</pre>
|
||||||
This provides a means of testing the behaviour of <b>regerror()</b> when the
|
This provides a means of testing the behaviour of <b>regerror()</b> when the
|
||||||
buffer is too small for the error message. If this modifier has not been set, a
|
buffer is too small for the error message. If this modifier has not been set, a
|
||||||
large buffer is used.
|
large buffer is used.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -892,14 +892,18 @@ are applied to every subject line that is processed with that pattern. They may
|
||||||
not appear in <b>#pattern</b> commands. These modifiers do not affect the
|
not appear in <b>#pattern</b> commands. These modifiers do not affect the
|
||||||
compilation process.
|
compilation process.
|
||||||
<pre>
|
<pre>
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text
|
||||||
/g global global matching
|
/g global global matching
|
||||||
mark show mark values
|
mark show mark values
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
|
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
</pre>
|
</pre>
|
||||||
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
||||||
defaults, set them in a <b>#subject</b> command.
|
defaults, set them in a <b>#subject</b> command.
|
||||||
|
@ -964,33 +968,38 @@ information. Some of them may also be specified on a pattern line (see above),
|
||||||
in which case they apply to every subject line that is matched against that
|
in which case they apply to every subject line that is matched against that
|
||||||
pattern.
|
pattern.
|
||||||
<pre>
|
<pre>
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text (non-JIT only)
|
allusedtext show all consulted text (non-JIT only)
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
callout_capture show captures at callout time
|
callout_capture show captures at callout time
|
||||||
callout_data=<n> set a value to pass via callouts
|
callout_data=<n> set a value to pass via callouts
|
||||||
callout_fail=<n>[:<m>] control callout failure
|
callout_fail=<n>[:<m>] control callout failure
|
||||||
callout_none do not supply a callout function
|
callout_none do not supply a callout function
|
||||||
copy=<number or name> copy captured substring
|
copy=<number or name> copy captured substring
|
||||||
dfa use <b>pcre2_dfa_match()</b>
|
dfa use <b>pcre2_dfa_match()</b>
|
||||||
find_limits find match and recursion limits
|
find_limits find match and recursion limits
|
||||||
get=<number or name> extract captured substring
|
get=<number or name> extract captured substring
|
||||||
getall extract all captured substrings
|
getall extract all captured substrings
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitstack=<n> set size of JIT stack
|
jitstack=<n> set size of JIT stack
|
||||||
mark show mark values
|
mark show mark values
|
||||||
match_limit=<n> set a match limit
|
match_limit=<n> set a match limit
|
||||||
memory show memory usage
|
memory show memory usage
|
||||||
null_context match with a NULL context
|
null_context match with a NULL context
|
||||||
offset=<n> set starting offset
|
offset=<n> set starting offset
|
||||||
offset_limit=<n> set offset limit
|
offset_limit=<n> set offset limit
|
||||||
ovector=<n> set size of output vector
|
ovector=<n> set size of output vector
|
||||||
recursion_limit=<n> set a recursion limit
|
recursion_limit=<n> set a recursion limit
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show startchar when relevant
|
startchar show startchar when relevant
|
||||||
zero_terminate pass the subject as zero-terminated
|
startoffset=<n> same as offset=<n>
|
||||||
|
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
|
zero_terminate pass the subject as zero-terminated
|
||||||
</pre>
|
</pre>
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
</P>
|
</P>
|
||||||
|
@ -1129,19 +1138,34 @@ Testing the substitution function
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
If the <b>replace</b> modifier is set, the <b>pcre2_substitute()</b> function is
|
If the <b>replace</b> modifier is set, the <b>pcre2_substitute()</b> function is
|
||||||
called instead of one of the matching functions. Unlike subject strings,
|
called instead of one of the matching functions. Note that replacement strings
|
||||||
<b>pcre2test</b> does not process replacement strings for escape sequences. In
|
cannot contain commas, because a comma signifies the end of a modifier. This is
|
||||||
UTF mode, a replacement string is checked to see if it is a valid UTF-8 string.
|
not thought to be an issue in a test program.
|
||||||
If so, it is correctly converted to a UTF string of the appropriate code unit
|
|
||||||
width. If it is not a valid UTF-8 string, the individual code units are copied
|
|
||||||
directly. This provides a means of passing an invalid UTF-8 string for testing
|
|
||||||
purposes.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the <b>global</b> modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
Unlike subject strings, <b>pcre2test</b> does not process replacement strings
|
||||||
<b>pcre2_substitute()</b>. After a successful substitution, the modified string
|
for escape sequences. In UTF mode, a replacement string is checked to see if it
|
||||||
is output, preceded by the number of replacements. This may be zero if there
|
is a valid UTF-8 string. If so, it is correctly converted to a UTF string of
|
||||||
were no matches. Here is a simple example of a substitution test:
|
the appropriate code unit width. If it is not a valid UTF-8 string, the
|
||||||
|
individual code units are copied directly. This provides a means of passing an
|
||||||
|
invalid UTF-8 string for testing purposes.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The following modifiers set options (in additional to the normal match options)
|
||||||
|
for <b>pcre2_substitute()</b>:
|
||||||
|
<pre>
|
||||||
|
global PCRE2_SUBSTITUTE_GLOBAL
|
||||||
|
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||||
|
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
|
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
|
|
||||||
|
</PRE>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
After a successful substitution, the modified string is output, preceded by the
|
||||||
|
number of replacements. This may be zero if there were no matches. Here is a
|
||||||
|
simple example of a substitution test:
|
||||||
<pre>
|
<pre>
|
||||||
/abc/replace=xxx
|
/abc/replace=xxx
|
||||||
=abc=abc=
|
=abc=abc=
|
||||||
|
@ -1149,12 +1173,12 @@ were no matches. Here is a simple example of a substitution test:
|
||||||
=abc=abc=\=global
|
=abc=abc=\=global
|
||||||
2: =xxx=xxx=
|
2: =xxx=xxx=
|
||||||
</pre>
|
</pre>
|
||||||
Subject and replacement strings should be kept relatively short for
|
Subject and replacement strings should be kept relatively short (fewer than 256
|
||||||
substitution tests, as fixed-size buffers are used. To make it easy to test for
|
characters) for substitution tests, as fixed-size buffers are used. To make it
|
||||||
buffer overflow, if the replacement string starts with a number in square
|
easy to test for buffer overflow, if the replacement string starts with a
|
||||||
brackets, that number is passed to <b>pcre2_substitute()</b> as the size of the
|
number in square brackets, that number is passed to <b>pcre2_substitute()</b> as
|
||||||
output buffer, with the replacement string starting at the next character. Here
|
the size of the output buffer, with the replacement string starting at the next
|
||||||
is an example that tests the edge case:
|
character. Here is an example that tests the edge case:
|
||||||
<pre>
|
<pre>
|
||||||
/abc/
|
/abc/
|
||||||
123abc123\=replace=[10]XYZ
|
123abc123\=replace=[10]XYZ
|
||||||
|
@ -1162,6 +1186,19 @@ is an example that tests the edge case:
|
||||||
123abc123\=replace=[9]XYZ
|
123abc123\=replace=[9]XYZ
|
||||||
Failed: error -47: no more memory
|
Failed: error -47: no more memory
|
||||||
</pre>
|
</pre>
|
||||||
|
The default action of <b>pcre2_substitute()</b> is to return
|
||||||
|
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
||||||
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
||||||
|
<b>substitute_overflow_length</b> modifier), <b>pcre2_substitute()</b> continues
|
||||||
|
to go through the motions of matching and substituting, in order to compute the
|
||||||
|
size of buffer that is required. When this happens, <b>pcre2test</b> shows the
|
||||||
|
required buffer length (which includes space for the trailing zero) as part of
|
||||||
|
the error message. For example:
|
||||||
|
<pre>
|
||||||
|
/abc/substitute_overflow_length
|
||||||
|
123abc123\=replace=[9]XYZ
|
||||||
|
Failed: error -47: no more memory: 10 code units are needed
|
||||||
|
</pre>
|
||||||
A replacement string is ignored with POSIX and DFA matching. Specifying partial
|
A replacement string is ignored with POSIX and DFA matching. Specifying partial
|
||||||
matching provokes an error return ("bad option value") from
|
matching provokes an error return ("bad option value") from
|
||||||
<b>pcre2_substitute()</b>.
|
<b>pcre2_substitute()</b>.
|
||||||
|
@ -1236,10 +1273,10 @@ matching starts. Its value is a number of code units, not characters.
|
||||||
Setting an offset limit
|
Setting an offset limit
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
|
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
|
||||||
cannot be found starting at or before this offset in the subject, a "no match"
|
cannot be found starting at or before this offset in the subject, a "no match"
|
||||||
return is given. The data value is a number of code units, not characters. When
|
return is given. The data value is a number of code units, not characters. When
|
||||||
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
|
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
|
||||||
for the pattern; if not, an error is generated.
|
for the pattern; if not, an error is generated.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
@ -1281,8 +1318,8 @@ Passing a NULL context
|
||||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||||
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
|
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
|
||||||
modifier is set, however, NULL is passed. This is for testing that the matching
|
modifier is set, however, NULL is passed. This is for testing that the matching
|
||||||
functions behave correctly in this case (they use default values). This
|
functions behave correctly in this case (they use default values). This
|
||||||
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
|
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
|
||||||
substitution function.
|
substitution function.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||||
|
@ -1623,7 +1660,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 05 November 2015
|
Last updated: 12 December 2015
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2015 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -127,8 +127,8 @@ as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
||||||
strings to be in host byte order.
|
strings to be in host byte order.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A UTF string is checked before any other processing takes place. In the case of
|
A UTF string is checked before any other processing takes place. In the case of
|
||||||
<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b> calls with a non-zero starting
|
<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b> calls with a non-zero starting
|
||||||
offset, the check is applied only to that part of the subject that could be
|
offset, the check is applied only to that part of the subject that could be
|
||||||
inspected during matching, and there is a check that the starting offset points
|
inspected during matching, and there is a check that the starting offset points
|
||||||
to the first code unit of a character or to the end of the subject. If there
|
to the first code unit of a character or to the end of the subject. If there
|
||||||
|
|
|
@ -118,9 +118,9 @@ running redundant checks.
|
||||||
.P
|
.P
|
||||||
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||||
problems, because it may leave the current matching point in the middle of a
|
problems, because it may leave the current matching point in the middle of a
|
||||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||||
application to lock out the use of \eC, causing a compile-time error if it is
|
application to lock out the use of \eC, causing a compile-time error if it is
|
||||||
encountered. It is also possible to build PCRE2 with the use of \eC permanently
|
encountered. It is also possible to build PCRE2 with the use of \eC permanently
|
||||||
disabled.
|
disabled.
|
||||||
.P
|
.P
|
||||||
Another way that performance can be hit is by running a pattern that has a very
|
Another way that performance can be hit is by running a pattern that has a very
|
||||||
|
|
1825
doc/pcre2.txt
1825
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -14,7 +14,7 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
This function sets, in a compile context, the maximum length (in code units) of
|
This function sets, in a compile context, the maximum length (in code units) of
|
||||||
the pattern that can be compiled. The result is always zero.
|
the pattern that can be compiled. The result is always zero.
|
||||||
.P
|
.P
|
||||||
There is a complete description of the PCRE2 native API in the
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
|
|
@ -58,9 +58,9 @@ The options are:
|
||||||
PCRE2_UTF was set at compile time)
|
PCRE2_UTF was set at compile time)
|
||||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||||
.sp
|
.sp
|
||||||
The function returns the number of substitutions, which may be zero if there
|
The function returns the number of substitutions, which may be zero if there
|
||||||
were no matches. The result can be greater than one only when
|
were no matches. The result can be greater than one only when
|
||||||
|
|
142
doc/pcre2api.3
142
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2API 3 "12 December 2015" "PCRE2 10.21"
|
.TH PCRE2API 3 "16 December 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.sp
|
.sp
|
||||||
|
@ -570,7 +570,7 @@ of the following compile-time parameters:
|
||||||
PCRE2's character tables
|
PCRE2's character tables
|
||||||
The newline character sequence
|
The newline character sequence
|
||||||
The compile time nested parentheses limit
|
The compile time nested parentheses limit
|
||||||
The maximum length of the pattern string
|
The maximum length of the pattern string
|
||||||
An external function for stack checking
|
An external function for stack checking
|
||||||
.sp
|
.sp
|
||||||
A compile context is also required if you are using custom memory management.
|
A compile context is also required if you are using custom memory management.
|
||||||
|
@ -618,10 +618,10 @@ in the current locale.
|
||||||
.B " PCRE2_SIZE \fIvalue\fP);"
|
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||||
.fi
|
.fi
|
||||||
.sp
|
.sp
|
||||||
This sets a maximum length, in code units, for the pattern string that is to be
|
This sets a maximum length, in code units, for the pattern string that is to be
|
||||||
compiled. If the pattern is longer, an error is generated. This facility is
|
compiled. If the pattern is longer, an error is generated. This facility is
|
||||||
provided so that applications that accept patterns from external sources can
|
provided so that applications that accept patterns from external sources can
|
||||||
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
||||||
can hold, which is effectively unlimited.
|
can hold, which is effectively unlimited.
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
|
@ -678,8 +678,8 @@ of the following match-time parameters:
|
||||||
.sp
|
.sp
|
||||||
A callout function
|
A callout function
|
||||||
The offset limit for matching an unanchored pattern
|
The offset limit for matching an unanchored pattern
|
||||||
The limit for calling \fImatch()\fP
|
The limit for calling \fBmatch()\fP (see below)
|
||||||
The limit for calling \fImatch()\fP recursively
|
The limit for calling \fBmatch()\fP recursively
|
||||||
.sp
|
.sp
|
||||||
A match context is also required if you are using custom memory management.
|
A match context is also required if you are using custom memory management.
|
||||||
If none of these apply, just pass NULL as the context argument of
|
If none of these apply, just pass NULL as the context argument of
|
||||||
|
@ -736,7 +736,7 @@ PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
|
||||||
.P
|
.P
|
||||||
The offset limit facility can be used to track progress when searching large
|
The offset limit facility can be used to track progress when searching large
|
||||||
subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
||||||
start within the first line of the subject. If this is set with an offset
|
start within the first line of the subject. If this is set with an offset
|
||||||
limit, a match must occur in the first line and also within the offset limit.
|
limit, a match must occur in the first line and also within the offset limit.
|
||||||
In other words, whichever limit comes first is used.
|
In other words, whichever limit comes first is used.
|
||||||
.sp
|
.sp
|
||||||
|
@ -1228,7 +1228,7 @@ This option locks out the use of \eC in the pattern that is being compiled.
|
||||||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||||
it may leave the current matching point in the middle of a multi-code-unit
|
it may leave the current matching point in the middle of a multi-code-unit
|
||||||
character. This option may be useful in applications that process patterns from
|
character. This option may be useful in applications that process patterns from
|
||||||
external sources. Note that there is also a build-time option that permanently
|
external sources. Note that there is also a build-time option that permanently
|
||||||
locks out the use of \eC.
|
locks out the use of \eC.
|
||||||
.sp
|
.sp
|
||||||
PCRE2_NEVER_UCP
|
PCRE2_NEVER_UCP
|
||||||
|
@ -1565,7 +1565,7 @@ are as follows:
|
||||||
Return a copy of the pattern's options. The third argument should point to a
|
Return a copy of the pattern's options. The third argument should point to a
|
||||||
\fBuint32_t\fP variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
|
\fBuint32_t\fP variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
|
||||||
were passed to \fBpcre2_compile()\fP, whereas PCRE2_INFO_ALLOPTIONS returns
|
were passed to \fBpcre2_compile()\fP, whereas PCRE2_INFO_ALLOPTIONS returns
|
||||||
the compile options as modified by any top-level option settings such as (*UTF)
|
the compile options as modified by any top-level option settings such as (*UTF)
|
||||||
at the start of the pattern itself. For example, if the pattern /(*UTF)abc/ is
|
at the start of the pattern itself. For example, if the pattern /(*UTF)abc/ is
|
||||||
compiled with the PCRE2_EXTENDED option, the result is PCRE2_EXTENDED and
|
compiled with the PCRE2_EXTENDED option, the result is PCRE2_EXTENDED and
|
||||||
PCRE2_UTF.
|
PCRE2_UTF.
|
||||||
|
@ -1611,8 +1611,9 @@ matches only CR, LF, or CRLF.
|
||||||
.sp
|
.sp
|
||||||
PCRE2_INFO_CAPTURECOUNT
|
PCRE2_INFO_CAPTURECOUNT
|
||||||
.sp
|
.sp
|
||||||
Return the number of capturing subpatterns in the pattern. The third argument
|
Return the highest capturing subpattern number in the pattern. In patterns
|
||||||
should point to an \fBuint32_t\fP variable.
|
where (?| is not used, this is also the total number of capturing subpatterns.
|
||||||
|
The third argument should point to an \fBuint32_t\fP variable.
|
||||||
.sp
|
.sp
|
||||||
PCRE2_INFO_FIRSTBITMAP
|
PCRE2_INFO_FIRSTBITMAP
|
||||||
.sp
|
.sp
|
||||||
|
@ -1629,10 +1630,8 @@ returned. Otherwise NULL is returned. The third argument should point to an
|
||||||
.sp
|
.sp
|
||||||
Return information about the first code unit of any matched string, for a
|
Return information about the first code unit of any matched string, for a
|
||||||
non-anchored pattern. The third argument should point to an \fBuint32_t\fP
|
non-anchored pattern. The third argument should point to an \fBuint32_t\fP
|
||||||
variable.
|
variable. If there is a fixed first value, for example, the letter "c" from a
|
||||||
.P
|
pattern such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||||
If there is a fixed first value, for example, the letter "c" from a pattern
|
|
||||||
such as (cat|cow|coyote), 1 is returned, and the character value can be
|
|
||||||
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
||||||
it is known that a match can occur only at the start of the subject or
|
it is known that a match can occur only at the start of the subject or
|
||||||
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
||||||
|
@ -1676,12 +1675,10 @@ Returns 1 if there is a rightmost literal code unit that must exist in any
|
||||||
matched string, other than at its start. The third argument should point to an
|
matched string, other than at its start. The third argument should point to an
|
||||||
\fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
|
\fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
|
||||||
returned, the code unit value itself can be retrieved using
|
returned, the code unit value itself can be retrieved using
|
||||||
PCRE2_INFO_LASTCODEUNIT.
|
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||||
.P
|
recorded only if it follows something of variable length. For example, for the
|
||||||
For anchored patterns, a last literal value is recorded only if it follows
|
pattern /^a\ed+z\ed+/ the returned value is 1 (with "z" returned from
|
||||||
something of variable length. For example, for the pattern /^a\ed+z\ed+/ the
|
PCRE2_INFO_LASTCODEUNIT), but for /^a\edz\ed/ the returned value is 0.
|
||||||
returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for
|
|
||||||
/^a\edz\ed/ the returned value is 0.
|
|
||||||
.sp
|
.sp
|
||||||
PCRE2_INFO_LASTCODEUNIT
|
PCRE2_INFO_LASTCODEUNIT
|
||||||
.sp
|
.sp
|
||||||
|
@ -1693,9 +1690,9 @@ value, 0 is returned.
|
||||||
PCRE2_INFO_MATCHEMPTY
|
PCRE2_INFO_MATCHEMPTY
|
||||||
.sp
|
.sp
|
||||||
Return 1 if the pattern might match an empty string, otherwise 0. The third
|
Return 1 if the pattern might match an empty string, otherwise 0. The third
|
||||||
argument should point to an \fBuint32_t\fP variable. When a pattern contains
|
argument should point to an \fBuint32_t\fP variable. When a pattern contains
|
||||||
recursive subroutine calls it is not always possible to determine whether or
|
recursive subroutine calls it is not always possible to determine whether or
|
||||||
not it can match an empty string. PCRE2 takes a cautious approach and returns 1
|
not it can match an empty string. PCRE2 takes a cautious approach and returns 1
|
||||||
in such cases.
|
in such cases.
|
||||||
.sp
|
.sp
|
||||||
PCRE2_INFO_MATCHLIMIT
|
PCRE2_INFO_MATCHLIMIT
|
||||||
|
@ -2181,9 +2178,19 @@ standard convention for the operating system. The default can be overridden in
|
||||||
a
|
a
|
||||||
.\" HTML <a href="#compilecontext">
|
.\" HTML <a href="#compilecontext">
|
||||||
.\" </a>
|
.\" </a>
|
||||||
compile context.
|
compile context
|
||||||
.\"
|
.\"
|
||||||
During matching, the newline choice affects the behaviour of the dot,
|
by calling \fBpcre2_set_newline()\fP. It can also be overridden by starting a
|
||||||
|
pattern string with, for example, (*CRLF), as described in the
|
||||||
|
.\" HTML <a href="pcre2pattern.html#newlines">
|
||||||
|
.\" </a>
|
||||||
|
section on newline conventions
|
||||||
|
.\"
|
||||||
|
in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2pattern\fP
|
||||||
|
.\"
|
||||||
|
page. During matching, the newline choice affects the behaviour of the dot,
|
||||||
circumflex, and dollar metacharacters. It may also alter the way the match
|
circumflex, and dollar metacharacters. It may also alter the way the match
|
||||||
starting position is advanced after a match failure for an unanchored pattern.
|
starting position is advanced after a match failure for an unanchored pattern.
|
||||||
.P
|
.P
|
||||||
|
@ -2229,18 +2236,7 @@ that do not cause substrings to be captured. The \fBpcre2_pattern_info()\fP
|
||||||
function can be used to find out how many capturing subpatterns there are in a
|
function can be used to find out how many capturing subpatterns there are in a
|
||||||
compiled pattern.
|
compiled pattern.
|
||||||
.P
|
.P
|
||||||
A successful match returns the overall matched string and any captured
|
You can use auxiliary functions for accessing captured substrings
|
||||||
substrings to the caller via a vector of PCRE2_SIZE values. This is called the
|
|
||||||
\fBovector\fP, and is contained within the
|
|
||||||
.\" HTML <a href="#matchdatablock">
|
|
||||||
.\" </a>
|
|
||||||
match data block.
|
|
||||||
.\"
|
|
||||||
You can obtain direct access to the ovector by calling
|
|
||||||
\fBpcre2_get_ovector_pointer()\fP to find its address, and
|
|
||||||
\fBpcre2_get_ovector_count()\fP to find the number of pairs of values it
|
|
||||||
contains. Alternatively, you can use the auxiliary functions for accessing
|
|
||||||
captured substrings
|
|
||||||
.\" HTML <a href="#extractbynumber">
|
.\" HTML <a href="#extractbynumber">
|
||||||
.\" </a>
|
.\" </a>
|
||||||
by number
|
by number
|
||||||
|
@ -2248,9 +2244,20 @@ by number
|
||||||
or
|
or
|
||||||
.\" HTML <a href="#extractbyname">
|
.\" HTML <a href="#extractbyname">
|
||||||
.\" </a>
|
.\" </a>
|
||||||
by name
|
by name,
|
||||||
.\"
|
.\"
|
||||||
(see below).
|
as described in sections below.
|
||||||
|
.P
|
||||||
|
Alternatively, you can make direct use of the vector of PCRE2_SIZE values,
|
||||||
|
called the \fBovector\fP, which contains the offsets of captured strings. It is
|
||||||
|
part of the
|
||||||
|
.\" HTML <a href="#matchdatablock">
|
||||||
|
.\" </a>
|
||||||
|
match data block.
|
||||||
|
.\"
|
||||||
|
The function \fBpcre2_get_ovector_pointer()\fP returns the address of the
|
||||||
|
ovector, and \fBpcre2_get_ovector_count()\fP returns the number of pairs of
|
||||||
|
values it contains.
|
||||||
.P
|
.P
|
||||||
Within the ovector, the first in each pair of values is set to the offset of
|
Within the ovector, the first in each pair of values is set to the offset of
|
||||||
the first code unit of a substring, and the second is set to the offset of the
|
the first code unit of a substring, and the second is set to the offset of the
|
||||||
|
@ -2334,7 +2341,12 @@ After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure
|
||||||
to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and
|
to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and
|
||||||
\fBpcre2_get_mark()\fP can be called. It returns a pointer to the
|
\fBpcre2_get_mark()\fP can be called. It returns a pointer to the
|
||||||
zero-terminated name, which is within the compiled pattern. Otherwise NULL is
|
zero-terminated name, which is within the compiled pattern. Otherwise NULL is
|
||||||
returned. After a successful match, the (*MARK) name that is returned is the
|
returned. The length of the (*MARK) name (excluding the terminating zero) is
|
||||||
|
stored in the code unit that preceeds the name. You should use this instead of
|
||||||
|
relying on the terminating zero if the (*MARK) name might contain a binary
|
||||||
|
zero.
|
||||||
|
.P
|
||||||
|
After a successful match, the (*MARK) name that is returned is the
|
||||||
last one encountered on the matching path through the pattern. After a "no
|
last one encountered on the matching path through the pattern. After a "no
|
||||||
match" or a partial match, the last encountered (*MARK) name is returned. For
|
match" or a partial match, the last encountered (*MARK) name is returned. For
|
||||||
example, consider this pattern:
|
example, consider this pattern:
|
||||||
|
@ -2353,7 +2365,7 @@ different to the value of \fIovector[0]\fP if the pattern contains the \eK
|
||||||
escape sequence. After a partial match, however, this value is always the same
|
escape sequence. After a partial match, however, this value is always the same
|
||||||
as \fIovector[0]\fP because \eK does not affect the result of a partial match.
|
as \fIovector[0]\fP because \eK does not affect the result of a partial match.
|
||||||
.P
|
.P
|
||||||
After a UTF check failure, \fBpcre2_get_startchar()\fB can be used to obtain
|
After a UTF check failure, \fBpcre2_get_startchar()\fP can be used to obtain
|
||||||
the code unit offset of the invalid UTF character. Details are given in the
|
the code unit offset of the invalid UTF character. Details are given in the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2unicode\fP
|
\fBpcre2unicode\fP
|
||||||
|
@ -2692,7 +2704,7 @@ same number causes an error at compile time.
|
||||||
This function calls \fBpcre2_match()\fP and then makes a copy of the subject
|
This function calls \fBpcre2_match()\fP and then makes a copy of the subject
|
||||||
string in \fIoutputbuffer\fP, replacing the part that was matched with the
|
string in \fIoutputbuffer\fP, replacing the part that was matched with the
|
||||||
\fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can
|
\fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can
|
||||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||||
which a \eK item in a lookahead in the pattern causes the match to end before
|
which a \eK item in a lookahead in the pattern causes the match to end before
|
||||||
it starts are not supported, and give rise to an error return.
|
it starts are not supported, and give rise to an error return.
|
||||||
.P
|
.P
|
||||||
|
@ -2706,7 +2718,7 @@ allocate memory for the compiled code.
|
||||||
The \fIoutlengthptr\fP argument must point to a variable that contains the
|
The \fIoutlengthptr\fP argument must point to a variable that contains the
|
||||||
length, in code units, of the output buffer. If the function is successful, the
|
length, in code units, of the output buffer. If the function is successful, the
|
||||||
value is updated to contain the length of the new string, excluding the
|
value is updated to contain the length of the new string, excluding the
|
||||||
trailing zero that is automatically added.
|
trailing zero that is automatically added.
|
||||||
.P
|
.P
|
||||||
If the function is not successful, the value set via \fIoutlengthptr\fP depends
|
If the function is not successful, the value set via \fIoutlengthptr\fP depends
|
||||||
on the type of error. For syntax errors in the replacement string, the value is
|
on the type of error. For syntax errors in the replacement string, the value is
|
||||||
|
@ -2754,7 +2766,7 @@ advanced by one character except when CRLF is a valid newline sequence and the
|
||||||
next two characters are CR, LF. In this case, the current position is advanced
|
next two characters are CR, LF. In this case, the current position is advanced
|
||||||
by two characters.
|
by two characters.
|
||||||
.P
|
.P
|
||||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||||
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||||
this option is set, however, \fBpcre2_substitute()\fP continues to go through
|
this option is set, however, \fBpcre2_substitute()\fP continues to go through
|
||||||
the motions of matching and substituting (without, of course, writing anything)
|
the motions of matching and substituting (without, of course, writing anything)
|
||||||
|
@ -2762,15 +2774,15 @@ in order to compute the size of buffer that is needed. This value is passed
|
||||||
back via the \fIoutlengthptr\fP variable, with the result of the function still
|
back via the \fIoutlengthptr\fP variable, with the result of the function still
|
||||||
being PCRE2_ERROR_NOMEMORY.
|
being PCRE2_ERROR_NOMEMORY.
|
||||||
.P
|
.P
|
||||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||||
is needed for given substitution. However, this does mean that the entire
|
is needed for given substitution. However, this does mean that the entire
|
||||||
operation is carried out twice. Depending on the application, it may be more
|
operation is carried out twice. Depending on the application, it may be more
|
||||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||||
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||||
.P
|
.P
|
||||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do
|
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do
|
||||||
not appear in the pattern to be treated as unset groups. This option should be
|
not appear in the pattern to be treated as unset groups. This option should be
|
||||||
used with care, because it means that a typo in a group name or number no
|
used with care, because it means that a typo in a group name or number no
|
||||||
longer causes the PCRE2_ERROR_NOSUBSTRING error.
|
longer causes the PCRE2_ERROR_NOSUBSTRING error.
|
||||||
.P
|
.P
|
||||||
PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown
|
PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown
|
||||||
|
@ -2828,8 +2840,8 @@ string remains in force afterwards, as shown in this \fBpcre2test\fP example:
|
||||||
somebody
|
somebody
|
||||||
1: HELLO
|
1: HELLO
|
||||||
.sp
|
.sp
|
||||||
The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
|
The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
|
||||||
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
||||||
groups in the extended syntax forms to be treated as unset.
|
groups in the extended syntax forms to be treated as unset.
|
||||||
.P
|
.P
|
||||||
If successful, \fBpcre2_substitute()\fP returns the number of replacements that
|
If successful, \fBpcre2_substitute()\fP returns the number of replacements that
|
||||||
|
@ -2838,7 +2850,7 @@ were made. This may be zero if no matches were found, and is never greater than
|
||||||
.P
|
.P
|
||||||
In the event of an error, a negative error code is returned. Except for
|
In the event of an error, a negative error code is returned. Except for
|
||||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from \fBpcre2_match()\fP
|
PCRE2_ERROR_NOMATCH (which is never returned), errors from \fBpcre2_match()\fP
|
||||||
are passed straight back.
|
are passed straight back.
|
||||||
.P
|
.P
|
||||||
PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
|
PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
|
||||||
unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
|
unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
|
||||||
|
@ -2849,7 +2861,7 @@ unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple
|
||||||
.P
|
.P
|
||||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the
|
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the
|
||||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is
|
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is
|
||||||
needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
||||||
default.
|
default.
|
||||||
.P
|
.P
|
||||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||||
|
@ -2857,7 +2869,7 @@ replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||||
(invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket
|
(invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket
|
||||||
not found), PCRE2_BADSUBSTITUTION (syntax error in extended group
|
not found), PCRE2_BADSUBSTITUTION (syntax error in extended group
|
||||||
substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it
|
substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it
|
||||||
started, which can happen if \eK is used in an assertion).
|
started, which can happen if \eK is used in an assertion).
|
||||||
.P
|
.P
|
||||||
As for all PCRE2 errors, a text message that describes the error can be
|
As for all PCRE2 errors, a text message that describes the error can be
|
||||||
obtained by calling \fBpcre2_get_error_message()\fP.
|
obtained by calling \fBpcre2_get_error_message()\fP.
|
||||||
|
@ -2901,14 +2913,14 @@ first and last entries in the name-to-number table for the given name, and the
|
||||||
function returns the length of each entry in code units. In both cases,
|
function returns the length of each entry in code units. In both cases,
|
||||||
PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
|
PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
|
||||||
.P
|
.P
|
||||||
The format of the name table is described above in the section entitled
|
The format of the name table is described
|
||||||
\fIInformation about a pattern\fP
|
|
||||||
.\" HTML <a href="#infoaboutpattern">
|
.\" HTML <a href="#infoaboutpattern">
|
||||||
.\" </a>
|
.\" </a>
|
||||||
above.
|
above
|
||||||
.\"
|
.\"
|
||||||
Given all the relevant entries for the name, you can extract each of their
|
in the section entitled \fIInformation about a pattern\fP. Given all the
|
||||||
numbers, and hence the captured data.
|
relevant entries for the name, you can extract each of their numbers, and hence
|
||||||
|
the captured data.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "FINDING ALL POSSIBLE MATCHES AT ONE POSITION"
|
.SH "FINDING ALL POSSIBLE MATCHES AT ONE POSITION"
|
||||||
|
@ -3154,6 +3166,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 21 December 2015
|
Last updated: 16 December 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -20,8 +20,8 @@ documentation for details. In these cases the limit is substantially larger.
|
||||||
However, the speed of execution is slower. In the 32-bit library, the internal
|
However, the speed of execution is slower. In the 32-bit library, the internal
|
||||||
linkage size is always 4.
|
linkage size is always 4.
|
||||||
.P
|
.P
|
||||||
The maximum length of a source pattern string is essentially unlimited; it is
|
The maximum length of a source pattern string is essentially unlimited; it is
|
||||||
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
||||||
calls \fBpcre2_compile()\fP can specify a smaller limit.
|
calls \fBpcre2_compile()\fP can specify a smaller limit.
|
||||||
.P
|
.P
|
||||||
The maximum length (in code units) of a subject string is one less than the
|
The maximum length (in code units) of a subject string is one less than the
|
||||||
|
|
|
@ -1188,11 +1188,11 @@ When the newline convention (see
|
||||||
.\" </a>
|
.\" </a>
|
||||||
"Newline conventions"
|
"Newline conventions"
|
||||||
.\"
|
.\"
|
||||||
below) recognizes the two-character sequence CRLF as a newline, this is
|
below) recognizes the two-character sequence CRLF as a newline, this is
|
||||||
preferred, even if the single characters CR and LF are also recognized as
|
preferred, even if the single characters CR and LF are also recognized as
|
||||||
newlines. For example, if the newline convention is "any", a multiline mode
|
newlines. For example, if the newline convention is "any", a multiline mode
|
||||||
circumflex matches before "xyz" in the string "abc\er\enxyz" rather than after
|
circumflex matches before "xyz" in the string "abc\er\enxyz" rather than after
|
||||||
CR, even though CR on its own is a valid newline. (It also matches at the very
|
CR, even though CR on its own is a valid newline. (It also matches at the very
|
||||||
start of the string, of course.)
|
start of the string, of course.)
|
||||||
.P
|
.P
|
||||||
Note that the sequences \eA, \eZ, and \ez can be used to match the start and
|
Note that the sequences \eA, \eZ, and \ez can be used to match the start and
|
||||||
|
@ -1245,7 +1245,7 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
|
||||||
with a malformed UTF character. This has undefined results, because PCRE2
|
with a malformed UTF character. This has undefined results, because PCRE2
|
||||||
assumes that it is matching character by character in a valid UTF string (by
|
assumes that it is matching character by character in a valid UTF string (by
|
||||||
default it checks the subject string's validity at the start of processing
|
default it checks the subject string's validity at the start of processing
|
||||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||||
.P
|
.P
|
||||||
An application can lock out the use of \eC by setting the
|
An application can lock out the use of \eC by setting the
|
||||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||||
|
@ -1257,9 +1257,9 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
|
||||||
(described below)
|
(described below)
|
||||||
.\"
|
.\"
|
||||||
in a UTF mode, because this would make it impossible to calculate the length of
|
in a UTF mode, because this would make it impossible to calculate the length of
|
||||||
the lookbehind. Neither the alternative matching function
|
the lookbehind. Neither the alternative matching function
|
||||||
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
|
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
|
||||||
former gives a match-time error; the latter fails to optimize and so the match
|
former gives a match-time error; the latter fails to optimize and so the match
|
||||||
is always run using the interpreter.
|
is always run using the interpreter.
|
||||||
.P
|
.P
|
||||||
In general, the \eC escape sequence is best avoided. However, one way of using
|
In general, the \eC escape sequence is best avoided. However, one way of using
|
||||||
|
@ -1347,11 +1347,11 @@ inclusive. They can also be used for code points specified numerically, for
|
||||||
example [\e000-\e037]. Ranges can include any characters that are valid for the
|
example [\e000-\e037]. Ranges can include any characters that are valid for the
|
||||||
current mode.
|
current mode.
|
||||||
.P
|
.P
|
||||||
There is a special case in EBCDIC environments for ranges whose end points are
|
There is a special case in EBCDIC environments for ranges whose end points are
|
||||||
both specified as literal letters in the same case. For compatibility with
|
both specified as literal letters in the same case. For compatibility with
|
||||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||||
example, [h-k] matches only four characters, even though the codes for h and k
|
example, [h-k] matches only four characters, even though the codes for h and k
|
||||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||||
specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points
|
specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points
|
||||||
are included.
|
are included.
|
||||||
.P
|
.P
|
||||||
|
@ -1683,7 +1683,7 @@ first one in the pattern with the given number. The following pattern matches
|
||||||
.sp
|
.sp
|
||||||
/(?|(abc)|(def))(?1)/
|
/(?|(abc)|(def))(?1)/
|
||||||
.sp
|
.sp
|
||||||
A relative reference such as (?-1) is no different: it is just a convenient way
|
A relative reference such as (?-1) is no different: it is just a convenient way
|
||||||
of computing an absolute group number.
|
of computing an absolute group number.
|
||||||
.P
|
.P
|
||||||
If a
|
If a
|
||||||
|
@ -2549,7 +2549,7 @@ For example:
|
||||||
(?(VERSION>=10.4)yes|no)
|
(?(VERSION>=10.4)yes|no)
|
||||||
.sp
|
.sp
|
||||||
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or
|
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or
|
||||||
"no" otherwise. The fractional part of the version number may not contain more
|
"no" otherwise. The fractional part of the version number may not contain more
|
||||||
than two digits.
|
than two digits.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
@ -2667,21 +2667,21 @@ pattern above you can write (?-2) to refer to the second most recently opened
|
||||||
parentheses preceding the recursion. In other words, a negative number counts
|
parentheses preceding the recursion. In other words, a negative number counts
|
||||||
capturing parentheses leftwards from the point at which it is encountered.
|
capturing parentheses leftwards from the point at which it is encountered.
|
||||||
.P
|
.P
|
||||||
Be aware however, that if
|
Be aware however, that if
|
||||||
.\" HTML <a href="#dupsubpatternnumber">
|
.\" HTML <a href="#dupsubpatternnumber">
|
||||||
.\" </a>
|
.\" </a>
|
||||||
duplicate subpattern numbers
|
duplicate subpattern numbers
|
||||||
.\"
|
.\"
|
||||||
are in use, relative references refer to the earliest subpattern with the
|
are in use, relative references refer to the earliest subpattern with the
|
||||||
appropriate number. Consider, for example:
|
appropriate number. Consider, for example:
|
||||||
.sp
|
.sp
|
||||||
(?|(a)|(b)) (c) (?-2)
|
(?|(a)|(b)) (c) (?-2)
|
||||||
.sp
|
.sp
|
||||||
The first two capturing groups (a) and (b) are both numbered 1, and group (c)
|
The first two capturing groups (a) and (b) are both numbered 1, and group (c)
|
||||||
is number 2. When the reference (?-2) is encountered, the second most recently
|
is number 2. When the reference (?-2) is encountered, the second most recently
|
||||||
opened parentheses has the number 1, but it is the first such group (the (a)
|
opened parentheses has the number 1, but it is the first such group (the (a)
|
||||||
group) to which the recursion refers. This would be the same if an absolute
|
group) to which the recursion refers. This would be the same if an absolute
|
||||||
reference (?1) was used. In other words, relative references are just a
|
reference (?1) was used. In other words, relative references are just a
|
||||||
shorthand for computing a group number.
|
shorthand for computing a group number.
|
||||||
.P
|
.P
|
||||||
It is also possible to refer to subsequently opened parentheses, by writing
|
It is also possible to refer to subsequently opened parentheses, by writing
|
||||||
|
@ -2988,13 +2988,13 @@ parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
||||||
depending on whether or not a name is present.
|
depending on whether or not a name is present.
|
||||||
.P
|
.P
|
||||||
By default, for compatibility with Perl, a name is any sequence of characters
|
By default, for compatibility with Perl, a name is any sequence of characters
|
||||||
that does not include a closing parenthesis. The name is not processed in
|
that does not include a closing parenthesis. The name is not processed in
|
||||||
any way, and it is not possible to include a closing parenthesis in the name.
|
any way, and it is not possible to include a closing parenthesis in the name.
|
||||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||||
the name. A closing parenthesis can be included in a name either as \e) or
|
the name. A closing parenthesis can be included in a name either as \e) or
|
||||||
between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
||||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
||||||
of the pattern.
|
of the pattern.
|
||||||
.P
|
.P
|
||||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||||
|
|
|
@ -174,7 +174,7 @@ Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and
|
||||||
PCRE2_DOLLAR_ENDONLY when calling \fBpcre2_compile()\fP directly, but there is
|
PCRE2_DOLLAR_ENDONLY when calling \fBpcre2_compile()\fP directly, but there is
|
||||||
no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using
|
no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using
|
||||||
the POSIX API, passing REG_NEWLINE to PCRE2's \fBregcomp()\fP function
|
the POSIX API, passing REG_NEWLINE to PCRE2's \fBregcomp()\fP function
|
||||||
causes PCRE2_MULTILINE to be passed to \fBpcre2_compile()\fP, and REG_DOTALL
|
causes PCRE2_MULTILINE to be passed to \fBpcre2_compile()\fP, and REG_DOTALL
|
||||||
passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
|
passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
@ -211,7 +211,7 @@ to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
|
||||||
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
||||||
intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
|
intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
|
||||||
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
||||||
how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are
|
how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are
|
||||||
mutually exclusive; the error REG_INVARG is returned.
|
mutually exclusive; the error REG_INVARG is returned.
|
||||||
.P
|
.P
|
||||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||||
|
|
|
@ -140,13 +140,13 @@ on a system with different endianness.
|
||||||
.P
|
.P
|
||||||
Decoded patterns can be used for matching in the usual way, and must be freed
|
Decoded patterns can be used for matching in the usual way, and must be freed
|
||||||
by calling \fBpcre2_code_free()\fP. However, be aware that there is a potential
|
by calling \fBpcre2_code_free()\fP. However, be aware that there is a potential
|
||||||
race issue if you are using multiple patterns that were decoded from a single
|
race issue if you are using multiple patterns that were decoded from a single
|
||||||
byte stream in a multithreaded application. A single copy of the character
|
byte stream in a multithreaded application. A single copy of the character
|
||||||
tables is used by all the decoded patterns and a reference count is used to
|
tables is used by all the decoded patterns and a reference count is used to
|
||||||
arrange for its memory to be automatically freed when the last pattern is
|
arrange for its memory to be automatically freed when the last pattern is
|
||||||
freed, but there is no locking on this reference count. Therefore, if you want
|
freed, but there is no locking on this reference count. Therefore, if you want
|
||||||
to call \fBpcre2_code_free()\fP for these patterns in different threads, you
|
to call \fBpcre2_code_free()\fP for these patterns in different threads, you
|
||||||
must arrange your own locking, and ensure that \fBpcre2_code_free()\fP cannot
|
must arrange your own locking, and ensure that \fBpcre2_code_free()\fP cannot
|
||||||
be called by two threads at the same time.
|
be called by two threads at the same time.
|
||||||
.P
|
.P
|
||||||
If a pattern was processed by \fBpcre2_jit_compile()\fP before being
|
If a pattern was processed by \fBpcre2_jit_compile()\fP before being
|
||||||
|
|
|
@ -83,7 +83,7 @@ it matches a literal "u".
|
||||||
.sp
|
.sp
|
||||||
\eC is dangerous because it may leave the current matching point in the middle
|
\eC is dangerous because it may leave the current matching point in the middle
|
||||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
|
of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
|
||||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||||
with the use of \eC permanently disabled.
|
with the use of \eC permanently disabled.
|
||||||
.P
|
.P
|
||||||
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
||||||
|
|
|
@ -444,12 +444,12 @@ the start of a modifier list. For example:
|
||||||
.sp
|
.sp
|
||||||
abc\e=notbol,notempty
|
abc\e=notbol,notempty
|
||||||
.sp
|
.sp
|
||||||
If the subject string is empty and \e= is followed by whitespace, the line is
|
If the subject string is empty and \e= is followed by whitespace, the line is
|
||||||
treated as a comment line, and is not used for matching. For example:
|
treated as a comment line, and is not used for matching. For example:
|
||||||
.sp
|
.sp
|
||||||
\e= This is a comment.
|
\e= This is a comment.
|
||||||
abc\e= This is an invalid modifier list.
|
abc\e= This is an invalid modifier list.
|
||||||
.sp
|
.sp
|
||||||
A backslash followed by any other non-alphanumeric character just escapes that
|
A backslash followed by any other non-alphanumeric character just escapes that
|
||||||
character. A backslash followed by anything else causes an error. However, if
|
character. A backslash followed by anything else causes an error. However, if
|
||||||
the very last character in the line is a backslash (and there is no modifier
|
the very last character in the line is a backslash (and there is no modifier
|
||||||
|
@ -501,7 +501,7 @@ for a description of their effects.
|
||||||
no_utf_check set PCRE2_NO_UTF_CHECK
|
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||||
ucp set PCRE2_UCP
|
ucp set PCRE2_UCP
|
||||||
ungreedy set PCRE2_UNGREEDY
|
ungreedy set PCRE2_UNGREEDY
|
||||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||||
utf set PCRE2_UTF
|
utf set PCRE2_UTF
|
||||||
.sp
|
.sp
|
||||||
As well as turning on the PCRE2_UTF option, the \fButf\fP modifier causes all
|
As well as turning on the PCRE2_UTF option, the \fButf\fP modifier causes all
|
||||||
|
@ -528,7 +528,7 @@ about the pattern:
|
||||||
jitfast use JIT fast path
|
jitfast use JIT fast path
|
||||||
jitverify verify JIT use
|
jitverify verify JIT use
|
||||||
locale=<name> use this locale
|
locale=<name> use this locale
|
||||||
max_pattern_length=<n> set the maximum pattern length
|
max_pattern_length=<n> set the maximum pattern length
|
||||||
memory show memory used
|
memory show memory used
|
||||||
newline=<type> set newline type
|
newline=<type> set newline type
|
||||||
null_context compile with a NULL context
|
null_context compile with a NULL context
|
||||||
|
@ -608,9 +608,9 @@ by the item that follows it in the pattern.
|
||||||
.SS "Passing a NULL context"
|
.SS "Passing a NULL context"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_compile()\fP. If
|
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_compile()\fP. If
|
||||||
the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
||||||
testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses
|
testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
@ -634,9 +634,9 @@ actual length of the pattern is passed.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
Some tests use long patterns that are very repetitive. Instead of creating a
|
Some tests use long patterns that are very repetitive. Instead of creating a
|
||||||
very long input line for such a pattern, you can use a special repetition
|
very long input line for such a pattern, you can use a special repetition
|
||||||
feature, similar to the one described for subject lines above. If the
|
feature, similar to the one described for subject lines above. If the
|
||||||
\fBexpand\fP modifier is present on a pattern, parts of the pattern that have
|
\fBexpand\fP modifier is present on a pattern, parts of the pattern that have
|
||||||
the form
|
the form
|
||||||
.sp
|
.sp
|
||||||
\e[<characters>]{<count>}
|
\e[<characters>]{<count>}
|
||||||
|
@ -647,12 +647,12 @@ cannot be nested. An initial "\e[" sequence is recognized only if "]{" followed
|
||||||
by decimal digits and "}" is found later in the pattern. If not, the characters
|
by decimal digits and "}" is found later in the pattern. If not, the characters
|
||||||
remain in the pattern unaltered.
|
remain in the pattern unaltered.
|
||||||
.P
|
.P
|
||||||
If part of an expanded pattern looks like an expansion, but is really part of
|
If part of an expanded pattern looks like an expansion, but is really part of
|
||||||
the actual pattern, unwanted expansion can be avoided by giving two values in
|
the actual pattern, unwanted expansion can be avoided by giving two values in
|
||||||
the quantifier. For example, \e[AB]{6000,6000} is not recognized as an
|
the quantifier. For example, \e[AB]{6000,6000} is not recognized as an
|
||||||
expansion item.
|
expansion item.
|
||||||
.P
|
.P
|
||||||
If the \fBinfo\fP modifier is set on an expanded pattern, the result of the
|
If the \fBinfo\fP modifier is set on an expanded pattern, the result of the
|
||||||
expansion is included in the information that is output.
|
expansion is included in the information that is output.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
@ -771,9 +771,9 @@ suite.
|
||||||
.SS "Limiting the pattern length"
|
.SS "Limiting the pattern length"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
The \fBmax_pattern_length\fP modifier sets a limit, in code units, to the
|
The \fBmax_pattern_length\fP modifier sets a limit, in code units, to the
|
||||||
length of pattern that \fBpcre2_compile()\fP will accept. Breaching the limit
|
length of pattern that \fBpcre2_compile()\fP will accept. Breaching the limit
|
||||||
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
||||||
variable can hold (essentially unlimited).
|
variable can hold (essentially unlimited).
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
@ -797,13 +797,13 @@ modifiers set options for the \fBregcomp()\fP function:
|
||||||
ucp REG_UCP ) the POSIX standard
|
ucp REG_UCP ) the POSIX standard
|
||||||
utf REG_UTF8 )
|
utf REG_UTF8 )
|
||||||
.sp
|
.sp
|
||||||
The \fBregerror_buffsize\fP modifier specifies a size for the error buffer that
|
The \fBregerror_buffsize\fP modifier specifies a size for the error buffer that
|
||||||
is passed to \fBregerror()\fP in the event of a compilation error. For example:
|
is passed to \fBregerror()\fP in the event of a compilation error. For example:
|
||||||
.sp
|
.sp
|
||||||
/abc/posix,regerror_buffsize=20
|
/abc/posix,regerror_buffsize=20
|
||||||
.sp
|
.sp
|
||||||
This provides a means of testing the behaviour of \fBregerror()\fP when the
|
This provides a means of testing the behaviour of \fBregerror()\fP when the
|
||||||
buffer is too small for the error message. If this modifier has not been set, a
|
buffer is too small for the error message. If this modifier has not been set, a
|
||||||
large buffer is used.
|
large buffer is used.
|
||||||
.P
|
.P
|
||||||
The \fBaftertext\fP and \fBallaftertext\fP subject modifiers work as described
|
The \fBaftertext\fP and \fBallaftertext\fP subject modifiers work as described
|
||||||
|
@ -863,9 +863,9 @@ compilation process.
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
.sp
|
.sp
|
||||||
These modifiers may not appear in a \fB#pattern\fP command. If you want them as
|
These modifiers may not appear in a \fB#pattern\fP command. If you want them as
|
||||||
defaults, set them in a \fB#subject\fP command.
|
defaults, set them in a \fB#subject\fP command.
|
||||||
|
@ -956,7 +956,7 @@ pattern.
|
||||||
mark show mark values
|
mark show mark values
|
||||||
match_limit=<n> set a match limit
|
match_limit=<n> set a match limit
|
||||||
memory show memory usage
|
memory show memory usage
|
||||||
null_context match with a NULL context
|
null_context match with a NULL context
|
||||||
offset=<n> set starting offset
|
offset=<n> set starting offset
|
||||||
offset_limit=<n> set offset limit
|
offset_limit=<n> set offset limit
|
||||||
ovector=<n> set size of output vector
|
ovector=<n> set size of output vector
|
||||||
|
@ -965,9 +965,9 @@ pattern.
|
||||||
startchar show startchar when relevant
|
startchar show startchar when relevant
|
||||||
startoffset=<n> same as offset=<n>
|
startoffset=<n> same as offset=<n>
|
||||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
zero_terminate pass the subject as zero-terminated
|
zero_terminate pass the subject as zero-terminated
|
||||||
.sp
|
.sp
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
|
@ -1102,7 +1102,7 @@ by name.
|
||||||
If the \fBreplace\fP modifier is set, the \fBpcre2_substitute()\fP function is
|
If the \fBreplace\fP modifier is set, the \fBpcre2_substitute()\fP function is
|
||||||
called instead of one of the matching functions. Note that replacement strings
|
called instead of one of the matching functions. Note that replacement strings
|
||||||
cannot contain commas, because a comma signifies the end of a modifier. This is
|
cannot contain commas, because a comma signifies the end of a modifier. This is
|
||||||
not thought to be an issue in a test program.
|
not thought to be an issue in a test program.
|
||||||
.P
|
.P
|
||||||
Unlike subject strings, \fBpcre2test\fP does not process replacement strings
|
Unlike subject strings, \fBpcre2test\fP does not process replacement strings
|
||||||
for escape sequences. In UTF mode, a replacement string is checked to see if it
|
for escape sequences. In UTF mode, a replacement string is checked to see if it
|
||||||
|
@ -1119,7 +1119,7 @@ for \fBpcre2_substitute()\fP:
|
||||||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
.sp
|
.sp
|
||||||
.P
|
.P
|
||||||
After a successful substitution, the modified string is output, preceded by the
|
After a successful substitution, the modified string is output, preceded by the
|
||||||
number of replacements. This may be zero if there were no matches. Here is a
|
number of replacements. This may be zero if there were no matches. Here is a
|
||||||
|
@ -1230,10 +1230,10 @@ matching starts. Its value is a number of code units, not characters.
|
||||||
.SS "Setting an offset limit"
|
.SS "Setting an offset limit"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
The \fBoffset_limit\fP modifier sets a limit for unanchored matches. If a match
|
The \fBoffset_limit\fP modifier sets a limit for unanchored matches. If a match
|
||||||
cannot be found starting at or before this offset in the subject, a "no match"
|
cannot be found starting at or before this offset in the subject, a "no match"
|
||||||
return is given. The data value is a number of code units, not characters. When
|
return is given. The data value is a number of code units, not characters. When
|
||||||
this modifier is used, the \fBuse_offset_limit\fP modifier must have been set
|
this modifier is used, the \fBuse_offset_limit\fP modifier must have been set
|
||||||
for the pattern; if not, an error is generated.
|
for the pattern; if not, an error is generated.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
@ -1273,8 +1273,8 @@ passing the replacement string as zero-terminated.
|
||||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
||||||
\fBpcre2_dfa_match()\fP or \fBpcre2_jit_match()\fP. If the \fBnull_context\fP
|
\fBpcre2_dfa_match()\fP or \fBpcre2_jit_match()\fP. If the \fBnull_context\fP
|
||||||
modifier is set, however, NULL is passed. This is for testing that the matching
|
modifier is set, however, NULL is passed. This is for testing that the matching
|
||||||
functions behave correctly in this case (they use default values). This
|
functions behave correctly in this case (they use default values). This
|
||||||
modifier cannot be used with the \fBfind_limits\fP modifier or when testing the
|
modifier cannot be used with the \fBfind_limits\fP modifier or when testing the
|
||||||
substitution function.
|
substitution function.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
|
|
@ -797,14 +797,18 @@ PATTERN MODIFIERS
|
||||||
with that pattern. They may not appear in #pattern commands. These mod-
|
with that pattern. They may not appear in #pattern commands. These mod-
|
||||||
ifiers do not affect the compilation process.
|
ifiers do not affect the compilation process.
|
||||||
|
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text
|
allusedtext show all consulted text
|
||||||
/g global global matching
|
/g global global matching
|
||||||
mark show mark values
|
mark show mark values
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
|
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
|
|
||||||
These modifiers may not appear in a #pattern command. If you want them
|
These modifiers may not appear in a #pattern command. If you want them
|
||||||
as defaults, set them in a #subject command.
|
as defaults, set them in a #subject command.
|
||||||
|
@ -860,33 +864,38 @@ SUBJECT MODIFIERS
|
||||||
line (see above), in which case they apply to every subject line that
|
line (see above), in which case they apply to every subject line that
|
||||||
is matched against that pattern.
|
is matched against that pattern.
|
||||||
|
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
allaftertext show text after captures
|
allaftertext show text after captures
|
||||||
allcaptures show all captures
|
allcaptures show all captures
|
||||||
allusedtext show all consulted text (non-JIT only)
|
allusedtext show all consulted text (non-JIT only)
|
||||||
altglobal alternative global matching
|
altglobal alternative global matching
|
||||||
callout_capture show captures at callout time
|
callout_capture show captures at callout time
|
||||||
callout_data=<n> set a value to pass via callouts
|
callout_data=<n> set a value to pass via callouts
|
||||||
callout_fail=<n>[:<m>] control callout failure
|
callout_fail=<n>[:<m>] control callout failure
|
||||||
callout_none do not supply a callout function
|
callout_none do not supply a callout function
|
||||||
copy=<number or name> copy captured substring
|
copy=<number or name> copy captured substring
|
||||||
dfa use pcre2_dfa_match()
|
dfa use pcre2_dfa_match()
|
||||||
find_limits find match and recursion limits
|
find_limits find match and recursion limits
|
||||||
get=<number or name> extract captured substring
|
get=<number or name> extract captured substring
|
||||||
getall extract all captured substrings
|
getall extract all captured substrings
|
||||||
/g global global matching
|
/g global global matching
|
||||||
jitstack=<n> set size of JIT stack
|
jitstack=<n> set size of JIT stack
|
||||||
mark show mark values
|
mark show mark values
|
||||||
match_limit=<n> set a match limit
|
match_limit=<n> set a match limit
|
||||||
memory show memory usage
|
memory show memory usage
|
||||||
null_context match with a NULL context
|
null_context match with a NULL context
|
||||||
offset=<n> set starting offset
|
offset=<n> set starting offset
|
||||||
offset_limit=<n> set offset limit
|
offset_limit=<n> set offset limit
|
||||||
ovector=<n> set size of output vector
|
ovector=<n> set size of output vector
|
||||||
recursion_limit=<n> set a recursion limit
|
recursion_limit=<n> set a recursion limit
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show startchar when relevant
|
startchar show startchar when relevant
|
||||||
zero_terminate pass the subject as zero-terminated
|
startoffset=<n> same as offset=<n>
|
||||||
|
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||||
|
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
|
zero_terminate pass the subject as zero-terminated
|
||||||
|
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
|
|
||||||
|
@ -1011,19 +1020,30 @@ SUBJECT MODIFIERS
|
||||||
Testing the substitution function
|
Testing the substitution function
|
||||||
|
|
||||||
If the replace modifier is set, the pcre2_substitute() function is
|
If the replace modifier is set, the pcre2_substitute() function is
|
||||||
called instead of one of the matching functions. Unlike subject
|
called instead of one of the matching functions. Note that replacement
|
||||||
strings, pcre2test does not process replacement strings for escape
|
strings cannot contain commas, because a comma signifies the end of a
|
||||||
sequences. In UTF mode, a replacement string is checked to see if it is
|
modifier. This is not thought to be an issue in a test program.
|
||||||
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
|
||||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
|
||||||
the individual code units are copied directly. This provides a means of
|
|
||||||
passing an invalid UTF-8 string for testing purposes.
|
|
||||||
|
|
||||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
Unlike subject strings, pcre2test does not process replacement strings
|
||||||
pcre2_substitute(). After a successful substitution, the modified
|
for escape sequences. In UTF mode, a replacement string is checked to
|
||||||
string is output, preceded by the number of replacements. This may be
|
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
||||||
zero if there were no matches. Here is a simple example of a substitu-
|
a UTF string of the appropriate code unit width. If it is not a valid
|
||||||
tion test:
|
UTF-8 string, the individual code units are copied directly. This pro-
|
||||||
|
vides a means of passing an invalid UTF-8 string for testing purposes.
|
||||||
|
|
||||||
|
The following modifiers set options (in additional to the normal match
|
||||||
|
options) for pcre2_substitute():
|
||||||
|
|
||||||
|
global PCRE2_SUBSTITUTE_GLOBAL
|
||||||
|
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||||
|
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||||
|
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
|
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
|
|
||||||
|
|
||||||
|
After a successful substitution, the modified string is output, pre-
|
||||||
|
ceded by the number of replacements. This may be zero if there were no
|
||||||
|
matches. Here is a simple example of a substitution test:
|
||||||
|
|
||||||
/abc/replace=xxx
|
/abc/replace=xxx
|
||||||
=abc=abc=
|
=abc=abc=
|
||||||
|
@ -1031,12 +1051,13 @@ SUBJECT MODIFIERS
|
||||||
=abc=abc=\=global
|
=abc=abc=\=global
|
||||||
2: =xxx=xxx=
|
2: =xxx=xxx=
|
||||||
|
|
||||||
Subject and replacement strings should be kept relatively short for
|
Subject and replacement strings should be kept relatively short (fewer
|
||||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
than 256 characters) for substitution tests, as fixed-size buffers are
|
||||||
test for buffer overflow, if the replacement string starts with a num-
|
used. To make it easy to test for buffer overflow, if the replacement
|
||||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
string starts with a number in square brackets, that number is passed
|
||||||
the size of the output buffer, with the replacement string starting at
|
to pcre2_substitute() as the size of the output buffer, with the
|
||||||
the next character. Here is an example that tests the edge case:
|
replacement string starting at the next character. Here is an example
|
||||||
|
that tests the edge case:
|
||||||
|
|
||||||
/abc/
|
/abc/
|
||||||
123abc123\=replace=[10]XYZ
|
123abc123\=replace=[10]XYZ
|
||||||
|
@ -1044,6 +1065,19 @@ SUBJECT MODIFIERS
|
||||||
123abc123\=replace=[9]XYZ
|
123abc123\=replace=[9]XYZ
|
||||||
Failed: error -47: no more memory
|
Failed: error -47: no more memory
|
||||||
|
|
||||||
|
The default action of pcre2_substitute() is to return
|
||||||
|
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if
|
||||||
|
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the sub-
|
||||||
|
stitute_overflow_length modifier), pcre2_substitute() continues to go
|
||||||
|
through the motions of matching and substituting, in order to compute
|
||||||
|
the size of buffer that is required. When this happens, pcre2test shows
|
||||||
|
the required buffer length (which includes space for the trailing zero)
|
||||||
|
as part of the error message. For example:
|
||||||
|
|
||||||
|
/abc/substitute_overflow_length
|
||||||
|
123abc123\=replace=[9]XYZ
|
||||||
|
Failed: error -47: no more memory: 10 code units are needed
|
||||||
|
|
||||||
A replacement string is ignored with POSIX and DFA matching. Specifying
|
A replacement string is ignored with POSIX and DFA matching. Specifying
|
||||||
partial matching provokes an error return ("bad option value") from
|
partial matching provokes an error return ("bad option value") from
|
||||||
pcre2_substitute().
|
pcre2_substitute().
|
||||||
|
@ -1471,5 +1505,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 05 November 2015
|
Last updated: 12 December 2015
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
|
|
|
@ -118,8 +118,8 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows
|
||||||
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
||||||
strings to be in host byte order.
|
strings to be in host byte order.
|
||||||
.P
|
.P
|
||||||
A UTF string is checked before any other processing takes place. In the case of
|
A UTF string is checked before any other processing takes place. In the case of
|
||||||
\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting
|
\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting
|
||||||
offset, the check is applied only to that part of the subject that could be
|
offset, the check is applied only to that part of the subject that could be
|
||||||
inspected during matching, and there is a check that the starting offset points
|
inspected during matching, and there is a check that the starting offset points
|
||||||
to the first code unit of a character or to the end of the subject. If there
|
to the first code unit of a character or to the end of the subject. If there
|
||||||
|
|
|
@ -211,7 +211,7 @@ for (;;)
|
||||||
|
|
||||||
last if ($_ eq "");
|
last if ($_ eq "");
|
||||||
next if $_ =~ /^\\=(?:\s|$)/; # Comment line
|
next if $_ =~ /^\\=(?:\s|$)/; # Comment line
|
||||||
|
|
||||||
$x = eval "\"$_\""; # To get escapes processed
|
$x = eval "\"$_\""; # To get escapes processed
|
||||||
|
|
||||||
# Empty array for holding results, ensure $REGERROR and $REGMARK are
|
# Empty array for holding results, ensure $REGERROR and $REGMARK are
|
||||||
|
|
|
@ -44,7 +44,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define PCRE2_MAJOR 10
|
#define PCRE2_MAJOR 10
|
||||||
#define PCRE2_MINOR 21
|
#define PCRE2_MINOR 21
|
||||||
#define PCRE2_PRERELEASE -RC1
|
#define PCRE2_PRERELEASE -RC1
|
||||||
#define PCRE2_DATE 2015-07-06
|
#define PCRE2_DATE 2015-12-15
|
||||||
|
|
||||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||||
imported have to be identified as such. When building PCRE2, the appropriate
|
imported have to be identified as such. When building PCRE2, the appropriate
|
||||||
|
|
|
@ -466,7 +466,7 @@ if (*first_op == OP_REVERSE)
|
||||||
/* In byte-mode we can do this quickly. */
|
/* In byte-mode we can do this quickly. */
|
||||||
|
|
||||||
{
|
{
|
||||||
size_t current_offset = (size_t)(current_subject - start_subject);
|
size_t current_offset = (size_t)(current_subject - start_subject);
|
||||||
gone_back = (current_offset < max_back)? current_offset : max_back;
|
gone_back = (current_offset < max_back)? current_offset : max_back;
|
||||||
current_subject -= gone_back;
|
current_subject -= gone_back;
|
||||||
}
|
}
|
||||||
|
|
|
@ -251,7 +251,7 @@ static const char match_error_texts[] =
|
||||||
"bad substitution in replacement string\0"
|
"bad substitution in replacement string\0"
|
||||||
/* 60 */
|
/* 60 */
|
||||||
"match with end before start is not supported\0"
|
"match with end before start is not supported\0"
|
||||||
"too many replacements (more than INT_MAX)\0"
|
"too many replacements (more than INT_MAX)\0"
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -562,7 +562,7 @@ typedef struct pcre2_real_compile_context {
|
||||||
int (*stack_guard)(uint32_t, void *);
|
int (*stack_guard)(uint32_t, void *);
|
||||||
void *stack_guard_data;
|
void *stack_guard_data;
|
||||||
const uint8_t *tables;
|
const uint8_t *tables;
|
||||||
PCRE2_SIZE max_pattern_length;
|
PCRE2_SIZE max_pattern_length;
|
||||||
uint16_t bsr_convention;
|
uint16_t bsr_convention;
|
||||||
uint16_t newline_convention;
|
uint16_t newline_convention;
|
||||||
uint32_t parens_nest_limit;
|
uint32_t parens_nest_limit;
|
||||||
|
@ -581,7 +581,7 @@ typedef struct pcre2_real_match_context {
|
||||||
#endif
|
#endif
|
||||||
int (*callout)(pcre2_callout_block *, void *);
|
int (*callout)(pcre2_callout_block *, void *);
|
||||||
void *callout_data;
|
void *callout_data;
|
||||||
PCRE2_SIZE offset_limit;
|
PCRE2_SIZE offset_limit;
|
||||||
uint32_t match_limit;
|
uint32_t match_limit;
|
||||||
uint32_t recursion_limit;
|
uint32_t recursion_limit;
|
||||||
} pcre2_real_match_context;
|
} pcre2_real_match_context;
|
||||||
|
@ -592,7 +592,7 @@ copying the size from possibly unaligned memory into a variable of the same
|
||||||
type. Use a macro rather than a typedef to avoid compiler warnings when this
|
type. Use a macro rather than a typedef to avoid compiler warnings when this
|
||||||
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
|
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
|
||||||
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
|
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
|
||||||
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
|
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
|
||||||
here.) */
|
here.) */
|
||||||
|
|
||||||
#undef CODE_BLOCKSIZE_TYPE
|
#undef CODE_BLOCKSIZE_TYPE
|
||||||
|
@ -660,7 +660,7 @@ typedef struct recurse_check {
|
||||||
typedef struct recurse_cache {
|
typedef struct recurse_cache {
|
||||||
PCRE2_SPTR group;
|
PCRE2_SPTR group;
|
||||||
int recno;
|
int recno;
|
||||||
} recurse_cache;
|
} recurse_cache;
|
||||||
|
|
||||||
/* Structure for maintaining a chain of pointers to the currently incomplete
|
/* Structure for maintaining a chain of pointers to the currently incomplete
|
||||||
branches, for testing for left recursion while compiling. */
|
branches, for testing for left recursion while compiling. */
|
||||||
|
@ -693,7 +693,7 @@ typedef struct compile_block {
|
||||||
PCRE2_SPTR start_code; /* The start of the compiled code */
|
PCRE2_SPTR start_code; /* The start of the compiled code */
|
||||||
PCRE2_SPTR start_pattern; /* The start of the pattern */
|
PCRE2_SPTR start_pattern; /* The start of the pattern */
|
||||||
PCRE2_SPTR end_pattern; /* The end of the pattern */
|
PCRE2_SPTR end_pattern; /* The end of the pattern */
|
||||||
PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */
|
PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */
|
||||||
PCRE2_UCHAR *name_table; /* The name/number table */
|
PCRE2_UCHAR *name_table; /* The name/number table */
|
||||||
size_t workspace_size; /* Size of workspace */
|
size_t workspace_size; /* Size of workspace */
|
||||||
uint16_t names_found; /* Number of entries so far */
|
uint16_t names_found; /* Number of entries so far */
|
||||||
|
@ -717,7 +717,7 @@ typedef struct compile_block {
|
||||||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||||
BOOL had_accept; /* (*ACCEPT) encountered */
|
BOOL had_accept; /* (*ACCEPT) encountered */
|
||||||
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
|
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
|
||||||
BOOL had_recurse; /* Had a recursion or subroutine call */
|
BOOL had_recurse; /* Had a recursion or subroutine call */
|
||||||
BOOL check_lookbehind; /* Lookbehinds need later checking */
|
BOOL check_lookbehind; /* Lookbehinds need later checking */
|
||||||
BOOL dupnames; /* Duplicate names exist */
|
BOOL dupnames; /* Duplicate names exist */
|
||||||
BOOL iscondassert; /* Next assert is a condition */
|
BOOL iscondassert; /* Next assert is a condition */
|
||||||
|
|
|
@ -2409,7 +2409,7 @@ for (;;)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Match a single code unit, even in UTF-8 mode. This opcode really does
|
/* Match a single code unit, even in UTF-8 mode. This opcode really does
|
||||||
match any code unit, even newline. (It really should be called ANYCODEUNIT,
|
match any code unit, even newline. (It really should be called ANYCODEUNIT,
|
||||||
of course - the byte name is from pre-16 bit days.) */
|
of course - the byte name is from pre-16 bit days.) */
|
||||||
|
|
||||||
case OP_ANYBYTE:
|
case OP_ANYBYTE:
|
||||||
|
|
|
@ -77,7 +77,7 @@ if (where == NULL) /* Requests field length */
|
||||||
case PCRE2_INFO_CAPTURECOUNT:
|
case PCRE2_INFO_CAPTURECOUNT:
|
||||||
case PCRE2_INFO_FIRSTCODETYPE:
|
case PCRE2_INFO_FIRSTCODETYPE:
|
||||||
case PCRE2_INFO_FIRSTCODEUNIT:
|
case PCRE2_INFO_FIRSTCODEUNIT:
|
||||||
case PCRE2_INFO_HASBACKSLASHC:
|
case PCRE2_INFO_HASBACKSLASHC:
|
||||||
case PCRE2_INFO_HASCRORLF:
|
case PCRE2_INFO_HASCRORLF:
|
||||||
case PCRE2_INFO_JCHANGED:
|
case PCRE2_INFO_JCHANGED:
|
||||||
case PCRE2_INFO_LASTCODETYPE:
|
case PCRE2_INFO_LASTCODETYPE:
|
||||||
|
|
|
@ -190,13 +190,13 @@ return 0;
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* These take no account of UTF as they always print each individual code unit.
|
/* These take no account of UTF as they always print each individual code unit.
|
||||||
The string is zero-terminated for print_custring(); the length is given for
|
The string is zero-terminated for print_custring(); the length is given for
|
||||||
print_custring_bylen().
|
print_custring_bylen().
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
f file to write to
|
f file to write to
|
||||||
ptr point to the string
|
ptr point to the string
|
||||||
len length for print_custring_bylen()
|
len length for print_custring_bylen()
|
||||||
|
|
||||||
Returns: nothing
|
Returns: nothing
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -1546,7 +1546,7 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
|
||||||
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
|
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Find the minimum length of subject string. If it can match an empty string,
|
/* Find the minimum length of subject string. If it can match an empty string,
|
||||||
the minimum length is already known. */
|
the minimum length is already known. */
|
||||||
|
|
||||||
if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
|
if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
|
||||||
|
@ -1555,19 +1555,19 @@ if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
|
||||||
{
|
{
|
||||||
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
|
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
|
||||||
break; /* Leave minlength unchanged (will be zero) */
|
break; /* Leave minlength unchanged (will be zero) */
|
||||||
|
|
||||||
case -2:
|
case -2:
|
||||||
return 2; /* missing capturing bracket */
|
return 2; /* missing capturing bracket */
|
||||||
|
|
||||||
case -3:
|
case -3:
|
||||||
return 3; /* unrecognized opcode */
|
return 3; /* unrecognized opcode */
|
||||||
|
|
||||||
default:
|
default:
|
||||||
if (min > UINT16_MAX) min = UINT16_MAX;
|
if (min > UINT16_MAX) min = UINT16_MAX;
|
||||||
re->minlength = min;
|
re->minlength = min;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -136,7 +136,7 @@ for (p = string; length > 0; p++)
|
||||||
register uint32_t ab, d;
|
register uint32_t ab, d;
|
||||||
|
|
||||||
c = *p;
|
c = *p;
|
||||||
length--;
|
length--;
|
||||||
|
|
||||||
if (c < 128) continue; /* ASCII character */
|
if (c < 128) continue; /* ASCII character */
|
||||||
|
|
||||||
|
@ -329,7 +329,7 @@ PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate
|
||||||
for (p = string; length > 0; p++)
|
for (p = string; length > 0; p++)
|
||||||
{
|
{
|
||||||
c = *p;
|
c = *p;
|
||||||
length--;
|
length--;
|
||||||
|
|
||||||
if ((c & 0xf800) != 0xd800)
|
if ((c & 0xf800) != 0xd800)
|
||||||
{
|
{
|
||||||
|
|
|
@ -285,7 +285,7 @@ start location rather than being passed as a PCRE2 "starting offset". */
|
||||||
|
|
||||||
if ((eflags & REG_STARTEND) != 0)
|
if ((eflags & REG_STARTEND) != 0)
|
||||||
{
|
{
|
||||||
if (pmatch == NULL) return REG_INVARG;
|
if (pmatch == NULL) return REG_INVARG;
|
||||||
so = pmatch[0].rm_so;
|
so = pmatch[0].rm_so;
|
||||||
eo = pmatch[0].rm_eo;
|
eo = pmatch[0].rm_eo;
|
||||||
}
|
}
|
||||||
|
|
|
@ -6033,14 +6033,14 @@ if (dat_datctl.replacement[0] != 0)
|
||||||
|
|
||||||
if (rc < 0)
|
if (rc < 0)
|
||||||
{
|
{
|
||||||
PCRE2_SIZE msize;
|
PCRE2_SIZE msize;
|
||||||
fprintf(outfile, "Failed: error %d", rc);
|
fprintf(outfile, "Failed: error %d", rc);
|
||||||
if (rc != PCRE2_ERROR_NOMEMORY && nsize != PCRE2_UNSET)
|
if (rc != PCRE2_ERROR_NOMEMORY && nsize != PCRE2_UNSET)
|
||||||
fprintf(outfile, " at offset %ld in replacement", (long int)nsize);
|
fprintf(outfile, " at offset %ld in replacement", (long int)nsize);
|
||||||
fprintf(outfile, ": ");
|
fprintf(outfile, ": ");
|
||||||
PCRE2_GET_ERROR_MESSAGE(msize, rc, pbuffer);
|
PCRE2_GET_ERROR_MESSAGE(msize, rc, pbuffer);
|
||||||
PCHARSV(CASTVAR(void *, pbuffer), 0, msize, FALSE, outfile);
|
PCHARSV(CASTVAR(void *, pbuffer), 0, msize, FALSE, outfile);
|
||||||
if (rc == PCRE2_ERROR_NOMEMORY &&
|
if (rc == PCRE2_ERROR_NOMEMORY &&
|
||||||
(xoptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)
|
(xoptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)
|
||||||
fprintf(outfile, ": %ld code units are needed", (long int)nsize);
|
fprintf(outfile, ": %ld code units are needed", (long int)nsize);
|
||||||
}
|
}
|
||||||
|
@ -6405,7 +6405,7 @@ else for (gmatched = 0;; gmatched++)
|
||||||
TESTFLD(match_data, mark, !=, NULL))
|
TESTFLD(match_data, mark, !=, NULL))
|
||||||
{
|
{
|
||||||
fprintf(outfile, ", mark=");
|
fprintf(outfile, ", mark=");
|
||||||
PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf,
|
PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf,
|
||||||
outfile);
|
outfile);
|
||||||
rubriclength += 7;
|
rubriclength += 7;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue