File tidies, version updates, etc. for 10.21-RC1
This commit is contained in:
parent
293da188aa
commit
dffd559601
|
@ -258,7 +258,7 @@ ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
|
|||
|
||||
IF(PCRE2_NEVER_BACKSLASH_C)
|
||||
SET(NEVER_BACKSLASH_C 1)
|
||||
ENDIF(PCRE2_NEVER_BACKSLASH_C)
|
||||
ENDIF(PCRE2_NEVER_BACKSLASH_C)
|
||||
|
||||
IF(PCRE2_SUPPORT_UNICODE)
|
||||
SET(SUPPORT_UNICODE 1)
|
||||
|
@ -400,7 +400,7 @@ SET(PCRE2_SOURCES
|
|||
src/pcre2_context.c
|
||||
src/pcre2_dfa_match.c
|
||||
src/pcre2_error.c
|
||||
src/pcre2_find_bracket.c
|
||||
src/pcre2_find_bracket.c
|
||||
src/pcre2_jit_compile.c
|
||||
src/pcre2_maketables.c
|
||||
src/pcre2_match.c
|
||||
|
|
86
ChangeLog
86
ChangeLog
|
@ -268,18 +268,18 @@ size of patterns that they are prepared to handle.
|
|||
|
||||
78. (*NO_AUTO_POSSESS) was not working.
|
||||
|
||||
79. Adding group information caching improves the speed of compiling when
|
||||
checking whether a group has a fixed length and/or could match an empty string,
|
||||
especially when recursion or subroutine calls are involved. However, this
|
||||
cannot be used when (?| is present in the pattern because the same number may
|
||||
be used for groups of different sizes. To catch runaway patterns in this
|
||||
situation, counts have been introduced to the functions that scan for empty
|
||||
79. Adding group information caching improves the speed of compiling when
|
||||
checking whether a group has a fixed length and/or could match an empty string,
|
||||
especially when recursion or subroutine calls are involved. However, this
|
||||
cannot be used when (?| is present in the pattern because the same number may
|
||||
be used for groups of different sizes. To catch runaway patterns in this
|
||||
situation, counts have been introduced to the functions that scan for empty
|
||||
branches or compute fixed lengths.
|
||||
|
||||
80. Allow for the possibility of the size of the nest_save structure not being
|
||||
a factor of the size of the compiling workspace (it currently is).
|
||||
|
||||
81. Check for integer overflow in minimum length calculation and cap it at
|
||||
81. Check for integer overflow in minimum length calculation and cap it at
|
||||
65535.
|
||||
|
||||
82. Small optimizations in code for finding the minimum matching length.
|
||||
|
@ -290,72 +290,72 @@ a factor of the size of the compiling workspace (it currently is).
|
|||
|
||||
85. Check for too many replacements (more than INT_MAX) in pcre2_substitute().
|
||||
|
||||
86. Avoid the possibility of computing with an out-of-bounds pointer (though
|
||||
86. Avoid the possibility of computing with an out-of-bounds pointer (though
|
||||
not dereferencing it) while handling lookbehind assertions.
|
||||
|
||||
87. Failure to get memory for the match data in regcomp() is now given as a
|
||||
87. Failure to get memory for the match data in regcomp() is now given as a
|
||||
regcomp() error instead of waiting for regexec() to pick it up.
|
||||
|
||||
88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid
|
||||
88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid
|
||||
newline sequence.
|
||||
|
||||
89. Paranoid check in regcomp() for bad error code from pcre2_compile().
|
||||
|
||||
90. Run test 8 (internal offsets and code sizes) for link sizes 3 and 4 as well
|
||||
90. Run test 8 (internal offsets and code sizes) for link sizes 3 and 4 as well
|
||||
as for link size 2.
|
||||
|
||||
91. Document that JIT has a limit on pattern size, and give more information
|
||||
91. Document that JIT has a limit on pattern size, and give more information
|
||||
about JIT compile failures in pcre2test.
|
||||
|
||||
92. Implement PCRE2_INFO_HASBACKSLASHC.
|
||||
|
||||
93. Re-arrange valgrind support code in pcre2test to avoid spurious reports
|
||||
93. Re-arrange valgrind support code in pcre2test to avoid spurious reports
|
||||
with JIT (possibly caused by SSE2?).
|
||||
|
||||
94. Support offset_limit in JIT.
|
||||
|
||||
95. A sequence such as [[:punct:]b] that is, a POSIX character class followed
|
||||
by a single ASCII character in a class item, was incorrectly compiled in UCP
|
||||
95. A sequence such as [[:punct:]b] that is, a POSIX character class followed
|
||||
by a single ASCII character in a class item, was incorrectly compiled in UCP
|
||||
mode. The POSIX class got lost, but only if the single character followed it.
|
||||
|
||||
96. [:punct:] in UCP mode was matching some characters in the range 128-255
|
||||
96. [:punct:] in UCP mode was matching some characters in the range 128-255
|
||||
that should not have been matched.
|
||||
|
||||
97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all
|
||||
characters with code points greater than 255 are in the class. When a Unicode
|
||||
property was also in the class (if PCRE2_UCP is set, escapes such as \w are
|
||||
turned into Unicode properties), wide characters were not correctly handled,
|
||||
97. If [:^ascii:] or [:^xdigit:] are present in a non-negated class, all
|
||||
characters with code points greater than 255 are in the class. When a Unicode
|
||||
property was also in the class (if PCRE2_UCP is set, escapes such as \w are
|
||||
turned into Unicode properties), wide characters were not correctly handled,
|
||||
and could fail to match.
|
||||
|
||||
98. In pcre2test, make the "startoffset" modifier a synonym of "offset",
|
||||
98. In pcre2test, make the "startoffset" modifier a synonym of "offset",
|
||||
because it sets the "startoffset" parameter for pcre2_match().
|
||||
|
||||
99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between
|
||||
an item and its qualifier (for example, A(?#comment)?B) pcre2_compile()
|
||||
99. If PCRE2_AUTO_CALLOUT was set on a pattern that had a (?# comment between
|
||||
an item and its qualifier (for example, A(?#comment)?B) pcre2_compile()
|
||||
misbehaved. This bug was found by the LLVM fuzzer.
|
||||
|
||||
100. The error for an invalid UTF pattern string always gave the code unit
|
||||
100. The error for an invalid UTF pattern string always gave the code unit
|
||||
offset as zero instead of where the invalidity was found.
|
||||
|
||||
101. Further to 97 above, negated classes such as [^[:^ascii:]\d] were also not
|
||||
101. Further to 97 above, negated classes such as [^[:^ascii:]\d] were also not
|
||||
working correctly in UCP mode.
|
||||
|
||||
102. Similar to 99 above, if an isolated \E was present between an item and its
|
||||
102. Similar to 99 above, if an isolated \E was present between an item and its
|
||||
qualifier when PCRE2_AUTO_CALLOUT was set, pcre2_compile() misbehaved. This bug
|
||||
was found by the LLVM fuzzer.
|
||||
|
||||
103. The POSIX wrapper function regexec() crashed if the option REG_STARTEND
|
||||
103. The POSIX wrapper function regexec() crashed if the option REG_STARTEND
|
||||
was set when the pmatch argument was NULL. It now returns REG_INVARG.
|
||||
|
||||
104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep.
|
||||
|
||||
105. An empty \Q\E sequence between an item and its qualifier caused
|
||||
105. An empty \Q\E sequence between an item and its qualifier caused
|
||||
pcre2_compile() to misbehave when auto callouts were enabled. This bug
|
||||
was found by the LLVM fuzzer.
|
||||
|
||||
106. If both PCRE2_ALT_VERBNAMES and PCRE2_EXTENDED were set, and a (*MARK) or
|
||||
other verb "name" ended with whitespace immediately before the closing
|
||||
parenthesis, pcre2_compile() misbehaved. Example: /(*:abc )/, but only when
|
||||
106. If both PCRE2_ALT_VERBNAMES and PCRE2_EXTENDED were set, and a (*MARK) or
|
||||
other verb "name" ended with whitespace immediately before the closing
|
||||
parenthesis, pcre2_compile() misbehaved. Example: /(*:abc )/, but only when
|
||||
both those options were set.
|
||||
|
||||
107. In a number of places pcre2_compile() was not handling NULL characters
|
||||
|
@ -363,27 +363,27 @@ correctly, and pcre2test with the "bincode" modifier was not always correctly
|
|||
displaying fields containing NULLS:
|
||||
|
||||
(a) Within /x extended #-comments
|
||||
(b) Within the "name" part of (*MARK) and other *verbs
|
||||
(c) Within the text argument of a callout
|
||||
|
||||
108. If a pattern that was compiled with PCRE2_EXTENDED started with white
|
||||
space or a #-type comment that was followed by (?-x), which turns off
|
||||
(b) Within the "name" part of (*MARK) and other *verbs
|
||||
(c) Within the text argument of a callout
|
||||
|
||||
108. If a pattern that was compiled with PCRE2_EXTENDED started with white
|
||||
space or a #-type comment that was followed by (?-x), which turns off
|
||||
PCRE2_EXTENDED, and there was no subsequent (?x) to turn it on again,
|
||||
pcre2_compile() assumed that (?-x) applied to the whole pattern and
|
||||
consequently mis-compiled it. This bug was found by the LLVM fuzzer. The fix
|
||||
for this bug means that a setting of any of the (?imsxU) options at the start
|
||||
of a pattern is no longer transferred to the options that are returned by
|
||||
PCRE2_INFO_ALLOPTIONS. In fact, this was an anachronism that should have
|
||||
consequently mis-compiled it. This bug was found by the LLVM fuzzer. The fix
|
||||
for this bug means that a setting of any of the (?imsxU) options at the start
|
||||
of a pattern is no longer transferred to the options that are returned by
|
||||
PCRE2_INFO_ALLOPTIONS. In fact, this was an anachronism that should have
|
||||
changed when the effects of those options were all moved to compile time.
|
||||
|
||||
109. An escaped closing parenthesis in the "name" part of a (*verb) when
|
||||
109. An escaped closing parenthesis in the "name" part of a (*verb) when
|
||||
PCRE2_ALT_VERBNAMES was set caused pcre2_compile() to malfunction. This bug
|
||||
was found by the LLVM fuzzer.
|
||||
|
||||
110. Implemented PCRE2_SUBSTITUTE_UNSET_EMPTY, and updated pcre2test to make it
|
||||
110. Implemented PCRE2_SUBSTITUTE_UNSET_EMPTY, and updated pcre2test to make it
|
||||
possible to test it.
|
||||
|
||||
111. "Harden" pcre2test against ridiculously large values in modifiers and
|
||||
111. "Harden" pcre2test against ridiculously large values in modifiers and
|
||||
command line arguments.
|
||||
|
||||
112. Implemented PCRE2_SUBSTITUTE_UNKNOWN_UNSET and PCRE2_SUBSTITUTE_OVERFLOW_
|
||||
|
|
41
NEWS
41
NEWS
|
@ -1,6 +1,47 @@
|
|||
News about PCRE2 releases
|
||||
-------------------------
|
||||
|
||||
Version 10.21 15-December-2015
|
||||
------------------------------
|
||||
|
||||
1. Many bugs have been fixed. A large number of them were provoked only by very
|
||||
strange pattern input, and were discovered by fuzzers. Some others were
|
||||
discovered by code auditing. See ChangeLog for details.
|
||||
|
||||
2. The Unicode tables have been updated to Unicode version 8.0.0.
|
||||
|
||||
3. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
|
||||
class, where both values are literal letters in the same case, omit the
|
||||
non-letter EBCDIC code points within the range.
|
||||
|
||||
4. There have been a number of enhancements to the pcre2_substitute() function,
|
||||
giving more flexibility to replacement facilities. It is now also possible to
|
||||
cause the function to return the needed buffer size if the one given is too
|
||||
small.
|
||||
|
||||
5. The PCRE2_ALT_VERBNAMES option causes the "name" parts of special verbs such
|
||||
as (*THEN:name) to be processed for backslashes and to take note of
|
||||
PCRE2_EXTENDED.
|
||||
|
||||
6. PCRE2_INFO_HASBACKSLASHC makes it possible for a client to find out if a
|
||||
pattern uses \C, and --never-backslash-C makes it possible to compile a version
|
||||
PCRE2 in which the use of \C is always forbidden.
|
||||
|
||||
7. A limit to the length of pattern that can be handled can now be set by
|
||||
calling pcre2_set_max_pattern_length().
|
||||
|
||||
8. When matching an unanchored pattern, a match can be required to begin within
|
||||
a given number of code units after the start of the subject by calling
|
||||
pcre2_set_offset_limit().
|
||||
|
||||
9. The pcre2test program has been extended to test new facilities, and it can
|
||||
now run the tests when LF on its own is not a valid newline sequence.
|
||||
|
||||
10. The RunTest script has also been updated to enable more tests to be run.
|
||||
|
||||
11. There have been some minor performance enhancements.
|
||||
|
||||
|
||||
Version 10.20 30-June-2015
|
||||
--------------------------
|
||||
|
||||
|
|
|
@ -97,7 +97,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_context.c
|
||||
pcre2_dfa_match.c
|
||||
pcre2_error.c
|
||||
pcre2_find_bracket.c
|
||||
pcre2_find_bracket.c
|
||||
pcre2_jit_compile.c
|
||||
pcre2_maketables.c
|
||||
pcre2_match.c
|
||||
|
|
14
README
14
README
|
@ -219,13 +219,13 @@ library. They are also documented in the pcre2build man page.
|
|||
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
|
||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||
pattern. This limits the amount of system stack that a pattern uses when it
|
||||
|
@ -731,7 +731,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_context.c )
|
||||
src/pcre2_dfa_match.c )
|
||||
src/pcre2_error.c )
|
||||
src/pcre2_find_bracket.c )
|
||||
src/pcre2_find_bracket.c )
|
||||
src/pcre2_jit_compile.c )
|
||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||
|
|
|
@ -82,7 +82,7 @@ utf8=$?
|
|||
nl=`$pcre2test -C newline`
|
||||
if [ "$nl" != "LF" -a "$nl" != "ANY" -a "$nl" != "ANYCRLF" ]; then
|
||||
pcre2grep="$pcre2grep -N LF"
|
||||
echo "Default newline setting forced to LF"
|
||||
echo "Default newline setting forced to LF"
|
||||
fi
|
||||
|
||||
# ------ Function to run and check a special pcre2grep arguments test -------
|
||||
|
|
20
RunTest
20
RunTest
|
@ -406,7 +406,7 @@ if [ $jit -ne 0 -a "$nojit" != "yes" ] ; then
|
|||
jitopt=-jit
|
||||
if [ "$valgrind" != "" ] ; then
|
||||
vjs="--suppressions=$testdata/valgrind-jit.supp"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# If no specific tests were requested, select all. Those that are not
|
||||
|
@ -439,10 +439,10 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
|
|||
do17=yes
|
||||
do18=yes
|
||||
do19=yes
|
||||
do20=yes
|
||||
do21=yes
|
||||
do22=yes
|
||||
do23=yes
|
||||
do20=yes
|
||||
do21=yes
|
||||
do22=yes
|
||||
do23=yes
|
||||
fi
|
||||
|
||||
# Handle any explicit skips at this stage, so that an argument list may consist
|
||||
|
@ -720,7 +720,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
checkresult $? 13 ""
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
# Tests for DFA UTF and UCP features. Output is different for the different widths.
|
||||
|
||||
if [ $do14 = yes ] ; then
|
||||
|
@ -730,7 +730,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
else
|
||||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput14 testtry
|
||||
checkresult $? 14-$bits ""
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Test non-JIT match and recursion limits
|
||||
|
@ -798,7 +798,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
$sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput20 testtry
|
||||
checkresult $? 20 ""
|
||||
fi
|
||||
|
||||
|
||||
# \C tests without UTF - DFA matching is supported
|
||||
|
||||
if [ "$do21" = yes ] ; then
|
||||
|
@ -814,7 +814,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
|
||||
# \C tests with UTF - DFA matching is not supported for \C in UTF mode
|
||||
|
||||
|
||||
if [ "$do22" = yes ] ; then
|
||||
echo $title22
|
||||
if [ $supportBSC -eq 0 ] ; then
|
||||
|
@ -830,7 +830,7 @@ for bmode in "$test8" "$test16" "$test32"; do
|
|||
fi
|
||||
|
||||
# Test when \C is disabled
|
||||
|
||||
|
||||
if [ "$do23" = yes ] ; then
|
||||
echo $title23
|
||||
if [ $supportBSC -ne 0 ] ; then
|
||||
|
|
20
configure.ac
20
configure.ac
|
@ -11,16 +11,16 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
|
|||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [21])
|
||||
m4_define(pcre2_prerelease, [-RC1])
|
||||
m4_define(pcre2_date, [2015-07-06])
|
||||
m4_define(pcre2_date, [2015-12-15])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
||||
# Libtool shared library interface versions (current:revision:age)
|
||||
m4_define(libpcre2_8_version, [2:0:2])
|
||||
m4_define(libpcre2_16_version, [2:0:2])
|
||||
m4_define(libpcre2_32_version, [2:0:2])
|
||||
m4_define(libpcre2_posix_version, [0:0:0])
|
||||
m4_define(libpcre2_8_version, [3:0:3])
|
||||
m4_define(libpcre2_16_version, [3:0:3])
|
||||
m4_define(libpcre2_32_version, [3:0:3])
|
||||
m4_define(libpcre2_posix_version, [0:1:0])
|
||||
|
||||
AC_PREREQ(2.57)
|
||||
AC_INIT(PCRE2, pcre2_major.pcre2_minor[]pcre2_prerelease, , pcre2)
|
||||
|
@ -189,12 +189,12 @@ AC_ARG_ENABLE(bsr-anycrlf,
|
|||
AS_HELP_STRING([--enable-bsr-anycrlf],
|
||||
[\R matches only CR, LF, CRLF by default]),
|
||||
, enable_bsr_anycrlf=no)
|
||||
|
||||
|
||||
# Handle --enable-never-backslash-C
|
||||
AC_ARG_ENABLE(never-backslash-C,
|
||||
AS_HELP_STRING([--enable-never-backslash-C],
|
||||
[use of \C causes an error]),
|
||||
, enable_never_backslash_C=no)
|
||||
, enable_never_backslash_C=no)
|
||||
|
||||
# Handle --enable-ebcdic
|
||||
AC_ARG_ENABLE(ebcdic,
|
||||
|
@ -348,7 +348,7 @@ if test "x$enable_ebcdic" = "xyes"; then
|
|||
AC_MSG_ERROR([support for EBCDIC and Unicode cannot be enabled at the same time])
|
||||
fi
|
||||
if test "x$enable_pcre2_16" = "xyes" -o "x$enable_pcre2_32" = "xyes"; then
|
||||
AC_MSG_ERROR([EBCDIC support is available only for the 8-bit library])
|
||||
AC_MSG_ERROR([EBCDIC support is available only for the 8-bit library])
|
||||
fi
|
||||
fi
|
||||
|
||||
|
@ -617,7 +617,7 @@ fi
|
|||
if test "$enable_never_backslash_C" = "yes"; then
|
||||
AC_DEFINE([NEVER_BACKSLASH_C], [], [
|
||||
Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns.])
|
||||
fi
|
||||
fi
|
||||
|
||||
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
|
||||
The value of LINK_SIZE determines the number of bytes used to store
|
||||
|
@ -896,7 +896,7 @@ $PACKAGE-$VERSION configuration summary:
|
|||
Enable Unicode support .......... : ${enable_unicode}
|
||||
Newline char/sequence ........... : ${enable_newline}
|
||||
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
|
||||
\C is disabled .................. : ${enable_never_backslash_C}
|
||||
\C is disabled .................. : ${enable_never_backslash_C}
|
||||
EBCDIC coding ................... : ${enable_ebcdic}
|
||||
EBCDIC code for NL .............. : ${ebcdic_nl_code}
|
||||
Rebuild char tables ............. : ${enable_rebuild_chartables}
|
||||
|
|
|
@ -97,7 +97,7 @@ can skip ahead to the CMake section.
|
|||
pcre2_context.c
|
||||
pcre2_dfa_match.c
|
||||
pcre2_error.c
|
||||
pcre2_find_bracket.c
|
||||
pcre2_find_bracket.c
|
||||
pcre2_jit_compile.c
|
||||
pcre2_maketables.c
|
||||
pcre2_match.c
|
||||
|
|
|
@ -219,13 +219,13 @@ library. They are also documented in the pcre2build man page.
|
|||
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
|
||||
. In a pattern, the escape sequence \C matches a single code unit, even in a
|
||||
UTF mode. This can be dangerous because it breaks up multi-code-unit
|
||||
characters. You can build PCRE2 with the use of \C permanently locked out by
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
adding --enable-never-backslash-C (note the upper case C) to the "configure"
|
||||
command. When \C is allowed by the library, individual applications can lock
|
||||
it out by calling pcre2_compile() with the PCRE2_NEVER_BACKSLASH_C option.
|
||||
|
||||
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||
pattern. This limits the amount of system stack that a pattern uses when it
|
||||
|
@ -731,7 +731,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_context.c )
|
||||
src/pcre2_dfa_match.c )
|
||||
src/pcre2_error.c )
|
||||
src/pcre2_find_bracket.c )
|
||||
src/pcre2_find_bracket.c )
|
||||
src/pcre2_jit_compile.c )
|
||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||
|
|
|
@ -126,9 +126,9 @@ running redundant checks.
|
|||
<P>
|
||||
The use of the \C escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||
problems, because it may leave the current matching point in the middle of a
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
application to lock out the use of \C, causing a compile-time error if it is
|
||||
encountered. It is also possible to build PCRE2 with the use of \C permanently
|
||||
encountered. It is also possible to build PCRE2 with the use of \C permanently
|
||||
disabled.
|
||||
</P>
|
||||
<P>
|
||||
|
|
|
@ -42,19 +42,20 @@ request are as follows:
|
|||
PCRE2_BSR_ANYCRLF: CR, LF, or CRLF only
|
||||
PCRE2_INFO_CAPTURECOUNT Number of capturing subpatterns
|
||||
PCRE2_INFO_FIRSTBITMAP Bitmap of first code units, or NULL
|
||||
PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1
|
||||
PCRE2_INFO_FIRSTCODETYPE Type of start-of-match information
|
||||
0 nothing set
|
||||
1 first code unit is set
|
||||
2 start of string or after newline
|
||||
PCRE2_INFO_FIRSTCODEUNIT First code unit when type is 1
|
||||
PCRE2_INFO_HASBACKSLASHC Return 1 if pattern contains \C
|
||||
PCRE2_INFO_HASCRORLF Return 1 if explicit CR or LF matches
|
||||
exist in the pattern
|
||||
PCRE2_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
|
||||
PCRE2_INFO_JITSIZE Size of JIT compiled code, or 0
|
||||
PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1
|
||||
PCRE2_INFO_LASTCODETYPE Type of must-be-present information
|
||||
0 nothing set
|
||||
1 code unit is set
|
||||
PCRE2_INFO_LASTCODEUNIT Last code unit when type is 1
|
||||
PCRE2_INFO_MATCHEMPTY 1 if the pattern can match an
|
||||
empty string, 0 otherwise
|
||||
PCRE2_INFO_MATCHLIMIT Match limit if set,
|
||||
|
@ -62,8 +63,8 @@ request are as follows:
|
|||
PCRE2_INFO_MAXLOOKBEHIND Length (in characters) of the longest
|
||||
lookbehind assertion
|
||||
PCRE2_INFO_MINLENGTH Lower bound length of matching strings
|
||||
PCRE2_INFO_NAMEENTRYSIZE Size of name table entries
|
||||
PCRE2_INFO_NAMECOUNT Number of named subpatterns
|
||||
PCRE2_INFO_NAMEENTRYSIZE Size of name table entries
|
||||
PCRE2_INFO_NAMETABLE Pointer to name table
|
||||
PCRE2_CONFIG_NEWLINE Code for the newline sequence:
|
||||
PCRE2_NEWLINE_CR
|
||||
|
|
|
@ -26,7 +26,7 @@ SYNOPSIS
|
|||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function sets, in a compile context, the maximum length (in code units) of
|
||||
This function sets, in a compile context, the maximum length (in code units) of
|
||||
the pattern that can be compiled. The result is always zero.
|
||||
</P>
|
||||
<P>
|
||||
|
|
|
@ -70,6 +70,9 @@ The options are:
|
|||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
</pre>
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
|
|
|
@ -618,7 +618,7 @@ of the following compile-time parameters:
|
|||
PCRE2's character tables
|
||||
The newline character sequence
|
||||
The compile time nested parentheses limit
|
||||
The maximum length of the pattern string
|
||||
The maximum length of the pattern string
|
||||
An external function for stack checking
|
||||
</pre>
|
||||
A compile context is also required if you are using custom memory management.
|
||||
|
@ -661,10 +661,10 @@ in the current locale.
|
|||
<b> PCRE2_SIZE <i>value</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
This sets a maximum length, in code units, for the pattern string that is to be
|
||||
compiled. If the pattern is longer, an error is generated. This facility is
|
||||
provided so that applications that accept patterns from external sources can
|
||||
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
||||
This sets a maximum length, in code units, for the pattern string that is to be
|
||||
compiled. If the pattern is longer, an error is generated. This facility is
|
||||
provided so that applications that accept patterns from external sources can
|
||||
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
||||
can hold, which is effectively unlimited.
|
||||
<b>int pcre2_set_newline(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
|
@ -716,8 +716,8 @@ of the following match-time parameters:
|
|||
<pre>
|
||||
A callout function
|
||||
The offset limit for matching an unanchored pattern
|
||||
The limit for calling <i>match()</i>
|
||||
The limit for calling <i>match()</i> recursively
|
||||
The limit for calling <b>match()</b> (see below)
|
||||
The limit for calling <b>match()</b> recursively
|
||||
</pre>
|
||||
A match context is also required if you are using custom memory management.
|
||||
If none of these apply, just pass NULL as the context argument of
|
||||
|
@ -771,7 +771,9 @@ PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
|
|||
<P>
|
||||
The offset limit facility can be used to track progress when searching large
|
||||
subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
||||
start within the first line of the subject.
|
||||
start within the first line of the subject. If this is set with an offset
|
||||
limit, a match must occur in the first line and also within the offset limit.
|
||||
In other words, whichever limit comes first is used.
|
||||
<b>int pcre2_set_match_limit(pcre2_match_context *<i>mcontext</i>,</b>
|
||||
<b> uint32_t <i>value</i>);</b>
|
||||
<br>
|
||||
|
@ -1212,7 +1214,9 @@ built.
|
|||
If this option is set, an unanchored pattern is required to match before or at
|
||||
the first newline in the subject string, though the matched text may continue
|
||||
over the newline. See also PCRE2_USE_OFFSET_LIMIT, which provides a more
|
||||
general limiting facility.
|
||||
general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a
|
||||
match must occur in the first line and also within the offset limit. In other
|
||||
words, whichever limit comes first is used.
|
||||
<pre>
|
||||
PCRE2_MATCH_UNSET_BACKREF
|
||||
</pre>
|
||||
|
@ -1251,7 +1255,7 @@ This option locks out the use of \C in the pattern that is being compiled.
|
|||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||
it may leave the current matching point in the middle of a multi-code-unit
|
||||
character. This option may be useful in applications that process patterns from
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
locks out the use of \C.
|
||||
<pre>
|
||||
PCRE2_NEVER_UCP
|
||||
|
@ -1563,11 +1567,10 @@ are as follows:
|
|||
Return a copy of the pattern's options. The third argument should point to a
|
||||
<b>uint32_t</b> variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
|
||||
were passed to <b>pcre2_compile()</b>, whereas PCRE2_INFO_ALLOPTIONS returns
|
||||
the compile options as modified by any top-level option settings at the start
|
||||
of the pattern itself. In other words, they are the options that will be in
|
||||
force when matching starts. For example, if the pattern /(?im)abc(?-i)d/ is
|
||||
compiled with the PCRE2_EXTENDED option, the result is PCRE2_CASELESS,
|
||||
PCRE2_MULTILINE, and PCRE2_EXTENDED.
|
||||
the compile options as modified by any top-level option settings such as (*UTF)
|
||||
at the start of the pattern itself. For example, if the pattern /(*UTF)abc/ is
|
||||
compiled with the PCRE2_EXTENDED option, the result is PCRE2_EXTENDED and
|
||||
PCRE2_UTF.
|
||||
</P>
|
||||
<P>
|
||||
A pattern compiled without PCRE2_ANCHORED is automatically anchored by PCRE2 if
|
||||
|
@ -1609,18 +1612,27 @@ matches only CR, LF, or CRLF.
|
|||
<pre>
|
||||
PCRE2_INFO_CAPTURECOUNT
|
||||
</pre>
|
||||
Return the number of capturing subpatterns in the pattern. The third argument
|
||||
should point to an <b>uint32_t</b> variable.
|
||||
Return the highest capturing subpattern number in the pattern. In patterns
|
||||
where (?| is not used, this is also the total number of capturing subpatterns.
|
||||
The third argument should point to an <b>uint32_t</b> variable.
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTBITMAP
|
||||
</pre>
|
||||
In the absence of a single first code unit for a non-anchored pattern,
|
||||
<b>pcre2_compile()</b> may construct a 256-bit table that defines a fixed set of
|
||||
values for the first code unit in any match. For example, a pattern that starts
|
||||
with [abc] results in a table with three bits set. When code unit values
|
||||
greater than 255 are supported, the flag bit for 255 means "any code unit of
|
||||
value 255 or above". If such a table was constructed, a pointer to it is
|
||||
returned. Otherwise NULL is returned. The third argument should point to an
|
||||
<b>const uint8_t *</b> variable.
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTCODETYPE
|
||||
</pre>
|
||||
Return information about the first code unit of any matched string, for a
|
||||
non-anchored pattern. The third argument should point to an <b>uint32_t</b>
|
||||
variable.
|
||||
</P>
|
||||
<P>
|
||||
If there is a fixed first value, for example, the letter "c" from a pattern
|
||||
such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||
variable. If there is a fixed first value, for example, the letter "c" from a
|
||||
pattern such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
||||
it is known that a match can occur only at the start of the subject or
|
||||
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
||||
|
@ -1635,16 +1647,10 @@ value is always less than 256. In the 16-bit library the value can be up to
|
|||
0xffff. In the 32-bit library in UTF-32 mode the value can be up to 0x10ffff,
|
||||
and up to 0xffffffff when not using UTF-32 mode.
|
||||
<pre>
|
||||
PCRE2_INFO_FIRSTBITMAP
|
||||
PCRE2_INFO_HASBACKSLASHC
|
||||
</pre>
|
||||
In the absence of a single first code unit for a non-anchored pattern,
|
||||
<b>pcre2_compile()</b> may construct a 256-bit table that defines a fixed set of
|
||||
values for the first code unit in any match. For example, a pattern that starts
|
||||
with [abc] results in a table with three bits set. When code unit values
|
||||
greater than 255 are supported, the flag bit for 255 means "any code unit of
|
||||
value 255 or above". If such a table was constructed, a pointer to it is
|
||||
returned. Otherwise NULL is returned. The third argument should point to an
|
||||
<b>const uint8_t *</b> variable.
|
||||
Return 1 if the pattern contains any instances of \C, otherwise 0. The third
|
||||
argument should point to an <b>uint32_t</b> variable.
|
||||
<pre>
|
||||
PCRE2_INFO_HASCRORLF
|
||||
</pre>
|
||||
|
@ -1670,13 +1676,10 @@ Returns 1 if there is a rightmost literal code unit that must exist in any
|
|||
matched string, other than at its start. The third argument should point to an
|
||||
<b>uint32_t</b> variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT.
|
||||
</P>
|
||||
<P>
|
||||
For anchored patterns, a last literal value is recorded only if it follows
|
||||
something of variable length. For example, for the pattern /^a\d+z\d+/ the
|
||||
returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for
|
||||
/^a\dz\d/ the returned value is 0.
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
recorded only if it follows something of variable length. For example, for the
|
||||
pattern /^a\d+z\d+/ the returned value is 1 (with "z" returned from
|
||||
PCRE2_INFO_LASTCODEUNIT), but for /^a\dz\d/ the returned value is 0.
|
||||
<pre>
|
||||
PCRE2_INFO_LASTCODEUNIT
|
||||
</pre>
|
||||
|
@ -1687,8 +1690,11 @@ value, 0 is returned.
|
|||
<pre>
|
||||
PCRE2_INFO_MATCHEMPTY
|
||||
</pre>
|
||||
Return 1 if the pattern can match an empty string, otherwise 0. The third
|
||||
argument should point to an <b>uint32_t</b> variable.
|
||||
Return 1 if the pattern might match an empty string, otherwise 0. The third
|
||||
argument should point to an <b>uint32_t</b> variable. When a pattern contains
|
||||
recursive subroutine calls it is not always possible to determine whether or
|
||||
not it can match an empty string. PCRE2 takes a cautious approach and returns 1
|
||||
in such cases.
|
||||
<pre>
|
||||
PCRE2_INFO_MATCHLIMIT
|
||||
</pre>
|
||||
|
@ -2142,8 +2148,13 @@ documentation.
|
|||
When PCRE2 is built, a default newline convention is set; this is usually the
|
||||
standard convention for the operating system. The default can be overridden in
|
||||
a
|
||||
<a href="#compilecontext">compile context.</a>
|
||||
During matching, the newline choice affects the behaviour of the dot,
|
||||
<a href="#compilecontext">compile context</a>
|
||||
by calling <b>pcre2_set_newline()</b>. It can also be overridden by starting a
|
||||
pattern string with, for example, (*CRLF), as described in the
|
||||
<a href="pcre2pattern.html#newlines">section on newline conventions</a>
|
||||
in the
|
||||
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||
page. During matching, the newline choice affects the behaviour of the dot,
|
||||
circumflex, and dollar metacharacters. It may also alter the way the match
|
||||
starting position is advanced after a match failure for an unanchored pattern.
|
||||
</P>
|
||||
|
@ -2191,19 +2202,20 @@ function can be used to find out how many capturing subpatterns there are in a
|
|||
compiled pattern.
|
||||
</P>
|
||||
<P>
|
||||
A successful match returns the overall matched string and any captured
|
||||
substrings to the caller via a vector of PCRE2_SIZE values. This is called the
|
||||
<b>ovector</b>, and is contained within the
|
||||
<a href="#matchdatablock">match data block.</a>
|
||||
You can obtain direct access to the ovector by calling
|
||||
<b>pcre2_get_ovector_pointer()</b> to find its address, and
|
||||
<b>pcre2_get_ovector_count()</b> to find the number of pairs of values it
|
||||
contains. Alternatively, you can use the auxiliary functions for accessing
|
||||
captured substrings
|
||||
You can use auxiliary functions for accessing captured substrings
|
||||
<a href="#extractbynumber">by number</a>
|
||||
or
|
||||
<a href="#extractbyname">by name</a>
|
||||
(see below).
|
||||
<a href="#extractbyname">by name,</a>
|
||||
as described in sections below.
|
||||
</P>
|
||||
<P>
|
||||
Alternatively, you can make direct use of the vector of PCRE2_SIZE values,
|
||||
called the <b>ovector</b>, which contains the offsets of captured strings. It is
|
||||
part of the
|
||||
<a href="#matchdatablock">match data block.</a>
|
||||
The function <b>pcre2_get_ovector_pointer()</b> returns the address of the
|
||||
ovector, and <b>pcre2_get_ovector_count()</b> returns the number of pairs of
|
||||
values it contains.
|
||||
</P>
|
||||
<P>
|
||||
Within the ovector, the first in each pair of values is set to the offset of
|
||||
|
@ -2292,7 +2304,13 @@ After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure
|
|||
to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and
|
||||
<b>pcre2_get_mark()</b> can be called. It returns a pointer to the
|
||||
zero-terminated name, which is within the compiled pattern. Otherwise NULL is
|
||||
returned. After a successful match, the (*MARK) name that is returned is the
|
||||
returned. The length of the (*MARK) name (excluding the terminating zero) is
|
||||
stored in the code unit that preceeds the name. You should use this instead of
|
||||
relying on the terminating zero if the (*MARK) name might contain a binary
|
||||
zero.
|
||||
</P>
|
||||
<P>
|
||||
After a successful match, the (*MARK) name that is returned is the
|
||||
last one encountered on the matching path through the pattern. After a "no
|
||||
match" or a partial match, the last encountered (*MARK) name is returned. For
|
||||
example, consider this pattern:
|
||||
|
@ -2313,7 +2331,7 @@ escape sequence. After a partial match, however, this value is always the same
|
|||
as <i>ovector[0]</i> because \K does not affect the result of a partial match.
|
||||
</P>
|
||||
<P>
|
||||
After a UTF check failure, \fBpcre2_get_startchar()\fB can be used to obtain
|
||||
After a UTF check failure, <b>pcre2_get_startchar()</b> can be used to obtain
|
||||
the code unit offset of the invalid UTF character. Details are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
|
@ -2636,7 +2654,7 @@ same number causes an error at compile time.
|
|||
This function calls <b>pcre2_match()</b> and then makes a copy of the subject
|
||||
string in <i>outputbuffer</i>, replacing the part that was matched with the
|
||||
<i>replacement</i> string, whose length is supplied in <b>rlength</b>. This can
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||
which a \K item in a lookahead in the pattern causes the match to end before
|
||||
it starts are not supported, and give rise to an error return.
|
||||
</P>
|
||||
|
@ -2650,12 +2668,21 @@ allocate memory for the compiled code.
|
|||
</P>
|
||||
<P>
|
||||
The <i>outlengthptr</i> argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is successful,
|
||||
the value is updated to contain the length of the new string, excluding the
|
||||
trailing zero that is automatically added. If the function is not successful,
|
||||
the value is set to PCRE2_UNSET for general errors (such as output buffer too
|
||||
small). For syntax errors in the replacement string, the value is set to the
|
||||
offset in the replacement string where the error was detected.
|
||||
length, in code units, of the output buffer. If the function is successful, the
|
||||
value is updated to contain the length of the new string, excluding the
|
||||
trailing zero that is automatically added.
|
||||
</P>
|
||||
<P>
|
||||
If the function is not successful, the value set via <i>outlengthptr</i> depends
|
||||
on the type of error. For syntax errors in the replacement string, the value is
|
||||
the offset in the replacement string where the error was detected. For other
|
||||
errors, the value is PCRE2_UNSET by default. This includes the case of the
|
||||
output buffer being too small, unless PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set
|
||||
(see below), in which case the value is the minimum length needed, including
|
||||
space for the trailing zero. Note that in order to compute the required length,
|
||||
<b>pcre2_substitute()</b> has to simulate all the matching and copying, instead
|
||||
of giving an error return as soon as the buffer overflows. Note also that the
|
||||
length is in code units, not bytes.
|
||||
</P>
|
||||
<P>
|
||||
In the replacement string, which is interpreted as a UTF string in UTF mode,
|
||||
|
@ -2682,15 +2709,53 @@ simultaneous substitutions, as this <b>pcre2test</b> example shows:
|
|||
apple lemon
|
||||
2: pear orange
|
||||
</pre>
|
||||
There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the
|
||||
function to iterate over the subject string, replacing every matching
|
||||
substring. If this is not set, only the first matching substring is replaced.
|
||||
As well as the usual options for <b>pcre2_match()</b>, a number of additional
|
||||
options can be set in the <i>options</i> argument.
|
||||
</P>
|
||||
<P>
|
||||
A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing
|
||||
to be applied to the replacement string. Without this option, only the dollar
|
||||
character is special, and only the group insertion forms listed above are
|
||||
valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
|
||||
PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject string,
|
||||
replacing every matching substring. If this is not set, only the first matching
|
||||
substring is replaced. If any matched substring has zero length, after the
|
||||
substitution has happened, an attempt to find a non-empty match at the same
|
||||
position is performed. If this is not successful, the current position is
|
||||
advanced by one character except when CRLF is a valid newline sequence and the
|
||||
next two characters are CR, LF. In this case, the current position is advanced
|
||||
by two characters.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||
this option is set, however, <b>pcre2_substitute()</b> continues to go through
|
||||
the motions of matching and substituting (without, of course, writing anything)
|
||||
in order to compute the size of buffer that is needed. This value is passed
|
||||
back via the <i>outlengthptr</i> variable, with the result of the function still
|
||||
being PCRE2_ERROR_NOMEMORY.
|
||||
</P>
|
||||
<P>
|
||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||
is needed for given substitution. However, this does mean that the entire
|
||||
operation is carried out twice. Depending on the application, it may be more
|
||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do
|
||||
not appear in the pattern to be treated as unset groups. This option should be
|
||||
used with care, because it means that a typo in a group name or number no
|
||||
longer causes the PCRE2_ERROR_NOSUBSTRING error.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown
|
||||
groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty
|
||||
strings when inserted as described above. If this option is not set, an attempt
|
||||
to insert an unset group causes the PCRE2_ERROR_UNSET error. This option does
|
||||
not influence the extended substitution syntax described below.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the
|
||||
replacement string. Without this option, only the dollar character is special,
|
||||
and only the group insertion forms listed above are valid. When
|
||||
PCRE2_SUBSTITUTE_EXTENDED is set, two things change:
|
||||
</P>
|
||||
<P>
|
||||
Firstly, backslash in a replacement string is interpreted as an escape
|
||||
|
@ -2740,22 +2805,46 @@ string remains in force afterwards, as shown in this <b>pcre2test</b> example:
|
|||
somebody
|
||||
1: HELLO
|
||||
</pre>
|
||||
If successful, the function returns the number of replacements that were made.
|
||||
This may be zero if no matches were found, and is never greater than 1 unless
|
||||
PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
|
||||
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
||||
groups in the extended syntax forms to be treated as unset.
|
||||
</P>
|
||||
<P>
|
||||
If successful, <b>pcre2_substitute()</b> returns the number of replacements that
|
||||
were made. This may be zero if no matches were found, and is never greater than
|
||||
1 unless PCRE2_SUBSTITUTE_GLOBAL is set.
|
||||
</P>
|
||||
<P>
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from <b>pcre2_match()</b>
|
||||
are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer
|
||||
is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax
|
||||
errors in the replacement string, with more particular errors being
|
||||
PCRE2_ERROR_BADREPESCAPE (invalid escape sequence),
|
||||
PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found),
|
||||
PCRE2_BADSUBSTITUTION (syntax error in extended group substitution), and
|
||||
PCRE2_BADSUBPATTERN (the pattern match ended before it started). As for all
|
||||
PCRE2 errors, a text message that describes the error can be obtained by
|
||||
calling <b>pcre2_get_error_message()</b>.
|
||||
are passed straight back.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
|
||||
unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_UNSET is returned for an unset substring insertion (including an
|
||||
unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple
|
||||
(non-extended) syntax is used and PCRE2_SUBSTITUTE_UNSET_EMPTY is not set.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is
|
||||
needed is returned via <i>outlengthptr</i>. Note that this does not happen by
|
||||
default.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
||||
(invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket
|
||||
not found), PCRE2_BADSUBSTITUTION (syntax error in extended group
|
||||
substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it
|
||||
started, which can happen if \K is used in an assertion).
|
||||
</P>
|
||||
<P>
|
||||
As for all PCRE2 errors, a text message that describes the error can be
|
||||
obtained by calling <b>pcre2_get_error_message()</b>.
|
||||
</P>
|
||||
<br><a name="SEC35" href="#TOC1">DUPLICATE SUBPATTERN NAMES</a><br>
|
||||
<P>
|
||||
|
@ -2796,11 +2885,11 @@ function returns the length of each entry in code units. In both cases,
|
|||
PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
|
||||
</P>
|
||||
<P>
|
||||
The format of the name table is described above in the section entitled
|
||||
<i>Information about a pattern</i>
|
||||
<a href="#infoaboutpattern">above.</a>
|
||||
Given all the relevant entries for the name, you can extract each of their
|
||||
numbers, and hence the captured data.
|
||||
The format of the name table is described
|
||||
<a href="#infoaboutpattern">above</a>
|
||||
in the section entitled <i>Information about a pattern</i>. Given all the
|
||||
relevant entries for the name, you can extract each of their numbers, and hence
|
||||
the captured data.
|
||||
</P>
|
||||
<br><a name="SEC36" href="#TOC1">FINDING ALL POSSIBLE MATCHES AT ONE POSITION</a><br>
|
||||
<P>
|
||||
|
@ -3032,7 +3121,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 05 November 2015
|
||||
Last updated: 16 December 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -86,6 +86,13 @@ results. The returned value from <b>pcre2_jit_compile()</b> is zero on success,
|
|||
or a negative error code.
|
||||
</P>
|
||||
<P>
|
||||
There is a limit to the size of pattern that JIT supports, imposed by the size
|
||||
of machine stack that it uses. The exact rules are not documented because they
|
||||
may change at any time, in particular, when new optimizations are introduced.
|
||||
If a pattern is too big, a call to \fBpcre2_jit_compile()\fB returns
|
||||
PCRE2_ERROR_NOMEMORY.
|
||||
</P>
|
||||
<P>
|
||||
PCRE2_JIT_COMPLETE requests the JIT compiler to generate code for complete
|
||||
matches. If you want to run partial matches using the PCRE2_PARTIAL_HARD or
|
||||
PCRE2_PARTIAL_SOFT options of <b>pcre2_match()</b>, you should set one or both
|
||||
|
@ -425,7 +432,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC13" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 28 July 2015
|
||||
Last updated: 14 November 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -32,8 +32,8 @@ However, the speed of execution is slower. In the 32-bit library, the internal
|
|||
linkage size is always 4.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of a source pattern string is essentially unlimited; it is
|
||||
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
||||
The maximum length of a source pattern string is essentially unlimited; it is
|
||||
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
||||
calls <b>pcre2_compile()</b> can specify a smaller limit.
|
||||
</P>
|
||||
<P>
|
||||
|
|
|
@ -669,8 +669,8 @@ This is an example of an "atomic group", details of which are given
|
|||
This particular group matches either the two-character sequence CR followed by
|
||||
LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab,
|
||||
U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next
|
||||
line, U+0085). The two-character sequence is treated as a single unit that
|
||||
cannot be split.
|
||||
line, U+0085). Because this is an atomic group, the two-character sequence is
|
||||
treated as a single unit that cannot be split.
|
||||
</P>
|
||||
<P>
|
||||
In other modes, two additional characters whose codepoints are greater than 255
|
||||
|
@ -1186,6 +1186,16 @@ when the <i>startoffset</i> argument of <b>pcre2_match()</b> is non-zero. The
|
|||
PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set.
|
||||
</P>
|
||||
<P>
|
||||
When the newline convention (see
|
||||
<a href="#newlines">"Newline conventions"</a>
|
||||
below) recognizes the two-character sequence CRLF as a newline, this is
|
||||
preferred, even if the single characters CR and LF are also recognized as
|
||||
newlines. For example, if the newline convention is "any", a multiline mode
|
||||
circumflex matches before "xyz" in the string "abc\r\nxyz" rather than after
|
||||
CR, even though CR on its own is a valid newline. (It also matches at the very
|
||||
start of the string, of course.)
|
||||
</P>
|
||||
<P>
|
||||
Note that the sequences \A, \Z, and \z can be used to match the start and
|
||||
end of the subject in both modes, and if all branches of a pattern start with
|
||||
\A it is always anchored, whether or not PCRE2_MULTILINE is set.
|
||||
|
@ -1236,7 +1246,7 @@ with \C in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
</P>
|
||||
<P>
|
||||
An application can lock out the use of \C by setting the
|
||||
|
@ -1247,9 +1257,9 @@ build PCRE2 with the use of \C permanently disabled.
|
|||
PCRE2 does not allow \C to appear in lookbehind assertions
|
||||
<a href="#lookbehind">(described below)</a>
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind. Neither the alternative matching function
|
||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
the lookbehind. Neither the alternative matching function
|
||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -1341,11 +1351,11 @@ example [\000-\037]. Ranges can include any characters that are valid for the
|
|||
current mode.
|
||||
</P>
|
||||
<P>
|
||||
There is a special case in EBCDIC environments for ranges whose end points are
|
||||
both specified as literal letters in the same case. For compatibility with
|
||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
There is a special case in EBCDIC environments for ranges whose end points are
|
||||
both specified as literal letters in the same case. For compatibility with
|
||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
specified numerically, for example, [\x88-\x92] or [h-\x92], all code points
|
||||
are included.
|
||||
</P>
|
||||
|
@ -1672,6 +1682,10 @@ first one in the pattern with the given number. The following pattern matches
|
|||
<pre>
|
||||
/(?|(abc)|(def))(?1)/
|
||||
</pre>
|
||||
A relative reference such as (?-1) is no different: it is just a convenient way
|
||||
of computing an absolute group number.
|
||||
</P>
|
||||
<P>
|
||||
If a
|
||||
<a href="#conditions">condition test</a>
|
||||
for a subpattern's having matched refers to a non-unique number, the test is
|
||||
|
@ -2512,7 +2526,7 @@ For example:
|
|||
(?(VERSION>=10.4)yes|no)
|
||||
</pre>
|
||||
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or
|
||||
"no" otherwise. The fractional part of the version number may not contain more
|
||||
"no" otherwise. The fractional part of the version number may not contain more
|
||||
than two digits.
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -2626,6 +2640,21 @@ parentheses preceding the recursion. In other words, a negative number counts
|
|||
capturing parentheses leftwards from the point at which it is encountered.
|
||||
</P>
|
||||
<P>
|
||||
Be aware however, that if
|
||||
<a href="#dupsubpatternnumber">duplicate subpattern numbers</a>
|
||||
are in use, relative references refer to the earliest subpattern with the
|
||||
appropriate number. Consider, for example:
|
||||
<pre>
|
||||
(?|(a)|(b)) (c) (?-2)
|
||||
</pre>
|
||||
The first two capturing groups (a) and (b) are both numbered 1, and group (c)
|
||||
is number 2. When the reference (?-2) is encountered, the second most recently
|
||||
opened parentheses has the number 1, but it is the first such group (the (a)
|
||||
group) to which the recursion refers. This would be the same if an absolute
|
||||
reference (?1) was used. In other words, relative references are just a
|
||||
shorthand for computing a group number.
|
||||
</P>
|
||||
<P>
|
||||
It is also possible to refer to subsequently opened parentheses, by writing
|
||||
references such as (?+2). However, these cannot be recursive because the
|
||||
reference is not inside the parentheses that are referenced. They are always
|
||||
|
@ -2929,13 +2958,13 @@ depending on whether or not a name is present.
|
|||
</P>
|
||||
<P>
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
any way, and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name. A closing parenthesis can be included in a name either as \) or
|
||||
between \Q and \E. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name. A closing parenthesis can be included in a name either as \) or
|
||||
between \Q and \E. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
||||
of the pattern.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -3359,7 +3388,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 01 November 2015
|
||||
Last updated: 13 November 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -200,7 +200,7 @@ Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and
|
|||
PCRE2_DOLLAR_ENDONLY when calling <b>pcre2_compile()</b> directly, but there is
|
||||
no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using
|
||||
the POSIX API, passing REG_NEWLINE to PCRE2's <b>regcomp()</b> function
|
||||
causes PCRE2_MULTILINE to be passed to <b>pcre2_compile()</b>, and REG_DOTALL
|
||||
causes PCRE2_MULTILINE to be passed to <b>pcre2_compile()</b>, and REG_DOTALL
|
||||
passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
|
||||
</P>
|
||||
<br><a name="SEC5" href="#TOC1">MATCHING A PATTERN</a><br>
|
||||
|
@ -235,7 +235,8 @@ to have a terminating NUL located at <i>string</i> + <i>pmatch[0].rm_eo</i>
|
|||
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
||||
intended to be portable to other systems. Note that a non-zero <i>rm_so</i> does
|
||||
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
||||
how it is matched.
|
||||
how it is matched. Setting REG_STARTEND and passing <i>pmatch</i> as NULL are
|
||||
mutually exclusive; the error REG_INVARG is returned.
|
||||
</P>
|
||||
<P>
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||
|
@ -289,7 +290,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 30 October 2015
|
||||
Last updated: 29 November 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -154,13 +154,13 @@ on a system with different endianness.
|
|||
<P>
|
||||
Decoded patterns can be used for matching in the usual way, and must be freed
|
||||
by calling <b>pcre2_code_free()</b>. However, be aware that there is a potential
|
||||
race issue if you are using multiple patterns that were decoded from a single
|
||||
race issue if you are using multiple patterns that were decoded from a single
|
||||
byte stream in a multithreaded application. A single copy of the character
|
||||
tables is used by all the decoded patterns and a reference count is used to
|
||||
arrange for its memory to be automatically freed when the last pattern is
|
||||
freed, but there is no locking on this reference count. Therefore, if you want
|
||||
to call <b>pcre2_code_free()</b> for these patterns in different threads, you
|
||||
must arrange your own locking, and ensure that <b>pcre2_code_free()</b> cannot
|
||||
must arrange your own locking, and ensure that <b>pcre2_code_free()</b> cannot
|
||||
be called by two threads at the same time.
|
||||
</P>
|
||||
<P>
|
||||
|
|
|
@ -113,7 +113,7 @@ it matches a literal "u".
|
|||
</pre>
|
||||
\C is dangerous because it may leave the current matching point in the middle
|
||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \C by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
with the use of \C permanently disabled.
|
||||
</P>
|
||||
<P>
|
||||
|
|
|
@ -486,7 +486,7 @@ the start of a modifier list. For example:
|
|||
<pre>
|
||||
abc\=notbol,notempty
|
||||
</pre>
|
||||
If the subject string is empty and \= is followed by whitespace, the line is
|
||||
If the subject string is empty and \= is followed by whitespace, the line is
|
||||
treated as a comment line, and is not used for matching. For example:
|
||||
<pre>
|
||||
\= This is a comment.
|
||||
|
@ -538,7 +538,7 @@ for a description of their effects.
|
|||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
ucp set PCRE2_UCP
|
||||
ungreedy set PCRE2_UNGREEDY
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
utf set PCRE2_UTF
|
||||
</pre>
|
||||
As well as turning on the PCRE2_UTF option, the <b>utf</b> modifier causes all
|
||||
|
@ -564,7 +564,7 @@ about the pattern:
|
|||
jitfast use JIT fast path
|
||||
jitverify verify JIT use
|
||||
locale=<name> use this locale
|
||||
max_pattern_length=<n> set the maximum pattern length
|
||||
max_pattern_length=<n> set the maximum pattern length
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
null_context compile with a NULL context
|
||||
|
@ -649,9 +649,9 @@ by the item that follows it in the pattern.
|
|||
Passing a NULL context
|
||||
</b><br>
|
||||
<P>
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
|
||||
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_compile()</b>. If
|
||||
the <b>null_context</b> modifier is set, however, NULL is passed. This is for
|
||||
testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||
default values).
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -675,9 +675,9 @@ Generating long repetitive patterns
|
|||
</b><br>
|
||||
<P>
|
||||
Some tests use long patterns that are very repetitive. Instead of creating a
|
||||
very long input line for such a pattern, you can use a special repetition
|
||||
feature, similar to the one described for subject lines above. If the
|
||||
<b>expand</b> modifier is present on a pattern, parts of the pattern that have
|
||||
very long input line for such a pattern, you can use a special repetition
|
||||
feature, similar to the one described for subject lines above. If the
|
||||
<b>expand</b> modifier is present on a pattern, parts of the pattern that have
|
||||
the form
|
||||
<pre>
|
||||
\[<characters>]{<count>}
|
||||
|
@ -689,13 +689,13 @@ by decimal digits and "}" is found later in the pattern. If not, the characters
|
|||
remain in the pattern unaltered.
|
||||
</P>
|
||||
<P>
|
||||
If part of an expanded pattern looks like an expansion, but is really part of
|
||||
the actual pattern, unwanted expansion can be avoided by giving two values in
|
||||
the quantifier. For example, \[AB]{6000,6000} is not recognized as an
|
||||
If part of an expanded pattern looks like an expansion, but is really part of
|
||||
the actual pattern, unwanted expansion can be avoided by giving two values in
|
||||
the quantifier. For example, \[AB]{6000,6000} is not recognized as an
|
||||
expansion item.
|
||||
</P>
|
||||
<P>
|
||||
If the <b>info</b> modifier is set on an expanded pattern, the result of the
|
||||
If the <b>info</b> modifier is set on an expanded pattern, the result of the
|
||||
expansion is included in the information that is output.
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -812,9 +812,9 @@ suite.
|
|||
Limiting the pattern length
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>max_pattern_length</b> modifier sets a limit, in code units, to the
|
||||
length of pattern that <b>pcre2_compile()</b> will accept. Breaching the limit
|
||||
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
||||
The <b>max_pattern_length</b> modifier sets a limit, in code units, to the
|
||||
length of pattern that <b>pcre2_compile()</b> will accept. Breaching the limit
|
||||
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
||||
variable can hold (essentially unlimited).
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -836,13 +836,13 @@ modifiers set options for the <b>regcomp()</b> function:
|
|||
ucp REG_UCP ) the POSIX standard
|
||||
utf REG_UTF8 )
|
||||
</pre>
|
||||
The <b>regerror_buffsize</b> modifier specifies a size for the error buffer that
|
||||
The <b>regerror_buffsize</b> modifier specifies a size for the error buffer that
|
||||
is passed to <b>regerror()</b> in the event of a compilation error. For example:
|
||||
<pre>
|
||||
/abc/posix,regerror_buffsize=20
|
||||
</pre>
|
||||
This provides a means of testing the behaviour of <b>regerror()</b> when the
|
||||
buffer is too small for the error message. If this modifier has not been set, a
|
||||
This provides a means of testing the behaviour of <b>regerror()</b> when the
|
||||
buffer is too small for the error message. If this modifier has not been set, a
|
||||
large buffer is used.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -892,14 +892,18 @@ are applied to every subject line that is processed with that pattern. They may
|
|||
not appear in <b>#pattern</b> commands. These modifiers do not affect the
|
||||
compilation process.
|
||||
<pre>
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
/g global global matching
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
/g global global matching
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
</pre>
|
||||
These modifiers may not appear in a <b>#pattern</b> command. If you want them as
|
||||
defaults, set them in a <b>#subject</b> command.
|
||||
|
@ -964,33 +968,38 @@ information. Some of them may also be specified on a pattern line (see above),
|
|||
in which case they apply to every subject line that is matched against that
|
||||
pattern.
|
||||
<pre>
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
callout_capture show captures at callout time
|
||||
callout_data=<n> set a value to pass via callouts
|
||||
callout_fail=<n>[:<m>] control callout failure
|
||||
callout_none do not supply a callout function
|
||||
copy=<number or name> copy captured substring
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and recursion limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
startchar show startchar when relevant
|
||||
zero_terminate pass the subject as zero-terminated
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
callout_capture show captures at callout time
|
||||
callout_data=<n> set a value to pass via callouts
|
||||
callout_fail=<n>[:<m>] control callout failure
|
||||
callout_none do not supply a callout function
|
||||
copy=<number or name> copy captured substring
|
||||
dfa use <b>pcre2_dfa_match()</b>
|
||||
find_limits find match and recursion limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
startchar show startchar when relevant
|
||||
startoffset=<n> same as offset=<n>
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
zero_terminate pass the subject as zero-terminated
|
||||
</pre>
|
||||
The effects of these modifiers are described in the following sections.
|
||||
</P>
|
||||
|
@ -1129,19 +1138,34 @@ Testing the substitution function
|
|||
</b><br>
|
||||
<P>
|
||||
If the <b>replace</b> modifier is set, the <b>pcre2_substitute()</b> function is
|
||||
called instead of one of the matching functions. Unlike subject strings,
|
||||
<b>pcre2test</b> does not process replacement strings for escape sequences. In
|
||||
UTF mode, a replacement string is checked to see if it is a valid UTF-8 string.
|
||||
If so, it is correctly converted to a UTF string of the appropriate code unit
|
||||
width. If it is not a valid UTF-8 string, the individual code units are copied
|
||||
directly. This provides a means of passing an invalid UTF-8 string for testing
|
||||
purposes.
|
||||
called instead of one of the matching functions. Note that replacement strings
|
||||
cannot contain commas, because a comma signifies the end of a modifier. This is
|
||||
not thought to be an issue in a test program.
|
||||
</P>
|
||||
<P>
|
||||
If the <b>global</b> modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||
<b>pcre2_substitute()</b>. After a successful substitution, the modified string
|
||||
is output, preceded by the number of replacements. This may be zero if there
|
||||
were no matches. Here is a simple example of a substitution test:
|
||||
Unlike subject strings, <b>pcre2test</b> does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to see if it
|
||||
is a valid UTF-8 string. If so, it is correctly converted to a UTF string of
|
||||
the appropriate code unit width. If it is not a valid UTF-8 string, the
|
||||
individual code units are copied directly. This provides a means of passing an
|
||||
invalid UTF-8 string for testing purposes.
|
||||
</P>
|
||||
<P>
|
||||
The following modifiers set options (in additional to the normal match options)
|
||||
for <b>pcre2_substitute()</b>:
|
||||
<pre>
|
||||
global PCRE2_SUBSTITUTE_GLOBAL
|
||||
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
After a successful substitution, the modified string is output, preceded by the
|
||||
number of replacements. This may be zero if there were no matches. Here is a
|
||||
simple example of a substitution test:
|
||||
<pre>
|
||||
/abc/replace=xxx
|
||||
=abc=abc=
|
||||
|
@ -1149,12 +1173,12 @@ were no matches. Here is a simple example of a substitution test:
|
|||
=abc=abc=\=global
|
||||
2: =xxx=xxx=
|
||||
</pre>
|
||||
Subject and replacement strings should be kept relatively short for
|
||||
substitution tests, as fixed-size buffers are used. To make it easy to test for
|
||||
buffer overflow, if the replacement string starts with a number in square
|
||||
brackets, that number is passed to <b>pcre2_substitute()</b> as the size of the
|
||||
output buffer, with the replacement string starting at the next character. Here
|
||||
is an example that tests the edge case:
|
||||
Subject and replacement strings should be kept relatively short (fewer than 256
|
||||
characters) for substitution tests, as fixed-size buffers are used. To make it
|
||||
easy to test for buffer overflow, if the replacement string starts with a
|
||||
number in square brackets, that number is passed to <b>pcre2_substitute()</b> as
|
||||
the size of the output buffer, with the replacement string starting at the next
|
||||
character. Here is an example that tests the edge case:
|
||||
<pre>
|
||||
/abc/
|
||||
123abc123\=replace=[10]XYZ
|
||||
|
@ -1162,6 +1186,19 @@ is an example that tests the edge case:
|
|||
123abc123\=replace=[9]XYZ
|
||||
Failed: error -47: no more memory
|
||||
</pre>
|
||||
The default action of <b>pcre2_substitute()</b> is to return
|
||||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the
|
||||
<b>substitute_overflow_length</b> modifier), <b>pcre2_substitute()</b> continues
|
||||
to go through the motions of matching and substituting, in order to compute the
|
||||
size of buffer that is required. When this happens, <b>pcre2test</b> shows the
|
||||
required buffer length (which includes space for the trailing zero) as part of
|
||||
the error message. For example:
|
||||
<pre>
|
||||
/abc/substitute_overflow_length
|
||||
123abc123\=replace=[9]XYZ
|
||||
Failed: error -47: no more memory: 10 code units are needed
|
||||
</pre>
|
||||
A replacement string is ignored with POSIX and DFA matching. Specifying partial
|
||||
matching provokes an error return ("bad option value") from
|
||||
<b>pcre2_substitute()</b>.
|
||||
|
@ -1236,10 +1273,10 @@ matching starts. Its value is a number of code units, not characters.
|
|||
Setting an offset limit
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
|
||||
cannot be found starting at or before this offset in the subject, a "no match"
|
||||
return is given. The data value is a number of code units, not characters. When
|
||||
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
|
||||
The <b>offset_limit</b> modifier sets a limit for unanchored matches. If a match
|
||||
cannot be found starting at or before this offset in the subject, a "no match"
|
||||
return is given. The data value is a number of code units, not characters. When
|
||||
this modifier is used, the <b>use_offset_limit</b> modifier must have been set
|
||||
for the pattern; if not, an error is generated.
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -1281,8 +1318,8 @@ Passing a NULL context
|
|||
Normally, <b>pcre2test</b> passes a context block to <b>pcre2_match()</b>,
|
||||
<b>pcre2_dfa_match()</b> or <b>pcre2_jit_match()</b>. If the <b>null_context</b>
|
||||
modifier is set, however, NULL is passed. This is for testing that the matching
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the <b>find_limits</b> modifier or when testing the
|
||||
substitution function.
|
||||
</P>
|
||||
<br><a name="SEC12" href="#TOC1">THE ALTERNATIVE MATCHING FUNCTION</a><br>
|
||||
|
@ -1623,7 +1660,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 05 November 2015
|
||||
Last updated: 12 December 2015
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -127,8 +127,8 @@ as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
|||
strings to be in host byte order.
|
||||
</P>
|
||||
<P>
|
||||
A UTF string is checked before any other processing takes place. In the case of
|
||||
<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b> calls with a non-zero starting
|
||||
A UTF string is checked before any other processing takes place. In the case of
|
||||
<b>pcre2_match()</b> and <b>pcre2_dfa_match()</b> calls with a non-zero starting
|
||||
offset, the check is applied only to that part of the subject that could be
|
||||
inspected during matching, and there is a check that the starting offset points
|
||||
to the first code unit of a character or to the end of the subject. If there
|
||||
|
|
|
@ -118,9 +118,9 @@ running redundant checks.
|
|||
.P
|
||||
The use of the \eC escape sequence in a UTF-8 or UTF-16 pattern can lead to
|
||||
problems, because it may leave the current matching point in the middle of a
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
multi-code-unit character. The PCRE2_NEVER_BACKSLASH_C option can be used by an
|
||||
application to lock out the use of \eC, causing a compile-time error if it is
|
||||
encountered. It is also possible to build PCRE2 with the use of \eC permanently
|
||||
encountered. It is also possible to build PCRE2 with the use of \eC permanently
|
||||
disabled.
|
||||
.P
|
||||
Another way that performance can be hit is by running a pattern that has a very
|
||||
|
|
1825
doc/pcre2.txt
1825
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -14,7 +14,7 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function sets, in a compile context, the maximum length (in code units) of
|
||||
This function sets, in a compile context, the maximum length (in code units) of
|
||||
the pattern that can be compiled. The result is always zero.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -58,9 +58,9 @@ The options are:
|
|||
PCRE2_UTF was set at compile time)
|
||||
PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing
|
||||
PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string
|
||||
.sp
|
||||
The function returns the number of substitutions, which may be zero if there
|
||||
were no matches. The result can be greater than one only when
|
||||
|
|
142
doc/pcre2api.3
142
doc/pcre2api.3
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "12 December 2015" "PCRE2 10.21"
|
||||
.TH PCRE2API 3 "16 December 2015" "PCRE2 10.21"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -570,7 +570,7 @@ of the following compile-time parameters:
|
|||
PCRE2's character tables
|
||||
The newline character sequence
|
||||
The compile time nested parentheses limit
|
||||
The maximum length of the pattern string
|
||||
The maximum length of the pattern string
|
||||
An external function for stack checking
|
||||
.sp
|
||||
A compile context is also required if you are using custom memory management.
|
||||
|
@ -618,10 +618,10 @@ in the current locale.
|
|||
.B " PCRE2_SIZE \fIvalue\fP);"
|
||||
.fi
|
||||
.sp
|
||||
This sets a maximum length, in code units, for the pattern string that is to be
|
||||
compiled. If the pattern is longer, an error is generated. This facility is
|
||||
provided so that applications that accept patterns from external sources can
|
||||
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
||||
This sets a maximum length, in code units, for the pattern string that is to be
|
||||
compiled. If the pattern is longer, an error is generated. This facility is
|
||||
provided so that applications that accept patterns from external sources can
|
||||
limit their size. The default is the largest number that a PCRE2_SIZE variable
|
||||
can hold, which is effectively unlimited.
|
||||
.sp
|
||||
.nf
|
||||
|
@ -678,8 +678,8 @@ of the following match-time parameters:
|
|||
.sp
|
||||
A callout function
|
||||
The offset limit for matching an unanchored pattern
|
||||
The limit for calling \fImatch()\fP
|
||||
The limit for calling \fImatch()\fP recursively
|
||||
The limit for calling \fBmatch()\fP (see below)
|
||||
The limit for calling \fBmatch()\fP recursively
|
||||
.sp
|
||||
A match context is also required if you are using custom memory management.
|
||||
If none of these apply, just pass NULL as the context argument of
|
||||
|
@ -736,7 +736,7 @@ PCRE2_USE_OFFSET_LIMIT is not set, an error is generated.
|
|||
.P
|
||||
The offset limit facility can be used to track progress when searching large
|
||||
subject strings. See also the PCRE2_FIRSTLINE option, which requires a match to
|
||||
start within the first line of the subject. If this is set with an offset
|
||||
start within the first line of the subject. If this is set with an offset
|
||||
limit, a match must occur in the first line and also within the offset limit.
|
||||
In other words, whichever limit comes first is used.
|
||||
.sp
|
||||
|
@ -1228,7 +1228,7 @@ This option locks out the use of \eC in the pattern that is being compiled.
|
|||
This escape can cause unpredictable behaviour in UTF-8 or UTF-16 modes, because
|
||||
it may leave the current matching point in the middle of a multi-code-unit
|
||||
character. This option may be useful in applications that process patterns from
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
external sources. Note that there is also a build-time option that permanently
|
||||
locks out the use of \eC.
|
||||
.sp
|
||||
PCRE2_NEVER_UCP
|
||||
|
@ -1565,7 +1565,7 @@ are as follows:
|
|||
Return a copy of the pattern's options. The third argument should point to a
|
||||
\fBuint32_t\fP variable. PCRE2_INFO_ARGOPTIONS returns exactly the options that
|
||||
were passed to \fBpcre2_compile()\fP, whereas PCRE2_INFO_ALLOPTIONS returns
|
||||
the compile options as modified by any top-level option settings such as (*UTF)
|
||||
the compile options as modified by any top-level option settings such as (*UTF)
|
||||
at the start of the pattern itself. For example, if the pattern /(*UTF)abc/ is
|
||||
compiled with the PCRE2_EXTENDED option, the result is PCRE2_EXTENDED and
|
||||
PCRE2_UTF.
|
||||
|
@ -1611,8 +1611,9 @@ matches only CR, LF, or CRLF.
|
|||
.sp
|
||||
PCRE2_INFO_CAPTURECOUNT
|
||||
.sp
|
||||
Return the number of capturing subpatterns in the pattern. The third argument
|
||||
should point to an \fBuint32_t\fP variable.
|
||||
Return the highest capturing subpattern number in the pattern. In patterns
|
||||
where (?| is not used, this is also the total number of capturing subpatterns.
|
||||
The third argument should point to an \fBuint32_t\fP variable.
|
||||
.sp
|
||||
PCRE2_INFO_FIRSTBITMAP
|
||||
.sp
|
||||
|
@ -1629,10 +1630,8 @@ returned. Otherwise NULL is returned. The third argument should point to an
|
|||
.sp
|
||||
Return information about the first code unit of any matched string, for a
|
||||
non-anchored pattern. The third argument should point to an \fBuint32_t\fP
|
||||
variable.
|
||||
.P
|
||||
If there is a fixed first value, for example, the letter "c" from a pattern
|
||||
such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||
variable. If there is a fixed first value, for example, the letter "c" from a
|
||||
pattern such as (cat|cow|coyote), 1 is returned, and the character value can be
|
||||
retrieved using PCRE2_INFO_FIRSTCODEUNIT. If there is no fixed first value, but
|
||||
it is known that a match can occur only at the start of the subject or
|
||||
following a newline in the subject, 2 is returned. Otherwise, and for anchored
|
||||
|
@ -1676,12 +1675,10 @@ Returns 1 if there is a rightmost literal code unit that must exist in any
|
|||
matched string, other than at its start. The third argument should point to an
|
||||
\fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is
|
||||
returned, the code unit value itself can be retrieved using
|
||||
PCRE2_INFO_LASTCODEUNIT.
|
||||
.P
|
||||
For anchored patterns, a last literal value is recorded only if it follows
|
||||
something of variable length. For example, for the pattern /^a\ed+z\ed+/ the
|
||||
returned value is 1 (with "z" returned from PCRE2_INFO_LASTCODEUNIT), but for
|
||||
/^a\edz\ed/ the returned value is 0.
|
||||
PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is
|
||||
recorded only if it follows something of variable length. For example, for the
|
||||
pattern /^a\ed+z\ed+/ the returned value is 1 (with "z" returned from
|
||||
PCRE2_INFO_LASTCODEUNIT), but for /^a\edz\ed/ the returned value is 0.
|
||||
.sp
|
||||
PCRE2_INFO_LASTCODEUNIT
|
||||
.sp
|
||||
|
@ -1693,9 +1690,9 @@ value, 0 is returned.
|
|||
PCRE2_INFO_MATCHEMPTY
|
||||
.sp
|
||||
Return 1 if the pattern might match an empty string, otherwise 0. The third
|
||||
argument should point to an \fBuint32_t\fP variable. When a pattern contains
|
||||
recursive subroutine calls it is not always possible to determine whether or
|
||||
not it can match an empty string. PCRE2 takes a cautious approach and returns 1
|
||||
argument should point to an \fBuint32_t\fP variable. When a pattern contains
|
||||
recursive subroutine calls it is not always possible to determine whether or
|
||||
not it can match an empty string. PCRE2 takes a cautious approach and returns 1
|
||||
in such cases.
|
||||
.sp
|
||||
PCRE2_INFO_MATCHLIMIT
|
||||
|
@ -2181,9 +2178,19 @@ standard convention for the operating system. The default can be overridden in
|
|||
a
|
||||
.\" HTML <a href="#compilecontext">
|
||||
.\" </a>
|
||||
compile context.
|
||||
compile context
|
||||
.\"
|
||||
During matching, the newline choice affects the behaviour of the dot,
|
||||
by calling \fBpcre2_set_newline()\fP. It can also be overridden by starting a
|
||||
pattern string with, for example, (*CRLF), as described in the
|
||||
.\" HTML <a href="pcre2pattern.html#newlines">
|
||||
.\" </a>
|
||||
section on newline conventions
|
||||
.\"
|
||||
in the
|
||||
.\" HREF
|
||||
\fBpcre2pattern\fP
|
||||
.\"
|
||||
page. During matching, the newline choice affects the behaviour of the dot,
|
||||
circumflex, and dollar metacharacters. It may also alter the way the match
|
||||
starting position is advanced after a match failure for an unanchored pattern.
|
||||
.P
|
||||
|
@ -2229,18 +2236,7 @@ that do not cause substrings to be captured. The \fBpcre2_pattern_info()\fP
|
|||
function can be used to find out how many capturing subpatterns there are in a
|
||||
compiled pattern.
|
||||
.P
|
||||
A successful match returns the overall matched string and any captured
|
||||
substrings to the caller via a vector of PCRE2_SIZE values. This is called the
|
||||
\fBovector\fP, and is contained within the
|
||||
.\" HTML <a href="#matchdatablock">
|
||||
.\" </a>
|
||||
match data block.
|
||||
.\"
|
||||
You can obtain direct access to the ovector by calling
|
||||
\fBpcre2_get_ovector_pointer()\fP to find its address, and
|
||||
\fBpcre2_get_ovector_count()\fP to find the number of pairs of values it
|
||||
contains. Alternatively, you can use the auxiliary functions for accessing
|
||||
captured substrings
|
||||
You can use auxiliary functions for accessing captured substrings
|
||||
.\" HTML <a href="#extractbynumber">
|
||||
.\" </a>
|
||||
by number
|
||||
|
@ -2248,9 +2244,20 @@ by number
|
|||
or
|
||||
.\" HTML <a href="#extractbyname">
|
||||
.\" </a>
|
||||
by name
|
||||
by name,
|
||||
.\"
|
||||
(see below).
|
||||
as described in sections below.
|
||||
.P
|
||||
Alternatively, you can make direct use of the vector of PCRE2_SIZE values,
|
||||
called the \fBovector\fP, which contains the offsets of captured strings. It is
|
||||
part of the
|
||||
.\" HTML <a href="#matchdatablock">
|
||||
.\" </a>
|
||||
match data block.
|
||||
.\"
|
||||
The function \fBpcre2_get_ovector_pointer()\fP returns the address of the
|
||||
ovector, and \fBpcre2_get_ovector_count()\fP returns the number of pairs of
|
||||
values it contains.
|
||||
.P
|
||||
Within the ovector, the first in each pair of values is set to the offset of
|
||||
the first code unit of a substring, and the second is set to the offset of the
|
||||
|
@ -2334,7 +2341,12 @@ After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure
|
|||
to match (PCRE2_ERROR_NOMATCH), a (*MARK) name may be available, and
|
||||
\fBpcre2_get_mark()\fP can be called. It returns a pointer to the
|
||||
zero-terminated name, which is within the compiled pattern. Otherwise NULL is
|
||||
returned. After a successful match, the (*MARK) name that is returned is the
|
||||
returned. The length of the (*MARK) name (excluding the terminating zero) is
|
||||
stored in the code unit that preceeds the name. You should use this instead of
|
||||
relying on the terminating zero if the (*MARK) name might contain a binary
|
||||
zero.
|
||||
.P
|
||||
After a successful match, the (*MARK) name that is returned is the
|
||||
last one encountered on the matching path through the pattern. After a "no
|
||||
match" or a partial match, the last encountered (*MARK) name is returned. For
|
||||
example, consider this pattern:
|
||||
|
@ -2353,7 +2365,7 @@ different to the value of \fIovector[0]\fP if the pattern contains the \eK
|
|||
escape sequence. After a partial match, however, this value is always the same
|
||||
as \fIovector[0]\fP because \eK does not affect the result of a partial match.
|
||||
.P
|
||||
After a UTF check failure, \fBpcre2_get_startchar()\fB can be used to obtain
|
||||
After a UTF check failure, \fBpcre2_get_startchar()\fP can be used to obtain
|
||||
the code unit offset of the invalid UTF character. Details are given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
|
@ -2692,7 +2704,7 @@ same number causes an error at compile time.
|
|||
This function calls \fBpcre2_match()\fP and then makes a copy of the subject
|
||||
string in \fIoutputbuffer\fP, replacing the part that was matched with the
|
||||
\fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||
be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in
|
||||
which a \eK item in a lookahead in the pattern causes the match to end before
|
||||
it starts are not supported, and give rise to an error return.
|
||||
.P
|
||||
|
@ -2706,7 +2718,7 @@ allocate memory for the compiled code.
|
|||
The \fIoutlengthptr\fP argument must point to a variable that contains the
|
||||
length, in code units, of the output buffer. If the function is successful, the
|
||||
value is updated to contain the length of the new string, excluding the
|
||||
trailing zero that is automatically added.
|
||||
trailing zero that is automatically added.
|
||||
.P
|
||||
If the function is not successful, the value set via \fIoutlengthptr\fP depends
|
||||
on the type of error. For syntax errors in the replacement string, the value is
|
||||
|
@ -2754,7 +2766,7 @@ advanced by one character except when CRLF is a valid newline sequence and the
|
|||
next two characters are CR, LF. In this case, the current position is advanced
|
||||
by two characters.
|
||||
.P
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
|
||||
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
|
||||
this option is set, however, \fBpcre2_substitute()\fP continues to go through
|
||||
the motions of matching and substituting (without, of course, writing anything)
|
||||
|
@ -2762,15 +2774,15 @@ in order to compute the size of buffer that is needed. This value is passed
|
|||
back via the \fIoutlengthptr\fP variable, with the result of the function still
|
||||
being PCRE2_ERROR_NOMEMORY.
|
||||
.P
|
||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||
is needed for given substitution. However, this does mean that the entire
|
||||
operation is carried out twice. Depending on the application, it may be more
|
||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||
Passing a buffer size of zero is a permitted way of finding out how much memory
|
||||
is needed for given substitution. However, this does mean that the entire
|
||||
operation is carried out twice. Depending on the application, it may be more
|
||||
efficient to allocate a large buffer and free the excess afterwards, instead of
|
||||
using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH.
|
||||
.P
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do
|
||||
not appear in the pattern to be treated as unset groups. This option should be
|
||||
used with care, because it means that a typo in a group name or number no
|
||||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capturing groups that do
|
||||
not appear in the pattern to be treated as unset groups. This option should be
|
||||
used with care, because it means that a typo in a group name or number no
|
||||
longer causes the PCRE2_ERROR_NOSUBSTRING error.
|
||||
.P
|
||||
PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capturing groups (including unknown
|
||||
|
@ -2828,8 +2840,8 @@ string remains in force afterwards, as shown in this \fBpcre2test\fP example:
|
|||
somebody
|
||||
1: HELLO
|
||||
.sp
|
||||
The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
|
||||
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
||||
The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
|
||||
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
|
||||
groups in the extended syntax forms to be treated as unset.
|
||||
.P
|
||||
If successful, \fBpcre2_substitute()\fP returns the number of replacements that
|
||||
|
@ -2838,7 +2850,7 @@ were made. This may be zero if no matches were found, and is never greater than
|
|||
.P
|
||||
In the event of an error, a negative error code is returned. Except for
|
||||
PCRE2_ERROR_NOMATCH (which is never returned), errors from \fBpcre2_match()\fP
|
||||
are passed straight back.
|
||||
are passed straight back.
|
||||
.P
|
||||
PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring insertion,
|
||||
unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set.
|
||||
|
@ -2849,7 +2861,7 @@ unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) when the simple
|
|||
.P
|
||||
PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the
|
||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size of buffer that is
|
||||
needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
||||
needed is returned via \fIoutlengthptr\fP. Note that this does not happen by
|
||||
default.
|
||||
.P
|
||||
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the
|
||||
|
@ -2857,7 +2869,7 @@ replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE
|
|||
(invalid escape sequence), PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket
|
||||
not found), PCRE2_BADSUBSTITUTION (syntax error in extended group
|
||||
substitution), and PCRE2_BADSUBPATTERN (the pattern match ended before it
|
||||
started, which can happen if \eK is used in an assertion).
|
||||
started, which can happen if \eK is used in an assertion).
|
||||
.P
|
||||
As for all PCRE2 errors, a text message that describes the error can be
|
||||
obtained by calling \fBpcre2_get_error_message()\fP.
|
||||
|
@ -2901,14 +2913,14 @@ first and last entries in the name-to-number table for the given name, and the
|
|||
function returns the length of each entry in code units. In both cases,
|
||||
PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name.
|
||||
.P
|
||||
The format of the name table is described above in the section entitled
|
||||
\fIInformation about a pattern\fP
|
||||
The format of the name table is described
|
||||
.\" HTML <a href="#infoaboutpattern">
|
||||
.\" </a>
|
||||
above.
|
||||
above
|
||||
.\"
|
||||
Given all the relevant entries for the name, you can extract each of their
|
||||
numbers, and hence the captured data.
|
||||
in the section entitled \fIInformation about a pattern\fP. Given all the
|
||||
relevant entries for the name, you can extract each of their numbers, and hence
|
||||
the captured data.
|
||||
.
|
||||
.
|
||||
.SH "FINDING ALL POSSIBLE MATCHES AT ONE POSITION"
|
||||
|
@ -3154,6 +3166,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 21 December 2015
|
||||
Last updated: 16 December 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -20,8 +20,8 @@ documentation for details. In these cases the limit is substantially larger.
|
|||
However, the speed of execution is slower. In the 32-bit library, the internal
|
||||
linkage size is always 4.
|
||||
.P
|
||||
The maximum length of a source pattern string is essentially unlimited; it is
|
||||
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
||||
The maximum length of a source pattern string is essentially unlimited; it is
|
||||
the largest number a PCRE2_SIZE variable can hold. However, the program that
|
||||
calls \fBpcre2_compile()\fP can specify a smaller limit.
|
||||
.P
|
||||
The maximum length (in code units) of a subject string is one less than the
|
||||
|
|
|
@ -1188,11 +1188,11 @@ When the newline convention (see
|
|||
.\" </a>
|
||||
"Newline conventions"
|
||||
.\"
|
||||
below) recognizes the two-character sequence CRLF as a newline, this is
|
||||
preferred, even if the single characters CR and LF are also recognized as
|
||||
below) recognizes the two-character sequence CRLF as a newline, this is
|
||||
preferred, even if the single characters CR and LF are also recognized as
|
||||
newlines. For example, if the newline convention is "any", a multiline mode
|
||||
circumflex matches before "xyz" in the string "abc\er\enxyz" rather than after
|
||||
CR, even though CR on its own is a valid newline. (It also matches at the very
|
||||
CR, even though CR on its own is a valid newline. (It also matches at the very
|
||||
start of the string, of course.)
|
||||
.P
|
||||
Note that the sequences \eA, \eZ, and \ez can be used to match the start and
|
||||
|
@ -1245,7 +1245,7 @@ with \eC in UTF-8 or UTF-16 mode means that the rest of the string may start
|
|||
with a malformed UTF character. This has undefined results, because PCRE2
|
||||
assumes that it is matching character by character in a valid UTF string (by
|
||||
default it checks the subject string's validity at the start of processing
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
unless the PCRE2_NO_UTF_CHECK option is used).
|
||||
.P
|
||||
An application can lock out the use of \eC by setting the
|
||||
PCRE2_NEVER_BACKSLASH_C option when compiling a pattern. It is also possible to
|
||||
|
@ -1257,9 +1257,9 @@ PCRE2 does not allow \eC to appear in lookbehind assertions
|
|||
(described below)
|
||||
.\"
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind. Neither the alternative matching function
|
||||
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
the lookbehind. Neither the alternative matching function
|
||||
\fBpcre2_dfa_match()\fP not the JIT optimizer support \eC in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
.P
|
||||
In general, the \eC escape sequence is best avoided. However, one way of using
|
||||
|
@ -1347,11 +1347,11 @@ inclusive. They can also be used for code points specified numerically, for
|
|||
example [\e000-\e037]. Ranges can include any characters that are valid for the
|
||||
current mode.
|
||||
.P
|
||||
There is a special case in EBCDIC environments for ranges whose end points are
|
||||
both specified as literal letters in the same case. For compatibility with
|
||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
There is a special case in EBCDIC environments for ranges whose end points are
|
||||
both specified as literal letters in the same case. For compatibility with
|
||||
Perl, EBCDIC code points within the range that are not letters are omitted. For
|
||||
example, [h-k] matches only four characters, even though the codes for h and k
|
||||
are 0x88 and 0x92, a range of 11 code points. However, if the range is
|
||||
specified numerically, for example, [\ex88-\ex92] or [h-\ex92], all code points
|
||||
are included.
|
||||
.P
|
||||
|
@ -1683,7 +1683,7 @@ first one in the pattern with the given number. The following pattern matches
|
|||
.sp
|
||||
/(?|(abc)|(def))(?1)/
|
||||
.sp
|
||||
A relative reference such as (?-1) is no different: it is just a convenient way
|
||||
A relative reference such as (?-1) is no different: it is just a convenient way
|
||||
of computing an absolute group number.
|
||||
.P
|
||||
If a
|
||||
|
@ -2549,7 +2549,7 @@ For example:
|
|||
(?(VERSION>=10.4)yes|no)
|
||||
.sp
|
||||
This pattern matches "yes" if the PCRE2 version is greater or equal to 10.4, or
|
||||
"no" otherwise. The fractional part of the version number may not contain more
|
||||
"no" otherwise. The fractional part of the version number may not contain more
|
||||
than two digits.
|
||||
.
|
||||
.
|
||||
|
@ -2667,21 +2667,21 @@ pattern above you can write (?-2) to refer to the second most recently opened
|
|||
parentheses preceding the recursion. In other words, a negative number counts
|
||||
capturing parentheses leftwards from the point at which it is encountered.
|
||||
.P
|
||||
Be aware however, that if
|
||||
Be aware however, that if
|
||||
.\" HTML <a href="#dupsubpatternnumber">
|
||||
.\" </a>
|
||||
duplicate subpattern numbers
|
||||
.\"
|
||||
are in use, relative references refer to the earliest subpattern with the
|
||||
are in use, relative references refer to the earliest subpattern with the
|
||||
appropriate number. Consider, for example:
|
||||
.sp
|
||||
(?|(a)|(b)) (c) (?-2)
|
||||
.sp
|
||||
The first two capturing groups (a) and (b) are both numbered 1, and group (c)
|
||||
is number 2. When the reference (?-2) is encountered, the second most recently
|
||||
opened parentheses has the number 1, but it is the first such group (the (a)
|
||||
group) to which the recursion refers. This would be the same if an absolute
|
||||
reference (?1) was used. In other words, relative references are just a
|
||||
opened parentheses has the number 1, but it is the first such group (the (a)
|
||||
group) to which the recursion refers. This would be the same if an absolute
|
||||
reference (?1) was used. In other words, relative references are just a
|
||||
shorthand for computing a group number.
|
||||
.P
|
||||
It is also possible to refer to subsequently opened parentheses, by writing
|
||||
|
@ -2988,13 +2988,13 @@ parenthesis followed by an asterisk. They are generally of the form (*VERB) or
|
|||
depending on whether or not a name is present.
|
||||
.P
|
||||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
any way, and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name. A closing parenthesis can be included in a name either as \e) or
|
||||
between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name. A closing parenthesis can be included in a name either as \e) or
|
||||
between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
||||
of the pattern.
|
||||
.P
|
||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
|
|
|
@ -174,7 +174,7 @@ Default POSIX newline handling can be obtained by setting PCRE2_DOTALL and
|
|||
PCRE2_DOLLAR_ENDONLY when calling \fBpcre2_compile()\fP directly, but there is
|
||||
no way to make PCRE2 behave exactly as for the REG_NEWLINE action. When using
|
||||
the POSIX API, passing REG_NEWLINE to PCRE2's \fBregcomp()\fP function
|
||||
causes PCRE2_MULTILINE to be passed to \fBpcre2_compile()\fP, and REG_DOTALL
|
||||
causes PCRE2_MULTILINE to be passed to \fBpcre2_compile()\fP, and REG_DOTALL
|
||||
passes PCRE2_DOTALL. There is no way to pass PCRE2_DOLLAR_ENDONLY.
|
||||
.
|
||||
.
|
||||
|
@ -211,7 +211,7 @@ to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
|
|||
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
|
||||
intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
|
||||
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
|
||||
how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are
|
||||
how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are
|
||||
mutually exclusive; the error REG_INVARG is returned.
|
||||
.P
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||
|
|
|
@ -140,13 +140,13 @@ on a system with different endianness.
|
|||
.P
|
||||
Decoded patterns can be used for matching in the usual way, and must be freed
|
||||
by calling \fBpcre2_code_free()\fP. However, be aware that there is a potential
|
||||
race issue if you are using multiple patterns that were decoded from a single
|
||||
race issue if you are using multiple patterns that were decoded from a single
|
||||
byte stream in a multithreaded application. A single copy of the character
|
||||
tables is used by all the decoded patterns and a reference count is used to
|
||||
arrange for its memory to be automatically freed when the last pattern is
|
||||
freed, but there is no locking on this reference count. Therefore, if you want
|
||||
to call \fBpcre2_code_free()\fP for these patterns in different threads, you
|
||||
must arrange your own locking, and ensure that \fBpcre2_code_free()\fP cannot
|
||||
must arrange your own locking, and ensure that \fBpcre2_code_free()\fP cannot
|
||||
be called by two threads at the same time.
|
||||
.P
|
||||
If a pattern was processed by \fBpcre2_jit_compile()\fP before being
|
||||
|
|
|
@ -83,7 +83,7 @@ it matches a literal "u".
|
|||
.sp
|
||||
\eC is dangerous because it may leave the current matching point in the middle
|
||||
of a UTF-8 or UTF-16 character. The application can lock out the use of \eC by
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
setting the PCRE2_NEVER_BACKSLASH_C option. It is also possible to build PCRE2
|
||||
with the use of \eC permanently disabled.
|
||||
.P
|
||||
By default, \ed, \es, and \ew match only ASCII characters, even in UTF-8 mode
|
||||
|
|
|
@ -444,12 +444,12 @@ the start of a modifier list. For example:
|
|||
.sp
|
||||
abc\e=notbol,notempty
|
||||
.sp
|
||||
If the subject string is empty and \e= is followed by whitespace, the line is
|
||||
If the subject string is empty and \e= is followed by whitespace, the line is
|
||||
treated as a comment line, and is not used for matching. For example:
|
||||
.sp
|
||||
\e= This is a comment.
|
||||
abc\e= This is an invalid modifier list.
|
||||
.sp
|
||||
.sp
|
||||
A backslash followed by any other non-alphanumeric character just escapes that
|
||||
character. A backslash followed by anything else causes an error. However, if
|
||||
the very last character in the line is a backslash (and there is no modifier
|
||||
|
@ -501,7 +501,7 @@ for a description of their effects.
|
|||
no_utf_check set PCRE2_NO_UTF_CHECK
|
||||
ucp set PCRE2_UCP
|
||||
ungreedy set PCRE2_UNGREEDY
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
use_offset_limit set PCRE2_USE_OFFSET_LIMIT
|
||||
utf set PCRE2_UTF
|
||||
.sp
|
||||
As well as turning on the PCRE2_UTF option, the \fButf\fP modifier causes all
|
||||
|
@ -528,7 +528,7 @@ about the pattern:
|
|||
jitfast use JIT fast path
|
||||
jitverify verify JIT use
|
||||
locale=<name> use this locale
|
||||
max_pattern_length=<n> set the maximum pattern length
|
||||
max_pattern_length=<n> set the maximum pattern length
|
||||
memory show memory used
|
||||
newline=<type> set newline type
|
||||
null_context compile with a NULL context
|
||||
|
@ -608,9 +608,9 @@ by the item that follows it in the pattern.
|
|||
.SS "Passing a NULL context"
|
||||
.rs
|
||||
.sp
|
||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_compile()\fP. If
|
||||
the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
||||
testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses
|
||||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_compile()\fP. If
|
||||
the \fBnull_context\fP modifier is set, however, NULL is passed. This is for
|
||||
testing that \fBpcre2_compile()\fP behaves correctly in this case (it uses
|
||||
default values).
|
||||
.
|
||||
.
|
||||
|
@ -634,9 +634,9 @@ actual length of the pattern is passed.
|
|||
.rs
|
||||
.sp
|
||||
Some tests use long patterns that are very repetitive. Instead of creating a
|
||||
very long input line for such a pattern, you can use a special repetition
|
||||
feature, similar to the one described for subject lines above. If the
|
||||
\fBexpand\fP modifier is present on a pattern, parts of the pattern that have
|
||||
very long input line for such a pattern, you can use a special repetition
|
||||
feature, similar to the one described for subject lines above. If the
|
||||
\fBexpand\fP modifier is present on a pattern, parts of the pattern that have
|
||||
the form
|
||||
.sp
|
||||
\e[<characters>]{<count>}
|
||||
|
@ -647,12 +647,12 @@ cannot be nested. An initial "\e[" sequence is recognized only if "]{" followed
|
|||
by decimal digits and "}" is found later in the pattern. If not, the characters
|
||||
remain in the pattern unaltered.
|
||||
.P
|
||||
If part of an expanded pattern looks like an expansion, but is really part of
|
||||
the actual pattern, unwanted expansion can be avoided by giving two values in
|
||||
the quantifier. For example, \e[AB]{6000,6000} is not recognized as an
|
||||
If part of an expanded pattern looks like an expansion, but is really part of
|
||||
the actual pattern, unwanted expansion can be avoided by giving two values in
|
||||
the quantifier. For example, \e[AB]{6000,6000} is not recognized as an
|
||||
expansion item.
|
||||
.P
|
||||
If the \fBinfo\fP modifier is set on an expanded pattern, the result of the
|
||||
If the \fBinfo\fP modifier is set on an expanded pattern, the result of the
|
||||
expansion is included in the information that is output.
|
||||
.
|
||||
.
|
||||
|
@ -771,9 +771,9 @@ suite.
|
|||
.SS "Limiting the pattern length"
|
||||
.rs
|
||||
.sp
|
||||
The \fBmax_pattern_length\fP modifier sets a limit, in code units, to the
|
||||
length of pattern that \fBpcre2_compile()\fP will accept. Breaching the limit
|
||||
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
||||
The \fBmax_pattern_length\fP modifier sets a limit, in code units, to the
|
||||
length of pattern that \fBpcre2_compile()\fP will accept. Breaching the limit
|
||||
causes a compilation error. The default is the largest number a PCRE2_SIZE
|
||||
variable can hold (essentially unlimited).
|
||||
.
|
||||
.
|
||||
|
@ -797,13 +797,13 @@ modifiers set options for the \fBregcomp()\fP function:
|
|||
ucp REG_UCP ) the POSIX standard
|
||||
utf REG_UTF8 )
|
||||
.sp
|
||||
The \fBregerror_buffsize\fP modifier specifies a size for the error buffer that
|
||||
The \fBregerror_buffsize\fP modifier specifies a size for the error buffer that
|
||||
is passed to \fBregerror()\fP in the event of a compilation error. For example:
|
||||
.sp
|
||||
/abc/posix,regerror_buffsize=20
|
||||
.sp
|
||||
This provides a means of testing the behaviour of \fBregerror()\fP when the
|
||||
buffer is too small for the error message. If this modifier has not been set, a
|
||||
This provides a means of testing the behaviour of \fBregerror()\fP when the
|
||||
buffer is too small for the error message. If this modifier has not been set, a
|
||||
large buffer is used.
|
||||
.P
|
||||
The \fBaftertext\fP and \fBallaftertext\fP subject modifiers work as described
|
||||
|
@ -863,9 +863,9 @@ compilation process.
|
|||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
.sp
|
||||
These modifiers may not appear in a \fB#pattern\fP command. If you want them as
|
||||
defaults, set them in a \fB#subject\fP command.
|
||||
|
@ -956,7 +956,7 @@ pattern.
|
|||
mark show mark values
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
|
@ -965,9 +965,9 @@ pattern.
|
|||
startchar show startchar when relevant
|
||||
startoffset=<n> same as offset=<n>
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
zero_terminate pass the subject as zero-terminated
|
||||
.sp
|
||||
The effects of these modifiers are described in the following sections.
|
||||
|
@ -1102,7 +1102,7 @@ by name.
|
|||
If the \fBreplace\fP modifier is set, the \fBpcre2_substitute()\fP function is
|
||||
called instead of one of the matching functions. Note that replacement strings
|
||||
cannot contain commas, because a comma signifies the end of a modifier. This is
|
||||
not thought to be an issue in a test program.
|
||||
not thought to be an issue in a test program.
|
||||
.P
|
||||
Unlike subject strings, \fBpcre2test\fP does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to see if it
|
||||
|
@ -1119,7 +1119,7 @@ for \fBpcre2_substitute()\fP:
|
|||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
.sp
|
||||
.sp
|
||||
.P
|
||||
After a successful substitution, the modified string is output, preceded by the
|
||||
number of replacements. This may be zero if there were no matches. Here is a
|
||||
|
@ -1230,10 +1230,10 @@ matching starts. Its value is a number of code units, not characters.
|
|||
.SS "Setting an offset limit"
|
||||
.rs
|
||||
.sp
|
||||
The \fBoffset_limit\fP modifier sets a limit for unanchored matches. If a match
|
||||
cannot be found starting at or before this offset in the subject, a "no match"
|
||||
return is given. The data value is a number of code units, not characters. When
|
||||
this modifier is used, the \fBuse_offset_limit\fP modifier must have been set
|
||||
The \fBoffset_limit\fP modifier sets a limit for unanchored matches. If a match
|
||||
cannot be found starting at or before this offset in the subject, a "no match"
|
||||
return is given. The data value is a number of code units, not characters. When
|
||||
this modifier is used, the \fBuse_offset_limit\fP modifier must have been set
|
||||
for the pattern; if not, an error is generated.
|
||||
.
|
||||
.
|
||||
|
@ -1273,8 +1273,8 @@ passing the replacement string as zero-terminated.
|
|||
Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP,
|
||||
\fBpcre2_dfa_match()\fP or \fBpcre2_jit_match()\fP. If the \fBnull_context\fP
|
||||
modifier is set, however, NULL is passed. This is for testing that the matching
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the \fBfind_limits\fP modifier or when testing the
|
||||
functions behave correctly in this case (they use default values). This
|
||||
modifier cannot be used with the \fBfind_limits\fP modifier or when testing the
|
||||
substitution function.
|
||||
.
|
||||
.
|
||||
|
|
|
@ -797,14 +797,18 @@ PATTERN MODIFIERS
|
|||
with that pattern. They may not appear in #pattern commands. These mod-
|
||||
ifiers do not affect the compilation process.
|
||||
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
/g global global matching
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text
|
||||
/g global global matching
|
||||
mark show mark values
|
||||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
as defaults, set them in a #subject command.
|
||||
|
@ -860,33 +864,38 @@ SUBJECT MODIFIERS
|
|||
line (see above), in which case they apply to every subject line that
|
||||
is matched against that pattern.
|
||||
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
callout_capture show captures at callout time
|
||||
callout_data=<n> set a value to pass via callouts
|
||||
callout_fail=<n>[:<m>] control callout failure
|
||||
callout_none do not supply a callout function
|
||||
copy=<number or name> copy captured substring
|
||||
dfa use pcre2_dfa_match()
|
||||
find_limits find match and recursion limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
startchar show startchar when relevant
|
||||
zero_terminate pass the subject as zero-terminated
|
||||
aftertext show text after match
|
||||
allaftertext show text after captures
|
||||
allcaptures show all captures
|
||||
allusedtext show all consulted text (non-JIT only)
|
||||
altglobal alternative global matching
|
||||
callout_capture show captures at callout time
|
||||
callout_data=<n> set a value to pass via callouts
|
||||
callout_fail=<n>[:<m>] control callout failure
|
||||
callout_none do not supply a callout function
|
||||
copy=<number or name> copy captured substring
|
||||
dfa use pcre2_dfa_match()
|
||||
find_limits find match and recursion limits
|
||||
get=<number or name> extract captured substring
|
||||
getall extract all captured substrings
|
||||
/g global global matching
|
||||
jitstack=<n> set size of JIT stack
|
||||
mark show mark values
|
||||
match_limit=<n> set a match limit
|
||||
memory show memory usage
|
||||
null_context match with a NULL context
|
||||
offset=<n> set starting offset
|
||||
offset_limit=<n> set offset limit
|
||||
ovector=<n> set size of output vector
|
||||
recursion_limit=<n> set a recursion limit
|
||||
replace=<string> specify a replacement string
|
||||
startchar show startchar when relevant
|
||||
startoffset=<n> same as offset=<n>
|
||||
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
zero_terminate pass the subject as zero-terminated
|
||||
|
||||
The effects of these modifiers are described in the following sections.
|
||||
|
||||
|
@ -1011,19 +1020,30 @@ SUBJECT MODIFIERS
|
|||
Testing the substitution function
|
||||
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Unlike subject
|
||||
strings, pcre2test does not process replacement strings for escape
|
||||
sequences. In UTF mode, a replacement string is checked to see if it is
|
||||
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
||||
the individual code units are copied directly. This provides a means of
|
||||
passing an invalid UTF-8 string for testing purposes.
|
||||
called instead of one of the matching functions. Note that replacement
|
||||
strings cannot contain commas, because a comma signifies the end of a
|
||||
modifier. This is not thought to be an issue in a test program.
|
||||
|
||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||
pcre2_substitute(). After a successful substitution, the modified
|
||||
string is output, preceded by the number of replacements. This may be
|
||||
zero if there were no matches. Here is a simple example of a substitu-
|
||||
tion test:
|
||||
Unlike subject strings, pcre2test does not process replacement strings
|
||||
for escape sequences. In UTF mode, a replacement string is checked to
|
||||
see if it is a valid UTF-8 string. If so, it is correctly converted to
|
||||
a UTF string of the appropriate code unit width. If it is not a valid
|
||||
UTF-8 string, the individual code units are copied directly. This pro-
|
||||
vides a means of passing an invalid UTF-8 string for testing purposes.
|
||||
|
||||
The following modifiers set options (in additional to the normal match
|
||||
options) for pcre2_substitute():
|
||||
|
||||
global PCRE2_SUBSTITUTE_GLOBAL
|
||||
substitute_extended PCRE2_SUBSTITUTE_EXTENDED
|
||||
substitute_overflow_length PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
|
||||
substitute_unknown_unset PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
|
||||
After a successful substitution, the modified string is output, pre-
|
||||
ceded by the number of replacements. This may be zero if there were no
|
||||
matches. Here is a simple example of a substitution test:
|
||||
|
||||
/abc/replace=xxx
|
||||
=abc=abc=
|
||||
|
@ -1031,12 +1051,13 @@ SUBJECT MODIFIERS
|
|||
=abc=abc=\=global
|
||||
2: =xxx=xxx=
|
||||
|
||||
Subject and replacement strings should be kept relatively short for
|
||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
||||
test for buffer overflow, if the replacement string starts with a num-
|
||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
||||
the size of the output buffer, with the replacement string starting at
|
||||
the next character. Here is an example that tests the edge case:
|
||||
Subject and replacement strings should be kept relatively short (fewer
|
||||
than 256 characters) for substitution tests, as fixed-size buffers are
|
||||
used. To make it easy to test for buffer overflow, if the replacement
|
||||
string starts with a number in square brackets, that number is passed
|
||||
to pcre2_substitute() as the size of the output buffer, with the
|
||||
replacement string starting at the next character. Here is an example
|
||||
that tests the edge case:
|
||||
|
||||
/abc/
|
||||
123abc123\=replace=[10]XYZ
|
||||
|
@ -1044,6 +1065,19 @@ SUBJECT MODIFIERS
|
|||
123abc123\=replace=[9]XYZ
|
||||
Failed: error -47: no more memory
|
||||
|
||||
The default action of pcre2_substitute() is to return
|
||||
PCRE2_ERROR_NOMEMORY when the output buffer is too small. However, if
|
||||
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set (by using the sub-
|
||||
stitute_overflow_length modifier), pcre2_substitute() continues to go
|
||||
through the motions of matching and substituting, in order to compute
|
||||
the size of buffer that is required. When this happens, pcre2test shows
|
||||
the required buffer length (which includes space for the trailing zero)
|
||||
as part of the error message. For example:
|
||||
|
||||
/abc/substitute_overflow_length
|
||||
123abc123\=replace=[9]XYZ
|
||||
Failed: error -47: no more memory: 10 code units are needed
|
||||
|
||||
A replacement string is ignored with POSIX and DFA matching. Specifying
|
||||
partial matching provokes an error return ("bad option value") from
|
||||
pcre2_substitute().
|
||||
|
@ -1471,5 +1505,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 05 November 2015
|
||||
Last updated: 12 December 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
|
|
|
@ -118,8 +118,8 @@ UTF-16 and UTF-32 strings can indicate their endianness by special code knows
|
|||
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
|
||||
strings to be in host byte order.
|
||||
.P
|
||||
A UTF string is checked before any other processing takes place. In the case of
|
||||
\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting
|
||||
A UTF string is checked before any other processing takes place. In the case of
|
||||
\fBpcre2_match()\fP and \fBpcre2_dfa_match()\fP calls with a non-zero starting
|
||||
offset, the check is applied only to that part of the subject that could be
|
||||
inspected during matching, and there is a check that the starting offset points
|
||||
to the first code unit of a character or to the end of the subject. If there
|
||||
|
|
|
@ -211,7 +211,7 @@ for (;;)
|
|||
|
||||
last if ($_ eq "");
|
||||
next if $_ =~ /^\\=(?:\s|$)/; # Comment line
|
||||
|
||||
|
||||
$x = eval "\"$_\""; # To get escapes processed
|
||||
|
||||
# Empty array for holding results, ensure $REGERROR and $REGMARK are
|
||||
|
|
|
@ -44,7 +44,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#define PCRE2_MAJOR 10
|
||||
#define PCRE2_MINOR 21
|
||||
#define PCRE2_PRERELEASE -RC1
|
||||
#define PCRE2_DATE 2015-07-06
|
||||
#define PCRE2_DATE 2015-12-15
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE2, the appropriate
|
||||
|
|
|
@ -466,7 +466,7 @@ if (*first_op == OP_REVERSE)
|
|||
/* In byte-mode we can do this quickly. */
|
||||
|
||||
{
|
||||
size_t current_offset = (size_t)(current_subject - start_subject);
|
||||
size_t current_offset = (size_t)(current_subject - start_subject);
|
||||
gone_back = (current_offset < max_back)? current_offset : max_back;
|
||||
current_subject -= gone_back;
|
||||
}
|
||||
|
|
|
@ -251,7 +251,7 @@ static const char match_error_texts[] =
|
|||
"bad substitution in replacement string\0"
|
||||
/* 60 */
|
||||
"match with end before start is not supported\0"
|
||||
"too many replacements (more than INT_MAX)\0"
|
||||
"too many replacements (more than INT_MAX)\0"
|
||||
;
|
||||
|
||||
|
||||
|
|
|
@ -562,7 +562,7 @@ typedef struct pcre2_real_compile_context {
|
|||
int (*stack_guard)(uint32_t, void *);
|
||||
void *stack_guard_data;
|
||||
const uint8_t *tables;
|
||||
PCRE2_SIZE max_pattern_length;
|
||||
PCRE2_SIZE max_pattern_length;
|
||||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
|
@ -581,7 +581,7 @@ typedef struct pcre2_real_match_context {
|
|||
#endif
|
||||
int (*callout)(pcre2_callout_block *, void *);
|
||||
void *callout_data;
|
||||
PCRE2_SIZE offset_limit;
|
||||
PCRE2_SIZE offset_limit;
|
||||
uint32_t match_limit;
|
||||
uint32_t recursion_limit;
|
||||
} pcre2_real_match_context;
|
||||
|
@ -592,7 +592,7 @@ copying the size from possibly unaligned memory into a variable of the same
|
|||
type. Use a macro rather than a typedef to avoid compiler warnings when this
|
||||
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
|
||||
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
|
||||
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
|
||||
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
|
||||
here.) */
|
||||
|
||||
#undef CODE_BLOCKSIZE_TYPE
|
||||
|
@ -660,7 +660,7 @@ typedef struct recurse_check {
|
|||
typedef struct recurse_cache {
|
||||
PCRE2_SPTR group;
|
||||
int recno;
|
||||
} recurse_cache;
|
||||
} recurse_cache;
|
||||
|
||||
/* Structure for maintaining a chain of pointers to the currently incomplete
|
||||
branches, for testing for left recursion while compiling. */
|
||||
|
@ -693,7 +693,7 @@ typedef struct compile_block {
|
|||
PCRE2_SPTR start_code; /* The start of the compiled code */
|
||||
PCRE2_SPTR start_pattern; /* The start of the pattern */
|
||||
PCRE2_SPTR end_pattern; /* The end of the pattern */
|
||||
PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */
|
||||
PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */
|
||||
PCRE2_UCHAR *name_table; /* The name/number table */
|
||||
size_t workspace_size; /* Size of workspace */
|
||||
uint16_t names_found; /* Number of entries so far */
|
||||
|
@ -717,7 +717,7 @@ typedef struct compile_block {
|
|||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
BOOL had_accept; /* (*ACCEPT) encountered */
|
||||
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
|
||||
BOOL had_recurse; /* Had a recursion or subroutine call */
|
||||
BOOL had_recurse; /* Had a recursion or subroutine call */
|
||||
BOOL check_lookbehind; /* Lookbehinds need later checking */
|
||||
BOOL dupnames; /* Duplicate names exist */
|
||||
BOOL iscondassert; /* Next assert is a condition */
|
||||
|
|
|
@ -2409,7 +2409,7 @@ for (;;)
|
|||
break;
|
||||
|
||||
/* Match a single code unit, even in UTF-8 mode. This opcode really does
|
||||
match any code unit, even newline. (It really should be called ANYCODEUNIT,
|
||||
match any code unit, even newline. (It really should be called ANYCODEUNIT,
|
||||
of course - the byte name is from pre-16 bit days.) */
|
||||
|
||||
case OP_ANYBYTE:
|
||||
|
|
|
@ -77,7 +77,7 @@ if (where == NULL) /* Requests field length */
|
|||
case PCRE2_INFO_CAPTURECOUNT:
|
||||
case PCRE2_INFO_FIRSTCODETYPE:
|
||||
case PCRE2_INFO_FIRSTCODEUNIT:
|
||||
case PCRE2_INFO_HASBACKSLASHC:
|
||||
case PCRE2_INFO_HASBACKSLASHC:
|
||||
case PCRE2_INFO_HASCRORLF:
|
||||
case PCRE2_INFO_JCHANGED:
|
||||
case PCRE2_INFO_LASTCODETYPE:
|
||||
|
|
|
@ -190,13 +190,13 @@ return 0;
|
|||
*************************************************/
|
||||
|
||||
/* These take no account of UTF as they always print each individual code unit.
|
||||
The string is zero-terminated for print_custring(); the length is given for
|
||||
The string is zero-terminated for print_custring(); the length is given for
|
||||
print_custring_bylen().
|
||||
|
||||
Arguments:
|
||||
f file to write to
|
||||
ptr point to the string
|
||||
len length for print_custring_bylen()
|
||||
len length for print_custring_bylen()
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
|
|
@ -1546,7 +1546,7 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
|
|||
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
|
||||
}
|
||||
|
||||
/* Find the minimum length of subject string. If it can match an empty string,
|
||||
/* Find the minimum length of subject string. If it can match an empty string,
|
||||
the minimum length is already known. */
|
||||
|
||||
if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
|
||||
|
@ -1555,19 +1555,19 @@ if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
|
|||
{
|
||||
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */
|
||||
break; /* Leave minlength unchanged (will be zero) */
|
||||
|
||||
|
||||
case -2:
|
||||
return 2; /* missing capturing bracket */
|
||||
|
||||
|
||||
case -3:
|
||||
return 3; /* unrecognized opcode */
|
||||
|
||||
|
||||
default:
|
||||
if (min > UINT16_MAX) min = UINT16_MAX;
|
||||
re->minlength = min;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -136,7 +136,7 @@ for (p = string; length > 0; p++)
|
|||
register uint32_t ab, d;
|
||||
|
||||
c = *p;
|
||||
length--;
|
||||
length--;
|
||||
|
||||
if (c < 128) continue; /* ASCII character */
|
||||
|
||||
|
@ -329,7 +329,7 @@ PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate
|
|||
for (p = string; length > 0; p++)
|
||||
{
|
||||
c = *p;
|
||||
length--;
|
||||
length--;
|
||||
|
||||
if ((c & 0xf800) != 0xd800)
|
||||
{
|
||||
|
|
|
@ -285,7 +285,7 @@ start location rather than being passed as a PCRE2 "starting offset". */
|
|||
|
||||
if ((eflags & REG_STARTEND) != 0)
|
||||
{
|
||||
if (pmatch == NULL) return REG_INVARG;
|
||||
if (pmatch == NULL) return REG_INVARG;
|
||||
so = pmatch[0].rm_so;
|
||||
eo = pmatch[0].rm_eo;
|
||||
}
|
||||
|
|
|
@ -6033,14 +6033,14 @@ if (dat_datctl.replacement[0] != 0)
|
|||
|
||||
if (rc < 0)
|
||||
{
|
||||
PCRE2_SIZE msize;
|
||||
PCRE2_SIZE msize;
|
||||
fprintf(outfile, "Failed: error %d", rc);
|
||||
if (rc != PCRE2_ERROR_NOMEMORY && nsize != PCRE2_UNSET)
|
||||
fprintf(outfile, " at offset %ld in replacement", (long int)nsize);
|
||||
fprintf(outfile, ": ");
|
||||
PCRE2_GET_ERROR_MESSAGE(msize, rc, pbuffer);
|
||||
PCHARSV(CASTVAR(void *, pbuffer), 0, msize, FALSE, outfile);
|
||||
if (rc == PCRE2_ERROR_NOMEMORY &&
|
||||
if (rc == PCRE2_ERROR_NOMEMORY &&
|
||||
(xoptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)
|
||||
fprintf(outfile, ": %ld code units are needed", (long int)nsize);
|
||||
}
|
||||
|
@ -6405,7 +6405,7 @@ else for (gmatched = 0;; gmatched++)
|
|||
TESTFLD(match_data, mark, !=, NULL))
|
||||
{
|
||||
fprintf(outfile, ", mark=");
|
||||
PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf,
|
||||
PCHARS(rubriclength, CASTFLD(void *, match_data, mark), 0, -1, utf,
|
||||
outfile);
|
||||
rubriclength += 7;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue