More file tidies for 10.33-RC1

This commit is contained in:
Philip.Hazel 2019-03-04 18:07:04 +00:00
parent 02ff543f9c
commit 7375089fa5
48 changed files with 394 additions and 383 deletions

View File

@ -88,7 +88,7 @@
PROJECT(PCRE2 C) PROJECT(PCRE2 C)
# Increased minimum to 2.8.0 to support newer add_test features. # Increased minimum to 2.8.0 to support newer add_test features.
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0) CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
# Set policy CMP0026 to avoid warnings for the use of LOCATION in # Set policy CMP0026 to avoid warnings for the use of LOCATION in
@ -324,7 +324,7 @@ ENDIF(PCRE2_SUPPORT_VALGRIND)
IF(PCRE2_DISABLE_PERCENT_ZT) IF(PCRE2_DISABLE_PERCENT_ZT)
SET(DISABLE_PERCENT_ZT 1) SET(DISABLE_PERCENT_ZT 1)
ENDIF(PCRE2_DISABLE_PERCENT_ZT) ENDIF(PCRE2_DISABLE_PERCENT_ZT)
# This next one used to reference ${READLINE_LIBRARY}) # This next one used to reference ${READLINE_LIBRARY})
# but I was advised to add the NCURSES test as well, along with # but I was advised to add the NCURSES test as well, along with
@ -459,7 +459,7 @@ SET(PCRE2_SOURCES
src/pcre2_newline.c src/pcre2_newline.c
src/pcre2_ord2utf.c src/pcre2_ord2utf.c
src/pcre2_pattern_info.c src/pcre2_pattern_info.c
src/pcre2_script_run.c src/pcre2_script_run.c
src/pcre2_serialize.c src/pcre2_serialize.c
src/pcre2_string_utils.c src/pcre2_string_utils.c
src/pcre2_study.c src/pcre2_study.c
@ -651,10 +651,10 @@ IF(PCRE2_BUILD_TESTS)
# exes in Debug location tested by the RunTest and RunGrepTest shell scripts # exes in Debug location tested by the RunTest and RunGrepTest shell scripts
# via "make test" # via "make test"
# The commented out code below provokes a warning about future removal # The commented out code below provokes a warning about future removal
# of the facility, and requires policy CMP0026 to be set to "OLD". I have # of the facility, and requires policy CMP0026 to be set to "OLD". I have
# got fed-up with the warnings, but my plea for help on the mailing list # got fed-up with the warnings, but my plea for help on the mailing list
# produced no response. So, I've hacked. The new code below seems to work on # produced no response. So, I've hacked. The new code below seems to work on
# Linux. # Linux.
@ -857,9 +857,9 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Support Valgrind .................: ${PCRE2_SUPPORT_VALGRIND}") MESSAGE(STATUS " Support Valgrind .................: ${PCRE2_SUPPORT_VALGRIND}")
IF(PCRE2_DISABLE_PERCENT_ZT) IF(PCRE2_DISABLE_PERCENT_ZT)
MESSAGE(STATUS " Use %zu and %td ..................: OFF" ) MESSAGE(STATUS " Use %zu and %td ..................: OFF" )
ELSE(PCRE2_DISABLE_PERCENT_ZT) ELSE(PCRE2_DISABLE_PERCENT_ZT)
MESSAGE(STATUS " Use %zu and %td ..................: AUTO" ) MESSAGE(STATUS " Use %zu and %td ..................: AUTO" )
ENDIF(PCRE2_DISABLE_PERCENT_ZT) ENDIF(PCRE2_DISABLE_PERCENT_ZT)
IF(MINGW AND NOT PCRE2_STATIC) IF(MINGW AND NOT PCRE2_STATIC)
MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}") MESSAGE(STATUS " Non-standard dll names (prefix) . : ${NON_STANDARD_LIB_PREFIX}")

View File

@ -14,11 +14,11 @@ a greater than 1 fixed quantifier. This issue was found by Yunho Kim.
3. Added support for callouts from pcre2_substitute(). 3. Added support for callouts from pcre2_substitute().
4. The POSIX functions are now all called pcre2_regcomp() etc., with wrapper 4. The POSIX functions are now all called pcre2_regcomp() etc., with wrapper
functions that use the standard POSIX names. However, in pcre2posix.h the POSIX functions that use the standard POSIX names. However, in pcre2posix.h the POSIX
names are defined as macros. This should help avoid linking with the wrong names are defined as macros. This should help avoid linking with the wrong
library in some environments while still exporting the POSIX names for library in some environments while still exporting the POSIX names for
pre-existing programs that use them. (The Debian alternative names are also pre-existing programs that use them. (The Debian alternative names are also
defined as macros, but not documented.) defined as macros, but not documented.)
5. Fix an xclass matching issue in JIT. 5. Fix an xclass matching issue in JIT.
@ -33,29 +33,29 @@ new "is lower case letter" bit. At the same time, the now unused "is
hexadecimal digit" bit was removed. The default tables in hexadecimal digit" bit was removed. The default tables in
src/pcre2_chartables.c.dist are updated. src/pcre2_chartables.c.dist are updated.
8. Implement the new Perl "script run" features (*script_run:...) and 8. Implement the new Perl "script run" features (*script_run:...) and
(*atomic_script_run:...) aka (*sr:...) and (*asr:...). (*atomic_script_run:...) aka (*sr:...) and (*asr:...).
9. Fixed two typos in change 22 for 10.21, which added special handling for 9. Fixed two typos in change 22 for 10.21, which added special handling for
ranges such as a-z in EBCDIC environments. The original code probably never ranges such as a-z in EBCDIC environments. The original code probably never
worked, though there were no bug reports. worked, though there were no bug reports.
10. Implement PCRE2_COPY_MATCHED_SUBJECT for pcre2_match() (including JIT via 10. Implement PCRE2_COPY_MATCHED_SUBJECT for pcre2_match() (including JIT via
pcre2_match()) and pcre2_dfa_match(), but *not* the pcre2_jit_match() fast pcre2_match()) and pcre2_dfa_match(), but *not* the pcre2_jit_match() fast
path. Also, when a match fails, set the subject field in the match data to NULL path. Also, when a match fails, set the subject field in the match data to NULL
for tidiness - none of the substring extractors should reference this after for tidiness - none of the substring extractors should reference this after
match failure. match failure.
11. If a pattern started with a subroutine call that had a quantifier with a 11. If a pattern started with a subroutine call that had a quantifier with a
minimum of zero, an incorrect "match must start with this character" could be minimum of zero, an incorrect "match must start with this character" could be
recorded. Example: /(?&xxx)*ABC(?<xxx>XYZ)/ would (incorrectly) expect 'A' to recorded. Example: /(?&xxx)*ABC(?<xxx>XYZ)/ would (incorrectly) expect 'A' to
be the first character of a match. be the first character of a match.
12. The heap limit checking code in pcre2_dfa_match() could suffer from 12. The heap limit checking code in pcre2_dfa_match() could suffer from
overflow if the heap limit was set very large. This could cause incorrect "heap overflow if the heap limit was set very large. This could cause incorrect "heap
limit exceeded" errors. limit exceeded" errors.
13. Add "kibibytes" to the heap limit output from pcre2test -C to make the 13. Add "kibibytes" to the heap limit output from pcre2test -C to make the
units clear. units clear.
14. Add a call to pcre2_jit_free_unused_memory() in pcre2grep, for tidiness. 14. Add a call to pcre2_jit_free_unused_memory() in pcre2grep, for tidiness.
@ -71,33 +71,33 @@ inttypes.h. This supports environments that do not have stdint.h but do have
inttypes.h, which are known to exist. A note in the autotools documentation inttypes.h, which are known to exist. A note in the autotools documentation
says (November 2018) that there are none known that are the other way round. says (November 2018) that there are none known that are the other way round.
17. Added --disable-percent-zt to "configure" (and equivalent to CMake) to 17. Added --disable-percent-zt to "configure" (and equivalent to CMake) to
forcibly disable the use of %zu and %td in formatting strings because there is forcibly disable the use of %zu and %td in formatting strings because there is
at least one version of VMS that claims to be C99 but does not support these at least one version of VMS that claims to be C99 but does not support these
modifiers. modifiers.
18. Added --disable-pcre2grep-callout-fork, which restricts the callout support 18. Added --disable-pcre2grep-callout-fork, which restricts the callout support
in pcre2grep to the inbuilt echo facility. This may be useful in environments in pcre2grep to the inbuilt echo facility. This may be useful in environments
that do not support fork(). that do not support fork().
19. Fix two instances of <= 0 being applied to unsigned integers (the VMS 19. Fix two instances of <= 0 being applied to unsigned integers (the VMS
compiler complains). compiler complains).
20. Added "fork" support for VMS to pcre2grep, for running an external program 20. Added "fork" support for VMS to pcre2grep, for running an external program
via a string callout. via a string callout.
21. Improve MAP_JIT flag usage on MacOS. Patch by Rich Siegel. 21. Improve MAP_JIT flag usage on MacOS. Patch by Rich Siegel.
22. If a pattern started with (*MARK), (*COMMIT), (*PRUNE), (*SKIP), or (*THEN) 22. If a pattern started with (*MARK), (*COMMIT), (*PRUNE), (*SKIP), or (*THEN)
followed by ^ it was not recognized as anchored. followed by ^ it was not recognized as anchored.
23. The RunGrepTest script used to cut out the test of NUL characters for 23. The RunGrepTest script used to cut out the test of NUL characters for
Solaris and MacOS as printf and sed can't handle them. It seems that the *BSD Solaris and MacOS as printf and sed can't handle them. It seems that the *BSD
systems can't either. I've inverted the test so that only those OS that are systems can't either. I've inverted the test so that only those OS that are
known to work (currently only Linux) try to run this test. known to work (currently only Linux) try to run this test.
24. Some tests in RunGrepTest appended to testtrygrep from two different file 24. Some tests in RunGrepTest appended to testtrygrep from two different file
descriptors instead of redirecting stderr to stdout. This worked on Linux, but descriptors instead of redirecting stderr to stdout. This worked on Linux, but
it was reported not to on other systems, causing the tests to fail. it was reported not to on other systems, causing the tests to fail.
25. In the RunTest script, make the test for stack setting use the same value 25. In the RunTest script, make the test for stack setting use the same value
@ -105,27 +105,27 @@ for the stack as it needs for -bigstack.
26. Insert a cast in pcre2_dfa_match.c to suppress a compiler warning. 26. Insert a cast in pcre2_dfa_match.c to suppress a compiler warning.
26. With PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL set, escape sequences such as \s 26. With PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL set, escape sequences such as \s
which are valid in character classes, but not as the end of ranges, were being which are valid in character classes, but not as the end of ranges, were being
treated as literals. An example is [_-\s] (but not [\s-_] because that gave an treated as literals. An example is [_-\s] (but not [\s-_] because that gave an
error at the *start* of a range). Now an "invalid range" error is given error at the *start* of a range). Now an "invalid range" error is given
independently of PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. independently of PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL.
27. Related to 26 above, PCRE2_BAD_ESCAPE_IS_LITERAL was affecting known escape 27. Related to 26 above, PCRE2_BAD_ESCAPE_IS_LITERAL was affecting known escape
sequences such as \eX when they appeared invalidly in a character class. Now sequences such as \eX when they appeared invalidly in a character class. Now
the option applies only to unrecognized or malformed escape sequences. the option applies only to unrecognized or malformed escape sequences.
28. Fix word boundary in JIT compiler. Patch by Mike Munday. 28. Fix word boundary in JIT compiler. Patch by Mike Munday.
29. The pcre2_dfa_match() function was incorrectly handling conditional version 29. The pcre2_dfa_match() function was incorrectly handling conditional version
tests such as (?(VERSION>=0)...) when the version test was true. Incorrect tests such as (?(VERSION>=0)...) when the version test was true. Incorrect
processing or a crash could result. processing or a crash could result.
30. When PCRE2_UTF is set, allow non-ASCII letters and decimal digits in group 30. When PCRE2_UTF is set, allow non-ASCII letters and decimal digits in group
names, as Perl does. There was a small bug in this new code, found by names, as Perl does. There was a small bug in this new code, found by
ClusterFuzz 12950, fixed before release. ClusterFuzz 12950, fixed before release.
31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh} 31. Implemented PCRE2_EXTRA_ALT_BSUX to support ECMAScript 6's \u{hhh}
construct. construct.
32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits 32. Compile \p{Any} to be the same as . in DOTALL mode, so that it benefits
@ -133,15 +133,15 @@ from auto-anchoring if \p{Any}* starts a pattern.
33. Compile invalid UTF check in JIT test when only pcre32 is enabled. 33. Compile invalid UTF check in JIT test when only pcre32 is enabled.
34. For some time now, CMake has been warning about the setting of policy 34. For some time now, CMake has been warning about the setting of policy
CMP0026 to "OLD" in CmakeLists.txt, and hinting that the feature might be CMP0026 to "OLD" in CmakeLists.txt, and hinting that the feature might be
removed in a future version. A request for CMake expertise on the list produced removed in a future version. A request for CMake expertise on the list produced
no result, so I have now hacked CMakeLists.txt along the lines of some changes no result, so I have now hacked CMakeLists.txt along the lines of some changes
I found on the Internet. The new code no longer needs the policy setting, and I found on the Internet. The new code no longer needs the policy setting, and
it appears to work fine on Linux. it appears to work fine on Linux.
35. Setting --enable-jit=auto for an out-of-tree build failed because the 35. Setting --enable-jit=auto for an out-of-tree build failed because the
source directory wasn't in the search path for AC_TRY_COMPILE always. Patch source directory wasn't in the search path for AC_TRY_COMPILE always. Patch
from Ross Burton. from Ross Burton.

2
NEWS
View File

@ -5,7 +5,7 @@ News about PCRE2 releases
Version 10.33-RC1 03-March-2019 Version 10.33-RC1 03-March-2019
------------------------------- -------------------------------
Yet more bugfixes, tidies, and a few enhancements, summarized here (see Yet more bugfixes, tidies, and a few enhancements, summarized here (see
ChangeLog for the full list): ChangeLog for the full list):
1. Callouts from pcre2_substitute() are now available. 1. Callouts from pcre2_substitute() are now available.

View File

@ -47,7 +47,7 @@ can skip ahead to the CMake section.
environment. In particular, you can alter the definition of the NEWLINE environment. In particular, you can alter the definition of the NEWLINE
macro to specify what character(s) you want to be interpreted as line macro to specify what character(s) you want to be interpreted as line
terminators by default. terminators by default.
When you subsequently compile any of the PCRE2 modules, you must specify When you subsequently compile any of the PCRE2 modules, you must specify
-DHAVE_CONFIG_H to your compiler so that src/config.h is included in the -DHAVE_CONFIG_H to your compiler so that src/config.h is included in the
sources. sources.
@ -61,7 +61,7 @@ can skip ahead to the CMake section.
configure/make world, this is handled automatically.) When upgrading to a configure/make world, this is handled automatically.) When upgrading to a
new release, you are strongly advised to review src/config.h.generic new release, you are strongly advised to review src/config.h.generic
before re-using what you had previously. before re-using what you had previously.
Note also that the src/config.h.generic file is created from a config.h Note also that the src/config.h.generic file is created from a config.h
that was generated by Autotools, which automatically includes settings of that was generated by Autotools, which automatically includes settings of
a number of macros that are not actually used by PCRE2 (for example, a number of macros that are not actually used by PCRE2 (for example,
@ -109,7 +109,7 @@ can skip ahead to the CMake section.
pcre2_newline.c pcre2_newline.c
pcre2_ord2utf.c pcre2_ord2utf.c
pcre2_pattern_info.c pcre2_pattern_info.c
pcre2_script_run.c pcre2_script_run.c
pcre2_serialize.c pcre2_serialize.c
pcre2_string_utils.c pcre2_string_utils.c
pcre2_study.c pcre2_study.c

24
README
View File

@ -53,7 +53,7 @@ The header file for the POSIX-style functions is called pcre2posix.h. The
official POSIX name is regex.h, but I did not want to risk possible problems official POSIX name is regex.h, but I did not want to risk possible problems
with existing files of that name by distributing it that way. To use PCRE2 with with existing files of that name by distributing it that way. To use PCRE2 with
an existing program that uses the POSIX API, pcre2posix.h will have to be an existing program that uses the POSIX API, pcre2posix.h will have to be
renamed or pointed at by a link (or the program modified, of course). See the renamed or pointed at by a link (or the program modified, of course). See the
pcre2posix documentation for more details. pcre2posix documentation for more details.
@ -311,10 +311,10 @@ library. They are also documented in the pcre2build man page.
. There is support for calling external programs during matching in the . There is support for calling external programs during matching in the
pcre2grep command, using PCRE2's callout facility with string arguments. This pcre2grep command, using PCRE2's callout facility with string arguments. This
support can be disabled by adding --disable-pcre2grep-callout to the support can be disabled by adding --disable-pcre2grep-callout to the
"configure" command. There are two kinds of callout: one that generates "configure" command. There are two kinds of callout: one that generates
output from inbuilt code, and another that calls an external program. The output from inbuilt code, and another that calls an external program. The
latter has special support for Windows and VMS; otherwise it assumes the latter has special support for Windows and VMS; otherwise it assumes the
existence of the fork() function. This facility can be disabled by adding existence of the fork() function. This facility can be disabled by adding
--disable-pcre2grep-callout-fork to the "configure" command. --disable-pcre2grep-callout-fork to the "configure" command.
. The pcre2grep program currently supports only 8-bit data files, and so . The pcre2grep program currently supports only 8-bit data files, and so
@ -344,7 +344,7 @@ library. They are also documented in the pcre2build man page.
The default is either 1048576 or the value of --with-pcre2grep-bufsize, The default is either 1048576 or the value of --with-pcre2grep-bufsize,
whichever is the larger. whichever is the larger.
. It is possible to compile pcre2test so that it links with the libreadline . It is possible to compile pcre2test so that it links with the libreadline
or libedit libraries, by specifying, respectively, or libedit libraries, by specifying, respectively,
@ -367,14 +367,14 @@ library. They are also documented in the pcre2build man page.
If you get error messages about missing functions tgetstr, tgetent, tputs, If you get error messages about missing functions tgetstr, tgetent, tputs,
tgetflag, or tgoto, this is the problem, and linking with the ncurses library tgetflag, or tgoto, this is the problem, and linking with the ncurses library
should fix it. should fix it.
. The C99 standard defines formatting modifiers z and t for size_t and . The C99 standard defines formatting modifiers z and t for size_t and
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
environments other than Microsoft Visual Studio when __STDC_VERSION__ is environments other than Microsoft Visual Studio when __STDC_VERSION__ is
defined and has a value greater than or equal to 199901L (indicating C99). defined and has a value greater than or equal to 199901L (indicating C99).
However, there is at least one environment that claims to be C99 but does not However, there is at least one environment that claims to be C99 but does not
support these modifiers. If --disable-percent-zt is specified, no use is made support these modifiers. If --disable-percent-zt is specified, no use is made
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
size_t values. size_t values.
. There is a special option called --enable-fuzz-support for use by people who . There is a special option called --enable-fuzz-support for use by people who
@ -790,7 +790,7 @@ The distribution should contain the files listed below.
src/pcre2_newline.c ) src/pcre2_newline.c )
src/pcre2_ord2utf.c ) src/pcre2_ord2utf.c )
src/pcre2_pattern_info.c ) src/pcre2_pattern_info.c )
src/pcre2_script_run.c ) src/pcre2_script_run.c )
src/pcre2_serialize.c ) src/pcre2_serialize.c )
src/pcre2_string_utils.c ) src/pcre2_string_utils.c )
src/pcre2_study.c ) src/pcre2_study.c )

View File

@ -753,7 +753,7 @@ if $valgrind $vjs $pcre2grep --help | $valgrind $vjs $pcre2grep -q 'callout scri
else else
$cf $srcdir/testdata/grepoutputC testtrygrep $cf $srcdir/testdata/grepoutputC testtrygrep
fi fi
if [ $? != 0 ] ; then exit 1; fi if [ $? != 0 ] ; then exit 1; fi
else else
echo "Script callouts are not supported" echo "Script callouts are not supported"

View File

@ -147,14 +147,14 @@ AC_ARG_ENABLE(jit,
if test "$enable_jit" = "auto"; then if test "$enable_jit" = "auto"; then
AC_LANG(C) AC_LANG(C)
SAVE_CPPFLAGS=$CPPFLAGS SAVE_CPPFLAGS=$CPPFLAGS
CPPFLAGS=-I$srcdir CPPFLAGS=-I$srcdir
AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
#define SLJIT_CONFIG_AUTO 1 #define SLJIT_CONFIG_AUTO 1
#include "src/sljit/sljitConfigInternal.h" #include "src/sljit/sljitConfigInternal.h"
#if (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED) #if (defined SLJIT_CONFIG_UNSUPPORTED && SLJIT_CONFIG_UNSUPPORTED)
#error unsupported #error unsupported
#endif]])], enable_jit=yes, enable_jit=no) #endif]])], enable_jit=yes, enable_jit=no)
CPPFLAGS=$SAVE_CPPFLAGS CPPFLAGS=$SAVE_CPPFLAGS
echo checking for JIT support on this hardware... $enable_jit echo checking for JIT support on this hardware... $enable_jit
fi fi
@ -607,7 +607,7 @@ if test "$enable_percent_zt" = "no"; then
Define to any value to disable the use of the z and t modifiers in Define to any value to disable the use of the z and t modifiers in
formatting settings such as %zu or %td (this is rarely needed).]) formatting settings such as %zu or %td (this is rarely needed).])
else else
enable_percent_zt=auto enable_percent_zt=auto
fi fi
# Unless running under Windows, JIT support requires pthreads. # Unless running under Windows, JIT support requires pthreads.
@ -647,13 +647,13 @@ if test "$enable_pcre2grep_callout" = "yes"; then
fi fi
AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT_FORK], [], [ AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT_FORK], [], [
Define to any value to enable fork support in pcre2grep callout scripts. Define to any value to enable fork support in pcre2grep callout scripts.
This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also This will have no effect unless SUPPORT_PCRE2GREP_CALLOUT is also
defined.]) defined.])
fi fi
AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT], [], [ AC_DEFINE([SUPPORT_PCRE2GREP_CALLOUT], [], [
Define to any value to enable callout script support in pcre2grep.]) Define to any value to enable callout script support in pcre2grep.])
else else
enable_pcre2grep_callout_fork="no" enable_pcre2grep_callout_fork="no"
fi fi
if test "$enable_unicode" = "yes"; then if test "$enable_unicode" = "yes"; then
@ -1055,7 +1055,7 @@ $PACKAGE-$VERSION configuration summary:
Build static libs .................. : ${enable_static} Build static libs .................. : ${enable_static}
Use JIT in pcre2grep ............... : ${enable_pcre2grep_jit} Use JIT in pcre2grep ............... : ${enable_pcre2grep_jit}
Enable callouts in pcre2grep ....... : ${enable_pcre2grep_callout} Enable callouts in pcre2grep ....... : ${enable_pcre2grep_callout}
Enable fork in pcre2grep callouts .. : ${enable_pcre2grep_callout_fork} Enable fork in pcre2grep callouts .. : ${enable_pcre2grep_callout_fork}
Initial buffer size for pcre2grep .. : ${with_pcre2grep_bufsize} Initial buffer size for pcre2grep .. : ${with_pcre2grep_bufsize}
Maximum buffer size for pcre2grep .. : ${with_pcre2grep_max_bufsize} Maximum buffer size for pcre2grep .. : ${with_pcre2grep_max_bufsize}
Link pcre2grep with libz ........... : ${enable_pcre2grep_libz} Link pcre2grep with libz ........... : ${enable_pcre2grep_libz}

View File

@ -47,7 +47,7 @@ can skip ahead to the CMake section.
environment. In particular, you can alter the definition of the NEWLINE environment. In particular, you can alter the definition of the NEWLINE
macro to specify what character(s) you want to be interpreted as line macro to specify what character(s) you want to be interpreted as line
terminators by default. terminators by default.
When you subsequently compile any of the PCRE2 modules, you must specify When you subsequently compile any of the PCRE2 modules, you must specify
-DHAVE_CONFIG_H to your compiler so that src/config.h is included in the -DHAVE_CONFIG_H to your compiler so that src/config.h is included in the
sources. sources.
@ -61,7 +61,7 @@ can skip ahead to the CMake section.
configure/make world, this is handled automatically.) When upgrading to a configure/make world, this is handled automatically.) When upgrading to a
new release, you are strongly advised to review src/config.h.generic new release, you are strongly advised to review src/config.h.generic
before re-using what you had previously. before re-using what you had previously.
Note also that the src/config.h.generic file is created from a config.h Note also that the src/config.h.generic file is created from a config.h
that was generated by Autotools, which automatically includes settings of that was generated by Autotools, which automatically includes settings of
a number of macros that are not actually used by PCRE2 (for example, a number of macros that are not actually used by PCRE2 (for example,
@ -109,7 +109,7 @@ can skip ahead to the CMake section.
pcre2_newline.c pcre2_newline.c
pcre2_ord2utf.c pcre2_ord2utf.c
pcre2_pattern_info.c pcre2_pattern_info.c
pcre2_script_run.c pcre2_script_run.c
pcre2_serialize.c pcre2_serialize.c
pcre2_string_utils.c pcre2_string_utils.c
pcre2_study.c pcre2_study.c

View File

@ -53,7 +53,7 @@ The header file for the POSIX-style functions is called pcre2posix.h. The
official POSIX name is regex.h, but I did not want to risk possible problems official POSIX name is regex.h, but I did not want to risk possible problems
with existing files of that name by distributing it that way. To use PCRE2 with with existing files of that name by distributing it that way. To use PCRE2 with
an existing program that uses the POSIX API, pcre2posix.h will have to be an existing program that uses the POSIX API, pcre2posix.h will have to be
renamed or pointed at by a link (or the program modified, of course). See the renamed or pointed at by a link (or the program modified, of course). See the
pcre2posix documentation for more details. pcre2posix documentation for more details.
@ -311,7 +311,11 @@ library. They are also documented in the pcre2build man page.
. There is support for calling external programs during matching in the . There is support for calling external programs during matching in the
pcre2grep command, using PCRE2's callout facility with string arguments. This pcre2grep command, using PCRE2's callout facility with string arguments. This
support can be disabled by adding --disable-pcre2grep-callout to the support can be disabled by adding --disable-pcre2grep-callout to the
"configure" command. "configure" command. There are two kinds of callout: one that generates
output from inbuilt code, and another that calls an external program. The
latter has special support for Windows and VMS; otherwise it assumes the
existence of the fork() function. This facility can be disabled by adding
--disable-pcre2grep-callout-fork to the "configure" command.
. The pcre2grep program currently supports only 8-bit data files, and so . The pcre2grep program currently supports only 8-bit data files, and so
requires the 8-bit PCRE2 library. It is possible to compile pcre2grep to use requires the 8-bit PCRE2 library. It is possible to compile pcre2grep to use
@ -363,14 +367,14 @@ library. They are also documented in the pcre2build man page.
If you get error messages about missing functions tgetstr, tgetent, tputs, If you get error messages about missing functions tgetstr, tgetent, tputs,
tgetflag, or tgoto, this is the problem, and linking with the ncurses library tgetflag, or tgoto, this is the problem, and linking with the ncurses library
should fix it. should fix it.
. The C99 standard defines formatting modifiers z and t for size_t and . The C99 standard defines formatting modifiers z and t for size_t and
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
environments other than Microsoft Visual Studio when __STDC_VERSION__ is environments other than Microsoft Visual Studio when __STDC_VERSION__ is
defined and has a value greater than or equal to 199901L (indicating C99). defined and has a value greater than or equal to 199901L (indicating C99).
However, there is at least one environment that claims to be C99 but does not However, there is at least one environment that claims to be C99 but does not
support these modifiers. If --disable-percent-zt is specified, no use is made support these modifiers. If --disable-percent-zt is specified, no use is made
of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for of the z or t modifiers. Instead or %td or %zu, %lu is used, with a cast for
size_t values. size_t values.
. There is a special option called --enable-fuzz-support for use by people who . There is a special option called --enable-fuzz-support for use by people who
@ -786,7 +790,7 @@ The distribution should contain the files listed below.
src/pcre2_newline.c ) src/pcre2_newline.c )
src/pcre2_ord2utf.c ) src/pcre2_ord2utf.c )
src/pcre2_pattern_info.c ) src/pcre2_pattern_info.c )
src/pcre2_script_run.c ) src/pcre2_script_run.c )
src/pcre2_serialize.c ) src/pcre2_serialize.c )
src/pcre2_string_utils.c ) src/pcre2_string_utils.c )
src/pcre2_study.c ) src/pcre2_study.c )
@ -886,4 +890,4 @@ The distribution should contain the files listed below.
Philip Hazel Philip Hazel
Email local part: ph10 Email local part: ph10
Email domain: cam.ac.uk Email domain: cam.ac.uk
Last updated: 29 January 2019 Last updated: 03 March 2019

View File

@ -52,7 +52,7 @@ characters. The options are:
<pre> <pre>
PCRE2_ANCHORED Match only at the first position PCRE2_ANCHORED Match only at the first position
PCRE2_COPY_MATCHED_SUBJECT PCRE2_COPY_MATCHED_SUBJECT
On success, make a private subject copy On success, make a private subject copy
PCRE2_ENDANCHORED Pattern can match only at end of subject PCRE2_ENDANCHORED Pattern can match only at end of subject
PCRE2_NOTBOL Subject is not the beginning of a line PCRE2_NOTBOL Subject is not the beginning of a line
PCRE2_NOTEOL Subject is not the end of a line PCRE2_NOTEOL Subject is not the end of a line

View File

@ -61,7 +61,7 @@ terminated by a binary zero code unit. The options are:
<pre> <pre>
PCRE2_ANCHORED Match only at the first position PCRE2_ANCHORED Match only at the first position
PCRE2_COPY_MATCHED_SUBJECT PCRE2_COPY_MATCHED_SUBJECT
On success, make a private subject copy On success, make a private subject copy
PCRE2_ENDANCHORED Pattern can match only at end of subject PCRE2_ENDANCHORED Pattern can match only at end of subject
PCRE2_NOTBOL Subject string is not the beginning of a line PCRE2_NOTBOL Subject string is not the beginning of a line
PCRE2_NOTEOL Subject string is not the end of a line PCRE2_NOTEOL Subject string is not the end of a line

View File

@ -31,7 +31,7 @@ using the memory freeing function from the general context or compiled pattern
with which it was created, or <b>free()</b> if that was not set. with which it was created, or <b>free()</b> if that was not set.
</P> </P>
<P> <P>
If the PCRE2_COPY_MATCHED_SUBJECT was used for a successful match using this If the PCRE2_COPY_MATCHED_SUBJECT was used for a successful match using this
match data block, the copy of the subject that was remembered with the block is match data block, the copy of the subject that was remembered with the block is
also freed. also freed.
</P> </P>

View File

@ -31,7 +31,7 @@ housed in a compile context. It completely replaces all the bits. The extra
options are: options are:
<pre> <pre>
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines

View File

@ -1309,7 +1309,7 @@ be referenced by the substring extraction functions after a successful match.
After running a match, you must not free a compiled pattern or a subject string After running a match, you must not free a compiled pattern or a subject string
until after all operations on the until after all operations on the
<a href="#matchdatablock">match data block</a> <a href="#matchdatablock">match data block</a>
have taken place, unless, in the case of the subject string, you have used the have taken place, unless, in the case of the subject string, you have used the
PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled
"Option bits for <b>pcre2_match()</b>" "Option bits for <b>pcre2_match()</b>"
<a href="#matchoptions>">below.</a> <a href="#matchoptions>">below.</a>
@ -1437,8 +1437,8 @@ binary zero character followed by z).
ECMAscript 6 added additional functionality to \u. This can be accessed using ECMAscript 6 added additional functionality to \u. This can be accessed using
the PCRE2_EXTRA_ALT_BSUX extra option (see "Extra compile options" the PCRE2_EXTRA_ALT_BSUX extra option (see "Extra compile options"
<a href="#extracompileoptions">below).</a> <a href="#extracompileoptions">below).</a>
Note that this alternative escape handling applies only to patterns. Neither of Note that this alternative escape handling applies only to patterns. Neither of
these options affects the processing of replacement strings passed to these options affects the processing of replacement strings passed to
<b>pcre2_substitute()</b>. <b>pcre2_substitute()</b>.
<pre> <pre>
PCRE2_ALT_CIRCUMFLEX PCRE2_ALT_CIRCUMFLEX
@ -1875,10 +1875,10 @@ characters if the matching function is called with PCRE2_NO_UTF_CHECK set.
<pre> <pre>
PCRE2_EXTRA_ALT_BSUX PCRE2_EXTRA_ALT_BSUX
</pre> </pre>
The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and \x in The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, and \x in
the way that ECMAscript (aka JavaScript) does. Additional functionality was the way that ECMAscript (aka JavaScript) does. Additional functionality was
defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of
PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal PCRE2_ALT_BSUX, but in addition it recognizes \u{hhh..} as a hexadecimal
character code, where hhh.. is any number of hexadecimal digits. character code, where hhh.. is any number of hexadecimal digits.
<pre> <pre>
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
@ -1896,7 +1896,7 @@ If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to
<b>pcre2_compile()</b>, all unrecognized or malformed escape sequences are <b>pcre2_compile()</b>, all unrecognized or malformed escape sequences are
treated as single-character escapes. For example, \j is a literal "j" and treated as single-character escapes. For example, \j is a literal "j" and
\x{2z} is treated as the literal string "x{2z}". Setting this option means \x{2z} is treated as the literal string "x{2z}". Setting this option means
that typos in patterns may go undetected and have unexpected results. Also note that typos in patterns may go undetected and have unexpected results. Also note
that a sequence such as [\N{] is interpreted as a malformed attempt at that a sequence such as [\N{] is interpreted as a malformed attempt at
[\N{...}] and so is treated as [N{] whereas [\N] gives an error because an [\N{...}] and so is treated as [N{] whereas [\N] gives an error because an
unqualified \N is a valid escape sequence but is not supported in a character unqualified \N is a valid escape sequence but is not supported in a character
@ -1904,9 +1904,9 @@ class. To reiterate: this is a dangerous option. Use with great care.
<pre> <pre>
PCRE2_EXTRA_ESCAPED_CR_IS_LF PCRE2_EXTRA_ESCAPED_CR_IS_LF
</pre> </pre>
There are some legacy applications where the escape sequence \r in a pattern There are some legacy applications where the escape sequence \r in a pattern
is expected to match a newline. If this option is set, \r in a pattern is is expected to match a newline. If this option is set, \r in a pattern is
converted to \n so that it matches a LF (linefeed) instead of a CR (carriage converted to \n so that it matches a LF (linefeed) instead of a CR (carriage
return) character. The option does not affect a literal CR in the pattern, nor return) character. The option does not affect a literal CR in the pattern, nor
does it affect CR specified as an explicit code point such as \x{0D}. does it affect CR specified as an explicit code point such as \x{0D}.
<pre> <pre>
@ -2564,7 +2564,7 @@ Option bits for <b>pcre2_match()</b>
</b><br> </b><br>
<P> <P>
The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be The unused bits of the <i>options</i> argument for <b>pcre2_match()</b> must be
zero. The only bits that may be set are PCRE2_ANCHORED, zero. The only bits that may be set are PCRE2_ANCHORED,
PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL,
PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK,
PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below. PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below.
@ -2585,8 +2585,8 @@ matching.
<pre> <pre>
PCRE2_COPY_MATCHED_SUBJECT PCRE2_COPY_MATCHED_SUBJECT
</pre> </pre>
By default, a pointer to the subject is remembered in the match data block so By default, a pointer to the subject is remembered in the match data block so
that, after a successful match, it can be referenced by the substring that, after a successful match, it can be referenced by the substring
extraction functions. This means that the subject's memory must not be freed extraction functions. This means that the subject's memory must not be freed
until all such operations are complete. For some applications where the until all such operations are complete. For some applications where the
lifetime of the subject string is not guaranteed, it may be necessary to make a lifetime of the subject string is not guaranteed, it may be necessary to make a
@ -2866,8 +2866,8 @@ undefined.
<P> <P>
After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure
to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function
<b>pcre2_get_mark()</b> can be called to access this name, which can be <b>pcre2_get_mark()</b> can be called to access this name, which can be
specified in the pattern by any of the backtracking control verbs, not just specified in the pattern by any of the backtracking control verbs, not just
(*MARK). The same function applies to all the verbs. It returns a pointer to (*MARK). The same function applies to all the verbs. It returns a pointer to
the zero-terminated name, which is within the compiled pattern. If no name is the zero-terminated name, which is within the compiled pattern. If no name is
available, NULL is returned. The length of the name (excluding the terminating available, NULL is returned. The length of the name (excluding the terminating
@ -3002,7 +3002,7 @@ The backtracking match limit was reached.
If a pattern contains many nested backtracking points, heap memory is used to If a pattern contains many nested backtracking points, heap memory is used to
remember them. This error is given when the memory allocation function (default remember them. This error is given when the memory allocation function (default
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
<pre> <pre>
PCRE2_ERROR_NULL PCRE2_ERROR_NULL
@ -3405,7 +3405,7 @@ capture groups and letters within \Q...\E quoted sequences.
<P> <P>
Note that case forcing sequences such as \U...\E do not nest. For example, Note that case forcing sequences such as \U...\E do not nest. For example,
the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no
effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do
not apply to not apply to replacement strings. not apply to not apply to replacement strings.
</P> </P>
<P> <P>
@ -3439,7 +3439,7 @@ substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
groups in the extended syntax forms to be treated as unset. groups in the extended syntax forms to be treated as unset.
</P> </P>
<P> <P>
If successful, <b>pcre2_substitute()</b> returns the number of successful If successful, <b>pcre2_substitute()</b> returns the number of successful
matches. This may be zero if no matches were found, and is never greater than 1 matches. This may be zero if no matches were found, and is never greater than 1
unless PCRE2_SUBSTITUTE_GLOBAL is set. unless PCRE2_SUBSTITUTE_GLOBAL is set.
</P> </P>
@ -3489,8 +3489,8 @@ Substitution callouts
<br> <br>
The <b>pcre2_set_substitution_callout()</b> function can be used to specify a The <b>pcre2_set_substitution_callout()</b> function can be used to specify a
callout function for <b>pcre2_substitute()</b>. This information is passed in callout function for <b>pcre2_substitute()</b>. This information is passed in
a match context. The callout function is called after each substitution has a match context. The callout function is called after each substitution has
been processed, but it can cause the replacement not to happen. The callout been processed, but it can cause the replacement not to happen. The callout
function is not called for simulated substitutions that happen as a result of function is not called for simulated substitutions that happen as a result of
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option.
</P> </P>
@ -3500,10 +3500,10 @@ block structure, which contains the following fields, not necessarily in this
order: order:
<pre> <pre>
uint32_t <i>version</i>; uint32_t <i>version</i>;
uint32_t <i>subscount</i>; uint32_t <i>subscount</i>;
PCRE2_SPTR <i>input</i>; PCRE2_SPTR <i>input</i>;
PCRE2_SPTR <i>output</i>; PCRE2_SPTR <i>output</i>;
PCRE2_SIZE <i>*ovector</i>; PCRE2_SIZE <i>*ovector</i>;
uint32_t <i>oveccount</i>; uint32_t <i>oveccount</i>;
PCRE2_SIZE <i>output_offsets[2]</i>; PCRE2_SIZE <i>output_offsets[2]</i>;
</pre> </pre>
@ -3517,9 +3517,9 @@ first callout, 2 for the second, and so on. The <i>input</i> and <i>output</i>
pointers are copies of the values passed to <b>pcre2_substitute()</b>. pointers are copies of the values passed to <b>pcre2_substitute()</b>.
</P> </P>
<P> <P>
The <i>ovector</i> field points to the ovector, which contains the result of the The <i>ovector</i> field points to the ovector, which contains the result of the
most recent match. The <i>oveccount</i> field contains the number of pairs that most recent match. The <i>oveccount</i> field contains the number of pairs that
are set in the ovector, and is always greater than zero. are set in the ovector, and is always greater than zero.
</P> </P>
<P> <P>
The <i>output_offsets</i> vector contains the offsets of the replacement in the The <i>output_offsets</i> vector contains the offsets of the replacement in the

View File

@ -376,12 +376,15 @@ environment.
</P> </P>
<br><a name="SEC14" href="#TOC1">PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS</a><br> <br><a name="SEC14" href="#TOC1">PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS</a><br>
<P> <P>
By default, on non-Windows systems, <b>pcre2grep</b> supports the use of By default <b>pcre2grep</b> supports the use of callouts with string arguments
callouts with string arguments within the patterns it is matching, in order to within the patterns it is matching. There are two kinds: one that generates
run external scripts. For details, see the output using local code, and another that calls an external program or script.
If --disable-pcre2grep-callout-fork is added to the <b>configure</b> command,
only the first kind of callout is supported; if --disable-pcre2grep-callout is
used, all callouts are completely ignored. For more details of <b>pcre2grep</b>
callouts, see the
<a href="pcre2grep.html"><b>pcre2grep</b></a> <a href="pcre2grep.html"><b>pcre2grep</b></a>
documentation. This support can be disabled by adding documentation.
--disable-pcre2grep-callout to the <b>configure</b> command.
</P> </P>
<br><a name="SEC15" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br> <br><a name="SEC15" href="#TOC1">PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT</a><br>
<P> <P>
@ -526,14 +529,14 @@ documentation.
</P> </P>
<br><a name="SEC21" href="#TOC1">DISABLING THE Z AND T FORMATTING MODIFIERS</a><br> <br><a name="SEC21" href="#TOC1">DISABLING THE Z AND T FORMATTING MODIFIERS</a><br>
<P> <P>
The C99 standard defines formatting modifiers z and t for size_t and The C99 standard defines formatting modifiers z and t for size_t and
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers in
environments other than Microsoft Visual Studio when __STDC_VERSION__ is environments other than Microsoft Visual Studio when __STDC_VERSION__ is
defined and has a value greater than or equal to 199901L (indicating C99). defined and has a value greater than or equal to 199901L (indicating C99).
However, there is at least one environment that claims to be C99 but does not However, there is at least one environment that claims to be C99 but does not
support these modifiers. If support these modifiers. If
<pre> <pre>
--disable-percent-zt --disable-percent-zt
</pre> </pre>
is specified, no use is made of the z or t modifiers. Instead or %td or %zu, is specified, no use is made of the z or t modifiers. Instead or %td or %zu,
%lu is used, with a cast for size_t values. %lu is used, with a cast for size_t values.
@ -589,9 +592,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC26" href="#TOC1">REVISION</a><br> <br><a name="SEC26" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 15 November 2018 Last updated: 03 March 2019
<br> <br>
Copyright &copy; 1997-2018 University of Cambridge. Copyright &copy; 1997-2019 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -48,7 +48,7 @@ When using the <b>pcre2_substitute()</b> function, an additional callout feature
is available. This does a callout after each change to the subject string and is available. This does a callout after each change to the subject string and
is described in the is described in the
<a href="pcre2api.html"><b>pcre2api</b></a> <a href="pcre2api.html"><b>pcre2api</b></a>
documentation; the rest of this document is concerned with callouts during documentation; the rest of this document is concerned with callouts during
pattern matching. pattern matching.
</P> </P>
<P> <P>

View File

@ -871,8 +871,8 @@ only callouts with string arguments are useful.
Calling external programs or scripts Calling external programs or scripts
</b><br> </b><br>
<P> <P>
This facility can be independently disabled when <b>pcre2grep</b> is built. It This facility can be independently disabled when <b>pcre2grep</b> is built. It
is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS, is supported for Windows, where a call to <b>_spawnvp()</b> is used, for VMS,
where <b>lib$spawn()</b> is used, and for any other Unix-like environment where where <b>lib$spawn()</b> is used, and for any other Unix-like environment where
<b>fork()</b> and <b>execv()</b> are available. <b>fork()</b> and <b>execv()</b> are available.
</P> </P>

View File

@ -418,13 +418,13 @@ two compile-time options. If PCRE2_ALT_BSUX is set, the sequence \x followed
by { is not recognized. Only if \x is followed by two hexadecimal digits is it by { is not recognized. Only if \x is followed by two hexadecimal digits is it
recognized as a character escape. Otherwise it is interpreted as a literal "x" recognized as a character escape. Otherwise it is interpreted as a literal "x"
character. In this mode, support for code points greater than 256 is provided character. In this mode, support for code points greater than 256 is provided
by \u, which must be followed by four hexadecimal digits; otherwise it is by \u, which must be followed by four hexadecimal digits; otherwise it is
interpreted as a literal "u" character. interpreted as a literal "u" character.
</P> </P>
<P> <P>
PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition, PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition,
\u{hhh..} is recognized as the character specified by hexadecimal code point. \u{hhh..} is recognized as the character specified by hexadecimal code point.
There may be any number of hexadecimal digits. This syntax is from ECMAScript There may be any number of hexadecimal digits. This syntax is from ECMAScript
6. 6.
</P> </P>
<P> <P>
@ -1194,7 +1194,7 @@ character. If any other of these assertions appears in a character class, an
A word boundary is a position in the subject string where the current character A word boundary is a position in the subject string where the current character
and the previous character do not both match \w or \W (i.e. one matches and the previous character do not both match \w or \W (i.e. one matches
\w and the other matches \W), or the start or end of the string if the \w and the other matches \W), or the start or end of the string if the
first or last character matches \w, respectively. When PCRE2 is built with first or last character matches \w, respectively. When PCRE2 is built with
Unicode support, the meanings of \w and \W can be changed by setting the Unicode support, the meanings of \w and \W can be changed by setting the
PCRE2_UCP option. When this is done, it also affects \b and \B. Neither PCRE2 PCRE2_UCP option. When this is done, it also affects \b and \B. Neither PCRE2
nor Perl has a separate "start of word" or "end of word" metasequence. However, nor Perl has a separate "start of word" or "end of word" metasequence. However,

View File

@ -50,13 +50,13 @@ expression 8-bit library. There are no POSIX-style wrappers for PCRE2's 16-bit
and 32-bit libraries. See the and 32-bit libraries. See the
<a href="pcre2api.html"><b>pcre2api</b></a> <a href="pcre2api.html"><b>pcre2api</b></a>
documentation for a description of PCRE2's native API, which contains much documentation for a description of PCRE2's native API, which contains much
additional functionality. additional functionality.
</P> </P>
<P> <P>
The functions described here are wrapper functions that ultimately call the The functions described here are wrapper functions that ultimately call the
PCRE2 native API. Their prototypes are defined in the <b>pcre2posix.h</b> header PCRE2 native API. Their prototypes are defined in the <b>pcre2posix.h</b> header
file, and they all have unique names starting with <b>pcre2_</b>. However, the file, and they all have unique names starting with <b>pcre2_</b>. However, the
<b>pcre2posix.h</b> header also contains macro definitions that convert the <b>pcre2posix.h</b> header also contains macro definitions that convert the
standard POSIX names such <b>regcomp()</b> into <b>pcre2_regcomp()</b> etc. This standard POSIX names such <b>regcomp()</b> into <b>pcre2_regcomp()</b> etc. This
means that a program can use the usual POSIX names without running the risk of means that a program can use the usual POSIX names without running the risk of
accidentally linking with POSIX functions from a different library. accidentally linking with POSIX functions from a different library.
@ -68,7 +68,7 @@ application. Because the POSIX functions call the native ones, it is also
necessary to add <b>-lpcre2-8</b>. necessary to add <b>-lpcre2-8</b>.
</P> </P>
<P> <P>
Although they are not defined as protypes in <b>pcre2posix.h</b>, the library Although they are not defined as protypes in <b>pcre2posix.h</b>, the library
does contain functions with the POSIX names <b>regcomp()</b> etc. These simply does contain functions with the POSIX names <b>regcomp()</b> etc. These simply
pass their arguments to the PCRE2 functions. These functions are provided for pass their arguments to the PCRE2 functions. These functions are provided for
backwards compatibility with earlier versions of PCRE2, so that existing backwards compatibility with earlier versions of PCRE2, so that existing

View File

@ -58,7 +58,7 @@ documentation. This document contains a quick-reference summary of the syntax.
</P> </P>
<br><a name="SEC3" href="#TOC1">ESCAPED CHARACTERS</a><br> <br><a name="SEC3" href="#TOC1">ESCAPED CHARACTERS</a><br>
<P> <P>
This table applies to ASCII and Unicode environments. An unrecognized escape This table applies to ASCII and Unicode environments. An unrecognized escape
sequence causes an error. sequence causes an error.
<pre> <pre>
\a alarm, that is, the BEL character (hex 07) \a alarm, that is, the BEL character (hex 07)
@ -85,7 +85,7 @@ following are also recognized:
When \x is not followed by {, from zero to two hexadecimal digits are read, When \x is not followed by {, from zero to two hexadecimal digits are read,
but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be but in ALT_BSUX mode \x must be followed by two hexadecimal digits to be
recognized as a hexadecimal escape; otherwise it matches a literal "x". recognized as a hexadecimal escape; otherwise it matches a literal "x".
Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits Likewise, if \u (in ALT_BSUX mode) is not followed by four hexadecimal digits
or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it
matches a literal "u". matches a literal "u".
</P> </P>

View File

@ -606,10 +606,10 @@ for a description of the effects of these options.
/s dotall set PCRE2_DOTALL /s dotall set PCRE2_DOTALL
dupnames set PCRE2_DUPNAMES dupnames set PCRE2_DUPNAMES
endanchored set PCRE2_ENDANCHORED endanchored set PCRE2_ENDANCHORED
escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF
/x extended set PCRE2_EXTENDED /x extended set PCRE2_EXTENDED
/xx extended_more set PCRE2_EXTENDED_MORE /xx extended_more set PCRE2_EXTENDED_MORE
extra_alt_bsux set PCRE2_EXTRA_ALT_BSUX extra_alt_bsux set PCRE2_EXTRA_ALT_BSUX
firstline set PCRE2_FIRSTLINE firstline set PCRE2_FIRSTLINE
literal set PCRE2_LITERAL literal set PCRE2_LITERAL
match_line set PCRE2_EXTRA_MATCH_LINE match_line set PCRE2_EXTRA_MATCH_LINE
@ -1043,7 +1043,7 @@ process.
aftertext show text after match aftertext show text after match
allaftertext show text after captures allaftertext show text after captures
allcaptures show all captures allcaptures show all captures
allvector show the entire ovector allvector show the entire ovector
allusedtext show all consulted text allusedtext show all consulted text
altglobal alternative global matching altglobal alternative global matching
/g global global matching /g global global matching
@ -1051,9 +1051,9 @@ process.
mark show mark values mark show mark values
replace=&#60;string&#62; specify a replacement string replace=&#60;string&#62; specify a replacement string
startchar show starting character when relevant startchar show starting character when relevant
substitute_callout use substitution callouts substitute_callout use substitution callouts
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
substitute_skip=&#60;n&#62; skip substitution number n substitute_skip=&#60;n&#62; skip substitution number n
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
substitute_stop=&#60;n&#62; skip substitution number n and greater substitute_stop=&#60;n&#62; skip substitution number n and greater
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
@ -1191,7 +1191,7 @@ pattern.
aftertext show text after match aftertext show text after match
allaftertext show text after captures allaftertext show text after captures
allcaptures show all captures allcaptures show all captures
allvector show the entire ovector allvector show the entire ovector
allusedtext show all consulted text (non-JIT only) allusedtext show all consulted text (non-JIT only)
altglobal alternative global matching altglobal alternative global matching
callout_capture show captures at callout time callout_capture show captures at callout time
@ -1221,9 +1221,9 @@ pattern.
replace=&#60;string&#62; specify a replacement string replace=&#60;string&#62; specify a replacement string
startchar show startchar when relevant startchar show startchar when relevant
startoffset=&#60;n&#62; same as offset=&#60;n&#62; startoffset=&#60;n&#62; same as offset=&#60;n&#62;
substitute_callout use substitution callouts substitute_callout use substitution callouts
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
substitute_skip=&#60;n&#62; skip substitution number n substitute_skip=&#60;n&#62; skip substitution number n
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
substitute_stop=&#60;n&#62; skip substitution number n and greater substitute_stop=&#60;n&#62; skip substitution number n and greater
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
@ -1306,9 +1306,9 @@ result, and also for DFA matching, provides a means of checking that there are
no unexpected modifications to ovector fields. Before each match attempt, the no unexpected modifications to ovector fields. Before each match attempt, the
ovector is filled with a special value, and if this is found in both elements ovector is filled with a special value, and if this is found in both elements
of a capturing pair, "&#60;unchanged&#62;" is output. After a successful match, this of a capturing pair, "&#60;unchanged&#62;" is output. After a successful match, this
applies to all groups after the maximum capture group for the pattern. In other applies to all groups after the maximum capture group for the pattern. In other
cases it applies to the entire ovector. After a partial match, the first two cases it applies to the entire ovector. After a partial match, the first two
elements are the only ones that should be set. After a DFA match, the amount of elements are the only ones that should be set. After a DFA match, the amount of
ovector that is used depends on the number of matches that were found. ovector that is used depends on the number of matches that were found.
</P> </P>
<br><b> <br><b>
@ -1320,7 +1320,7 @@ functions, unless <b>callout_none</b> is specified. Its behaviour can be
controlled by various modifiers listed above whose names begin with controlled by various modifiers listed above whose names begin with
<b>callout_</b>. Details are given in the section entitled "Callouts" <b>callout_</b>. Details are given in the section entitled "Callouts"
<a href="#callouts">below.</a> <a href="#callouts">below.</a>
Testing callouts from <b>pcre2_substitute()</b> is decribed separately in Testing callouts from <b>pcre2_substitute()</b> is decribed separately in
"Testing the substitution function" "Testing the substitution function"
<a href="#substitution">below.</a> <a href="#substitution">below.</a>
</P> </P>
@ -1449,14 +1449,14 @@ matching provokes an error return ("bad option value") from
Testing substitute callouts Testing substitute callouts
</b><br> </b><br>
<P> <P>
If the <b>substitute_callout</b> modifier is set, a substitution callout If the <b>substitute_callout</b> modifier is set, a substitution callout
function is set up. When it is called (after each substitution), details of the function is set up. When it is called (after each substitution), details of the
the input and output strings are output. For example: the input and output strings are output. For example:
<pre> <pre>
/abc/g,replace=&#60;$0&#62;,substitute_callout /abc/g,replace=&#60;$0&#62;,substitute_callout
abcdefabcpqr abcdefabcpqr
1(1) Old 0 3 "abc" New 0 5 "&#60;abc&#62;" 1(1) Old 0 3 "abc" New 0 5 "&#60;abc&#62;"
2(1) Old 6 9 "abc" New 8 13 "&#60;abc&#62;" 2(1) Old 6 9 "abc" New 8 13 "&#60;abc&#62;"
2: &#60;abc&#62;def&#60;abc&#62;pqr 2: &#60;abc&#62;def&#60;abc&#62;pqr
</pre> </pre>
The first number on each callout line is the count of matches. The The first number on each callout line is the count of matches. The
@ -1466,11 +1466,11 @@ listed the offsets of the old substring, its contents, and the same for the
replacement. replacement.
</P> </P>
<P> <P>
By default, the substitution callout function returns zero, which accepts the By default, the substitution callout function returns zero, which accepts the
replacement and causes matching to continue if /g was used. Two further replacement and causes matching to continue if /g was used. Two further
modifiers can be used to test other return values. If <b>substitute_skip</b> is modifiers can be used to test other return values. If <b>substitute_skip</b> is
set to a value greater than zero the callout function returns +1 for the match set to a value greater than zero the callout function returns +1 for the match
of that number, and similarly <b>substitute_stop</b> returns -1. These cause the of that number, and similarly <b>substitute_stop</b> returns -1. These cause the
replacement to be rejected, and -1 causes no further matching to take place. If replacement to be rejected, and -1 causes no further matching to take place. If
either of them are set, <b>substitute_callout</b> is assumed. For example: either of them are set, <b>substitute_callout</b> is assumed. For example:
<pre> <pre>
@ -1483,7 +1483,7 @@ either of them are set, <b>substitute_callout</b> is assumed. For example:
1(1) Old 0 3 "abc" New 0 5 "&#60;abc&#62; STOPPED" 1(1) Old 0 3 "abc" New 0 5 "&#60;abc&#62; STOPPED"
1: abcdefabcpqr 1: abcdefabcpqr
</pre> </pre>
If both are set for the same number, stop takes precedence. Only a single skip If both are set for the same number, stop takes precedence. Only a single skip
or stop is supported, which is sufficient for testing that the feature works. or stop is supported, which is sufficient for testing that the feature works.
</P> </P>
<br><b> <br><b>

View File

@ -82,7 +82,7 @@ The escape sequence \C can be used to match a single code unit in a UTF mode,
but its use can lead to some strange effects because it breaks up multi-unit but its use can lead to some strange effects because it breaks up multi-unit
characters (see the description of \C in the characters (see the description of \C in the
<a href="pcre2pattern.html"><b>pcre2pattern</b></a> <a href="pcre2pattern.html"><b>pcre2pattern</b></a>
documentation). For this reason, there is a build-time option that disables documentation). For this reason, there is a build-time option that disables
support for \C completely. There is also a less draconian compile-time option support for \C completely. There is also a less draconian compile-time option
for locking out the use of \C when a pattern is compiled. for locking out the use of \C when a pattern is compiled.
</P> </P>
@ -144,14 +144,14 @@ scripts are commonly used together, and because some diacritical and other
marks are used with multiple scripts, it is not that simple. marks are used with multiple scripts, it is not that simple.
</P> </P>
<P> <P>
Every Unicode character has a Script property, mostly with a value Every Unicode character has a Script property, mostly with a value
corresponding to the name of a script, such as Latin, Greek, or Cyrillic. There corresponding to the name of a script, such as Latin, Greek, or Cyrillic. There
are also three special values: are also three special values:
</P> </P>
<P> <P>
"Unknown" is used for code points that have not been assigned, and also for the "Unknown" is used for code points that have not been assigned, and also for the
surrogate code points. In the PCRE2 32-bit library, characters whose code surrogate code points. In the PCRE2 32-bit library, characters whose code
points are greater than the Unicode maximum (U+10FFFF), which are accessible points are greater than the Unicode maximum (U+10FFFF), which are accessible
only in non-UTF mode, are assigned the Unknown script. only in non-UTF mode, are assigned the Unknown script.
</P> </P>
<P> <P>
@ -165,20 +165,20 @@ previous character. These are considered to take on the script of the character
that they modify. that they modify.
</P> </P>
<P> <P>
Some Inherited characters are used with many scripts, but many of them are only Some Inherited characters are used with many scripts, but many of them are only
normally used with a small number of scripts. For example, U+102E0 (Coptic normally used with a small number of scripts. For example, U+102E0 (Coptic
Epact thousands mark) is used only with Arabic and Coptic. In order to make it Epact thousands mark) is used only with Arabic and Coptic. In order to make it
possible to check this, a Unicode property called Script Extension exists. Its possible to check this, a Unicode property called Script Extension exists. Its
value is a list of scripts that apply to the character. For the majority of value is a list of scripts that apply to the character. For the majority of
characters, the list contains just one script, the same one as the Script characters, the list contains just one script, the same one as the Script
property. However, for characters such as U+102E0 more than one Script is property. However, for characters such as U+102E0 more than one Script is
listed. There are also some Common characters that have a single, non-Common listed. There are also some Common characters that have a single, non-Common
script in their Script Extension list. script in their Script Extension list.
</P> </P>
<P> <P>
The next section describes the basic rules for deciding whether a given string The next section describes the basic rules for deciding whether a given string
of characters is a script run. Note, however, that there are some special cases of characters is a script run. Note, however, that there are some special cases
involving the Chinese Han script, and an additional constraint for decimal involving the Chinese Han script, and an additional constraint for decimal
digits. These are covered in subsequent sections. digits. These are covered in subsequent sections.
</P> </P>
<br><b> <br><b>
@ -201,17 +201,17 @@ all the sets of scripts must not be empty.
<P> <P>
A simple example is an Internet name such as "google.com". The letters are all A simple example is an Internet name such as "google.com". The letters are all
in the Latin script, and the dot is Common, so this string is a script run. in the Latin script, and the dot is Common, so this string is a script run.
However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a
string that looks the same, but with Cyrillic "o"s is not a script run. string that looks the same, but with Cyrillic "o"s is not a script run.
</P> </P>
<P> <P>
More interesting examples involve characters with more than one script in their More interesting examples involve characters with more than one script in their
Script Extension. Consider the following characters: Script Extension. Consider the following characters:
<pre> <pre>
U+060C Arabic comma U+060C Arabic comma
U+06D4 Arabic full stop U+06D4 Arabic full stop
</pre> </pre>
The first has the Script Extension list Arabic, Hanifi Rohingya, Syriac, and The first has the Script Extension list Arabic, Hanifi Rohingya, Syriac, and
Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could
appear in script runs of either Arabic or Hanifi Rohingya. The first could also appear in script runs of either Arabic or Hanifi Rohingya. The first could also
appear in Syriac or Thaana script runs, but the second could not. appear in Syriac or Thaana script runs, but the second could not.
@ -220,8 +220,8 @@ appear in Syriac or Thaana script runs, but the second could not.
The Chinese Han script The Chinese Han script
</b><br> </b><br>
<P> <P>
The Chinese Han script is commonly used in conjunction with other scripts for The Chinese Han script is commonly used in conjunction with other scripts for
writing certain languages. Japanese uses the Hiragana and Katakana scripts writing certain languages. Japanese uses the Hiragana and Katakana scripts
together with Han; Korean uses Hangul and Han; Taiwanese Mandarin uses Bopomofo together with Han; Korean uses Hangul and Han; Taiwanese Mandarin uses Bopomofo
and Han. These three combinations are treated as special cases when checking and Han. These three combinations are treated as special cases when checking
script runs and are, in effect, "virtual scripts". Thus, a script run may script runs and are, in effect, "virtual scripts". Thus, a script run may

View File

@ -180,8 +180,8 @@ REVISION
Last updated: 17 September 2018 Last updated: 17 September 2018
Copyright (c) 1997-2018 University of Cambridge. Copyright (c) 1997-2018 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2API(3) Library Functions Manual PCRE2API(3) PCRE2API(3) Library Functions Manual PCRE2API(3)
@ -3681,8 +3681,8 @@ REVISION
Last updated: 14 February 2019 Last updated: 14 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3) PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3)
@ -4027,45 +4027,48 @@ USING EBCDIC CODE
PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS PCRE2GREP SUPPORT FOR EXTERNAL SCRIPTS
By default, on non-Windows systems, pcre2grep supports the use of call- By default pcre2grep supports the use of callouts with string arguments
outs with string arguments within the patterns it is matching, in order within the patterns it is matching. There are two kinds: one that gen-
to run external scripts. For details, see the pcre2grep documentation. erates output using local code, and another that calls an external pro-
This support can be disabled by adding --disable-pcre2grep-callout to gram or script. If --disable-pcre2grep-callout-fork is added to the
the configure command. configure command, only the first kind of callout is supported; if
--disable-pcre2grep-callout is used, all callouts are completely
ignored. For more details of pcre2grep callouts, see the pcre2grep doc-
umentation.
PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT
By default, pcre2grep reads all files as plain text. You can build it By default, pcre2grep reads all files as plain text. You can build it
so that it recognizes files whose names end in .gz or .bz2, and reads so that it recognizes files whose names end in .gz or .bz2, and reads
them with libz or libbz2, respectively, by adding one or both of them with libz or libbz2, respectively, by adding one or both of
--enable-pcre2grep-libz --enable-pcre2grep-libz
--enable-pcre2grep-libbz2 --enable-pcre2grep-libbz2
to the configure command. These options naturally require that the rel- to the configure command. These options naturally require that the rel-
evant libraries are installed on your system. Configuration will fail evant libraries are installed on your system. Configuration will fail
if they are not. if they are not.
PCRE2GREP BUFFER SIZE PCRE2GREP BUFFER SIZE
pcre2grep uses an internal buffer to hold a "window" on the file it is pcre2grep uses an internal buffer to hold a "window" on the file it is
scanning, in order to be able to output "before" and "after" lines when scanning, in order to be able to output "before" and "after" lines when
it finds a match. The default starting size of the buffer is 20KiB. The it finds a match. The default starting size of the buffer is 20KiB. The
buffer itself is three times this size, but because of the way it is buffer itself is three times this size, but because of the way it is
used for holding "before" lines, the longest line that is guaranteed to used for holding "before" lines, the longest line that is guaranteed to
be processable is the notional buffer size. If a longer line is encoun- be processable is the notional buffer size. If a longer line is encoun-
tered, pcre2grep automatically expands the buffer, up to a specified tered, pcre2grep automatically expands the buffer, up to a specified
maximum size, whose default is 1MiB or the starting size, whichever is maximum size, whose default is 1MiB or the starting size, whichever is
the larger. You can change the default parameter values by adding, for the larger. You can change the default parameter values by adding, for
example, example,
--with-pcre2grep-bufsize=51200 --with-pcre2grep-bufsize=51200
--with-pcre2grep-max-bufsize=2097152 --with-pcre2grep-max-bufsize=2097152
to the configure command. The caller of pcre2grep can override these to the configure command. The caller of pcre2grep can override these
values by using --buffer-size and --max-buffer-size on the command values by using --buffer-size and --max-buffer-size on the command
line. line.
@ -4076,26 +4079,26 @@ PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
--enable-pcre2test-libreadline --enable-pcre2test-libreadline
--enable-pcre2test-libedit --enable-pcre2test-libedit
to the configure command, pcre2test is linked with the libreadline to the configure command, pcre2test is linked with the libreadline
orlibedit library, respectively, and when its input is from a terminal, orlibedit library, respectively, and when its input is from a terminal,
it reads it using the readline() function. This provides line-editing it reads it using the readline() function. This provides line-editing
and history facilities. Note that libreadline is GPL-licensed, so if and history facilities. Note that libreadline is GPL-licensed, so if
you distribute a binary of pcre2test linked in this way, there may be you distribute a binary of pcre2test linked in this way, there may be
licensing issues. These can be avoided by linking instead with libedit, licensing issues. These can be avoided by linking instead with libedit,
which has a BSD licence. which has a BSD licence.
Setting --enable-pcre2test-libreadline causes the -lreadline option to Setting --enable-pcre2test-libreadline causes the -lreadline option to
be added to the pcre2test build. In many operating environments with a be added to the pcre2test build. In many operating environments with a
sytem-installed readline library this is sufficient. However, in some sytem-installed readline library this is sufficient. However, in some
environments (e.g. if an unmodified distribution version of readline is environments (e.g. if an unmodified distribution version of readline is
in use), some extra configuration may be necessary. The INSTALL file in use), some extra configuration may be necessary. The INSTALL file
for libreadline says this: for libreadline says this:
"Readline uses the termcap functions, but does not link with "Readline uses the termcap functions, but does not link with
the termcap or curses library itself, allowing applications the termcap or curses library itself, allowing applications
which link with readline the to choose an appropriate library." which link with readline the to choose an appropriate library."
If your environment has not been set up so that an appropriate library If your environment has not been set up so that an appropriate library
is automatically included, you may need to add something like is automatically included, you may need to add something like
LIBS="-ncurses" LIBS="-ncurses"
@ -4109,7 +4112,7 @@ INCLUDING DEBUGGING CODE
--enable-debug --enable-debug
to the configure command, additional debugging code is included in the to the configure command, additional debugging code is included in the
build. This feature is intended for use by the PCRE2 maintainers. build. This feature is intended for use by the PCRE2 maintainers.
@ -4119,15 +4122,15 @@ DEBUGGING WITH VALGRIND SUPPORT
--enable-valgrind --enable-valgrind
to the configure command, PCRE2 will use valgrind annotations to mark to the configure command, PCRE2 will use valgrind annotations to mark
certain memory regions as unaddressable. This allows it to detect certain memory regions as unaddressable. This allows it to detect
invalid memory accesses, and is mostly useful for debugging PCRE2 invalid memory accesses, and is mostly useful for debugging PCRE2
itself. itself.
CODE COVERAGE REPORTING CODE COVERAGE REPORTING
If your C compiler is gcc, you can build a version of PCRE2 that can If your C compiler is gcc, you can build a version of PCRE2 that can
generate a code coverage report for its test suite. To enable this, you generate a code coverage report for its test suite. To enable this, you
must install lcov version 1.6 or above. Then specify must install lcov version 1.6 or above. Then specify
@ -4136,20 +4139,20 @@ CODE COVERAGE REPORTING
to the configure command and build PCRE2 in the usual way. to the configure command and build PCRE2 in the usual way.
Note that using ccache (a caching C compiler) is incompatible with code Note that using ccache (a caching C compiler) is incompatible with code
coverage reporting. If you have configured ccache to run automatically coverage reporting. If you have configured ccache to run automatically
on your system, you must set the environment variable on your system, you must set the environment variable
CCACHE_DISABLE=1 CCACHE_DISABLE=1
before running make to build PCRE2, so that ccache is not used. before running make to build PCRE2, so that ccache is not used.
When --enable-coverage is used, the following addition targets are When --enable-coverage is used, the following addition targets are
added to the Makefile: added to the Makefile:
make coverage make coverage
This creates a fresh coverage report for the PCRE2 test suite. It is This creates a fresh coverage report for the PCRE2 test suite. It is
equivalent to running "make coverage-reset", "make coverage-baseline", equivalent to running "make coverage-reset", "make coverage-baseline",
"make check", and then "make coverage-report". "make check", and then "make coverage-report".
make coverage-reset make coverage-reset
@ -4166,28 +4169,28 @@ CODE COVERAGE REPORTING
make coverage-clean-report make coverage-clean-report
This removes the generated coverage report without cleaning the cover- This removes the generated coverage report without cleaning the cover-
age data itself. age data itself.
make coverage-clean-data make coverage-clean-data
This removes the captured coverage data without removing the coverage This removes the captured coverage data without removing the coverage
files created at compile time (*.gcno). files created at compile time (*.gcno).
make coverage-clean make coverage-clean
This cleans all coverage data including the generated coverage report. This cleans all coverage data including the generated coverage report.
For more information about code coverage, see the gcov and lcov docu- For more information about code coverage, see the gcov and lcov docu-
mentation. mentation.
DISABLING THE Z AND T FORMATTING MODIFIERS DISABLING THE Z AND T FORMATTING MODIFIERS
The C99 standard defines formatting modifiers z and t for size_t and The C99 standard defines formatting modifiers z and t for size_t and
ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers ptrdiff_t values, respectively. By default, PCRE2 uses these modifiers
in environments other than Microsoft Visual Studio when __STDC_VER- in environments other than Microsoft Visual Studio when __STDC_VER-
SION__ is defined and has a value greater than or equal to 199901L SION__ is defined and has a value greater than or equal to 199901L
(indicating C99). However, there is at least one environment that (indicating C99). However, there is at least one environment that
claims to be C99 but does not support these modifiers. If claims to be C99 but does not support these modifiers. If
--disable-percent-zt --disable-percent-zt
@ -4198,39 +4201,39 @@ DISABLING THE Z AND T FORMATTING MODIFIERS
SUPPORT FOR FUZZERS SUPPORT FOR FUZZERS
There is a special option for use by people who want to run fuzzing There is a special option for use by people who want to run fuzzing
tests on PCRE2: tests on PCRE2:
--enable-fuzz-support --enable-fuzz-support
At present this applies only to the 8-bit library. If set, it causes an At present this applies only to the 8-bit library. If set, it causes an
extra library called libpcre2-fuzzsupport.a to be built, but not extra library called libpcre2-fuzzsupport.a to be built, but not
installed. This contains a single function called LLVMFuzzerTestOneIn- installed. This contains a single function called LLVMFuzzerTestOneIn-
put() whose arguments are a pointer to a string and the length of the put() whose arguments are a pointer to a string and the length of the
string. When called, this function tries to compile the string as a string. When called, this function tries to compile the string as a
pattern, and if that succeeds, to match it. This is done both with no pattern, and if that succeeds, to match it. This is done both with no
options and with some random options bits that are generated from the options and with some random options bits that are generated from the
string. string.
Setting --enable-fuzz-support also causes a binary called pcre2fuz- Setting --enable-fuzz-support also causes a binary called pcre2fuz-
zcheck to be created. This is normally run under valgrind or used when zcheck to be created. This is normally run under valgrind or used when
PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing PCRE2 is compiled with address sanitizing enabled. It calls the fuzzing
function and outputs information about what it is doing. The input function and outputs information about what it is doing. The input
strings are specified by arguments: if an argument starts with "=" the strings are specified by arguments: if an argument starts with "=" the
rest of it is a literal input string. Otherwise, it is assumed to be a rest of it is a literal input string. Otherwise, it is assumed to be a
file name, and the contents of the file are the test string. file name, and the contents of the file are the test string.
OBSOLETE OPTION OBSOLETE OPTION
In versions of PCRE2 prior to 10.30, there were two ways of handling In versions of PCRE2 prior to 10.30, there were two ways of handling
backtracking in the pcre2_match() function. The default was to use the backtracking in the pcre2_match() function. The default was to use the
system stack, but if system stack, but if
--disable-stack-for-recursion --disable-stack-for-recursion
was set, memory on the heap was used. From release 10.30 onwards this was set, memory on the heap was used. From release 10.30 onwards this
has changed (the stack is no longer used) and this option now does has changed (the stack is no longer used) and this option now does
nothing except give a warning. nothing except give a warning.
@ -4248,11 +4251,11 @@ AUTHOR
REVISION REVISION
Last updated: 15 November 2018 Last updated: 03 March 2019
Copyright (c) 1997-2018 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3) PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3)
@ -4682,8 +4685,8 @@ REVISION
Last updated: 03 February 2019 Last updated: 03 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3) PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3)
@ -4887,8 +4890,8 @@ REVISION
Last updated: 12 February 2019 Last updated: 12 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2JIT(3) Library Functions Manual PCRE2JIT(3) PCRE2JIT(3) Library Functions Manual PCRE2JIT(3)
@ -5287,8 +5290,8 @@ REVISION
Last updated: 16 October 2018 Last updated: 16 October 2018
Copyright (c) 1997-2018 University of Cambridge. Copyright (c) 1997-2018 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3) PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3)
@ -5357,8 +5360,8 @@ REVISION
Last updated: 02 February 2019 Last updated: 02 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3) PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3)
@ -5578,8 +5581,8 @@ REVISION
Last updated: 10 October 2018 Last updated: 10 October 2018
Copyright (c) 1997-2018 University of Cambridge. Copyright (c) 1997-2018 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3) PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3)
@ -6018,8 +6021,8 @@ REVISION
Last updated: 22 December 2014 Last updated: 22 December 2014
Copyright (c) 1997-2014 University of Cambridge. Copyright (c) 1997-2014 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3) PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3)
@ -9362,8 +9365,8 @@ REVISION
Last updated: 12 February 2019 Last updated: 12 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3) PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3)
@ -9597,8 +9600,8 @@ REVISION
Last updated: 03 February 2019 Last updated: 03 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3) PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3)
@ -9927,8 +9930,8 @@ REVISION
Last updated: 30 January 2019 Last updated: 30 January 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3) PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3)
@ -10206,8 +10209,8 @@ REVISION
Last updated: 27 June 2018 Last updated: 27 June 2018
Copyright (c) 1997-2018 University of Cambridge. Copyright (c) 1997-2018 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3) PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3)
@ -10707,8 +10710,8 @@ REVISION
Last updated: 11 February 2019 Last updated: 11 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3) PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3)
@ -11079,5 +11082,5 @@ REVISION
Last updated: 03 February 2019 Last updated: 03 February 2019
Copyright (c) 1997-2019 University of Cambridge. Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------

View File

@ -75,7 +75,7 @@ PCRE2_UTF, PCRE2_UCP and related options.
.P .P
Additional options may be set in the compile context via the Additional options may be set in the compile context via the
.\" HREF .\" HREF
\fBpcre2_set_compile_extra_options\fP \fBpcre2_set_compile_extra_options\fP
.\" .\"
function. function.
.P .P

View File

@ -40,7 +40,7 @@ characters. The options are:
.sp .sp
PCRE2_ANCHORED Match only at the first position PCRE2_ANCHORED Match only at the first position
PCRE2_COPY_MATCHED_SUBJECT PCRE2_COPY_MATCHED_SUBJECT
On success, make a private subject copy On success, make a private subject copy
PCRE2_ENDANCHORED Pattern can match only at end of subject PCRE2_ENDANCHORED Pattern can match only at end of subject
PCRE2_NOTBOL Subject is not the beginning of a line PCRE2_NOTBOL Subject is not the beginning of a line
PCRE2_NOTEOL Subject is not the end of a line PCRE2_NOTEOL Subject is not the end of a line

View File

@ -49,7 +49,7 @@ terminated by a binary zero code unit. The options are:
.sp .sp
PCRE2_ANCHORED Match only at the first position PCRE2_ANCHORED Match only at the first position
PCRE2_COPY_MATCHED_SUBJECT PCRE2_COPY_MATCHED_SUBJECT
On success, make a private subject copy On success, make a private subject copy
PCRE2_ENDANCHORED Pattern can match only at end of subject PCRE2_ENDANCHORED Pattern can match only at end of subject
PCRE2_NOTBOL Subject string is not the beginning of a line PCRE2_NOTBOL Subject string is not the beginning of a line
PCRE2_NOTEOL Subject string is not the end of a line PCRE2_NOTEOL Subject string is not the end of a line

View File

@ -18,7 +18,7 @@ If \fImatch_data\fP is NULL, this function does nothing. Otherwise,
using the memory freeing function from the general context or compiled pattern using the memory freeing function from the general context or compiled pattern
with which it was created, or \fBfree()\fP if that was not set. with which it was created, or \fBfree()\fP if that was not set.
.P .P
If the PCRE2_COPY_MATCHED_SUBJECT was used for a successful match using this If the PCRE2_COPY_MATCHED_SUBJECT was used for a successful match using this
match data block, the copy of the subject that was remembered with the block is match data block, the copy of the subject that was remembered with the block is
also freed. also freed.
.P .P

View File

@ -23,7 +23,7 @@ options are:
in UTF-8 and UTF-32 modes in UTF-8 and UTF-32 modes
.\" JOIN .\" JOIN
PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and \ex PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and \ex
handling handling
.\" JOIN .\" JOIN
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as
a literal following character a literal following character

View File

@ -247,7 +247,7 @@ document for an overview of all the PCRE2 documentation.
.sp .sp
.B const unsigned char *pcre2_maketables(pcre2_general_context *\fIgcontext\fP); .B const unsigned char *pcre2_maketables(pcre2_general_context *\fIgcontext\fP);
.sp .sp
.B int pcre2_pattern_info(const pcre2_code *\fIcode\fP, uint32_t \fIwhat\fP, .B int pcre2_pattern_info(const pcre2_code *\fIcode\fP, uint32_t \fIwhat\fP,
.B " void *\fIwhere\fP);" .B " void *\fIwhere\fP);"
.sp .sp
.B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP, .B int pcre2_callout_enumerate(const pcre2_code *\fIcode\fP,
@ -1244,7 +1244,7 @@ until after all operations on the
.\" </a> .\" </a>
match data block match data block
.\" .\"
have taken place, unless, in the case of the subject string, you have used the have taken place, unless, in the case of the subject string, you have used the
PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled PCRE2_COPY_MATCHED_SUBJECT option, which is described in the section entitled
"Option bits for \fBpcre2_match()\fP" "Option bits for \fBpcre2_match()\fP"
.\" HTML <a href="#matchoptions>"> .\" HTML <a href="#matchoptions>">
@ -1375,8 +1375,8 @@ the PCRE2_EXTRA_ALT_BSUX extra option (see "Extra compile options"
.\" </a> .\" </a>
below). below).
.\" .\"
Note that this alternative escape handling applies only to patterns. Neither of Note that this alternative escape handling applies only to patterns. Neither of
these options affects the processing of replacement strings passed to these options affects the processing of replacement strings passed to
\fBpcre2_substitute()\fP. \fBpcre2_substitute()\fP.
.sp .sp
PCRE2_ALT_CIRCUMFLEX PCRE2_ALT_CIRCUMFLEX
@ -1832,10 +1832,10 @@ characters if the matching function is called with PCRE2_NO_UTF_CHECK set.
.sp .sp
PCRE2_EXTRA_ALT_BSUX PCRE2_EXTRA_ALT_BSUX
.sp .sp
The original option PCRE2_ALT_BSUX causes PCRE2 to process \eU, \eu, and \ex in The original option PCRE2_ALT_BSUX causes PCRE2 to process \eU, \eu, and \ex in
the way that ECMAscript (aka JavaScript) does. Additional functionality was the way that ECMAscript (aka JavaScript) does. Additional functionality was
defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of defined by ECMAscript 6; setting PCRE2_EXTRA_ALT_BSUX has the effect of
PCRE2_ALT_BSUX, but in addition it recognizes \eu{hhh..} as a hexadecimal PCRE2_ALT_BSUX, but in addition it recognizes \eu{hhh..} as a hexadecimal
character code, where hhh.. is any number of hexadecimal digits. character code, where hhh.. is any number of hexadecimal digits.
.sp .sp
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL
@ -1852,7 +1852,7 @@ If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to
\fBpcre2_compile()\fP, all unrecognized or malformed escape sequences are \fBpcre2_compile()\fP, all unrecognized or malformed escape sequences are
treated as single-character escapes. For example, \ej is a literal "j" and treated as single-character escapes. For example, \ej is a literal "j" and
\ex{2z} is treated as the literal string "x{2z}". Setting this option means \ex{2z} is treated as the literal string "x{2z}". Setting this option means
that typos in patterns may go undetected and have unexpected results. Also note that typos in patterns may go undetected and have unexpected results. Also note
that a sequence such as [\eN{] is interpreted as a malformed attempt at that a sequence such as [\eN{] is interpreted as a malformed attempt at
[\eN{...}] and so is treated as [N{] whereas [\eN] gives an error because an [\eN{...}] and so is treated as [N{] whereas [\eN] gives an error because an
unqualified \eN is a valid escape sequence but is not supported in a character unqualified \eN is a valid escape sequence but is not supported in a character
@ -1860,9 +1860,9 @@ class. To reiterate: this is a dangerous option. Use with great care.
.sp .sp
PCRE2_EXTRA_ESCAPED_CR_IS_LF PCRE2_EXTRA_ESCAPED_CR_IS_LF
.sp .sp
There are some legacy applications where the escape sequence \er in a pattern There are some legacy applications where the escape sequence \er in a pattern
is expected to match a newline. If this option is set, \er in a pattern is is expected to match a newline. If this option is set, \er in a pattern is
converted to \en so that it matches a LF (linefeed) instead of a CR (carriage converted to \en so that it matches a LF (linefeed) instead of a CR (carriage
return) character. The option does not affect a literal CR in the pattern, nor return) character. The option does not affect a literal CR in the pattern, nor
does it affect CR specified as an explicit code point such as \ex{0D}. does it affect CR specified as an explicit code point such as \ex{0D}.
.sp .sp
@ -2547,7 +2547,7 @@ the use of .* with PCRE2_DOTALL, not by starting the pattern with ^ or \eA.
.rs .rs
.sp .sp
The unused bits of the \fIoptions\fP argument for \fBpcre2_match()\fP must be The unused bits of the \fIoptions\fP argument for \fBpcre2_match()\fP must be
zero. The only bits that may be set are PCRE2_ANCHORED, zero. The only bits that may be set are PCRE2_ANCHORED,
PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_COPY_MATCHED_SUBJECT, PCRE2_ENDANCHORED, PCRE2_NOTBOL, PCRE2_NOTEOL,
PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, PCRE2_NO_JIT, PCRE2_NO_UTF_CHECK,
PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below. PCRE2_PARTIAL_HARD, and PCRE2_PARTIAL_SOFT. Their action is described below.
@ -2567,8 +2567,8 @@ matching.
.sp .sp
PCRE2_COPY_MATCHED_SUBJECT PCRE2_COPY_MATCHED_SUBJECT
.sp .sp
By default, a pointer to the subject is remembered in the match data block so By default, a pointer to the subject is remembered in the match data block so
that, after a successful match, it can be referenced by the substring that, after a successful match, it can be referenced by the substring
extraction functions. This means that the subject's memory must not be freed extraction functions. This means that the subject's memory must not be freed
until all such operations are complete. For some applications where the until all such operations are complete. For some applications where the
lifetime of the subject string is not guaranteed, it may be necessary to make a lifetime of the subject string is not guaranteed, it may be necessary to make a
@ -2868,8 +2868,8 @@ undefined.
.P .P
After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure After a successful match, a partial match (PCRE2_ERROR_PARTIAL), or a failure
to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function to match (PCRE2_ERROR_NOMATCH), a mark name may be available. The function
\fBpcre2_get_mark()\fP can be called to access this name, which can be \fBpcre2_get_mark()\fP can be called to access this name, which can be
specified in the pattern by any of the backtracking control verbs, not just specified in the pattern by any of the backtracking control verbs, not just
(*MARK). The same function applies to all the verbs. It returns a pointer to (*MARK). The same function applies to all the verbs. It returns a pointer to
the zero-terminated name, which is within the compiled pattern. If no name is the zero-terminated name, which is within the compiled pattern. If no name is
available, NULL is returned. The length of the name (excluding the terminating available, NULL is returned. The length of the name (excluding the terminating
@ -3016,7 +3016,7 @@ The backtracking match limit was reached.
If a pattern contains many nested backtracking points, heap memory is used to If a pattern contains many nested backtracking points, heap memory is used to
remember them. This error is given when the memory allocation function (default remember them. This error is given when the memory allocation function (default
or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given or custom) fails. Note that a different error, PCRE2_ERROR_HEAPLIMIT, is given
if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is if the amount of memory needed exceeds the heap limit. PCRE2_ERROR_NOMEMORY is
also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails. also returned if PCRE2_COPY_MATCHED_SUBJECT is set and memory allocation fails.
.sp .sp
PCRE2_ERROR_NULL PCRE2_ERROR_NULL
@ -3407,7 +3407,7 @@ capture groups and letters within \eQ...\eE quoted sequences.
.P .P
Note that case forcing sequences such as \eU...\eE do not nest. For example, Note that case forcing sequences such as \eU...\eE do not nest. For example,
the result of processing "\eUaa\eLBB\eEcc\eE" is "AAbbcc"; the final \eE has no the result of processing "\eUaa\eLBB\eEcc\eE" is "AAbbcc"; the final \eE has no
effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do
not apply to not apply to replacement strings. not apply to not apply to replacement strings.
.P .P
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more
@ -3439,7 +3439,7 @@ The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended
substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause unknown
groups in the extended syntax forms to be treated as unset. groups in the extended syntax forms to be treated as unset.
.P .P
If successful, \fBpcre2_substitute()\fP returns the number of successful If successful, \fBpcre2_substitute()\fP returns the number of successful
matches. This may be zero if no matches were found, and is never greater than 1 matches. This may be zero if no matches were found, and is never greater than 1
unless PCRE2_SUBSTITUTE_GLOBAL is set. unless PCRE2_SUBSTITUTE_GLOBAL is set.
.P .P
@ -3487,8 +3487,8 @@ above).
.sp .sp
The \fBpcre2_set_substitution_callout()\fP function can be used to specify a The \fBpcre2_set_substitution_callout()\fP function can be used to specify a
callout function for \fBpcre2_substitute()\fP. This information is passed in callout function for \fBpcre2_substitute()\fP. This information is passed in
a match context. The callout function is called after each substitution has a match context. The callout function is called after each substitution has
been processed, but it can cause the replacement not to happen. The callout been processed, but it can cause the replacement not to happen. The callout
function is not called for simulated substitutions that happen as a result of function is not called for simulated substitutions that happen as a result of
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option.
.P .P
@ -3497,10 +3497,10 @@ block structure, which contains the following fields, not necessarily in this
order: order:
.sp .sp
uint32_t \fIversion\fP; uint32_t \fIversion\fP;
uint32_t \fIsubscount\fP; uint32_t \fIsubscount\fP;
PCRE2_SPTR \fIinput\fP; PCRE2_SPTR \fIinput\fP;
PCRE2_SPTR \fIoutput\fP; PCRE2_SPTR \fIoutput\fP;
PCRE2_SIZE \fI*ovector\fP; PCRE2_SIZE \fI*ovector\fP;
uint32_t \fIoveccount\fP; uint32_t \fIoveccount\fP;
PCRE2_SIZE \fIoutput_offsets[2]\fP; PCRE2_SIZE \fIoutput_offsets[2]\fP;
.sp .sp
@ -3512,9 +3512,9 @@ The \fIsubscount\fP field is the number of the current match. It is 1 for the
first callout, 2 for the second, and so on. The \fIinput\fP and \fIoutput\fP first callout, 2 for the second, and so on. The \fIinput\fP and \fIoutput\fP
pointers are copies of the values passed to \fBpcre2_substitute()\fP. pointers are copies of the values passed to \fBpcre2_substitute()\fP.
.P .P
The \fIovector\fP field points to the ovector, which contains the result of the The \fIovector\fP field points to the ovector, which contains the result of the
most recent match. The \fIoveccount\fP field contains the number of pairs that most recent match. The \fIoveccount\fP field contains the number of pairs that
are set in the ovector, and is always greater than zero. are set in the ovector, and is always greater than zero.
.P .P
The \fIoutput_offsets\fP vector contains the offsets of the replacement in the The \fIoutput_offsets\fP vector contains the offsets of the replacement in the
output string. This has already been processed for dollar and (if requested) output string. This has already been processed for dollar and (if requested)

View File

@ -33,7 +33,7 @@ is described in the
.\" HREF .\" HREF
\fBpcre2api\fP \fBpcre2api\fP
.\" .\"
documentation; the rest of this document is concerned with callouts during documentation; the rest of this document is concerned with callouts during
pattern matching. pattern matching.
.P .P
Within a regular expression, (?C<arg>) indicates a point at which the external Within a regular expression, (?C<arg>) indicates a point at which the external

View File

@ -778,8 +778,8 @@ only callouts with string arguments are useful.
.SS "Calling external programs or scripts" .SS "Calling external programs or scripts"
.rs .rs
.sp .sp
This facility can be independently disabled when \fBpcre2grep\fP is built. It This facility can be independently disabled when \fBpcre2grep\fP is built. It
is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS, is supported for Windows, where a call to \fB_spawnvp()\fP is used, for VMS,
where \fBlib$spawn()\fP is used, and for any other Unix-like environment where where \fBlib$spawn()\fP is used, and for any other Unix-like environment where
\fBfork()\fP and \fBexecv()\fP are available. \fBfork()\fP and \fBexecv()\fP are available.
.P .P

View File

@ -390,12 +390,12 @@ two compile-time options. If PCRE2_ALT_BSUX is set, the sequence \ex followed
by { is not recognized. Only if \ex is followed by two hexadecimal digits is it by { is not recognized. Only if \ex is followed by two hexadecimal digits is it
recognized as a character escape. Otherwise it is interpreted as a literal "x" recognized as a character escape. Otherwise it is interpreted as a literal "x"
character. In this mode, support for code points greater than 256 is provided character. In this mode, support for code points greater than 256 is provided
by \eu, which must be followed by four hexadecimal digits; otherwise it is by \eu, which must be followed by four hexadecimal digits; otherwise it is
interpreted as a literal "u" character. interpreted as a literal "u" character.
.P .P
PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition, PCRE2_EXTRA_ALT_BSUX has the same effect as PCRE2_ALT_BSUX and, in addition,
\eu{hhh..} is recognized as the character specified by hexadecimal code point. \eu{hhh..} is recognized as the character specified by hexadecimal code point.
There may be any number of hexadecimal digits. This syntax is from ECMAScript There may be any number of hexadecimal digits. This syntax is from ECMAScript
6. 6.
.P .P
The \eN{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option The \eN{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
@ -1188,7 +1188,7 @@ character. If any other of these assertions appears in a character class, an
A word boundary is a position in the subject string where the current character A word boundary is a position in the subject string where the current character
and the previous character do not both match \ew or \eW (i.e. one matches and the previous character do not both match \ew or \eW (i.e. one matches
\ew and the other matches \eW), or the start or end of the string if the \ew and the other matches \eW), or the start or end of the string if the
first or last character matches \ew, respectively. When PCRE2 is built with first or last character matches \ew, respectively. When PCRE2 is built with
Unicode support, the meanings of \ew and \eW can be changed by setting the Unicode support, the meanings of \ew and \eW can be changed by setting the
PCRE2_UCP option. When this is done, it also affects \eb and \eB. Neither PCRE2 PCRE2_UCP option. When this is done, it also affects \eb and \eB. Neither PCRE2
nor Perl has a separate "start of word" or "end of word" metasequence. However, nor Perl has a separate "start of word" or "end of word" metasequence. However,

View File

@ -29,12 +29,12 @@ and 32-bit libraries. See the
\fBpcre2api\fP \fBpcre2api\fP
.\" .\"
documentation for a description of PCRE2's native API, which contains much documentation for a description of PCRE2's native API, which contains much
additional functionality. additional functionality.
.P .P
The functions described here are wrapper functions that ultimately call the The functions described here are wrapper functions that ultimately call the
PCRE2 native API. Their prototypes are defined in the \fBpcre2posix.h\fP header PCRE2 native API. Their prototypes are defined in the \fBpcre2posix.h\fP header
file, and they all have unique names starting with \fBpcre2_\fP. However, the file, and they all have unique names starting with \fBpcre2_\fP. However, the
\fBpcre2posix.h\fP header also contains macro definitions that convert the \fBpcre2posix.h\fP header also contains macro definitions that convert the
standard POSIX names such \fBregcomp()\fP into \fBpcre2_regcomp()\fP etc. This standard POSIX names such \fBregcomp()\fP into \fBpcre2_regcomp()\fP etc. This
means that a program can use the usual POSIX names without running the risk of means that a program can use the usual POSIX names without running the risk of
accidentally linking with POSIX functions from a different library. accidentally linking with POSIX functions from a different library.
@ -44,7 +44,7 @@ can be accessed by adding \fB-lpcre2-posix\fP to the command for linking an
application. Because the POSIX functions call the native ones, it is also application. Because the POSIX functions call the native ones, it is also
necessary to add \fB-lpcre2-8\fP. necessary to add \fB-lpcre2-8\fP.
.P .P
Although they are not defined as protypes in \fBpcre2posix.h\fP, the library Although they are not defined as protypes in \fBpcre2posix.h\fP, the library
does contain functions with the POSIX names \fBregcomp()\fP etc. These simply does contain functions with the POSIX names \fBregcomp()\fP etc. These simply
pass their arguments to the PCRE2 functions. These functions are provided for pass their arguments to the PCRE2 functions. These functions are provided for
backwards compatibility with earlier versions of PCRE2, so that existing backwards compatibility with earlier versions of PCRE2, so that existing

View File

@ -22,7 +22,7 @@ documentation. This document contains a quick-reference summary of the syntax.
.SH "ESCAPED CHARACTERS" .SH "ESCAPED CHARACTERS"
.rs .rs
.sp .sp
This table applies to ASCII and Unicode environments. An unrecognized escape This table applies to ASCII and Unicode environments. An unrecognized escape
sequence causes an error. sequence causes an error.
.sp .sp
\ea alarm, that is, the BEL character (hex 07) \ea alarm, that is, the BEL character (hex 07)
@ -49,7 +49,7 @@ following are also recognized:
When \ex is not followed by {, from zero to two hexadecimal digits are read, When \ex is not followed by {, from zero to two hexadecimal digits are read,
but in ALT_BSUX mode \ex must be followed by two hexadecimal digits to be but in ALT_BSUX mode \ex must be followed by two hexadecimal digits to be
recognized as a hexadecimal escape; otherwise it matches a literal "x". recognized as a hexadecimal escape; otherwise it matches a literal "x".
Likewise, if \eu (in ALT_BSUX mode) is not followed by four hexadecimal digits Likewise, if \eu (in ALT_BSUX mode) is not followed by four hexadecimal digits
or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it or (in EXTRA_ALT_BSUX mode) a sequence of hex digits in curly brackets, it
matches a literal "u". matches a literal "u".
.P .P

View File

@ -565,10 +565,10 @@ for a description of the effects of these options.
/s dotall set PCRE2_DOTALL /s dotall set PCRE2_DOTALL
dupnames set PCRE2_DUPNAMES dupnames set PCRE2_DUPNAMES
endanchored set PCRE2_ENDANCHORED endanchored set PCRE2_ENDANCHORED
escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF
/x extended set PCRE2_EXTENDED /x extended set PCRE2_EXTENDED
/xx extended_more set PCRE2_EXTENDED_MORE /xx extended_more set PCRE2_EXTENDED_MORE
extra_alt_bsux set PCRE2_EXTRA_ALT_BSUX extra_alt_bsux set PCRE2_EXTRA_ALT_BSUX
firstline set PCRE2_FIRSTLINE firstline set PCRE2_FIRSTLINE
literal set PCRE2_LITERAL literal set PCRE2_LITERAL
match_line set PCRE2_EXTRA_MATCH_LINE match_line set PCRE2_EXTRA_MATCH_LINE
@ -1005,7 +1005,7 @@ process.
aftertext show text after match aftertext show text after match
allaftertext show text after captures allaftertext show text after captures
allcaptures show all captures allcaptures show all captures
allvector show the entire ovector allvector show the entire ovector
allusedtext show all consulted text allusedtext show all consulted text
altglobal alternative global matching altglobal alternative global matching
/g global global matching /g global global matching
@ -1013,9 +1013,9 @@ process.
mark show mark values mark show mark values
replace=<string> specify a replacement string replace=<string> specify a replacement string
startchar show starting character when relevant startchar show starting character when relevant
substitute_callout use substitution callouts substitute_callout use substitution callouts
substitute_extended use PCRE2_SUBSTITUTE_EXTENDED substitute_extended use PCRE2_SUBSTITUTE_EXTENDED
substitute_skip=<n> skip substitution number n substitute_skip=<n> skip substitution number n
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
substitute_stop=<n> skip substitution number n and greater substitute_stop=<n> skip substitution number n and greater
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
@ -1160,7 +1160,7 @@ pattern.
aftertext show text after match aftertext show text after match
allaftertext show text after captures allaftertext show text after captures
allcaptures show all captures allcaptures show all captures
allvector show the entire ovector allvector show the entire ovector
allusedtext show all consulted text (non-JIT only) allusedtext show all consulted text (non-JIT only)
altglobal alternative global matching altglobal alternative global matching
callout_capture show captures at callout time callout_capture show captures at callout time
@ -1190,9 +1190,9 @@ pattern.
replace=<string> specify a replacement string replace=<string> specify a replacement string
startchar show startchar when relevant startchar show startchar when relevant
startoffset=<n> same as offset=<n> startoffset=<n> same as offset=<n>
substitute_callout use substitution callouts substitute_callout use substitution callouts
substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED substitute_extedded use PCRE2_SUBSTITUTE_EXTENDED
substitute_skip=<n> skip substitution number n substitute_skip=<n> skip substitution number n
substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH substitute_overflow_length use PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
substitute_stop=<n> skip substitution number n and greater substitute_stop=<n> skip substitution number n and greater
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
@ -1273,9 +1273,9 @@ result, and also for DFA matching, provides a means of checking that there are
no unexpected modifications to ovector fields. Before each match attempt, the no unexpected modifications to ovector fields. Before each match attempt, the
ovector is filled with a special value, and if this is found in both elements ovector is filled with a special value, and if this is found in both elements
of a capturing pair, "<unchanged>" is output. After a successful match, this of a capturing pair, "<unchanged>" is output. After a successful match, this
applies to all groups after the maximum capture group for the pattern. In other applies to all groups after the maximum capture group for the pattern. In other
cases it applies to the entire ovector. After a partial match, the first two cases it applies to the entire ovector. After a partial match, the first two
elements are the only ones that should be set. After a DFA match, the amount of elements are the only ones that should be set. After a DFA match, the amount of
ovector that is used depends on the number of matches that were found. ovector that is used depends on the number of matches that were found.
. .
. .
@ -1288,13 +1288,13 @@ controlled by various modifiers listed above whose names begin with
\fBcallout_\fP. Details are given in the section entitled "Callouts" \fBcallout_\fP. Details are given in the section entitled "Callouts"
.\" HTML <a href="#callouts"> .\" HTML <a href="#callouts">
.\" </a> .\" </a>
below. below.
.\" .\"
Testing callouts from \fBpcre2_substitute()\fP is decribed separately in Testing callouts from \fBpcre2_substitute()\fP is decribed separately in
"Testing the substitution function" "Testing the substitution function"
.\" HTML <a href="#substitution"> .\" HTML <a href="#substitution">
.\" </a> .\" </a>
below. below.
.\" .\"
. .
. .
@ -1416,14 +1416,14 @@ matching provokes an error return ("bad option value") from
.SS "Testing substitute callouts" .SS "Testing substitute callouts"
.rs .rs
.sp .sp
If the \fBsubstitute_callout\fP modifier is set, a substitution callout If the \fBsubstitute_callout\fP modifier is set, a substitution callout
function is set up. When it is called (after each substitution), details of the function is set up. When it is called (after each substitution), details of the
the input and output strings are output. For example: the input and output strings are output. For example:
.sp .sp
/abc/g,replace=<$0>,substitute_callout /abc/g,replace=<$0>,substitute_callout
abcdefabcpqr abcdefabcpqr
1(1) Old 0 3 "abc" New 0 5 "<abc>" 1(1) Old 0 3 "abc" New 0 5 "<abc>"
2(1) Old 6 9 "abc" New 8 13 "<abc>" 2(1) Old 6 9 "abc" New 8 13 "<abc>"
2: <abc>def<abc>pqr 2: <abc>def<abc>pqr
.sp .sp
The first number on each callout line is the count of matches. The The first number on each callout line is the count of matches. The
@ -1432,11 +1432,11 @@ is, one more than the number of capturing groups that were set). Then are
listed the offsets of the old substring, its contents, and the same for the listed the offsets of the old substring, its contents, and the same for the
replacement. replacement.
.P .P
By default, the substitution callout function returns zero, which accepts the By default, the substitution callout function returns zero, which accepts the
replacement and causes matching to continue if /g was used. Two further replacement and causes matching to continue if /g was used. Two further
modifiers can be used to test other return values. If \fBsubstitute_skip\fP is modifiers can be used to test other return values. If \fBsubstitute_skip\fP is
set to a value greater than zero the callout function returns +1 for the match set to a value greater than zero the callout function returns +1 for the match
of that number, and similarly \fBsubstitute_stop\fP returns -1. These cause the of that number, and similarly \fBsubstitute_stop\fP returns -1. These cause the
replacement to be rejected, and -1 causes no further matching to take place. If replacement to be rejected, and -1 causes no further matching to take place. If
either of them are set, \fBsubstitute_callout\fP is assumed. For example: either of them are set, \fBsubstitute_callout\fP is assumed. For example:
.sp .sp
@ -1449,7 +1449,7 @@ either of them are set, \fBsubstitute_callout\fP is assumed. For example:
1(1) Old 0 3 "abc" New 0 5 "<abc> STOPPED" 1(1) Old 0 3 "abc" New 0 5 "<abc> STOPPED"
1: abcdefabcpqr 1: abcdefabcpqr
.sp .sp
If both are set for the same number, stop takes precedence. Only a single skip If both are set for the same number, stop takes precedence. Only a single skip
or stop is supported, which is sufficient for testing that the feature works. or stop is supported, which is sufficient for testing that the feature works.
. .
. .

View File

@ -72,7 +72,7 @@ characters (see the description of \eC in the
.\" HREF .\" HREF
\fBpcre2pattern\fP \fBpcre2pattern\fP
.\" .\"
documentation). For this reason, there is a build-time option that disables documentation). For this reason, there is a build-time option that disables
support for \eC completely. There is also a less draconian compile-time option support for \eC completely. There is also a less draconian compile-time option
for locking out the use of \eC when a pattern is compiled. for locking out the use of \eC when a pattern is compiled.
.P .P
@ -135,13 +135,13 @@ characters that are all from the same Unicode script. However, because some
scripts are commonly used together, and because some diacritical and other scripts are commonly used together, and because some diacritical and other
marks are used with multiple scripts, it is not that simple. marks are used with multiple scripts, it is not that simple.
.P .P
Every Unicode character has a Script property, mostly with a value Every Unicode character has a Script property, mostly with a value
corresponding to the name of a script, such as Latin, Greek, or Cyrillic. There corresponding to the name of a script, such as Latin, Greek, or Cyrillic. There
are also three special values: are also three special values:
.P .P
"Unknown" is used for code points that have not been assigned, and also for the "Unknown" is used for code points that have not been assigned, and also for the
surrogate code points. In the PCRE2 32-bit library, characters whose code surrogate code points. In the PCRE2 32-bit library, characters whose code
points are greater than the Unicode maximum (U+10FFFF), which are accessible points are greater than the Unicode maximum (U+10FFFF), which are accessible
only in non-UTF mode, are assigned the Unknown script. only in non-UTF mode, are assigned the Unknown script.
.P .P
"Common" is used for characters that are used with many scripts. These include "Common" is used for characters that are used with many scripts. These include
@ -152,19 +152,19 @@ digits 0 to 9.
previous character. These are considered to take on the script of the character previous character. These are considered to take on the script of the character
that they modify. that they modify.
.P .P
Some Inherited characters are used with many scripts, but many of them are only Some Inherited characters are used with many scripts, but many of them are only
normally used with a small number of scripts. For example, U+102E0 (Coptic normally used with a small number of scripts. For example, U+102E0 (Coptic
Epact thousands mark) is used only with Arabic and Coptic. In order to make it Epact thousands mark) is used only with Arabic and Coptic. In order to make it
possible to check this, a Unicode property called Script Extension exists. Its possible to check this, a Unicode property called Script Extension exists. Its
value is a list of scripts that apply to the character. For the majority of value is a list of scripts that apply to the character. For the majority of
characters, the list contains just one script, the same one as the Script characters, the list contains just one script, the same one as the Script
property. However, for characters such as U+102E0 more than one Script is property. However, for characters such as U+102E0 more than one Script is
listed. There are also some Common characters that have a single, non-Common listed. There are also some Common characters that have a single, non-Common
script in their Script Extension list. script in their Script Extension list.
.P .P
The next section describes the basic rules for deciding whether a given string The next section describes the basic rules for deciding whether a given string
of characters is a script run. Note, however, that there are some special cases of characters is a script run. Note, however, that there are some special cases
involving the Chinese Han script, and an additional constraint for decimal involving the Chinese Han script, and an additional constraint for decimal
digits. These are covered in subsequent sections. digits. These are covered in subsequent sections.
. .
. .
@ -185,16 +185,16 @@ all the sets of scripts must not be empty.
.P .P
A simple example is an Internet name such as "google.com". The letters are all A simple example is an Internet name such as "google.com". The letters are all
in the Latin script, and the dot is Common, so this string is a script run. in the Latin script, and the dot is Common, so this string is a script run.
However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a However, the Cyrillic letter "o" looks exactly the same as the Latin "o"; a
string that looks the same, but with Cyrillic "o"s is not a script run. string that looks the same, but with Cyrillic "o"s is not a script run.
.P .P
More interesting examples involve characters with more than one script in their More interesting examples involve characters with more than one script in their
Script Extension. Consider the following characters: Script Extension. Consider the following characters:
.sp .sp
U+060C Arabic comma U+060C Arabic comma
U+06D4 Arabic full stop U+06D4 Arabic full stop
.sp .sp
The first has the Script Extension list Arabic, Hanifi Rohingya, Syriac, and The first has the Script Extension list Arabic, Hanifi Rohingya, Syriac, and
Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could Thaana; the second has just Arabic and Hanifi Rohingya. Both of them could
appear in script runs of either Arabic or Hanifi Rohingya. The first could also appear in script runs of either Arabic or Hanifi Rohingya. The first could also
appear in Syriac or Thaana script runs, but the second could not. appear in Syriac or Thaana script runs, but the second could not.
@ -202,9 +202,9 @@ appear in Syriac or Thaana script runs, but the second could not.
. .
.SS "The Chinese Han script" .SS "The Chinese Han script"
.rs .rs
.sp .sp
The Chinese Han script is commonly used in conjunction with other scripts for The Chinese Han script is commonly used in conjunction with other scripts for
writing certain languages. Japanese uses the Hiragana and Katakana scripts writing certain languages. Japanese uses the Hiragana and Katakana scripts
together with Han; Korean uses Hangul and Han; Taiwanese Mandarin uses Bopomofo together with Han; Korean uses Hangul and Han; Taiwanese Mandarin uses Bopomofo
and Han. These three combinations are treated as special cases when checking and Han. These three combinations are treated as special cases when checking
script runs and are, in effect, "virtual scripts". Thus, a script run may script runs and are, in effect, "virtual scripts". Thus, a script run may

View File

@ -29,7 +29,7 @@ if [ $# -gt 1 -a "$1" = "-perl" ] ; then
shift shift
perl=$1 perl=$1
shift shift
fi fi
if [ $# -gt 0 -a "$1" = "-w" ] ; then if [ $# -gt 0 -a "$1" = "-w" ] ; then
perlarg="-w" perlarg="-w"
@ -386,10 +386,10 @@ for (;;)
} }
} }
# By closing OUTFILE explicitly, we avoid a Perl warning in -w mode # By closing OUTFILE explicitly, we avoid a Perl warning in -w mode
# "main::OUTFILE" used only once". # "main::OUTFILE" used only once".
close(OUTFILE) if $outfile eq "OUTFILE"; close(OUTFILE) if $outfile eq "OUTFILE";
PERLEND PERLEND
) | $perl $perlarg - $@ ) | $perl $perlarg - $@

View File

@ -44,7 +44,7 @@ POSSIBILITY OF SUCH DAMAGE.
#define PCRE2_MAJOR 10 #define PCRE2_MAJOR 10
#define PCRE2_MINOR 33 #define PCRE2_MINOR 33
#define PCRE2_PRERELEASE -RC1 #define PCRE2_PRERELEASE -RC1
#define PCRE2_DATE 2018-09-14 #define PCRE2_DATE 2019-03-03
/* When an application links to a PCRE DLL in Windows, the symbols that are /* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE2, the appropriate imported have to be identified as such. When building PCRE2, the appropriate
@ -150,6 +150,7 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ #define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */
#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ #define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */ #define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
/* These are for pcre2_jit_compile(). */ /* These are for pcre2_jit_compile(). */

View File

@ -604,15 +604,15 @@ for(;;)
case OP_SCBRAPOS: case OP_SCBRAPOS:
if (cb->had_recurse) return FALSE; if (cb->had_recurse) return FALSE;
break; break;
/* A script run might have to backtrack if the iterated item can match /* A script run might have to backtrack if the iterated item can match
characters from more than one script. So give up unless repeating an characters from more than one script. So give up unless repeating an
explicit character. */ explicit character. */
case OP_SCRIPT_RUN: case OP_SCRIPT_RUN:
if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI) if (base_list[0] != OP_CHAR && base_list[0] != OP_CHARI)
return FALSE; return FALSE;
break; break;
/* Atomic sub-patterns and assertions can always auto-possessify their /* Atomic sub-patterns and assertions can always auto-possessify their
last iterator. However, if the group was entered as a result of checking last iterator. However, if the group was entered as a result of checking

View File

@ -407,7 +407,7 @@ return 0;
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_substitute_callout(pcre2_match_context *mcontext, pcre2_set_substitute_callout(pcre2_match_context *mcontext,
int (*substitute_callout)(pcre2_substitute_callout_block *, void *), int (*substitute_callout)(pcre2_substitute_callout_block *, void *),
void *substitute_callout_data) void *substitute_callout_data)
{ {
mcontext->substitute_callout = substitute_callout; mcontext->substitute_callout = substitute_callout;

View File

@ -182,8 +182,8 @@ static const unsigned char compile_error_texts[] =
"\\N{U+dddd} is supported only in Unicode (UTF) mode\0" "\\N{U+dddd} is supported only in Unicode (UTF) mode\0"
"invalid hyphen in option setting\0" "invalid hyphen in option setting\0"
/* 95 */ /* 95 */
"(*alpha_assertion) not recognized\0" "(*alpha_assertion) not recognized\0"
"script runs require Unicode support, which this version of PCRE2 does not have\0" "script runs require Unicode support, which this version of PCRE2 does not have\0"
; ;
/* Match-time and UTF error texts are in the same format. */ /* Match-time and UTF error texts are in the same format. */

View File

@ -525,10 +525,10 @@ bytes in a code unit in that mode. */
enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */ enum { PCRE2_MATCHEDBY_INTERPRETER, /* pcre2_match() */
PCRE2_MATCHEDBY_DFA_INTERPRETER, /* pcre2_dfa_match() */ PCRE2_MATCHEDBY_DFA_INTERPRETER, /* pcre2_dfa_match() */
PCRE2_MATCHEDBY_JIT }; /* pcre2_jit_match() */ PCRE2_MATCHEDBY_JIT }; /* pcre2_jit_match() */
/* Values for the flags field in a match data block. */ /* Values for the flags field in a match data block. */
#define PCRE2_MD_COPIED_SUBJECT 0x01u #define PCRE2_MD_COPIED_SUBJECT 0x01u
/* Magic number to provide a small check against being handed junk. */ /* Magic number to provide a small check against being handed junk. */
@ -1774,7 +1774,7 @@ typedef struct {
uint8_t caseset; /* offset to multichar other cases or zero */ uint8_t caseset; /* offset to multichar other cases or zero */
int32_t other_case; /* offset to other case, or zero if none */ int32_t other_case; /* offset to other case, or zero if none */
int16_t scriptx; /* script extension value */ int16_t scriptx; /* script extension value */
int16_t dummy; /* spare - to round to multiple of 4 bytes */ int16_t dummy; /* spare - to round to multiple of 4 bytes */
} ucd_record; } ucd_record;
/* UCD access macros */ /* UCD access macros */

View File

@ -7794,12 +7794,12 @@ if (needstype || needsscript)
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2); OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script)); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0); OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3); // OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
ccbegin = cc; ccbegin = cc;
@ -7848,7 +7848,7 @@ if (needstype || needsscript)
//fprintf(stderr, "~~C\n"); //fprintf(stderr, "~~C\n");
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2); OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP1, 0); OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
@ -7862,12 +7862,12 @@ if (needstype || needsscript)
// PH hacking // PH hacking
//fprintf(stderr, "~~D\n"); //fprintf(stderr, "~~D\n");
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2); OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0); OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0);
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
typereg = RETURN_ADDR; typereg = RETURN_ADDR;
} }
@ -9207,9 +9207,9 @@ if (common->utf && *cc == OP_REFI)
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2); OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0); OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records)); OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records));
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case)); OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case));

View File

@ -138,7 +138,7 @@ for (i = 0; i < 256; i++)
int x = 0; int x = 0;
if (isspace(i)) x += ctype_space; if (isspace(i)) x += ctype_space;
if (isalpha(i)) x += ctype_letter; if (isalpha(i)) x += ctype_letter;
if (islower(i)) x += ctype_lcletter; if (islower(i)) x += ctype_lcletter;
if (isdigit(i)) x += ctype_digit; if (isdigit(i)) x += ctype_digit;
if (isalnum(i) || i == '_') x += ctype_word; if (isalnum(i) || i == '_') x += ctype_word;
*p++ = x; *p++ = x;

View File

@ -96,10 +96,10 @@ pcre2_match_data_free(pcre2_match_data *match_data)
if (match_data != NULL) if (match_data != NULL)
{ {
if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
match_data->memctl.free((void *)match_data->subject, match_data->memctl.free((void *)match_data->subject,
match_data->memctl.memory_data); match_data->memctl.memory_data);
match_data->memctl.free(match_data, match_data->memctl.memory_data); match_data->memctl.free(match_data, match_data->memctl.memory_data);
} }
} }

View File

@ -393,7 +393,7 @@ for(;;)
case OP_ASSERTBACK: case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT: case OP_ASSERTBACK_NOT:
case OP_ONCE: case OP_ONCE:
case OP_SCRIPT_RUN: case OP_SCRIPT_RUN:
case OP_COND: case OP_COND:
case OP_SCOND: case OP_SCOND:
case OP_REVERSE: case OP_REVERSE:

View File

@ -171,7 +171,7 @@ for (;;)
/* Fall through */ /* Fall through */
case OP_ONCE: case OP_ONCE:
case OP_SCRIPT_RUN: case OP_SCRIPT_RUN:
case OP_SBRA: case OP_SBRA:
case OP_BRAPOS: case OP_BRAPOS:
case OP_SBRAPOS: case OP_SBRAPOS:
@ -1076,7 +1076,7 @@ do
case OP_CBRAPOS: case OP_CBRAPOS:
case OP_SCBRAPOS: case OP_SCBRAPOS:
case OP_ONCE: case OP_ONCE:
case OP_SCRIPT_RUN: case OP_SCRIPT_RUN:
case OP_ASSERT: case OP_ASSERT:
rc = set_start_bits(re, tcode, utf); rc = set_start_bits(re, tcode, utf);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;

View File

@ -3,8 +3,8 @@
*************************************************/ *************************************************/
/* PCRE2 is a library of functions to support regular expressions whose syntax /* PCRE2 is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. This is and semantics are as close as possible to those of the Perl 5 language. This is
the public header file to be #included by applications that call PCRE2 via the the public header file to be #included by applications that call PCRE2 via the
POSIX wrapper interface. POSIX wrapper interface.
Written by Philip Hazel Written by Philip Hazel
@ -138,7 +138,7 @@ file. */
# endif # endif
#endif #endif
/* The functions. The actual code is in functions with pcre2_xxx names for /* The functions. The actual code is in functions with pcre2_xxx names for
uniqueness. POSIX names are provided as macros for API compatibility with POSIX uniqueness. POSIX names are provided as macros for API compatibility with POSIX
regex functions. It's done this way to ensure to they are always linked from regex functions. It's done this way to ensure to they are always linked from
the PCRE2 library and not by accident from elsewhere (regex_t differs in size the PCRE2 library and not by accident from elsewhere (regex_t differs in size
@ -155,7 +155,7 @@ PCRE2POSIX_EXP_DECL void pcre2_regfree(regex_t *);
#define regerror pcre2_regerror #define regerror pcre2_regerror
#define regfree pcre2_regfree #define regfree pcre2_regfree
/* Debian had a patch that used different names. These are now here to save /* Debian had a patch that used different names. These are now here to save
them having to maintain their own patch, but are not documented by PCRE2. */ them having to maintain their own patch, but are not documented by PCRE2. */
#define PCRE2regcomp pcre2_regcomp #define PCRE2regcomp pcre2_regcomp