Final preparations for 10.00-RC1
This commit is contained in:
parent
91f2e97474
commit
0acc416ed1
19
ChangeLog
19
ChangeLog
|
@ -1,12 +1,12 @@
|
||||||
Change Log for PCRE2
|
Change Log for PCRE2
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
Version 10.0 xx-xxxx-2014
|
Version 10.00 24-November-2014
|
||||||
-------------------------
|
------------------------------
|
||||||
|
|
||||||
Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
|
Version 10.00 is the first release of PCRE2, a revised API for the PCRE
|
||||||
Changes prior to 10.0 are logged in the ChangeLog file for the old API, up to
|
library. Changes prior to 10.00 are logged in the ChangeLog file for the old
|
||||||
item 20 for release 8.36.
|
API, up to item 20 for release 8.36.
|
||||||
|
|
||||||
The code of the library was heavily revised as part of the new API
|
The code of the library was heavily revised as part of the new API
|
||||||
implementation. Details of each and every modification were not individually
|
implementation. Details of each and every modification were not individually
|
||||||
|
@ -25,7 +25,7 @@ matched by that pattern.
|
||||||
|
|
||||||
4. For the benefit of those who use PCRE2 via some other application, that is,
|
4. For the benefit of those who use PCRE2 via some other application, that is,
|
||||||
not writing the function calls themselves, it is possible to check the PCRE2
|
not writing the function calls themselves, it is possible to check the PCRE2
|
||||||
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
version by matching a pattern such as /(?(VERSION>=10)yes|no)/ against a
|
||||||
string such as "yesno".
|
string such as "yesno".
|
||||||
|
|
||||||
5. There are case-equivalent Unicode characters whose encodings use different
|
5. There are case-equivalent Unicode characters whose encodings use different
|
||||||
|
@ -46,14 +46,15 @@ characters, for example: /(?:(?=.)|(?<!x))a/.
|
||||||
|
|
||||||
7. When an (*ACCEPT) is triggered inside capturing parentheses, it arranges for
|
7. When an (*ACCEPT) is triggered inside capturing parentheses, it arranges for
|
||||||
those parentheses to be closed with whatever has been captured so far. However,
|
those parentheses to be closed with whatever has been captured so far. However,
|
||||||
it was failing to mark any other groups between the hightest capture so far and
|
it was failing to mark any other groups between the highest capture so far and
|
||||||
the currrent group as "unset". Thus, the ovector for those groups contained
|
the currrent group as "unset". Thus, the ovector for those groups contained
|
||||||
whatever was previously there. An example is the pattern /(x)|((*ACCEPT))/ when
|
whatever was previously there. An example is the pattern /(x)|((*ACCEPT))/ when
|
||||||
matched against "abcd".
|
matched against "abcd".
|
||||||
|
|
||||||
8. The pcre2_substitute() function has been implemented.
|
8. The pcre2_substitute() function has been implemented.
|
||||||
|
|
||||||
9. If an assertion condition was quantified with a minimum of zero (an odd
|
9. If an assertion used as a condition was quantified with a minimum of zero
|
||||||
thing to do, but it happened), SIGSEGV or other misbehaviour could occur.
|
(an odd thing to do, but it happened), SIGSEGV or other misbehaviour could
|
||||||
|
occur.
|
||||||
|
|
||||||
****
|
****
|
||||||
|
|
50
Makefile.am
50
Makefile.am
|
@ -375,28 +375,34 @@ CLEANFILES += src/pcre2_chartables.c
|
||||||
# when pcre2_jit_compile.c is processed, so they must be distributed.
|
# when pcre2_jit_compile.c is processed, so they must be distributed.
|
||||||
|
|
||||||
EXTRA_DIST += \
|
EXTRA_DIST += \
|
||||||
sljit/sljitConfig.h \
|
src/sljit/sljitConfig.h \
|
||||||
sljit/sljitConfigInternal.h \
|
src/sljit/sljitConfigInternal.h \
|
||||||
sljit/sljitExecAllocator.c \
|
src/sljit/sljitExecAllocator.c \
|
||||||
sljit/sljitLir.c \
|
src/sljit/sljitLir.c \
|
||||||
sljit/sljitLir.h \
|
src/sljit/sljitLir.h \
|
||||||
sljit/sljitNativeARM_32.c \
|
src/sljit/sljitNativeARM_32.c \
|
||||||
sljit/sljitNativeARM_64.c \
|
src/sljit/sljitNativeARM_64.c \
|
||||||
sljit/sljitNativeARM_T2_32.c \
|
src/sljit/sljitNativeARM_T2_32.c \
|
||||||
sljit/sljitNativeMIPS_32.c \
|
src/sljit/sljitNativeMIPS_32.c \
|
||||||
sljit/sljitNativeMIPS_64.c \
|
src/sljit/sljitNativeMIPS_64.c \
|
||||||
sljit/sljitNativeMIPS_common.c \
|
src/sljit/sljitNativeMIPS_common.c \
|
||||||
sljit/sljitNativePPC_32.c \
|
src/sljit/sljitNativePPC_32.c \
|
||||||
sljit/sljitNativePPC_64.c \
|
src/sljit/sljitNativePPC_64.c \
|
||||||
sljit/sljitNativePPC_common.c \
|
src/sljit/sljitNativePPC_common.c \
|
||||||
sljit/sljitNativeSPARC_32.c \
|
src/sljit/sljitNativeSPARC_32.c \
|
||||||
sljit/sljitNativeSPARC_common.c \
|
src/sljit/sljitNativeSPARC_common.c \
|
||||||
sljit/sljitNativeTILEGX-encoder.c \
|
src/sljit/sljitNativeTILEGX-encoder.c \
|
||||||
sljit/sljitNativeTILEGX_64.c \
|
src/sljit/sljitNativeTILEGX_64.c \
|
||||||
sljit/sljitNativeX86_32.c \
|
src/sljit/sljitNativeX86_32.c \
|
||||||
sljit/sljitNativeX86_64.c \
|
src/sljit/sljitNativeX86_64.c \
|
||||||
sljit/sljitNativeX86_common.c \
|
src/sljit/sljitNativeX86_common.c \
|
||||||
sljit/sljitUtils.c
|
src/sljit/sljitUtils.c
|
||||||
|
|
||||||
|
# Some of the JIT sources are also in separate files that are #included.
|
||||||
|
|
||||||
|
EXTRA_DIST += \
|
||||||
|
src/pcre2_jit_match.c \
|
||||||
|
src/pcre2_jit_misc.c
|
||||||
|
|
||||||
if WITH_PCRE2_8
|
if WITH_PCRE2_8
|
||||||
libpcre2_8_la_LDFLAGS = $(EXTRA_LIBPCRE2_8_LDFLAGS)
|
libpcre2_8_la_LDFLAGS = $(EXTRA_LIBPCRE2_8_LDFLAGS)
|
||||||
|
|
12
NEWS
12
NEWS
|
@ -1,11 +1,13 @@
|
||||||
News about PCRE2 releases
|
News about PCRE2 releases
|
||||||
-------------------------
|
-------------------------
|
||||||
|
|
||||||
Version 10.0 xx-xxxx-2014
|
Version 10.00 24-November-2014
|
||||||
-------------------------
|
------------------------------
|
||||||
|
|
||||||
Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
|
Version 10.00 is the first release of PCRE2, a revised API for the PCRE
|
||||||
Changes prior to 10.0 are logged in the ChangeLog file for the old API, up to
|
library. Changes prior to 10.00 are logged in the ChangeLog file for the old
|
||||||
item 20 for release 8.36.
|
API, up to item 20 for release 8.36. New programs are recommended to use the
|
||||||
|
new library. Programs that use the original (PCRE1) API will need changing
|
||||||
|
before linking with the new library.
|
||||||
|
|
||||||
****
|
****
|
||||||
|
|
72
README
72
README
|
@ -5,11 +5,9 @@ PCRE2 is a re-implementation of the original PCRE library with an entirely new
|
||||||
API. The latest release of PCRE2 is always available in three alternative
|
API. The latest release of PCRE2 is always available in three alternative
|
||||||
formats from:
|
formats from:
|
||||||
|
|
||||||
FIXME: THIS WILL NOT BE THE CASE UNTIL THERE IS A FORMAL RELEASE.
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
|
||||||
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
|
||||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.gz
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
|
||||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.bz2
|
|
||||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.zip
|
|
||||||
|
|
||||||
There is a mailing list for discussion about the development of PCRE (both the
|
There is a mailing list for discussion about the development of PCRE (both the
|
||||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||||
|
@ -46,7 +44,7 @@ there as yet no C++ wrappers.
|
||||||
|
|
||||||
The distribution does contain a set of C wrapper functions for the 8-bit
|
The distribution does contain a set of C wrapper functions for the 8-bit
|
||||||
library that are based on the POSIX regular expression API (see the pcre2posix
|
library that are based on the POSIX regular expression API (see the pcre2posix
|
||||||
man page). These end up in the library called libpcre2posix. Note that this
|
man page). These can be found in a library called libpcre2posix. Note that this
|
||||||
just provides a POSIX calling interface to PCRE2; the regular expressions
|
just provides a POSIX calling interface to PCRE2; the regular expressions
|
||||||
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
||||||
and does not give full access to all of PCRE2's facilities.
|
and does not give full access to all of PCRE2's facilities.
|
||||||
|
@ -72,7 +70,7 @@ new names.
|
||||||
|
|
||||||
|
|
||||||
Documentation for PCRE2
|
Documentation for PCRE2
|
||||||
----------------------
|
-----------------------
|
||||||
|
|
||||||
If you install PCRE2 in the normal way on a Unix-like system, you will end up
|
If you install PCRE2 in the normal way on a Unix-like system, you will end up
|
||||||
with a set of man pages whose names all start with "pcre2". The one that is
|
with a set of man pages whose names all start with "pcre2". The one that is
|
||||||
|
@ -95,7 +93,7 @@ PCRE2 documentation is supplied in two other forms:
|
||||||
|
|
||||||
|
|
||||||
Building PCRE2 on non-Unix-like systems
|
Building PCRE2 on non-Unix-like systems
|
||||||
--------------------------------------
|
---------------------------------------
|
||||||
|
|
||||||
For a non-Unix-like system, please read the comments in the file
|
For a non-Unix-like system, please read the comments in the file
|
||||||
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
||||||
|
@ -112,7 +110,7 @@ library, because it uses only Standard C functions.
|
||||||
|
|
||||||
|
|
||||||
Building PCRE2 without using autotools
|
Building PCRE2 without using autotools
|
||||||
-------------------------------------
|
--------------------------------------
|
||||||
|
|
||||||
The use of autotools (in particular, libtool) is problematic in some
|
The use of autotools (in particular, libtool) is problematic in some
|
||||||
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
||||||
|
@ -120,7 +118,7 @@ file for ways of building PCRE2 without using autotools.
|
||||||
|
|
||||||
|
|
||||||
Building PCRE2 using autotools
|
Building PCRE2 using autotools
|
||||||
-----------------------------
|
------------------------------
|
||||||
|
|
||||||
The following instructions assume the use of the widely used "configure; make;
|
The following instructions assume the use of the widely used "configure; make;
|
||||||
make install" (autotools) process.
|
make install" (autotools) process.
|
||||||
|
@ -166,15 +164,15 @@ library. They are also documented in the pcre2build man page.
|
||||||
|
|
||||||
. By default, only the 8-bit library is built. If you add --enable-pcre2-16 to
|
. By default, only the 8-bit library is built. If you add --enable-pcre2-16 to
|
||||||
the "configure" command, the 16-bit library is also built. If you add
|
the "configure" command, the 16-bit library is also built. If you add
|
||||||
--enable-pcre2-32 to the "configure" command, the 32-bit library is also built.
|
--enable-pcre2-32 to the "configure" command, the 32-bit library is also
|
||||||
If you want only the 16-bit or 32-bit library, use --disable-pcre2-8 to disable
|
built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8
|
||||||
building the 8-bit library.
|
to disable building the 8-bit library.
|
||||||
|
|
||||||
. If you want to include support for just-in-time compiling, which can give
|
. If you want to include support for just-in-time compiling, which can give
|
||||||
large performance improvements on certain platforms, add --enable-jit to the
|
large performance improvements on certain platforms, add --enable-jit to the
|
||||||
"configure" command. This support is available only for certain hardware
|
"configure" command. This support is available only for certain hardware
|
||||||
architectures. If you try to enable it on an unsupported architecture, there
|
architectures. If you try to enable it on an unsupported architecture, there
|
||||||
will be a compile time error. FIXME: NOT YET IMPLEMENTED.
|
will be a compile time error.
|
||||||
|
|
||||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||||
you add --disable-pcre2grep-jit to the "configure" command.
|
you add --disable-pcre2grep-jit to the "configure" command.
|
||||||
|
@ -196,13 +194,13 @@ library. They are also documented in the pcre2build man page.
|
||||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||||
However, only the basic two-letter properties such as Lu are supported.
|
However, only the basic two-letter properties such as Lu are supported.
|
||||||
|
|
||||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
of the preceding, or any of the Unicode newline sequences, as indicating the
|
||||||
end of a line. Whatever you specify at build time is the default; the caller
|
end of a line. Whatever you specify at build time is the default; the caller
|
||||||
of PCRE2 can change the selection at run time. The default newline indicator
|
of PCRE2 can change the selection at run time. The default newline indicator
|
||||||
is a single LF character (the Unix standard). You can specify the default
|
is a single LF character (the Unix standard). You can specify the default
|
||||||
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
|
newline indicator by adding --enable-newline-is-cr, --enable-newline-is-lf,
|
||||||
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
|
--enable-newline-is-crlf, --enable-newline-is-anycrlf, or
|
||||||
--enable-newline-is-any to the "configure" command, respectively.
|
--enable-newline-is-any to the "configure" command, respectively.
|
||||||
|
|
||||||
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
||||||
|
@ -251,8 +249,9 @@ library. They are also documented in the pcre2build man page.
|
||||||
command. PCRE2 then uses three bytes instead of two for offsets to different
|
command. PCRE2 then uses three bytes instead of two for offsets to different
|
||||||
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
||||||
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
||||||
offsets. Increasing the internal link size reduces performance. In the 32-bit
|
offsets. Increasing the internal link size reduces performance in the 8-bit
|
||||||
library, the link size setting is ignored, as 4-byte offsets are always used.
|
and 16-bit libraries. In the 32-bit library, the link size setting is
|
||||||
|
ignored, as 4-byte offsets are always used.
|
||||||
|
|
||||||
. You can build PCRE2 so that its internal match() function that is called from
|
. You can build PCRE2 so that its internal match() function that is called from
|
||||||
pcre2_match() does not call itself recursively. Instead, it uses memory
|
pcre2_match() does not call itself recursively. Instead, it uses memory
|
||||||
|
@ -376,12 +375,13 @@ contains compiler output from tests that "configure" runs.
|
||||||
Once "configure" has run, you can run "make". This builds whichever of the
|
Once "configure" has run, you can run "make". This builds whichever of the
|
||||||
libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
|
libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
|
||||||
program called pcre2test. If you enabled JIT support with --enable-jit, another
|
program called pcre2test. If you enabled JIT support with --enable-jit, another
|
||||||
test program called pcre2_jit_test is built as well. FIXME: still to be
|
test program called pcre2_jit_test is built as well. If the 8-bit library is
|
||||||
implemented. If the 8-bit library is built, libpcre2-posix and the pcre2grep
|
built, libpcre2-posix and the pcre2grep command are also built. Running
|
||||||
command are also built.
|
"make" with the -j option may speed up compilation on multiprocessor systems.
|
||||||
|
|
||||||
The command "make check" runs all the appropriate tests. Details of the PCRE2
|
The command "make check" runs all the appropriate tests. Details of the PCRE2
|
||||||
tests are given below in a separate section of this document.
|
tests are given below in a separate section of this document. The -j option of
|
||||||
|
"make" can also be used when running the tests.
|
||||||
|
|
||||||
You can use "make install" to install PCRE2 into live directories on your
|
You can use "make install" to install PCRE2 into live directories on your
|
||||||
system. The following are installed (file names are all relative to the
|
system. The following are installed (file names are all relative to the
|
||||||
|
@ -528,7 +528,7 @@ Testing PCRE2
|
||||||
|
|
||||||
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
||||||
There is another script called RunGrepTest that tests the options of the
|
There is another script called RunGrepTest that tests the options of the
|
||||||
pcre2grep command. When JIT support is enabled, another test program called
|
pcre2grep command. When JIT support is enabled, a third test program called
|
||||||
pcre2_jit_test is built. Both the scripts and all the program tests are run if
|
pcre2_jit_test is built. Both the scripts and all the program tests are run if
|
||||||
you obey "make check". For other environments, see the instructions in
|
you obey "make check". For other environments, see the instructions in
|
||||||
NON-AUTOTOOLS-BUILD.
|
NON-AUTOTOOLS-BUILD.
|
||||||
|
@ -709,7 +709,6 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_context.c )
|
src/pcre2_context.c )
|
||||||
src/pcre2_dfa_match.c )
|
src/pcre2_dfa_match.c )
|
||||||
src/pcre2_error.c )
|
src/pcre2_error.c )
|
||||||
src/pcre2_exec.c )
|
|
||||||
src/pcre2_jit_compile.c )
|
src/pcre2_jit_compile.c )
|
||||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||||
|
@ -721,6 +720,7 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_pattern_info.c )
|
src/pcre2_pattern_info.c )
|
||||||
src/pcre2_string_utils.c )
|
src/pcre2_string_utils.c )
|
||||||
src/pcre2_study.c )
|
src/pcre2_study.c )
|
||||||
|
src/pcre2_substitute.c )
|
||||||
src/pcre2_substring.c )
|
src/pcre2_substring.c )
|
||||||
src/pcre2_tables.c )
|
src/pcre2_tables.c )
|
||||||
src/pcre2_ucd.c )
|
src/pcre2_ucd.c )
|
||||||
|
@ -736,13 +736,15 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_intmodedep.h a mode-specific internal header
|
src/pcre2_intmodedep.h a mode-specific internal header
|
||||||
src/pcre2_ucp.h header for Unicode property handling
|
src/pcre2_ucp.h header for Unicode property handling
|
||||||
|
|
||||||
sljit/* 16 files that make up the JIT compiler FIXME
|
sljit/* source files for the JIT compiler
|
||||||
|
|
||||||
(B) Source files for programs that use PCRE2:
|
(B) Source files for programs that use PCRE2:
|
||||||
|
|
||||||
src/pcre2demo.c simple demonstration of coding calls to PCRE2
|
src/pcre2demo.c simple demonstration of coding calls to PCRE2
|
||||||
src/pcre2grep.c source of a grep utility that uses PCRE2
|
src/pcre2grep.c source of a grep utility that uses PCRE2
|
||||||
src/pcre2test.c comprehensive test program
|
src/pcre2test.c comprehensive test program
|
||||||
|
src/pcre2_printint.c part of pcre2test
|
||||||
|
src/pcre2_jit_test.c JIT test program
|
||||||
|
|
||||||
(C) Auxiliary files:
|
(C) Auxiliary files:
|
||||||
|
|
||||||
|
@ -790,7 +792,6 @@ The distribution should contain the files listed below.
|
||||||
mkinstalldirs script for making install directories
|
mkinstalldirs script for making install directories
|
||||||
perltest.sh Script for running a Perl test program
|
perltest.sh Script for running a Perl test program
|
||||||
pcre2-config.in source of script which retains PCRE2 information
|
pcre2-config.in source of script which retains PCRE2 information
|
||||||
pcre2_jit_test.c test program for the JIT compiler
|
|
||||||
testdata/testinput* test data for main library tests
|
testdata/testinput* test data for main library tests
|
||||||
testdata/testoutput* expected test results
|
testdata/testoutput* expected test results
|
||||||
testdata/grep* input and output for pcre2grep tests
|
testdata/grep* input and output for pcre2grep tests
|
||||||
|
@ -805,25 +806,14 @@ The distribution should contain the files listed below.
|
||||||
CMakeLists.txt
|
CMakeLists.txt
|
||||||
config-cmake.h.in
|
config-cmake.h.in
|
||||||
|
|
||||||
(E) Auxiliary files for VPASCAL FIXME FIXME
|
(E) Auxiliary files for building PCRE2 "by hand"
|
||||||
|
|
||||||
makevp.bat
|
|
||||||
makevp_c.txt
|
|
||||||
makevp_l.txt
|
|
||||||
pcre2gexp.pas
|
|
||||||
|
|
||||||
(F) Auxiliary files for building PCRE2 "by hand"
|
|
||||||
|
|
||||||
pcre2.h.generic ) a version of the public PCRE2 header file
|
pcre2.h.generic ) a version of the public PCRE2 header file
|
||||||
) for use in non-"configure" environments
|
) for use in non-"configure" environments
|
||||||
config.h.generic ) a version of config.h for use in non-"configure"
|
config.h.generic ) a version of config.h for use in non-"configure"
|
||||||
) environments
|
) environments
|
||||||
|
|
||||||
(F) Miscellaneous
|
|
||||||
|
|
||||||
RunTest.bat a script for running tests under Windows FIXME
|
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 03 November 2014
|
Last updated: 24 November 2014
|
||||||
|
|
|
@ -10,8 +10,8 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
|
||||||
|
|
||||||
m4_define(pcre2_major, [10])
|
m4_define(pcre2_major, [10])
|
||||||
m4_define(pcre2_minor, [00])
|
m4_define(pcre2_minor, [00])
|
||||||
m4_define(pcre2_prerelease, [-DEV])
|
m4_define(pcre2_prerelease, [-RC1])
|
||||||
m4_define(pcre2_date, [2014-99-99])
|
m4_define(pcre2_date, [2014-11-24])
|
||||||
|
|
||||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||||
# 50 lines of this file. Please update that if the variables above are moved.
|
# 50 lines of this file. Please update that if the variables above are moved.
|
||||||
|
|
|
@ -5,11 +5,9 @@ PCRE2 is a re-implementation of the original PCRE library with an entirely new
|
||||||
API. The latest release of PCRE2 is always available in three alternative
|
API. The latest release of PCRE2 is always available in three alternative
|
||||||
formats from:
|
formats from:
|
||||||
|
|
||||||
FIXME: THIS WILL NOT BE THE CASE UNTIL THERE IS A FORMAL RELEASE.
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
|
||||||
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
|
||||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.gz
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
|
||||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.bz2
|
|
||||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.zip
|
|
||||||
|
|
||||||
There is a mailing list for discussion about the development of PCRE (both the
|
There is a mailing list for discussion about the development of PCRE (both the
|
||||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||||
|
@ -46,7 +44,7 @@ there as yet no C++ wrappers.
|
||||||
|
|
||||||
The distribution does contain a set of C wrapper functions for the 8-bit
|
The distribution does contain a set of C wrapper functions for the 8-bit
|
||||||
library that are based on the POSIX regular expression API (see the pcre2posix
|
library that are based on the POSIX regular expression API (see the pcre2posix
|
||||||
man page). These end up in the library called libpcre2posix. Note that this
|
man page). These can be found in a library called libpcre2posix. Note that this
|
||||||
just provides a POSIX calling interface to PCRE2; the regular expressions
|
just provides a POSIX calling interface to PCRE2; the regular expressions
|
||||||
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
||||||
and does not give full access to all of PCRE2's facilities.
|
and does not give full access to all of PCRE2's facilities.
|
||||||
|
@ -72,7 +70,7 @@ new names.
|
||||||
|
|
||||||
|
|
||||||
Documentation for PCRE2
|
Documentation for PCRE2
|
||||||
----------------------
|
-----------------------
|
||||||
|
|
||||||
If you install PCRE2 in the normal way on a Unix-like system, you will end up
|
If you install PCRE2 in the normal way on a Unix-like system, you will end up
|
||||||
with a set of man pages whose names all start with "pcre2". The one that is
|
with a set of man pages whose names all start with "pcre2". The one that is
|
||||||
|
@ -95,7 +93,7 @@ PCRE2 documentation is supplied in two other forms:
|
||||||
|
|
||||||
|
|
||||||
Building PCRE2 on non-Unix-like systems
|
Building PCRE2 on non-Unix-like systems
|
||||||
--------------------------------------
|
---------------------------------------
|
||||||
|
|
||||||
For a non-Unix-like system, please read the comments in the file
|
For a non-Unix-like system, please read the comments in the file
|
||||||
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
||||||
|
@ -112,7 +110,7 @@ library, because it uses only Standard C functions.
|
||||||
|
|
||||||
|
|
||||||
Building PCRE2 without using autotools
|
Building PCRE2 without using autotools
|
||||||
-------------------------------------
|
--------------------------------------
|
||||||
|
|
||||||
The use of autotools (in particular, libtool) is problematic in some
|
The use of autotools (in particular, libtool) is problematic in some
|
||||||
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
||||||
|
@ -120,7 +118,7 @@ file for ways of building PCRE2 without using autotools.
|
||||||
|
|
||||||
|
|
||||||
Building PCRE2 using autotools
|
Building PCRE2 using autotools
|
||||||
-----------------------------
|
------------------------------
|
||||||
|
|
||||||
The following instructions assume the use of the widely used "configure; make;
|
The following instructions assume the use of the widely used "configure; make;
|
||||||
make install" (autotools) process.
|
make install" (autotools) process.
|
||||||
|
@ -166,15 +164,15 @@ library. They are also documented in the pcre2build man page.
|
||||||
|
|
||||||
. By default, only the 8-bit library is built. If you add --enable-pcre2-16 to
|
. By default, only the 8-bit library is built. If you add --enable-pcre2-16 to
|
||||||
the "configure" command, the 16-bit library is also built. If you add
|
the "configure" command, the 16-bit library is also built. If you add
|
||||||
--enable-pcre2-32 to the "configure" command, the 32-bit library is also built.
|
--enable-pcre2-32 to the "configure" command, the 32-bit library is also
|
||||||
If you want only the 16-bit or 32-bit library, use --disable-pcre2-8 to disable
|
built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8
|
||||||
building the 8-bit library.
|
to disable building the 8-bit library.
|
||||||
|
|
||||||
. If you want to include support for just-in-time compiling, which can give
|
. If you want to include support for just-in-time compiling, which can give
|
||||||
large performance improvements on certain platforms, add --enable-jit to the
|
large performance improvements on certain platforms, add --enable-jit to the
|
||||||
"configure" command. This support is available only for certain hardware
|
"configure" command. This support is available only for certain hardware
|
||||||
architectures. If you try to enable it on an unsupported architecture, there
|
architectures. If you try to enable it on an unsupported architecture, there
|
||||||
will be a compile time error. FIXME: NOT YET IMPLEMENTED.
|
will be a compile time error.
|
||||||
|
|
||||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||||
you add --disable-pcre2grep-jit to the "configure" command.
|
you add --disable-pcre2grep-jit to the "configure" command.
|
||||||
|
@ -196,13 +194,13 @@ library. They are also documented in the pcre2build man page.
|
||||||
\P, \p, and \X sequences that recognize Unicode character properties.
|
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||||
However, only the basic two-letter properties such as Lu are supported.
|
However, only the basic two-letter properties such as Lu are supported.
|
||||||
|
|
||||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
of the preceding, or any of the Unicode newline sequences, as indicating the
|
||||||
end of a line. Whatever you specify at build time is the default; the caller
|
end of a line. Whatever you specify at build time is the default; the caller
|
||||||
of PCRE2 can change the selection at run time. The default newline indicator
|
of PCRE2 can change the selection at run time. The default newline indicator
|
||||||
is a single LF character (the Unix standard). You can specify the default
|
is a single LF character (the Unix standard). You can specify the default
|
||||||
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
|
newline indicator by adding --enable-newline-is-cr, --enable-newline-is-lf,
|
||||||
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
|
--enable-newline-is-crlf, --enable-newline-is-anycrlf, or
|
||||||
--enable-newline-is-any to the "configure" command, respectively.
|
--enable-newline-is-any to the "configure" command, respectively.
|
||||||
|
|
||||||
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
||||||
|
@ -251,8 +249,9 @@ library. They are also documented in the pcre2build man page.
|
||||||
command. PCRE2 then uses three bytes instead of two for offsets to different
|
command. PCRE2 then uses three bytes instead of two for offsets to different
|
||||||
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
||||||
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
||||||
offsets. Increasing the internal link size reduces performance. In the 32-bit
|
offsets. Increasing the internal link size reduces performance in the 8-bit
|
||||||
library, the link size setting is ignored, as 4-byte offsets are always used.
|
and 16-bit libraries. In the 32-bit library, the link size setting is
|
||||||
|
ignored, as 4-byte offsets are always used.
|
||||||
|
|
||||||
. You can build PCRE2 so that its internal match() function that is called from
|
. You can build PCRE2 so that its internal match() function that is called from
|
||||||
pcre2_match() does not call itself recursively. Instead, it uses memory
|
pcre2_match() does not call itself recursively. Instead, it uses memory
|
||||||
|
@ -376,12 +375,13 @@ contains compiler output from tests that "configure" runs.
|
||||||
Once "configure" has run, you can run "make". This builds whichever of the
|
Once "configure" has run, you can run "make". This builds whichever of the
|
||||||
libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
|
libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
|
||||||
program called pcre2test. If you enabled JIT support with --enable-jit, another
|
program called pcre2test. If you enabled JIT support with --enable-jit, another
|
||||||
test program called pcre2_jit_test is built as well. FIXME: still to be
|
test program called pcre2_jit_test is built as well. If the 8-bit library is
|
||||||
implemented. If the 8-bit library is built, libpcre2-posix and the pcre2grep
|
built, libpcre2-posix and the pcre2grep command are also built. Running
|
||||||
command are also built.
|
"make" with the -j option may speed up compilation on multiprocessor systems.
|
||||||
|
|
||||||
The command "make check" runs all the appropriate tests. Details of the PCRE2
|
The command "make check" runs all the appropriate tests. Details of the PCRE2
|
||||||
tests are given below in a separate section of this document.
|
tests are given below in a separate section of this document. The -j option of
|
||||||
|
"make" can also be used when running the tests.
|
||||||
|
|
||||||
You can use "make install" to install PCRE2 into live directories on your
|
You can use "make install" to install PCRE2 into live directories on your
|
||||||
system. The following are installed (file names are all relative to the
|
system. The following are installed (file names are all relative to the
|
||||||
|
@ -528,7 +528,7 @@ Testing PCRE2
|
||||||
|
|
||||||
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
||||||
There is another script called RunGrepTest that tests the options of the
|
There is another script called RunGrepTest that tests the options of the
|
||||||
pcre2grep command. When JIT support is enabled, another test program called
|
pcre2grep command. When JIT support is enabled, a third test program called
|
||||||
pcre2_jit_test is built. Both the scripts and all the program tests are run if
|
pcre2_jit_test is built. Both the scripts and all the program tests are run if
|
||||||
you obey "make check". For other environments, see the instructions in
|
you obey "make check". For other environments, see the instructions in
|
||||||
NON-AUTOTOOLS-BUILD.
|
NON-AUTOTOOLS-BUILD.
|
||||||
|
@ -709,7 +709,6 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_context.c )
|
src/pcre2_context.c )
|
||||||
src/pcre2_dfa_match.c )
|
src/pcre2_dfa_match.c )
|
||||||
src/pcre2_error.c )
|
src/pcre2_error.c )
|
||||||
src/pcre2_exec.c )
|
|
||||||
src/pcre2_jit_compile.c )
|
src/pcre2_jit_compile.c )
|
||||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||||
|
@ -721,6 +720,7 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_pattern_info.c )
|
src/pcre2_pattern_info.c )
|
||||||
src/pcre2_string_utils.c )
|
src/pcre2_string_utils.c )
|
||||||
src/pcre2_study.c )
|
src/pcre2_study.c )
|
||||||
|
src/pcre2_substitute.c )
|
||||||
src/pcre2_substring.c )
|
src/pcre2_substring.c )
|
||||||
src/pcre2_tables.c )
|
src/pcre2_tables.c )
|
||||||
src/pcre2_ucd.c )
|
src/pcre2_ucd.c )
|
||||||
|
@ -736,13 +736,15 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_intmodedep.h a mode-specific internal header
|
src/pcre2_intmodedep.h a mode-specific internal header
|
||||||
src/pcre2_ucp.h header for Unicode property handling
|
src/pcre2_ucp.h header for Unicode property handling
|
||||||
|
|
||||||
sljit/* 16 files that make up the JIT compiler FIXME
|
sljit/* source files for the JIT compiler
|
||||||
|
|
||||||
(B) Source files for programs that use PCRE2:
|
(B) Source files for programs that use PCRE2:
|
||||||
|
|
||||||
src/pcre2demo.c simple demonstration of coding calls to PCRE2
|
src/pcre2demo.c simple demonstration of coding calls to PCRE2
|
||||||
src/pcre2grep.c source of a grep utility that uses PCRE2
|
src/pcre2grep.c source of a grep utility that uses PCRE2
|
||||||
src/pcre2test.c comprehensive test program
|
src/pcre2test.c comprehensive test program
|
||||||
|
src/pcre2_printint.c part of pcre2test
|
||||||
|
src/pcre2_jit_test.c JIT test program
|
||||||
|
|
||||||
(C) Auxiliary files:
|
(C) Auxiliary files:
|
||||||
|
|
||||||
|
@ -790,7 +792,6 @@ The distribution should contain the files listed below.
|
||||||
mkinstalldirs script for making install directories
|
mkinstalldirs script for making install directories
|
||||||
perltest.sh Script for running a Perl test program
|
perltest.sh Script for running a Perl test program
|
||||||
pcre2-config.in source of script which retains PCRE2 information
|
pcre2-config.in source of script which retains PCRE2 information
|
||||||
pcre2_jit_test.c test program for the JIT compiler
|
|
||||||
testdata/testinput* test data for main library tests
|
testdata/testinput* test data for main library tests
|
||||||
testdata/testoutput* expected test results
|
testdata/testoutput* expected test results
|
||||||
testdata/grep* input and output for pcre2grep tests
|
testdata/grep* input and output for pcre2grep tests
|
||||||
|
@ -805,25 +806,14 @@ The distribution should contain the files listed below.
|
||||||
CMakeLists.txt
|
CMakeLists.txt
|
||||||
config-cmake.h.in
|
config-cmake.h.in
|
||||||
|
|
||||||
(E) Auxiliary files for VPASCAL FIXME FIXME
|
(E) Auxiliary files for building PCRE2 "by hand"
|
||||||
|
|
||||||
makevp.bat
|
|
||||||
makevp_c.txt
|
|
||||||
makevp_l.txt
|
|
||||||
pcre2gexp.pas
|
|
||||||
|
|
||||||
(F) Auxiliary files for building PCRE2 "by hand"
|
|
||||||
|
|
||||||
pcre2.h.generic ) a version of the public PCRE2 header file
|
pcre2.h.generic ) a version of the public PCRE2 header file
|
||||||
) for use in non-"configure" environments
|
) for use in non-"configure" environments
|
||||||
config.h.generic ) a version of config.h for use in non-"configure"
|
config.h.generic ) a version of config.h for use in non-"configure"
|
||||||
) environments
|
) environments
|
||||||
|
|
||||||
(F) Miscellaneous
|
|
||||||
|
|
||||||
RunTest.bat a script for running tests under Windows FIXME
|
|
||||||
|
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 03 November 2014
|
Last updated: 24 November 2014
|
||||||
|
|
|
@ -2074,6 +2074,12 @@ returned by <b>pcre2_get_startchar()</b>. For a non-partial match, this can be
|
||||||
different to the value of <i>ovector[0]</i> if the pattern contains the \K
|
different to the value of <i>ovector[0]</i> if the pattern contains the \K
|
||||||
escape sequence. After a partial match, however, this value is always the same
|
escape sequence. After a partial match, however, this value is always the same
|
||||||
as <i>ovector[0]</i> because \K does not affect the result of a partial match.
|
as <i>ovector[0]</i> because \K does not affect the result of a partial match.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The <b>startchar</b> field is also used to return the offset of an invalid
|
||||||
|
UTF character when UTF checking fails. Details are given in the
|
||||||
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||||
|
page.
|
||||||
<a name="errorlist"></a></P>
|
<a name="errorlist"></a></P>
|
||||||
<br><a name="SEC26" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
<br><a name="SEC26" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -2658,7 +2664,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC36" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC36" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 21 November 2014
|
Last updated: 23 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -197,9 +197,9 @@ the string "dog" matched against the ungreedy pattern shown above:
|
||||||
<pre>
|
<pre>
|
||||||
/dog(sbody)??/
|
/dog(sbody)??/
|
||||||
</pre>
|
</pre>
|
||||||
Whereas the standard functions stop as soon as they find the complete match for
|
Whereas the standard function stops as soon as it finds the complete match for
|
||||||
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
"dog", the DFA function also finds the partial match for "dogsbody", and so
|
||||||
return that when PCRE2_PARTIAL_HARD is set.
|
returns that when PCRE2_PARTIAL_HARD is set.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
|
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
|
|
@ -244,7 +244,7 @@ input lines. Each set starts with a regular expression pattern, followed by any
|
||||||
number of subject lines to be matched against that pattern. In between sets of
|
number of subject lines to be matched against that pattern. In between sets of
|
||||||
test data, command lines that begin with a hash (#) character may appear. This
|
test data, command lines that begin with a hash (#) character may appear. This
|
||||||
file format, with some restrictions, can also be processed by the
|
file format, with some restrictions, can also be processed by the
|
||||||
<b>perltest.pl</b> script that is distributed with PCRE2 as a means of checking
|
<b>perltest.sh</b> script that is distributed with PCRE2 as a means of checking
|
||||||
that the behaviour of PCRE2 and Perl is the same.
|
that the behaviour of PCRE2 and Perl is the same.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -287,11 +287,11 @@ patterns. Modifiers on a pattern can change these settings.
|
||||||
#perltest
|
#perltest
|
||||||
</pre>
|
</pre>
|
||||||
The appearance of this line causes all subsequent modifier settings to be
|
The appearance of this line causes all subsequent modifier settings to be
|
||||||
checked for compatibility with the <b>perltest.pl</b> script, which is used to
|
checked for compatibility with the <b>perltest.sh</b> script, which is used to
|
||||||
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
||||||
lines, none of the other command lines are permitted, because they and many
|
lines, none of the other command lines are permitted, because they and many
|
||||||
of the modifiers are specific to <b>pcre2test</b>, and should not be used in
|
of the modifiers are specific to <b>pcre2test</b>, and should not be used in
|
||||||
test files that are also processed by <b>perltest.pl</b>. The \fP#perltest\fB
|
test files that are also processed by <b>perltest.sh</b>. The \fP#perltest\fB
|
||||||
command helps detect tests that are accidentally put in the wrong file.
|
command helps detect tests that are accidentally put in the wrong file.
|
||||||
<pre>
|
<pre>
|
||||||
#subject <modifier-list>
|
#subject <modifier-list>
|
||||||
|
@ -307,7 +307,7 @@ for both patterns and subject lines, whereas others are valid for one or the
|
||||||
other only. Each modifier has a long name, for example "anchored", and some of
|
other only. Each modifier has a long name, for example "anchored", and some of
|
||||||
them must be followed by an equals sign and a value, for example, "offset=12".
|
them must be followed by an equals sign and a value, for example, "offset=12".
|
||||||
Modifiers that do not take values may be preceded by a minus sign to turn off a
|
Modifiers that do not take values may be preceded by a minus sign to turn off a
|
||||||
previous default setting.
|
previous setting.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A few of the more common modifiers can also be specified as single letters, for
|
A few of the more common modifiers can also be specified as single letters, for
|
||||||
|
@ -376,7 +376,7 @@ encoding non-printing characters in a visible way:
|
||||||
\xhh hexadecimal byte (up to 2 hex digits)
|
\xhh hexadecimal byte (up to 2 hex digits)
|
||||||
\x{hh...} hexadecimal character (any number of hex digits)
|
\x{hh...} hexadecimal character (any number of hex digits)
|
||||||
</pre>
|
</pre>
|
||||||
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
The use of \x{hh...} is not dependent on the use of the <b>utf</b> modifier on
|
||||||
the pattern. It is recognized always. There may be any number of hexadecimal
|
the pattern. It is recognized always. There may be any number of hexadecimal
|
||||||
digits inside the braces; invalid values provoke error messages.
|
digits inside the braces; invalid values provoke error messages.
|
||||||
</P>
|
</P>
|
||||||
|
@ -411,7 +411,7 @@ is converted to "abcabcabcabc". This feature does not support nesting. To
|
||||||
include a closing square bracket in the characters, code it as \x5D.
|
include a closing square bracket in the characters, code it as \x5D.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
A backslash followed by an equals sign marke the end of the subject string and
|
A backslash followed by an equals sign marks the end of the subject string and
|
||||||
the start of a modifier list. For example:
|
the start of a modifier list. For example:
|
||||||
<pre>
|
<pre>
|
||||||
abc\=notbol,notempty
|
abc\=notbol,notempty
|
||||||
|
@ -503,8 +503,8 @@ is built, with the default default being Unicode.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>newline</b> modifier specifies which characters are to be interpreted as
|
The <b>newline</b> modifier specifies which characters are to be interpreted as
|
||||||
newlines, both in the pattern and (by default) in subject lines. The type must
|
newlines, both in the pattern and in subject lines. The type must be one of CR,
|
||||||
be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
LF, CRLF, ANYCRLF, or ANY (in upper or lower case).
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Information about a pattern
|
Information about a pattern
|
||||||
|
@ -522,8 +522,8 @@ regression tests can be used in different environments.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>fullbincode</b> modifier, by contrast, <i>does</i> include length and
|
The <b>fullbincode</b> modifier, by contrast, <i>does</i> include length and
|
||||||
offset values. This is used in a few special tests and is also useful for
|
offset values. This is used in a few special tests that run only for specific
|
||||||
one-off tests.
|
code unit widths and link sizes, and is also useful for one-off tests.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>info</b> modifier requests information about the compiled pattern
|
The <b>info</b> modifier requests information about the compiled pattern
|
||||||
|
@ -546,13 +546,14 @@ some typical examples:
|
||||||
Last code unit = 'c' (caseless)
|
Last code unit = 'c' (caseless)
|
||||||
Subject length lower bound = 3
|
Subject length lower bound = 3
|
||||||
</pre>
|
</pre>
|
||||||
"Compile options" are those specified to the compile function; "overall
|
"Compile options" are those specified by modifiers; "overall options" have
|
||||||
options" have added options that are taken or deduced from the pattern. If both
|
added options that are taken or deduced from the pattern. If both sets of
|
||||||
sets of options are the same, just a single "options" line is output. "First
|
options are the same, just a single "options" line is output; if there are no
|
||||||
code unit" is where any match must start; if there is more than one they are
|
options, the line is omitted. "First code unit" is where any match must start;
|
||||||
listed as "starting code units". "Last code unit" is the last literal code unit
|
if there is more than one they are listed as "starting code units". "Last code
|
||||||
that must be present in any match. This is not necessarily the last character.
|
unit" is the last literal code unit that must be present in any match. This is
|
||||||
These lines are omitted if no starting or ending code units are recorded.
|
not necessarily the last character. These lines are omitted if no starting or
|
||||||
|
ending code units are recorded.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Specifying a pattern in hex
|
Specifying a pattern in hex
|
||||||
|
@ -565,16 +566,16 @@ pairs. For example:
|
||||||
/ab 32 59/hex
|
/ab 32 59/hex
|
||||||
</pre>
|
</pre>
|
||||||
This feature is provided as a way of creating patterns that contain binary zero
|
This feature is provided as a way of creating patterns that contain binary zero
|
||||||
characters. By default, <b>pcre2test</b> passes patterns as zero-terminated
|
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
|
||||||
strings to <b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED.
|
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
|
||||||
However, for patterns specified in hexadecimal, the actual length of the
|
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
|
||||||
pattern is passed.
|
actual length of the pattern is passed.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
JIT compilation
|
JIT compilation
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>/jit</b> modifier may optionally be followed by and equals sign and a
|
The <b>/jit</b> modifier may optionally be followed by an equals sign and a
|
||||||
number in the range 0 to 7:
|
number in the range 0 to 7:
|
||||||
<pre>
|
<pre>
|
||||||
0 disable JIT
|
0 disable JIT
|
||||||
|
@ -606,7 +607,7 @@ pattern shows whether JIT compilation was or was not successful. If
|
||||||
<b>jitverify</b> is specified without <b>jit</b>, jit=7 is assumed. If JIT
|
<b>jitverify</b> is specified without <b>jit</b>, jit=7 is assumed. If JIT
|
||||||
compilation is successful when <b>jitverify</b> is set, the text "(JIT)" is
|
compilation is successful when <b>jitverify</b> is set, the text "(JIT)" is
|
||||||
added to the first output line after a match or non match when JIT-compiled
|
added to the first output line after a match or non match when JIT-compiled
|
||||||
code was actually used.
|
code was actually used in the match.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Setting a locale
|
Setting a locale
|
||||||
|
@ -689,8 +690,8 @@ be aborted.
|
||||||
Using alternative character tables
|
Using alternative character tables
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>/tables</b> modifier must be followed by a single digit. It causes a
|
The value specified for the <b>/tables</b> modifier must be one of the digits 0,
|
||||||
specific set of built-in character tables to be passed to
|
1, or 2. It causes a specific set of built-in character tables to be passed to
|
||||||
<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with
|
<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with
|
||||||
different character tables. The digit specifies the tables as follows:
|
different character tables. The digit specifies the tables as follows:
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -800,13 +801,13 @@ The effects of these modifiers are described in the following sections.
|
||||||
Showing more text
|
Showing more text
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>aftertext</b> modifier requests that as well as outputting the substring
|
The <b>aftertext</b> modifier requests that as well as outputting the part of
|
||||||
that matched the entire pattern, <b>pcre2test</b> should in addition output the
|
the subject string that matched the entire pattern, <b>pcre2test</b> should in
|
||||||
remainder of the subject string. This is useful for tests where the subject
|
addition output the remainder of the subject string. This is useful for tests
|
||||||
contains multiple copies of the same substring. The <b>allaftertext</b> modifier
|
where the subject contains multiple copies of the same substring. The
|
||||||
requests the same action for captured substrings as well as the main matched
|
<b>allaftertext</b> modifier requests the same action for captured substrings as
|
||||||
substring. In each case the remainder is output on the following line with a
|
well as the main matched substring. In each case the remainder is output on the
|
||||||
plus character following the capture number.
|
following line with a plus character following the capture number.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
||||||
|
@ -824,7 +825,8 @@ underneath them. Here is an example:
|
||||||
<<< >>>
|
<<< >>>
|
||||||
</pre>
|
</pre>
|
||||||
This shows that the matched string is "abc", with the preceding and following
|
This shows that the matched string is "abc", with the preceding and following
|
||||||
strings "pqr" and "xyz" also consulted during the match.
|
strings "pqr" and "xyz" having been consulted during the match (when processing
|
||||||
|
the assertions).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>startchar</b> modifier requests that the starting character for the match
|
The <b>startchar</b> modifier requests that the starting character for the match
|
||||||
|
@ -881,7 +883,7 @@ function is called again to search the remainder of the subject. The difference
|
||||||
between <b>global</b> and <b>altglobal</b> is that the former uses the
|
between <b>global</b> and <b>altglobal</b> is that the former uses the
|
||||||
<i>start_offset</i> argument to <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>
|
<i>start_offset</i> argument to <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>
|
||||||
to start searching at a new point within the entire string (which is what Perl
|
to start searching at a new point within the entire string (which is what Perl
|
||||||
does), whereas the latter passes over a shortened substring. This makes a
|
does), whereas the latter passes over a shortened subject. This makes a
|
||||||
difference to the matching process if the pattern begins with a lookbehind
|
difference to the matching process if the pattern begins with a lookbehind
|
||||||
assertion (including \b or \B).
|
assertion (including \b or \B).
|
||||||
</P>
|
</P>
|
||||||
|
@ -893,7 +895,7 @@ fails, the start offset is advanced, and the normal match is retried. This
|
||||||
imitates the way Perl handles such cases when using the <b>/g</b> modifier or
|
imitates the way Perl handles such cases when using the <b>/g</b> modifier or
|
||||||
the <b>split()</b> function. Normally, the start offset is advanced by one
|
the <b>split()</b> function. Normally, the start offset is advanced by one
|
||||||
character, but if the newline convention recognizes CRLF as a newline, and the
|
character, but if the newline convention recognizes CRLF as a newline, and the
|
||||||
current character is CR followed by LF, an advance of two is used.
|
current character is CR followed by LF, an advance of two characters occurs.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Testing substring extraction functions
|
Testing substring extraction functions
|
||||||
|
@ -906,9 +908,9 @@ for example:
|
||||||
<pre>
|
<pre>
|
||||||
abcd\=copy=1,copy=3,get=G1
|
abcd\=copy=1,copy=3,get=G1
|
||||||
</pre>
|
</pre>
|
||||||
If the <b>#subject</b> command is used to set default copy and get lists, these
|
If the <b>#subject</b> command is used to set default copy and/or get lists,
|
||||||
can be unset by specifying a negative number for numbered groups and an empty
|
these can be unset by specifying a negative number to cancel all numbered
|
||||||
name for named groups.
|
groups and an empty name to cancel all named groups.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>getall</b> modifier tests <b>pcre2_substring_list_get()</b>, which
|
The <b>getall</b> modifier tests <b>pcre2_substring_list_get()</b>, which
|
||||||
|
@ -919,7 +921,8 @@ If the subject line is successfully matched, the substrings extracted by the
|
||||||
convenience functions are output with C, G, or L after the string number
|
convenience functions are output with C, G, or L after the string number
|
||||||
instead of a colon. This is in addition to the normal full list. The string
|
instead of a colon. This is in addition to the normal full list. The string
|
||||||
length (that is, the return from the extraction function) is given in
|
length (that is, the return from the extraction function) is given in
|
||||||
parentheses after each substring.
|
parentheses after each substring, followed by the name when the extraction was
|
||||||
|
by name.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Testing the substitution function
|
Testing the substitution function
|
||||||
|
@ -1093,11 +1096,10 @@ characters before the actual match start if a lookbehind assertion, \K, \b,
|
||||||
or \B was involved.)
|
or \B was involved.)
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
For any other return, <b>pcre2test</b> outputs the PCRE2
|
For any other return, <b>pcre2test</b> outputs the PCRE2 negative error number
|
||||||
negative error number and a short descriptive phrase. If the error is a failed
|
and a short descriptive phrase. If the error is a failed UTF string check, the
|
||||||
UTF string check, the offset of the start of the failing character and the
|
code unit offset of the start of the failing character is also output. Here is
|
||||||
reason code are also output. Here is an example of an interactive
|
an example of an interactive <b>pcre2test</b> run.
|
||||||
<b>pcre2test</b> run.
|
|
||||||
<pre>
|
<pre>
|
||||||
$ pcre2test
|
$ pcre2test
|
||||||
PCRE2 version 9.00 2014-05-10
|
PCRE2 version 9.00 2014-05-10
|
||||||
|
@ -1110,10 +1112,10 @@ reason code are also output. Here is an example of an interactive
|
||||||
No match
|
No match
|
||||||
</pre>
|
</pre>
|
||||||
Unset capturing substrings that are not followed by one that is set are not
|
Unset capturing substrings that are not followed by one that is set are not
|
||||||
returned by <b>pcre2_match()</b>, and are not shown by <b>pcre2test</b>. In the
|
shown by <b>pcre2test</b> unless the <b>allcaptures</b> modifier is specified. In
|
||||||
following example, there are two capturing substrings, but when the first data
|
the following example, there are two capturing substrings, but when the first
|
||||||
line is matched, the second, unset substring is not shown. An "internal" unset
|
data line is matched, the second, unset substring is not shown. An "internal"
|
||||||
substring is shown as "<unset>", as for the second data line.
|
unset substring is shown as "<unset>", as for the second data line.
|
||||||
<pre>
|
<pre>
|
||||||
re> /(a)|(b)/
|
re> /(a)|(b)/
|
||||||
data> a
|
data> a
|
||||||
|
@ -1149,8 +1151,8 @@ are output in sequence, like this:
|
||||||
1: pp
|
1: pp
|
||||||
</pre>
|
</pre>
|
||||||
"No match" is output only if the first match attempt fails. Here is an example
|
"No match" is output only if the first match attempt fails. Here is an example
|
||||||
of a failure message (the offset 4 that is specified by \>4 is past the end of
|
of a failure message (the offset 4 that is specified by the <b>offset</b>
|
||||||
the subject string):
|
modifier is past the end of the subject string):
|
||||||
<pre>
|
<pre>
|
||||||
re> /xyz/
|
re> /xyz/
|
||||||
data> xyz\=offset=4
|
data> xyz\=offset=4
|
||||||
|
@ -1175,12 +1177,13 @@ the subject where there is at least one match. For example:
|
||||||
1: tang
|
1: tang
|
||||||
2: tan
|
2: tan
|
||||||
</pre>
|
</pre>
|
||||||
(Using the normal matching function on this data finds only "tang".) The
|
Using the normal matching function on this data finds only "tang". The
|
||||||
longest matching string is always given first (and numbered zero). After a
|
longest matching string is always given first (and numbered zero). After a
|
||||||
PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the
|
PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the
|
||||||
partially matching substring. (Note that this is the entire substring that was
|
partially matching substring. Note that this is the entire substring that was
|
||||||
inspected during the partial match; it may include characters before the actual
|
inspected during the partial match; it may include characters before the actual
|
||||||
match start if a lookbehind assertion, \K, \b, or \B was involved.)
|
match start if a lookbehind assertion, \b, or \B was involved. (\K is not
|
||||||
|
supported for DFA matching.)
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If global matching is requested, the search for further matches resumes
|
If global matching is requested, the search for further matches resumes
|
||||||
|
@ -1217,9 +1220,9 @@ documentation.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br>
|
<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br>
|
||||||
<P>
|
<P>
|
||||||
If the pattern contains any callout requests, <b>pcre2test</b>'s callout function
|
If the pattern contains any callout requests, <b>pcre2test</b>'s callout
|
||||||
is called during matching. This works with both matching functions. By default,
|
function is called during matching. This works with both matching functions. By
|
||||||
the called function displays the callout number, the start and current
|
default, the called function displays the callout number, the start and current
|
||||||
positions in the text at the callout time, and the next pattern item to be
|
positions in the text at the callout time, and the next pattern item to be
|
||||||
tested. For example:
|
tested. For example:
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -1306,7 +1309,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC20" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC20" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 14 November 2014
|
Last updated: 23 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -115,7 +115,10 @@ VALIDITY OF UTF STRINGS
|
||||||
<P>
|
<P>
|
||||||
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
|
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
|
||||||
are (by default) checked for validity on entry to the relevant functions.
|
are (by default) checked for validity on entry to the relevant functions.
|
||||||
If an invalid UTF string is passed, an error return is given.
|
If an invalid UTF string is passed, an negative error code is returned. The
|
||||||
|
code unit offset to the offending character can be extracted from the match
|
||||||
|
data block by calling <b>pcre2_get_startchar()</b>, which is used for this
|
||||||
|
purpose after a UTF error.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
UTF-16 and UTF-32 strings can indicate their endianness by special code knows
|
UTF-16 and UTF-32 strings can indicate their endianness by special code knows
|
||||||
|
|
|
@ -2057,6 +2057,10 @@ OTHER INFORMATION ABOUT A MATCH
|
||||||
value is always the same as ovector[0] because \K does not affect the
|
value is always the same as ovector[0] because \K does not affect the
|
||||||
result of a partial match.
|
result of a partial match.
|
||||||
|
|
||||||
|
The startchar field is also used to return the offset of an invalid UTF
|
||||||
|
character when UTF checking fails. Details are given in the pcre2uni-
|
||||||
|
code page.
|
||||||
|
|
||||||
|
|
||||||
ERROR RETURNS FROM pcre2_match()
|
ERROR RETURNS FROM pcre2_match()
|
||||||
|
|
||||||
|
@ -2601,7 +2605,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 21 November 2014
|
Last updated: 23 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -4327,9 +4331,9 @@ PARTIAL MATCHING USING pcre2_dfa_match()
|
||||||
|
|
||||||
/dog(sbody)??/
|
/dog(sbody)??/
|
||||||
|
|
||||||
Whereas the standard functions stop as soon as they find the complete
|
Whereas the standard function stops as soon as it finds the complete
|
||||||
match for "dog", the DFA functions also find the partial match for
|
match for "dog", the DFA function also finds the partial match for
|
||||||
"dogsbody", and so return that when PCRE2_PARTIAL_HARD is set.
|
"dogsbody", and so returns that when PCRE2_PARTIAL_HARD is set.
|
||||||
|
|
||||||
|
|
||||||
PARTIAL MATCHING AND WORD BOUNDARIES
|
PARTIAL MATCHING AND WORD BOUNDARIES
|
||||||
|
@ -4681,8 +4685,10 @@ VALIDITY OF UTF STRINGS
|
||||||
|
|
||||||
When the PCRE2_UTF option is set, the strings passed as patterns and
|
When the PCRE2_UTF option is set, the strings passed as patterns and
|
||||||
subjects are (by default) checked for validity on entry to the relevant
|
subjects are (by default) checked for validity on entry to the relevant
|
||||||
functions. If an invalid UTF string is passed, an error return is
|
functions. If an invalid UTF string is passed, an negative error code
|
||||||
given.
|
is returned. The code unit offset to the offending character can be
|
||||||
|
extracted from the match data block by calling pcre2_get_startchar(),
|
||||||
|
which is used for this purpose after a UTF error.
|
||||||
|
|
||||||
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
||||||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||||
|
|
|
@ -2091,12 +2091,12 @@ different to the value of \fIovector[0]\fP if the pattern contains the \eK
|
||||||
escape sequence. After a partial match, however, this value is always the same
|
escape sequence. After a partial match, however, this value is always the same
|
||||||
as \fIovector[0]\fP because \eK does not affect the result of a partial match.
|
as \fIovector[0]\fP because \eK does not affect the result of a partial match.
|
||||||
.P
|
.P
|
||||||
The \fBstartchar\fP field is also used to return the offset of an invalid
|
The \fBstartchar\fP field is also used to return the offset of an invalid
|
||||||
UTF character when UTF checking fails. Details are given in the
|
UTF character when UTF checking fails. Details are given in the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2unicode\fP
|
\fBpcre2unicode\fP
|
||||||
.\"
|
.\"
|
||||||
page.
|
page.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.\" HTML <a name="errorlist"></a>
|
.\" HTML <a name="errorlist"></a>
|
||||||
|
|
|
@ -169,9 +169,9 @@ the string "dog" matched against the ungreedy pattern shown above:
|
||||||
.sp
|
.sp
|
||||||
/dog(sbody)??/
|
/dog(sbody)??/
|
||||||
.sp
|
.sp
|
||||||
Whereas the standard functions stop as soon as they find the complete match for
|
Whereas the standard function stops as soon as it finds the complete match for
|
||||||
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
"dog", the DFA function also finds the partial match for "dogsbody", and so
|
||||||
return that when PCRE2_PARTIAL_HARD is set.
|
returns that when PCRE2_PARTIAL_HARD is set.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
|
.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
|
||||||
|
|
|
@ -478,7 +478,7 @@ link sizes and different code unit widths. By using \fBbincode\fP, the same
|
||||||
regression tests can be used in different environments.
|
regression tests can be used in different environments.
|
||||||
.P
|
.P
|
||||||
The \fBfullbincode\fP modifier, by contrast, \fIdoes\fP include length and
|
The \fBfullbincode\fP modifier, by contrast, \fIdoes\fP include length and
|
||||||
offset values. This is used in a few special tests that run only for specific
|
offset values. This is used in a few special tests that run only for specific
|
||||||
code unit widths and link sizes, and is also useful for one-off tests.
|
code unit widths and link sizes, and is also useful for one-off tests.
|
||||||
.P
|
.P
|
||||||
The \fBinfo\fP modifier requests information about the compiled pattern
|
The \fBinfo\fP modifier requests information about the compiled pattern
|
||||||
|
@ -503,7 +503,7 @@ some typical examples:
|
||||||
.sp
|
.sp
|
||||||
"Compile options" are those specified by modifiers; "overall options" have
|
"Compile options" are those specified by modifiers; "overall options" have
|
||||||
added options that are taken or deduced from the pattern. If both sets of
|
added options that are taken or deduced from the pattern. If both sets of
|
||||||
options are the same, just a single "options" line is output; if there are no
|
options are the same, just a single "options" line is output; if there are no
|
||||||
options, the line is omitted. "First code unit" is where any match must start;
|
options, the line is omitted. "First code unit" is where any match must start;
|
||||||
if there is more than one they are listed as "starting code units". "Last code
|
if there is more than one they are listed as "starting code units". "Last code
|
||||||
unit" is the last literal code unit that must be present in any match. This is
|
unit" is the last literal code unit that must be present in any match. This is
|
||||||
|
@ -646,7 +646,7 @@ be aborted.
|
||||||
.SS "Using alternative character tables"
|
.SS "Using alternative character tables"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
The value specified for the \fB/tables\fP modifier must be one of the digits 0,
|
The value specified for the \fB/tables\fP modifier must be one of the digits 0,
|
||||||
1, or 2. It causes a specific set of built-in character tables to be passed to
|
1, or 2. It causes a specific set of built-in character tables to be passed to
|
||||||
\fBpcre2_compile()\fP. This is used in the PCRE2 tests to check behaviour with
|
\fBpcre2_compile()\fP. This is used in the PCRE2 tests to check behaviour with
|
||||||
different character tables. The digit specifies the tables as follows:
|
different character tables. The digit specifies the tables as follows:
|
||||||
|
@ -760,7 +760,7 @@ The effects of these modifiers are described in the following sections.
|
||||||
.SS "Showing more text"
|
.SS "Showing more text"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
The \fBaftertext\fP modifier requests that as well as outputting the part of
|
The \fBaftertext\fP modifier requests that as well as outputting the part of
|
||||||
the subject string that matched the entire pattern, \fBpcre2test\fP should in
|
the subject string that matched the entire pattern, \fBpcre2test\fP should in
|
||||||
addition output the remainder of the subject string. This is useful for tests
|
addition output the remainder of the subject string. This is useful for tests
|
||||||
where the subject contains multiple copies of the same substring. The
|
where the subject contains multiple copies of the same substring. The
|
||||||
|
@ -783,7 +783,7 @@ underneath them. Here is an example:
|
||||||
<<< >>>
|
<<< >>>
|
||||||
.sp
|
.sp
|
||||||
This shows that the matched string is "abc", with the preceding and following
|
This shows that the matched string is "abc", with the preceding and following
|
||||||
strings "pqr" and "xyz" having been consulted during the match (when processing
|
strings "pqr" and "xyz" having been consulted during the match (when processing
|
||||||
the assertions).
|
the assertions).
|
||||||
.P
|
.P
|
||||||
The \fBstartchar\fP modifier requests that the starting character for the match
|
The \fBstartchar\fP modifier requests that the starting character for the match
|
||||||
|
@ -873,7 +873,7 @@ If the subject line is successfully matched, the substrings extracted by the
|
||||||
convenience functions are output with C, G, or L after the string number
|
convenience functions are output with C, G, or L after the string number
|
||||||
instead of a colon. This is in addition to the normal full list. The string
|
instead of a colon. This is in addition to the normal full list. The string
|
||||||
length (that is, the return from the extraction function) is given in
|
length (that is, the return from the extraction function) is given in
|
||||||
parentheses after each substring, followed by the name when the extraction was
|
parentheses after each substring, followed by the name when the extraction was
|
||||||
by name.
|
by name.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
|
@ -1102,7 +1102,7 @@ are output in sequence, like this:
|
||||||
1: pp
|
1: pp
|
||||||
.sp
|
.sp
|
||||||
"No match" is output only if the first match attempt fails. Here is an example
|
"No match" is output only if the first match attempt fails. Here is an example
|
||||||
of a failure message (the offset 4 that is specified by the \fBoffset\fP
|
of a failure message (the offset 4 that is specified by the \fBoffset\fP
|
||||||
modifier is past the end of the subject string):
|
modifier is past the end of the subject string):
|
||||||
.sp
|
.sp
|
||||||
re> /xyz/
|
re> /xyz/
|
||||||
|
@ -1134,7 +1134,7 @@ longest matching string is always given first (and numbered zero). After a
|
||||||
PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the
|
PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the
|
||||||
partially matching substring. Note that this is the entire substring that was
|
partially matching substring. Note that this is the entire substring that was
|
||||||
inspected during the partial match; it may include characters before the actual
|
inspected during the partial match; it may include characters before the actual
|
||||||
match start if a lookbehind assertion, \eb, or \eB was involved. (\eK is not
|
match start if a lookbehind assertion, \eb, or \eB was involved. (\eK is not
|
||||||
supported for DFA matching.)
|
supported for DFA matching.)
|
||||||
.P
|
.P
|
||||||
If global matching is requested, the search for further matches resumes
|
If global matching is requested, the search for further matches resumes
|
||||||
|
|
|
@ -188,7 +188,7 @@ DESCRIPTION
|
||||||
followed by any number of subject lines to be matched against that pat-
|
followed by any number of subject lines to be matched against that pat-
|
||||||
tern. In between sets of test data, command lines that begin with a
|
tern. In between sets of test data, command lines that begin with a
|
||||||
hash (#) character may appear. This file format, with some restric-
|
hash (#) character may appear. This file format, with some restric-
|
||||||
tions, can also be processed by the perltest.pl script that is distrib-
|
tions, can also be processed by the perltest.sh script that is distrib-
|
||||||
uted with PCRE2 as a means of checking that the behaviour of PCRE2 and
|
uted with PCRE2 as a means of checking that the behaviour of PCRE2 and
|
||||||
Perl is the same.
|
Perl is the same.
|
||||||
|
|
||||||
|
@ -232,11 +232,11 @@ COMMAND LINES
|
||||||
#perltest
|
#perltest
|
||||||
|
|
||||||
The appearance of this line causes all subsequent modifier settings to
|
The appearance of this line causes all subsequent modifier settings to
|
||||||
be checked for compatibility with the perltest.pl script, which is used
|
be checked for compatibility with the perltest.sh script, which is used
|
||||||
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
||||||
comment lines, none of the other command lines are permitted, because
|
comment lines, none of the other command lines are permitted, because
|
||||||
they and many of the modifiers are specific to pcre2test, and should
|
they and many of the modifiers are specific to pcre2test, and should
|
||||||
not be used in test files that are also processed by perltest.pl. The
|
not be used in test files that are also processed by perltest.sh. The
|
||||||
#perltest command helps detect tests that are accidentally put in the
|
#perltest command helps detect tests that are accidentally put in the
|
||||||
wrong file.
|
wrong file.
|
||||||
|
|
||||||
|
@ -255,53 +255,52 @@ MODIFIER SYNTAX
|
||||||
valid for one or the other only. Each modifier has a long name, for
|
valid for one or the other only. Each modifier has a long name, for
|
||||||
example "anchored", and some of them must be followed by an equals sign
|
example "anchored", and some of them must be followed by an equals sign
|
||||||
and a value, for example, "offset=12". Modifiers that do not take val-
|
and a value, for example, "offset=12". Modifiers that do not take val-
|
||||||
ues may be preceded by a minus sign to turn off a previous default set-
|
ues may be preceded by a minus sign to turn off a previous setting.
|
||||||
ting.
|
|
||||||
|
|
||||||
A few of the more common modifiers can also be specified as single let-
|
A few of the more common modifiers can also be specified as single let-
|
||||||
ters, for example "i" for "caseless". In documentation, following the
|
ters, for example "i" for "caseless". In documentation, following the
|
||||||
Perl convention, these are written with a slash ("the /i modifier") for
|
Perl convention, these are written with a slash ("the /i modifier") for
|
||||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||||
item of a modifier list. If the first item is not recognized as a long
|
item of a modifier list. If the first item is not recognized as a long
|
||||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
/abc/ig,newline=cr,jit=3
|
/abc/ig,newline=cr,jit=3
|
||||||
|
|
||||||
This is a pattern line whose modifier list starts with two one-letter
|
This is a pattern line whose modifier list starts with two one-letter
|
||||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||||
same as used in Perl.
|
same as used in Perl.
|
||||||
|
|
||||||
|
|
||||||
PATTERN SYNTAX
|
PATTERN SYNTAX
|
||||||
|
|
||||||
A pattern line must start with one of the following characters (common
|
A pattern line must start with one of the following characters (common
|
||||||
symbols, excluding pattern meta-characters):
|
symbols, excluding pattern meta-characters):
|
||||||
|
|
||||||
/ ! " ' ` - = _ : ; , % & @ ~
|
/ ! " ' ` - = _ : ; , % & @ ~
|
||||||
|
|
||||||
This is interpreted as the pattern's delimiter. A regular expression
|
This is interpreted as the pattern's delimiter. A regular expression
|
||||||
may be continued over several input lines, in which case the newline
|
may be continued over several input lines, in which case the newline
|
||||||
characters are included within it. It is possible to include the delim-
|
characters are included within it. It is possible to include the delim-
|
||||||
iter within the pattern by escaping it with a backslash, for example
|
iter within the pattern by escaping it with a backslash, for example
|
||||||
|
|
||||||
/abc\/def/
|
/abc\/def/
|
||||||
|
|
||||||
If you do this, the escape and the delimiter form part of the pattern,
|
If you do this, the escape and the delimiter form part of the pattern,
|
||||||
but since the delimiters are all non-alphanumeric, this does not affect
|
but since the delimiters are all non-alphanumeric, this does not affect
|
||||||
its interpretation. If the terminating delimiter is immediately fol-
|
its interpretation. If the terminating delimiter is immediately fol-
|
||||||
lowed by a backslash, for example,
|
lowed by a backslash, for example,
|
||||||
|
|
||||||
/abc/\
|
/abc/\
|
||||||
|
|
||||||
then a backslash is added to the end of the pattern. This is done to
|
then a backslash is added to the end of the pattern. This is done to
|
||||||
provide a way of testing the error condition that arises if a pattern
|
provide a way of testing the error condition that arises if a pattern
|
||||||
finishes with a backslash, because
|
finishes with a backslash, because
|
||||||
|
|
||||||
/abc\/
|
/abc\/
|
||||||
|
|
||||||
is interpreted as the first line of a pattern that starts with "abc/",
|
is interpreted as the first line of a pattern that starts with "abc/",
|
||||||
causing pcre2test to read the next line as a continuation of the regu-
|
causing pcre2test to read the next line as a continuation of the regu-
|
||||||
lar expression.
|
lar expression.
|
||||||
|
|
||||||
A pattern can be followed by a modifier list (details below).
|
A pattern can be followed by a modifier list (details below).
|
||||||
|
@ -309,7 +308,7 @@ PATTERN SYNTAX
|
||||||
|
|
||||||
SUBJECT LINE SYNTAX
|
SUBJECT LINE SYNTAX
|
||||||
|
|
||||||
Before each subject line is passed to pcre2_match() or
|
Before each subject line is passed to pcre2_match() or
|
||||||
pcre2_dfa_match(), leading and trailing white space is removed, and the
|
pcre2_dfa_match(), leading and trailing white space is removed, and the
|
||||||
line is scanned for backslash escapes. The following provide a means of
|
line is scanned for backslash escapes. The following provide a means of
|
||||||
encoding non-printing characters in a visible way:
|
encoding non-printing characters in a visible way:
|
||||||
|
@ -329,23 +328,23 @@ SUBJECT LINE SYNTAX
|
||||||
\x{hh...} hexadecimal character (any number of hex digits)
|
\x{hh...} hexadecimal character (any number of hex digits)
|
||||||
|
|
||||||
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
||||||
the pattern. It is recognized always. There may be any number of hexa-
|
the pattern. It is recognized always. There may be any number of hexa-
|
||||||
decimal digits inside the braces; invalid values provoke error mes-
|
decimal digits inside the braces; invalid values provoke error mes-
|
||||||
sages.
|
sages.
|
||||||
|
|
||||||
Note that \xhh specifies one byte rather than one character in UTF-8
|
Note that \xhh specifies one byte rather than one character in UTF-8
|
||||||
mode; this makes it possible to construct invalid UTF-8 sequences for
|
mode; this makes it possible to construct invalid UTF-8 sequences for
|
||||||
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
||||||
character in UTF-8 mode, generating more than one byte if the value is
|
character in UTF-8 mode, generating more than one byte if the value is
|
||||||
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
||||||
\x{hh} generates one byte for values less than 256, and causes an error
|
\x{hh} generates one byte for values less than 256, and causes an error
|
||||||
for greater values.
|
for greater values.
|
||||||
|
|
||||||
In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
|
In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
|
||||||
possible to construct invalid UTF-16 sequences for testing purposes.
|
possible to construct invalid UTF-16 sequences for testing purposes.
|
||||||
|
|
||||||
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
||||||
makes it possible to construct invalid UTF-32 sequences for testing
|
makes it possible to construct invalid UTF-32 sequences for testing
|
||||||
purposes.
|
purposes.
|
||||||
|
|
||||||
There is a special backslash sequence that specifies replication of one
|
There is a special backslash sequence that specifies replication of one
|
||||||
|
@ -353,38 +352,38 @@ SUBJECT LINE SYNTAX
|
||||||
|
|
||||||
\[<characters>]{<count>}
|
\[<characters>]{<count>}
|
||||||
|
|
||||||
This makes it possible to test long strings without having to provide
|
This makes it possible to test long strings without having to provide
|
||||||
them as part of the file. For example:
|
them as part of the file. For example:
|
||||||
|
|
||||||
\[abc]{4}
|
\[abc]{4}
|
||||||
|
|
||||||
is converted to "abcabcabcabc". This feature does not support nesting.
|
is converted to "abcabcabcabc". This feature does not support nesting.
|
||||||
To include a closing square bracket in the characters, code it as \x5D.
|
To include a closing square bracket in the characters, code it as \x5D.
|
||||||
|
|
||||||
A backslash followed by an equals sign marke the end of the subject
|
A backslash followed by an equals sign marks the end of the subject
|
||||||
string and the start of a modifier list. For example:
|
string and the start of a modifier list. For example:
|
||||||
|
|
||||||
abc\=notbol,notempty
|
abc\=notbol,notempty
|
||||||
|
|
||||||
A backslash followed by any other non-alphanumeric character just
|
A backslash followed by any other non-alphanumeric character just
|
||||||
escapes that character. A backslash followed by anything else causes an
|
escapes that character. A backslash followed by anything else causes an
|
||||||
error. However, if the very last character in the line is a backslash
|
error. However, if the very last character in the line is a backslash
|
||||||
(and there is no modifier list), it is ignored. This gives a way of
|
(and there is no modifier list), it is ignored. This gives a way of
|
||||||
passing an empty line as data, since a real empty line terminates the
|
passing an empty line as data, since a real empty line terminates the
|
||||||
data input.
|
data input.
|
||||||
|
|
||||||
|
|
||||||
PATTERN MODIFIERS
|
PATTERN MODIFIERS
|
||||||
|
|
||||||
There are three types of modifier that can appear in pattern lines, two
|
There are three types of modifier that can appear in pattern lines, two
|
||||||
of which may also be used in a #pattern command. A pattern's modifier
|
of which may also be used in a #pattern command. A pattern's modifier
|
||||||
list can add to or override default modifiers that were set by a previ-
|
list can add to or override default modifiers that were set by a previ-
|
||||||
ous #pattern command.
|
ous #pattern command.
|
||||||
|
|
||||||
Setting compilation options
|
Setting compilation options
|
||||||
|
|
||||||
The following modifiers set options for pcre2_compile(). The most com-
|
The following modifiers set options for pcre2_compile(). The most com-
|
||||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||||
tion of their effects.
|
tion of their effects.
|
||||||
|
|
||||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||||
|
@ -410,13 +409,13 @@ PATTERN MODIFIERS
|
||||||
utf set PCRE2_UTF
|
utf set PCRE2_UTF
|
||||||
|
|
||||||
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
||||||
non-printing characters in output strings to be printed using the
|
non-printing characters in output strings to be printed using the
|
||||||
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
||||||
without the curly brackets.
|
without the curly brackets.
|
||||||
|
|
||||||
Setting compilation controls
|
Setting compilation controls
|
||||||
|
|
||||||
The following modifiers affect the compilation process or request
|
The following modifiers affect the compilation process or request
|
||||||
information about the pattern:
|
information about the pattern:
|
||||||
|
|
||||||
bsr=[anycrlf|unicode] specify \R handling
|
bsr=[anycrlf|unicode] specify \R handling
|
||||||
|
@ -441,34 +440,34 @@ PATTERN MODIFIERS
|
||||||
|
|
||||||
Newline and \R handling
|
Newline and \R handling
|
||||||
|
|
||||||
The bsr modifier specifies what \R in a pattern should match. If it is
|
The bsr modifier specifies what \R in a pattern should match. If it is
|
||||||
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
||||||
"unicode", \R matches any Unicode newline sequence. The default is
|
"unicode", \R matches any Unicode newline sequence. The default is
|
||||||
specified when PCRE2 is built, with the default default being Unicode.
|
specified when PCRE2 is built, with the default default being Unicode.
|
||||||
|
|
||||||
The newline modifier specifies which characters are to be interpreted
|
The newline modifier specifies which characters are to be interpreted
|
||||||
as newlines, both in the pattern and (by default) in subject lines. The
|
as newlines, both in the pattern and in subject lines. The type must be
|
||||||
type must be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
one of CR, LF, CRLF, ANYCRLF, or ANY (in upper or lower case).
|
||||||
|
|
||||||
Information about a pattern
|
Information about a pattern
|
||||||
|
|
||||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||||
available information.
|
available information.
|
||||||
|
|
||||||
The bincode modifier causes a representation of the compiled code to be
|
The bincode modifier causes a representation of the compiled code to be
|
||||||
output after compilation. This information does not contain length and
|
output after compilation. This information does not contain length and
|
||||||
offset values, which ensures that the same output is generated for dif-
|
offset values, which ensures that the same output is generated for dif-
|
||||||
ferent internal link sizes and different code unit widths. By using
|
ferent internal link sizes and different code unit widths. By using
|
||||||
bincode, the same regression tests can be used in different environ-
|
bincode, the same regression tests can be used in different environ-
|
||||||
ments.
|
ments.
|
||||||
|
|
||||||
The fullbincode modifier, by contrast, does include length and offset
|
The fullbincode modifier, by contrast, does include length and offset
|
||||||
values. This is used in a few special tests and is also useful for one-
|
values. This is used in a few special tests that run only for specific
|
||||||
off tests.
|
code unit widths and link sizes, and is also useful for one-off tests.
|
||||||
|
|
||||||
The info modifier requests information about the compiled pattern
|
The info modifier requests information about the compiled pattern
|
||||||
(whether it is anchored, has a fixed first character, and so on). The
|
(whether it is anchored, has a fixed first character, and so on). The
|
||||||
information is obtained from the pcre2_pattern_info() function. Here
|
information is obtained from the pcre2_pattern_info() function. Here
|
||||||
are some typical examples:
|
are some typical examples:
|
||||||
|
|
||||||
re> /(?i)(^a|^b)/m,info
|
re> /(?i)(^a|^b)/m,info
|
||||||
|
@ -486,14 +485,15 @@ PATTERN MODIFIERS
|
||||||
Last code unit = 'c' (caseless)
|
Last code unit = 'c' (caseless)
|
||||||
Subject length lower bound = 3
|
Subject length lower bound = 3
|
||||||
|
|
||||||
"Compile options" are those specified to the compile function; "overall
|
"Compile options" are those specified by modifiers; "overall options"
|
||||||
options" have added options that are taken or deduced from the pattern.
|
have added options that are taken or deduced from the pattern. If both
|
||||||
If both sets of options are the same, just a single "options" line is
|
sets of options are the same, just a single "options" line is output;
|
||||||
output. "First code unit" is where any match must start; if there is
|
if there are no options, the line is omitted. "First code unit" is
|
||||||
more than one they are listed as "starting code units". "Last code
|
where any match must start; if there is more than one they are listed
|
||||||
unit" is the last literal code unit that must be present in any match.
|
as "starting code units". "Last code unit" is the last literal code
|
||||||
This is not necessarily the last character. These lines are omitted if
|
unit that must be present in any match. This is not necessarily the
|
||||||
no starting or ending code units are recorded.
|
last character. These lines are omitted if no starting or ending code
|
||||||
|
units are recorded.
|
||||||
|
|
||||||
Specifying a pattern in hex
|
Specifying a pattern in hex
|
||||||
|
|
||||||
|
@ -504,14 +504,14 @@ PATTERN MODIFIERS
|
||||||
/ab 32 59/hex
|
/ab 32 59/hex
|
||||||
|
|
||||||
This feature is provided as a way of creating patterns that contain
|
This feature is provided as a way of creating patterns that contain
|
||||||
binary zero characters. By default, pcre2test passes patterns as zero-
|
binary zero and other non-printing characters. By default, pcre2test
|
||||||
terminated strings to pcre2_compile(), giving the length as
|
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
||||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal,
|
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
||||||
the actual length of the pattern is passed.
|
hexadecimal, the actual length of the pattern is passed.
|
||||||
|
|
||||||
JIT compilation
|
JIT compilation
|
||||||
|
|
||||||
The /jit modifier may optionally be followed by and equals sign and a
|
The /jit modifier may optionally be followed by an equals sign and a
|
||||||
number in the range 0 to 7:
|
number in the range 0 to 7:
|
||||||
|
|
||||||
0 disable JIT
|
0 disable JIT
|
||||||
|
@ -540,7 +540,7 @@ PATTERN MODIFIERS
|
||||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||||
the first output line after a match or non match when JIT-compiled code
|
the first output line after a match or non match when JIT-compiled code
|
||||||
was actually used.
|
was actually used in the match.
|
||||||
|
|
||||||
Setting a locale
|
Setting a locale
|
||||||
|
|
||||||
|
@ -609,25 +609,26 @@ PATTERN MODIFIERS
|
||||||
|
|
||||||
Using alternative character tables
|
Using alternative character tables
|
||||||
|
|
||||||
The /tables modifier must be followed by a single digit. It causes a
|
The value specified for the /tables modifier must be one of the digits
|
||||||
specific set of built-in character tables to be passed to pcre2_com-
|
0, 1, or 2. It causes a specific set of built-in character tables to be
|
||||||
pile(). This is used in the PCRE2 tests to check behaviour with differ-
|
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
||||||
ent character tables. The digit specifies the tables as follows:
|
haviour with different character tables. The digit specifies the tables
|
||||||
|
as follows:
|
||||||
|
|
||||||
0 do not pass any special character tables
|
0 do not pass any special character tables
|
||||||
1 the default ASCII tables, as distributed in
|
1 the default ASCII tables, as distributed in
|
||||||
pcre2_chartables.c.dist
|
pcre2_chartables.c.dist
|
||||||
2 a set of tables defining ISO 8859 characters
|
2 a set of tables defining ISO 8859 characters
|
||||||
|
|
||||||
In table 2, some characters whose codes are greater than 128 are iden-
|
In table 2, some characters whose codes are greater than 128 are iden-
|
||||||
tified as letters, digits, spaces, etc. Setting alternate character
|
tified as letters, digits, spaces, etc. Setting alternate character
|
||||||
tables and a locale are mutually exclusive.
|
tables and a locale are mutually exclusive.
|
||||||
|
|
||||||
Setting certain match controls
|
Setting certain match controls
|
||||||
|
|
||||||
The following modifiers are really subject modifiers, and are described
|
The following modifiers are really subject modifiers, and are described
|
||||||
below. However, they may be included in a pattern's modifier list, in
|
below. However, they may be included in a pattern's modifier list, in
|
||||||
which case they are applied to every subject line that is processed
|
which case they are applied to every subject line that is processed
|
||||||
with that pattern. They do not affect the compilation process.
|
with that pattern. They do not affect the compilation process.
|
||||||
|
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
|
@ -639,7 +640,7 @@ PATTERN MODIFIERS
|
||||||
replace=<string> specify a replacement string
|
replace=<string> specify a replacement string
|
||||||
startchar show starting character when relevant
|
startchar show starting character when relevant
|
||||||
|
|
||||||
These modifiers may not appear in a #pattern command. If you want them
|
These modifiers may not appear in a #pattern command. If you want them
|
||||||
as defaults, set them in a #subject command.
|
as defaults, set them in a #subject command.
|
||||||
|
|
||||||
|
|
||||||
|
@ -650,7 +651,7 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
Setting match options
|
Setting match options
|
||||||
|
|
||||||
The following modifiers set options for pcre2_match() or
|
The following modifiers set options for pcre2_match() or
|
||||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||||
|
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
|
@ -664,20 +665,20 @@ SUBJECT MODIFIERS
|
||||||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||||
|
|
||||||
The partial matching modifiers are provided with abbreviations because
|
The partial matching modifiers are provided with abbreviations because
|
||||||
they appear frequently in tests.
|
they appear frequently in tests.
|
||||||
|
|
||||||
If the /posix modifier was present on the pattern, causing the POSIX
|
If the /posix modifier was present on the pattern, causing the POSIX
|
||||||
wrapper API to be used, the only option-setting modifiers that have any
|
wrapper API to be used, the only option-setting modifiers that have any
|
||||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||||
Any other modifiers cause an error.
|
Any other modifiers cause an error.
|
||||||
|
|
||||||
Setting match controls
|
Setting match controls
|
||||||
|
|
||||||
The following modifiers affect the matching process or request addi-
|
The following modifiers affect the matching process or request addi-
|
||||||
tional information. Some of them may also be specified on a pattern
|
tional information. Some of them may also be specified on a pattern
|
||||||
line (see above), in which case they apply to every subject line that
|
line (see above), in which case they apply to every subject line that
|
||||||
is matched against that pattern.
|
is matched against that pattern.
|
||||||
|
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
|
@ -710,23 +711,23 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
Showing more text
|
Showing more text
|
||||||
|
|
||||||
The aftertext modifier requests that as well as outputting the sub-
|
The aftertext modifier requests that as well as outputting the part of
|
||||||
string that matched the entire pattern, pcre2test should in addition
|
the subject string that matched the entire pattern, pcre2test should in
|
||||||
output the remainder of the subject string. This is useful for tests
|
addition output the remainder of the subject string. This is useful for
|
||||||
where the subject contains multiple copies of the same substring. The
|
tests where the subject contains multiple copies of the same substring.
|
||||||
allaftertext modifier requests the same action for captured substrings
|
The allaftertext modifier requests the same action for captured sub-
|
||||||
as well as the main matched substring. In each case the remainder is
|
strings as well as the main matched substring. In each case the remain-
|
||||||
output on the following line with a plus character following the cap-
|
der is output on the following line with a plus character following the
|
||||||
ture number.
|
capture number.
|
||||||
|
|
||||||
The allusedtext modifier requests that all the text that was consulted
|
The allusedtext modifier requests that all the text that was consulted
|
||||||
during a successful pattern match by the interpreter should be shown.
|
during a successful pattern match by the interpreter should be shown.
|
||||||
This feature is not supported for JIT matching, and if requested with
|
This feature is not supported for JIT matching, and if requested with
|
||||||
JIT it is ignored (with a warning message). Setting this modifier
|
JIT it is ignored (with a warning message). Setting this modifier
|
||||||
affects the output if there is a lookbehind at the start of a match, or
|
affects the output if there is a lookbehind at the start of a match, or
|
||||||
a lookahead at the end, or if \K is used in the pattern. Characters
|
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||||
that precede or follow the start and end of the actual match are indi-
|
that precede or follow the start and end of the actual match are indi-
|
||||||
cated in the output by '<' or '>' characters underneath them. Here is
|
cated in the output by '<' or '>' characters underneath them. Here is
|
||||||
an example:
|
an example:
|
||||||
|
|
||||||
re> /(?<=pqr)abc(?=xyz)/
|
re> /(?<=pqr)abc(?=xyz)/
|
||||||
|
@ -734,8 +735,9 @@ SUBJECT MODIFIERS
|
||||||
0: pqrabcxyz
|
0: pqrabcxyz
|
||||||
<<< >>>
|
<<< >>>
|
||||||
|
|
||||||
This shows that the matched string is "abc", with the preceding and
|
This shows that the matched string is "abc", with the preceding and
|
||||||
following strings "pqr" and "xyz" also consulted during the match.
|
following strings "pqr" and "xyz" having been consulted during the
|
||||||
|
match (when processing the assertions).
|
||||||
|
|
||||||
The startchar modifier requests that the starting character for the
|
The startchar modifier requests that the starting character for the
|
||||||
match be indicated, if it is different to the start of the matched
|
match be indicated, if it is different to the start of the matched
|
||||||
|
@ -784,9 +786,9 @@ SUBJECT MODIFIERS
|
||||||
difference between global and altglobal is that the former uses the
|
difference between global and altglobal is that the former uses the
|
||||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||||
searching at a new point within the entire string (which is what Perl
|
searching at a new point within the entire string (which is what Perl
|
||||||
does), whereas the latter passes over a shortened substring. This makes
|
does), whereas the latter passes over a shortened subject. This makes a
|
||||||
a difference to the matching process if the pattern begins with a look-
|
difference to the matching process if the pattern begins with a lookbe-
|
||||||
behind assertion (including \b or \B).
|
hind assertion (including \b or \B).
|
||||||
|
|
||||||
If an empty string is matched, the next match is done with the
|
If an empty string is matched, the next match is done with the
|
||||||
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
||||||
|
@ -796,7 +798,7 @@ SUBJECT MODIFIERS
|
||||||
/g modifier or the split() function. Normally, the start offset is
|
/g modifier or the split() function. Normally, the start offset is
|
||||||
advanced by one character, but if the newline convention recognizes
|
advanced by one character, but if the newline convention recognizes
|
||||||
CRLF as a newline, and the current character is CR followed by LF, an
|
CRLF as a newline, and the current character is CR followed by LF, an
|
||||||
advance of two is used.
|
advance of two characters occurs.
|
||||||
|
|
||||||
Testing substring extraction functions
|
Testing substring extraction functions
|
||||||
|
|
||||||
|
@ -807,9 +809,9 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
abcd\=copy=1,copy=3,get=G1
|
abcd\=copy=1,copy=3,get=G1
|
||||||
|
|
||||||
If the #subject command is used to set default copy and get lists,
|
If the #subject command is used to set default copy and/or get lists,
|
||||||
these can be unset by specifying a negative number for numbered groups
|
these can be unset by specifying a negative number to cancel all num-
|
||||||
and an empty name for named groups.
|
bered groups and an empty name to cancel all named groups.
|
||||||
|
|
||||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||||
all captured substrings.
|
all captured substrings.
|
||||||
|
@ -818,23 +820,24 @@ SUBJECT MODIFIERS
|
||||||
by the convenience functions are output with C, G, or L after the
|
by the convenience functions are output with C, G, or L after the
|
||||||
string number instead of a colon. This is in addition to the normal
|
string number instead of a colon. This is in addition to the normal
|
||||||
full list. The string length (that is, the return from the extraction
|
full list. The string length (that is, the return from the extraction
|
||||||
function) is given in parentheses after each substring.
|
function) is given in parentheses after each substring, followed by the
|
||||||
|
name when the extraction was by name.
|
||||||
|
|
||||||
Testing the substitution function
|
Testing the substitution function
|
||||||
|
|
||||||
If the replace modifier is set, the pcre2_substitute() function is
|
If the replace modifier is set, the pcre2_substitute() function is
|
||||||
called instead of one of the matching functions. Unlike subject
|
called instead of one of the matching functions. Unlike subject
|
||||||
strings, pcre2test does not process replacement strings for escape
|
strings, pcre2test does not process replacement strings for escape
|
||||||
sequences. In UTF mode, a replacement string is checked to see if it is
|
sequences. In UTF mode, a replacement string is checked to see if it is
|
||||||
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
||||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
||||||
the individual code units are copied directly. This provides a means of
|
the individual code units are copied directly. This provides a means of
|
||||||
passing an invalid UTF-8 string for testing purposes.
|
passing an invalid UTF-8 string for testing purposes.
|
||||||
|
|
||||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||||
pcre2_substitute(). After a successful substitution, the modified
|
pcre2_substitute(). After a successful substitution, the modified
|
||||||
string is output, preceded by the number of replacements. This may be
|
string is output, preceded by the number of replacements. This may be
|
||||||
zero if there were no matches. Here is a simple example of a substitu-
|
zero if there were no matches. Here is a simple example of a substitu-
|
||||||
tion test:
|
tion test:
|
||||||
|
|
||||||
/abc/replace=xxx
|
/abc/replace=xxx
|
||||||
|
@ -843,11 +846,11 @@ SUBJECT MODIFIERS
|
||||||
=abc=abc=\=global
|
=abc=abc=\=global
|
||||||
2: =xxx=xxx=
|
2: =xxx=xxx=
|
||||||
|
|
||||||
Subject and replacement strings should be kept relatively short for
|
Subject and replacement strings should be kept relatively short for
|
||||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
substitution tests, as fixed-size buffers are used. To make it easy to
|
||||||
test for buffer overflow, if the replacement string starts with a num-
|
test for buffer overflow, if the replacement string starts with a num-
|
||||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
ber in square brackets, that number is passed to pcre2_substitute() as
|
||||||
the size of the output buffer, with the replacement string starting at
|
the size of the output buffer, with the replacement string starting at
|
||||||
the next character. Here is an example that tests the edge case:
|
the next character. Here is an example that tests the edge case:
|
||||||
|
|
||||||
/abc/
|
/abc/
|
||||||
|
@ -857,125 +860,124 @@ SUBJECT MODIFIERS
|
||||||
Failed: error -47: no more memory
|
Failed: error -47: no more memory
|
||||||
|
|
||||||
A replacement string is ignored with POSIX and DFA matching. Specifying
|
A replacement string is ignored with POSIX and DFA matching. Specifying
|
||||||
partial matching provokes an error return ("bad option value") from
|
partial matching provokes an error return ("bad option value") from
|
||||||
pcre2_substitute().
|
pcre2_substitute().
|
||||||
|
|
||||||
Setting the JIT stack size
|
Setting the JIT stack size
|
||||||
|
|
||||||
The jitstack modifier provides a way of setting the maximum stack size
|
The jitstack modifier provides a way of setting the maximum stack size
|
||||||
that is used by the just-in-time optimization code. It is ignored if
|
that is used by the just-in-time optimization code. It is ignored if
|
||||||
JIT optimization is not being used. The value is a number of kilobytes.
|
JIT optimization is not being used. The value is a number of kilobytes.
|
||||||
Providing a stack that is larger than the default 32K is necessary only
|
Providing a stack that is larger than the default 32K is necessary only
|
||||||
for very complicated patterns.
|
for very complicated patterns.
|
||||||
|
|
||||||
Setting match and recursion limits
|
Setting match and recursion limits
|
||||||
|
|
||||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||||
its in the match context. These values are ignored when the find_limits
|
its in the match context. These values are ignored when the find_limits
|
||||||
modifier is specified.
|
modifier is specified.
|
||||||
|
|
||||||
Finding minimum limits
|
Finding minimum limits
|
||||||
|
|
||||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||||
several times, setting different values in the match context via
|
several times, setting different values in the match context via
|
||||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||||
the minimum values for each parameter that allow pcre2_match() to com-
|
the minimum values for each parameter that allow pcre2_match() to com-
|
||||||
plete without error.
|
plete without error.
|
||||||
|
|
||||||
If JIT is being used, only the match limit is relevant. If DFA matching
|
If JIT is being used, only the match limit is relevant. If DFA matching
|
||||||
is being used, neither limit is relevant, and this modifier is ignored
|
is being used, neither limit is relevant, and this modifier is ignored
|
||||||
(with a warning message).
|
(with a warning message).
|
||||||
|
|
||||||
The match_limit number is a measure of the amount of backtracking that
|
The match_limit number is a measure of the amount of backtracking that
|
||||||
takes place, and learning the minimum value can be instructive. For
|
takes place, and learning the minimum value can be instructive. For
|
||||||
most simple matches, the number is quite small, but for patterns with
|
most simple matches, the number is quite small, but for patterns with
|
||||||
very large numbers of matching possibilities, it can become large very
|
very large numbers of matching possibilities, it can become large very
|
||||||
quickly with increasing length of subject string. The
|
quickly with increasing length of subject string. The
|
||||||
match_limit_recursion number is a measure of how much stack (or, if
|
match_limit_recursion number is a measure of how much stack (or, if
|
||||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||||
complete the match attempt.
|
complete the match attempt.
|
||||||
|
|
||||||
Showing MARK names
|
Showing MARK names
|
||||||
|
|
||||||
|
|
||||||
The mark modifier causes the names from backtracking control verbs that
|
The mark modifier causes the names from backtracking control verbs that
|
||||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||||
it is added to the non-match message.
|
it is added to the non-match message.
|
||||||
|
|
||||||
Showing memory usage
|
Showing memory usage
|
||||||
|
|
||||||
The memory modifier causes pcre2test to log all memory allocation and
|
The memory modifier causes pcre2test to log all memory allocation and
|
||||||
freeing calls that occur during a match operation.
|
freeing calls that occur during a match operation.
|
||||||
|
|
||||||
Setting a starting offset
|
Setting a starting offset
|
||||||
|
|
||||||
The offset modifier sets an offset in the subject string at which
|
The offset modifier sets an offset in the subject string at which
|
||||||
matching starts. Its value is a number of code units, not characters.
|
matching starts. Its value is a number of code units, not characters.
|
||||||
|
|
||||||
Setting the size of the output vector
|
Setting the size of the output vector
|
||||||
|
|
||||||
The ovector modifier applies only to the subject line in which it
|
The ovector modifier applies only to the subject line in which it
|
||||||
appears, though of course it can also be used to set a default in a
|
appears, though of course it can also be used to set a default in a
|
||||||
#subject command. It specifies the number of pairs of offsets that are
|
#subject command. It specifies the number of pairs of offsets that are
|
||||||
available for storing matching information. The default is 15.
|
available for storing matching information. The default is 15.
|
||||||
|
|
||||||
A value of zero is useful when testing the POSIX API because it causes
|
A value of zero is useful when testing the POSIX API because it causes
|
||||||
regexec() to be called with a NULL capture vector. When not testing the
|
regexec() to be called with a NULL capture vector. When not testing the
|
||||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||||
ate_from_pattern() to be called, in order to create a match block of
|
ate_from_pattern() to be called, in order to create a match block of
|
||||||
exactly the right size for the pattern. (It is not possible to create a
|
exactly the right size for the pattern. (It is not possible to create a
|
||||||
match block with a zero-length ovector; there is always at least one
|
match block with a zero-length ovector; there is always at least one
|
||||||
pair of offsets.)
|
pair of offsets.)
|
||||||
|
|
||||||
Passing the subject as zero-terminated
|
Passing the subject as zero-terminated
|
||||||
|
|
||||||
By default, the subject string is passed to a native API matching func-
|
By default, the subject string is passed to a native API matching func-
|
||||||
tion with its correct length. In order to test the facility for passing
|
tion with its correct length. In order to test the facility for passing
|
||||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||||
causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching
|
causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching
|
||||||
via the POSIX interface, this modifier has no effect, as there is no
|
via the POSIX interface, this modifier has no effect, as there is no
|
||||||
facility for passing a length.)
|
facility for passing a length.)
|
||||||
|
|
||||||
When testing pcre2_substitute(), this modifier also has the effect of
|
When testing pcre2_substitute(), this modifier also has the effect of
|
||||||
passing the replacement string as zero-terminated.
|
passing the replacement string as zero-terminated.
|
||||||
|
|
||||||
|
|
||||||
THE ALTERNATIVE MATCHING FUNCTION
|
THE ALTERNATIVE MATCHING FUNCTION
|
||||||
|
|
||||||
By default, pcre2test uses the standard PCRE2 matching function,
|
By default, pcre2test uses the standard PCRE2 matching function,
|
||||||
pcre2_match() to match each subject line. PCRE2 also supports an alter-
|
pcre2_match() to match each subject line. PCRE2 also supports an alter-
|
||||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||||
ferent way, and has some restrictions. The differences between the two
|
ferent way, and has some restrictions. The differences between the two
|
||||||
functions are described in the pcre2matching documentation.
|
functions are described in the pcre2matching documentation.
|
||||||
|
|
||||||
If the dfa modifier is set, the alternative matching function is used.
|
If the dfa modifier is set, the alternative matching function is used.
|
||||||
This function finds all possible matches at a given point in the sub-
|
This function finds all possible matches at a given point in the sub-
|
||||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||||
after the first match is found. This is always the shortest possible
|
after the first match is found. This is always the shortest possible
|
||||||
match.
|
match.
|
||||||
|
|
||||||
|
|
||||||
DEFAULT OUTPUT FROM pcre2test
|
DEFAULT OUTPUT FROM pcre2test
|
||||||
|
|
||||||
This section describes the output when the normal matching function,
|
This section describes the output when the normal matching function,
|
||||||
pcre2_match(), is being used.
|
pcre2_match(), is being used.
|
||||||
|
|
||||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||||
strings, starting with number 0 for the string that matched the whole
|
strings, starting with number 0 for the string that matched the whole
|
||||||
pattern. Otherwise, it outputs "No match" when the return is
|
pattern. Otherwise, it outputs "No match" when the return is
|
||||||
PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially
|
PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially
|
||||||
matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that
|
matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that
|
||||||
this is the entire substring that was inspected during the partial
|
this is the entire substring that was inspected during the partial
|
||||||
match; it may include characters before the actual match start if a
|
match; it may include characters before the actual match start if a
|
||||||
lookbehind assertion, \K, \b, or \B was involved.)
|
lookbehind assertion, \K, \b, or \B was involved.)
|
||||||
|
|
||||||
For any other return, pcre2test outputs the PCRE2 negative error number
|
For any other return, pcre2test outputs the PCRE2 negative error number
|
||||||
and a short descriptive phrase. If the error is a failed UTF string
|
and a short descriptive phrase. If the error is a failed UTF string
|
||||||
check, the offset of the start of the failing character and the reason
|
check, the code unit offset of the start of the failing character is
|
||||||
code are also output. Here is an example of an interactive pcre2test
|
also output. Here is an example of an interactive pcre2test run.
|
||||||
run.
|
|
||||||
|
|
||||||
$ pcre2test
|
$ pcre2test
|
||||||
PCRE2 version 9.00 2014-05-10
|
PCRE2 version 9.00 2014-05-10
|
||||||
|
@ -988,8 +990,8 @@ DEFAULT OUTPUT FROM pcre2test
|
||||||
No match
|
No match
|
||||||
|
|
||||||
Unset capturing substrings that are not followed by one that is set are
|
Unset capturing substrings that are not followed by one that is set are
|
||||||
not returned by pcre2_match(), and are not shown by pcre2test. In the
|
not shown by pcre2test unless the allcaptures modifier is specified. In
|
||||||
following example, there are two capturing substrings, but when the
|
the following example, there are two capturing substrings, but when the
|
||||||
first data line is matched, the second, unset substring is not shown.
|
first data line is matched, the second, unset substring is not shown.
|
||||||
An "internal" unset substring is shown as "<unset>", as for the second
|
An "internal" unset substring is shown as "<unset>", as for the second
|
||||||
data line.
|
data line.
|
||||||
|
@ -1028,8 +1030,8 @@ DEFAULT OUTPUT FROM pcre2test
|
||||||
1: pp
|
1: pp
|
||||||
|
|
||||||
"No match" is output only if the first match attempt fails. Here is an
|
"No match" is output only if the first match attempt fails. Here is an
|
||||||
example of a failure message (the offset 4 that is specified by \>4 is
|
example of a failure message (the offset 4 that is specified by the
|
||||||
past the end of the subject string):
|
offset modifier is past the end of the subject string):
|
||||||
|
|
||||||
re> /xyz/
|
re> /xyz/
|
||||||
data> xyz\=offset=4
|
data> xyz\=offset=4
|
||||||
|
@ -1053,13 +1055,13 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
||||||
1: tang
|
1: tang
|
||||||
2: tan
|
2: tan
|
||||||
|
|
||||||
(Using the normal matching function on this data finds only "tang".)
|
Using the normal matching function on this data finds only "tang". The
|
||||||
The longest matching string is always given first (and numbered zero).
|
longest matching string is always given first (and numbered zero).
|
||||||
After a PCRE2_ERROR_PARTIAL return, the output is "Partial match:",
|
After a PCRE2_ERROR_PARTIAL return, the output is "Partial match:",
|
||||||
followed by the partially matching substring. (Note that this is the
|
followed by the partially matching substring. Note that this is the
|
||||||
entire substring that was inspected during the partial match; it may
|
entire substring that was inspected during the partial match; it may
|
||||||
include characters before the actual match start if a lookbehind asser-
|
include characters before the actual match start if a lookbehind asser-
|
||||||
tion, \K, \b, or \B was involved.)
|
tion, \b, or \B was involved. (\K is not supported for DFA matching.)
|
||||||
|
|
||||||
If global matching is requested, the search for further matches resumes
|
If global matching is requested, the search for further matches resumes
|
||||||
at the end of the longest match. For example:
|
at the end of the longest match. For example:
|
||||||
|
@ -1183,5 +1185,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 14 November 2014
|
Last updated: 23 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
|
|
|
@ -108,8 +108,8 @@ case-equivalent, and these are treated as such.
|
||||||
.sp
|
.sp
|
||||||
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
|
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
|
||||||
are (by default) checked for validity on entry to the relevant functions.
|
are (by default) checked for validity on entry to the relevant functions.
|
||||||
If an invalid UTF string is passed, an negative error code is returned. The
|
If an invalid UTF string is passed, an negative error code is returned. The
|
||||||
code unit offset to the offending character can be extracted from the match
|
code unit offset to the offending character can be extracted from the match
|
||||||
data block by calling \fBpcre2_get_startchar()\fP, which is used for this
|
data block by calling \fBpcre2_get_startchar()\fP, which is used for this
|
||||||
purpose after a UTF error.
|
purpose after a UTF error.
|
||||||
.P
|
.P
|
||||||
|
|
|
@ -18,10 +18,10 @@ to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H,
|
||||||
but if you do, default values will be taken from config.h for non-boolean
|
but if you do, default values will be taken from config.h for non-boolean
|
||||||
macros that are not defined on the command line.
|
macros that are not defined on the command line.
|
||||||
|
|
||||||
Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be
|
Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be defined
|
||||||
defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All
|
(conventionally to 1) for TRUE, and not defined at all for FALSE. All such
|
||||||
such macros are listed as a commented #undef in config.h.generic. Macros such
|
macros are listed as a commented #undef in config.h.generic. Macros such as
|
||||||
as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
|
MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
|
||||||
surrounded by #ifndef/#endif lines so that the value can be overridden by -D.
|
surrounded by #ifndef/#endif lines so that the value can be overridden by -D.
|
||||||
|
|
||||||
PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
|
PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
|
||||||
|
@ -201,7 +201,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||||
#define PACKAGE_NAME "PCRE2"
|
#define PACKAGE_NAME "PCRE2"
|
||||||
|
|
||||||
/* Define to the full name and version of this package. */
|
/* Define to the full name and version of this package. */
|
||||||
#define PACKAGE_STRING "PCRE2 10.00-DEV"
|
#define PACKAGE_STRING "PCRE2 10.00-RC1"
|
||||||
|
|
||||||
/* Define to the one symbol short name of this package. */
|
/* Define to the one symbol short name of this package. */
|
||||||
#define PACKAGE_TARNAME "pcre2"
|
#define PACKAGE_TARNAME "pcre2"
|
||||||
|
@ -210,7 +210,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||||
#define PACKAGE_URL ""
|
#define PACKAGE_URL ""
|
||||||
|
|
||||||
/* Define to the version of this package. */
|
/* Define to the version of this package. */
|
||||||
#define PACKAGE_VERSION "10.00-DEV"
|
#define PACKAGE_VERSION "10.00-RC1"
|
||||||
|
|
||||||
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
||||||
parentheses (of any kind) in a pattern. This limits the amount of system
|
parentheses (of any kind) in a pattern. This limits the amount of system
|
||||||
|
@ -288,7 +288,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||||
/* #undef SUPPORT_VALGRIND */
|
/* #undef SUPPORT_VALGRIND */
|
||||||
|
|
||||||
/* Version number of package */
|
/* Version number of package */
|
||||||
#define VERSION "10.00-DEV"
|
#define VERSION "10.00-RC1"
|
||||||
|
|
||||||
/* Define to empty if `const' does not conform to ANSI C. */
|
/* Define to empty if `const' does not conform to ANSI C. */
|
||||||
/* #undef const */
|
/* #undef const */
|
||||||
|
|
|
@ -43,8 +43,8 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define PCRE2_MAJOR 10
|
#define PCRE2_MAJOR 10
|
||||||
#define PCRE2_MINOR 00
|
#define PCRE2_MINOR 00
|
||||||
#define PCRE2_PRERELEASE -DEV
|
#define PCRE2_PRERELEASE -RC1
|
||||||
#define PCRE2_DATE 2014-99-99
|
#define PCRE2_DATE 2014-11-24
|
||||||
|
|
||||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||||
imported have to be identified as such. When building PCRE2, the appropriate
|
imported have to be identified as such. When building PCRE2, the appropriate
|
||||||
|
@ -125,8 +125,8 @@ D is inspected during pcre2_dfa_match() execution
|
||||||
#define PCRE2_JIT_PARTIAL_HARD 0x00000004u
|
#define PCRE2_JIT_PARTIAL_HARD 0x00000004u
|
||||||
|
|
||||||
/* These are for pcre2_match() and pcre2_dfa_match(). Note that PCRE2_ANCHORED,
|
/* These are for pcre2_match() and pcre2_dfa_match(). Note that PCRE2_ANCHORED,
|
||||||
PCRE2_NO_START_OPTIMIZE, and PCRE2_NO_UTF_CHECK can also be passed to these
|
and PCRE2_NO_UTF_CHECK can also be passed to these functions, so take care not
|
||||||
functions, so take care not to define synonyms by mistake. */
|
to define synonyms by mistake. */
|
||||||
|
|
||||||
#define PCRE2_NOTBOL 0x00000001u
|
#define PCRE2_NOTBOL 0x00000001u
|
||||||
#define PCRE2_NOTEOL 0x00000002u
|
#define PCRE2_NOTEOL 0x00000002u
|
||||||
|
@ -140,6 +140,10 @@ functions, so take care not to define synonyms by mistake. */
|
||||||
#define PCRE2_DFA_RESTART 0x00000040u
|
#define PCRE2_DFA_RESTART 0x00000040u
|
||||||
#define PCRE2_DFA_SHORTEST 0x00000080u
|
#define PCRE2_DFA_SHORTEST 0x00000080u
|
||||||
|
|
||||||
|
/* This is an additional option for pcre2_substitute(). */
|
||||||
|
|
||||||
|
#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u
|
||||||
|
|
||||||
/* Newline and \R settings, for use in compile contexts. The newline values
|
/* Newline and \R settings, for use in compile contexts. The newline values
|
||||||
must be kept in step with values set in config.h and both sets must all be
|
must be kept in step with values set in config.h and both sets must all be
|
||||||
greater than zero. */
|
greater than zero. */
|
||||||
|
@ -202,24 +206,25 @@ context functions. */
|
||||||
#define PCRE2_ERROR_BADMODE (-32)
|
#define PCRE2_ERROR_BADMODE (-32)
|
||||||
#define PCRE2_ERROR_BADOFFSET (-33)
|
#define PCRE2_ERROR_BADOFFSET (-33)
|
||||||
#define PCRE2_ERROR_BADOPTION (-34)
|
#define PCRE2_ERROR_BADOPTION (-34)
|
||||||
#define PCRE2_ERROR_BADUTFOFFSET (-35)
|
#define PCRE2_ERROR_BADREPLACEMENT (-35)
|
||||||
#define PCRE2_ERROR_CALLOUT (-36) /* Never used by PCRE2 itself */
|
#define PCRE2_ERROR_BADUTFOFFSET (-36)
|
||||||
#define PCRE2_ERROR_DFA_BADRESTART (-37)
|
#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */
|
||||||
#define PCRE2_ERROR_DFA_RECURSE (-38)
|
#define PCRE2_ERROR_DFA_BADRESTART (-38)
|
||||||
#define PCRE2_ERROR_DFA_UCOND (-39)
|
#define PCRE2_ERROR_DFA_RECURSE (-39)
|
||||||
#define PCRE2_ERROR_DFA_UITEM (-40)
|
#define PCRE2_ERROR_DFA_UCOND (-40)
|
||||||
#define PCRE2_ERROR_DFA_WSSIZE (-41)
|
#define PCRE2_ERROR_DFA_UITEM (-41)
|
||||||
#define PCRE2_ERROR_INTERNAL (-42)
|
#define PCRE2_ERROR_DFA_WSSIZE (-42)
|
||||||
#define PCRE2_ERROR_JIT_BADOPTION (-43)
|
#define PCRE2_ERROR_INTERNAL (-43)
|
||||||
#define PCRE2_ERROR_JIT_STACKLIMIT (-44)
|
#define PCRE2_ERROR_JIT_BADOPTION (-44)
|
||||||
#define PCRE2_ERROR_MATCHLIMIT (-45)
|
#define PCRE2_ERROR_JIT_STACKLIMIT (-45)
|
||||||
#define PCRE2_ERROR_NOMEMORY (-46)
|
#define PCRE2_ERROR_MATCHLIMIT (-46)
|
||||||
#define PCRE2_ERROR_NOSUBSTRING (-47)
|
#define PCRE2_ERROR_NOMEMORY (-47)
|
||||||
#define PCRE2_ERROR_NOUNIQUESUBSTRING (-48)
|
#define PCRE2_ERROR_NOSUBSTRING (-48)
|
||||||
#define PCRE2_ERROR_NULL (-49)
|
#define PCRE2_ERROR_NOUNIQUESUBSTRING (-49)
|
||||||
#define PCRE2_ERROR_RECURSELOOP (-50)
|
#define PCRE2_ERROR_NULL (-50)
|
||||||
#define PCRE2_ERROR_RECURSIONLIMIT (-51)
|
#define PCRE2_ERROR_RECURSELOOP (-51)
|
||||||
#define PCRE2_ERROR_UNSET (-52)
|
#define PCRE2_ERROR_RECURSIONLIMIT (-52)
|
||||||
|
#define PCRE2_ERROR_UNSET (-53)
|
||||||
|
|
||||||
/* Request types for pcre2_pattern_info() */
|
/* Request types for pcre2_pattern_info() */
|
||||||
|
|
||||||
|
@ -406,7 +411,8 @@ PCRE2_EXP_DECL \
|
||||||
pcre2_match_data *pcre2_match_data_create(uint32_t, \
|
pcre2_match_data *pcre2_match_data_create(uint32_t, \
|
||||||
pcre2_general_context *); \
|
pcre2_general_context *); \
|
||||||
PCRE2_EXP_DECL \
|
PCRE2_EXP_DECL \
|
||||||
pcre2_match_data *pcre2_match_data_create_from_pattern(pcre2_code *, \
|
pcre2_match_data *pcre2_match_data_create_from_pattern(\
|
||||||
|
const pcre2_code *, \
|
||||||
pcre2_general_context *); \
|
pcre2_general_context *); \
|
||||||
PCRE2_EXP_DECL int pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, \
|
PCRE2_EXP_DECL int pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, \
|
||||||
PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
||||||
|
@ -447,19 +453,28 @@ PCRE2_EXP_DECL int pcre2_substring_list_get(pcre2_match_data *, \
|
||||||
PCRE2_UCHAR ***, PCRE2_SIZE **);
|
PCRE2_UCHAR ***, PCRE2_SIZE **);
|
||||||
|
|
||||||
|
|
||||||
|
/* Convenience function for match + substitute. */
|
||||||
|
|
||||||
|
#define PCRE2_SUBSTITUTE_FUNCTION \
|
||||||
|
PCRE2_EXP_DECL int pcre2_substitute(const pcre2_code *, \
|
||||||
|
PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
||||||
|
pcre2_match_data *, pcre2_match_context *, \
|
||||||
|
PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, \
|
||||||
|
PCRE2_SIZE *);
|
||||||
|
|
||||||
|
|
||||||
/* Functions for JIT processing */
|
/* Functions for JIT processing */
|
||||||
|
|
||||||
#define PCRE2_JIT_FUNCTIONS \
|
#define PCRE2_JIT_FUNCTIONS \
|
||||||
PCRE2_EXP_DECL int pcre2_jit_compile(pcre2_code *, uint32_t); \
|
PCRE2_EXP_DECL int pcre2_jit_compile(pcre2_code *, uint32_t); \
|
||||||
PCRE2_EXP_DECL int pcre2_jit_match(const pcre2_code *, \
|
PCRE2_EXP_DECL int pcre2_jit_match(const pcre2_code *, \
|
||||||
PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
||||||
pcre2_match_data *, pcre2_match_context *, \
|
pcre2_match_data *, pcre2_match_context *); \
|
||||||
pcre2_jit_stack *); \
|
PCRE2_EXP_DECL void pcre2_jit_free_unused_memory(pcre2_general_context *); \
|
||||||
PCRE2_EXP_DECL void pcre2_jit_free_unused_memory(pcre2_general_context *);\
|
|
||||||
PCRE2_EXP_DECL \
|
PCRE2_EXP_DECL \
|
||||||
pcre2_jit_stack *pcre2_jit_stack_create(pcre2_general_context *, \
|
pcre2_jit_stack *pcre2_jit_stack_create(pcre2_general_context *, \
|
||||||
PCRE2_SIZE, PCRE2_SIZE); \
|
PCRE2_SIZE, PCRE2_SIZE); \
|
||||||
PCRE2_EXP_DECL void pcre2_jit_stack_assign(const pcre2_code *, \
|
PCRE2_EXP_DECL void pcre2_jit_stack_assign(pcre2_match_context *, \
|
||||||
pcre2_jit_callback, void *); \
|
pcre2_jit_callback, void *); \
|
||||||
PCRE2_EXP_DECL void pcre2_jit_stack_free(pcre2_jit_stack *);
|
PCRE2_EXP_DECL void pcre2_jit_stack_free(pcre2_jit_stack *);
|
||||||
|
|
||||||
|
@ -551,6 +566,7 @@ pcre2_compile are called by application code. */
|
||||||
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
||||||
#define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_)
|
#define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_)
|
||||||
#define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_)
|
#define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_)
|
||||||
|
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
|
||||||
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
|
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
|
||||||
#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_)
|
#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_)
|
||||||
#define pcre2_substring_free PCRE2_SUFFIX(pcre2_substring_free_)
|
#define pcre2_substring_free PCRE2_SUFFIX(pcre2_substring_free_)
|
||||||
|
@ -577,6 +593,7 @@ PCRE2_MATCH_CONTEXT_FUNCTIONS \
|
||||||
PCRE2_COMPILE_FUNCTIONS \
|
PCRE2_COMPILE_FUNCTIONS \
|
||||||
PCRE2_PATTERN_INFO_FUNCTIONS \
|
PCRE2_PATTERN_INFO_FUNCTIONS \
|
||||||
PCRE2_MATCH_FUNCTIONS \
|
PCRE2_MATCH_FUNCTIONS \
|
||||||
|
PCRE2_SUBSTITUTE_FUNCTION \
|
||||||
PCRE2_SUBSTRING_FUNCTIONS \
|
PCRE2_SUBSTRING_FUNCTIONS \
|
||||||
PCRE2_JIT_FUNCTIONS \
|
PCRE2_JIT_FUNCTIONS \
|
||||||
PCRE2_OTHER_FUNCTIONS
|
PCRE2_OTHER_FUNCTIONS
|
||||||
|
|
|
@ -1570,13 +1570,13 @@ enum {
|
||||||
|
|
||||||
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
|
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
|
||||||
definitions that follow must also be updated to match. There are also tables
|
definitions that follow must also be updated to match. There are also tables
|
||||||
called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
|
called "opcode_possessify" in pcre2_compile.c and "coptable" and "poptable" in
|
||||||
pcre_dfa_exec.c that must be updated. */
|
pcre2_dfa_exec.c that must be updated. */
|
||||||
|
|
||||||
|
|
||||||
/* This macro defines textual names for all the opcodes. These are used only
|
/* This macro defines textual names for all the opcodes. These are used only
|
||||||
for debugging, and some of them are only partial names. The macro is referenced
|
for debugging, and some of them are only partial names. The macro is referenced
|
||||||
only in pcre_printint.c, which fills out the full names in many cases (and in
|
only in pcre2_printint.c, which fills out the full names in many cases (and in
|
||||||
some cases doesn't actually use these names at all). */
|
some cases doesn't actually use these names at all). */
|
||||||
|
|
||||||
#define OP_NAME_LIST \
|
#define OP_NAME_LIST \
|
||||||
|
|
|
@ -5570,13 +5570,13 @@ else for (gmatched = 0;; gmatched++)
|
||||||
fprintf(outfile, "Failed: error %d: ", capcount);
|
fprintf(outfile, "Failed: error %d: ", capcount);
|
||||||
PCRE2_GET_ERROR_MESSAGE(mlen, capcount, pbuffer);
|
PCRE2_GET_ERROR_MESSAGE(mlen, capcount, pbuffer);
|
||||||
PCHARSV(CASTVAR(void *, pbuffer), 0, mlen, FALSE, outfile);
|
PCHARSV(CASTVAR(void *, pbuffer), 0, mlen, FALSE, outfile);
|
||||||
if (capcount <= PCRE2_ERROR_UTF8_ERR1 &&
|
if (capcount <= PCRE2_ERROR_UTF8_ERR1 &&
|
||||||
capcount >= PCRE2_ERROR_UTF32_ERR2)
|
capcount >= PCRE2_ERROR_UTF32_ERR2)
|
||||||
{
|
{
|
||||||
PCRE2_SIZE startchar;
|
PCRE2_SIZE startchar;
|
||||||
PCRE2_GET_STARTCHAR(startchar, match_data);
|
PCRE2_GET_STARTCHAR(startchar, match_data);
|
||||||
fprintf(outfile, " at offset %ld", startchar);
|
fprintf(outfile, " at offset %ld", startchar);
|
||||||
}
|
}
|
||||||
fprintf(outfile, "\n");
|
fprintf(outfile, "\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue