Final preparations for 10.00-RC1
This commit is contained in:
parent
91f2e97474
commit
0acc416ed1
19
ChangeLog
19
ChangeLog
|
@ -1,12 +1,12 @@
|
|||
Change Log for PCRE2
|
||||
--------------------
|
||||
|
||||
Version 10.0 xx-xxxx-2014
|
||||
-------------------------
|
||||
Version 10.00 24-November-2014
|
||||
------------------------------
|
||||
|
||||
Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
|
||||
Changes prior to 10.0 are logged in the ChangeLog file for the old API, up to
|
||||
item 20 for release 8.36.
|
||||
Version 10.00 is the first release of PCRE2, a revised API for the PCRE
|
||||
library. Changes prior to 10.00 are logged in the ChangeLog file for the old
|
||||
API, up to item 20 for release 8.36.
|
||||
|
||||
The code of the library was heavily revised as part of the new API
|
||||
implementation. Details of each and every modification were not individually
|
||||
|
@ -25,7 +25,7 @@ matched by that pattern.
|
|||
|
||||
4. For the benefit of those who use PCRE2 via some other application, that is,
|
||||
not writing the function calls themselves, it is possible to check the PCRE2
|
||||
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
||||
version by matching a pattern such as /(?(VERSION>=10)yes|no)/ against a
|
||||
string such as "yesno".
|
||||
|
||||
5. There are case-equivalent Unicode characters whose encodings use different
|
||||
|
@ -46,14 +46,15 @@ characters, for example: /(?:(?=.)|(?<!x))a/.
|
|||
|
||||
7. When an (*ACCEPT) is triggered inside capturing parentheses, it arranges for
|
||||
those parentheses to be closed with whatever has been captured so far. However,
|
||||
it was failing to mark any other groups between the hightest capture so far and
|
||||
it was failing to mark any other groups between the highest capture so far and
|
||||
the currrent group as "unset". Thus, the ovector for those groups contained
|
||||
whatever was previously there. An example is the pattern /(x)|((*ACCEPT))/ when
|
||||
matched against "abcd".
|
||||
|
||||
8. The pcre2_substitute() function has been implemented.
|
||||
|
||||
9. If an assertion condition was quantified with a minimum of zero (an odd
|
||||
thing to do, but it happened), SIGSEGV or other misbehaviour could occur.
|
||||
9. If an assertion used as a condition was quantified with a minimum of zero
|
||||
(an odd thing to do, but it happened), SIGSEGV or other misbehaviour could
|
||||
occur.
|
||||
|
||||
****
|
||||
|
|
50
Makefile.am
50
Makefile.am
|
@ -375,28 +375,34 @@ CLEANFILES += src/pcre2_chartables.c
|
|||
# when pcre2_jit_compile.c is processed, so they must be distributed.
|
||||
|
||||
EXTRA_DIST += \
|
||||
sljit/sljitConfig.h \
|
||||
sljit/sljitConfigInternal.h \
|
||||
sljit/sljitExecAllocator.c \
|
||||
sljit/sljitLir.c \
|
||||
sljit/sljitLir.h \
|
||||
sljit/sljitNativeARM_32.c \
|
||||
sljit/sljitNativeARM_64.c \
|
||||
sljit/sljitNativeARM_T2_32.c \
|
||||
sljit/sljitNativeMIPS_32.c \
|
||||
sljit/sljitNativeMIPS_64.c \
|
||||
sljit/sljitNativeMIPS_common.c \
|
||||
sljit/sljitNativePPC_32.c \
|
||||
sljit/sljitNativePPC_64.c \
|
||||
sljit/sljitNativePPC_common.c \
|
||||
sljit/sljitNativeSPARC_32.c \
|
||||
sljit/sljitNativeSPARC_common.c \
|
||||
sljit/sljitNativeTILEGX-encoder.c \
|
||||
sljit/sljitNativeTILEGX_64.c \
|
||||
sljit/sljitNativeX86_32.c \
|
||||
sljit/sljitNativeX86_64.c \
|
||||
sljit/sljitNativeX86_common.c \
|
||||
sljit/sljitUtils.c
|
||||
src/sljit/sljitConfig.h \
|
||||
src/sljit/sljitConfigInternal.h \
|
||||
src/sljit/sljitExecAllocator.c \
|
||||
src/sljit/sljitLir.c \
|
||||
src/sljit/sljitLir.h \
|
||||
src/sljit/sljitNativeARM_32.c \
|
||||
src/sljit/sljitNativeARM_64.c \
|
||||
src/sljit/sljitNativeARM_T2_32.c \
|
||||
src/sljit/sljitNativeMIPS_32.c \
|
||||
src/sljit/sljitNativeMIPS_64.c \
|
||||
src/sljit/sljitNativeMIPS_common.c \
|
||||
src/sljit/sljitNativePPC_32.c \
|
||||
src/sljit/sljitNativePPC_64.c \
|
||||
src/sljit/sljitNativePPC_common.c \
|
||||
src/sljit/sljitNativeSPARC_32.c \
|
||||
src/sljit/sljitNativeSPARC_common.c \
|
||||
src/sljit/sljitNativeTILEGX-encoder.c \
|
||||
src/sljit/sljitNativeTILEGX_64.c \
|
||||
src/sljit/sljitNativeX86_32.c \
|
||||
src/sljit/sljitNativeX86_64.c \
|
||||
src/sljit/sljitNativeX86_common.c \
|
||||
src/sljit/sljitUtils.c
|
||||
|
||||
# Some of the JIT sources are also in separate files that are #included.
|
||||
|
||||
EXTRA_DIST += \
|
||||
src/pcre2_jit_match.c \
|
||||
src/pcre2_jit_misc.c
|
||||
|
||||
if WITH_PCRE2_8
|
||||
libpcre2_8_la_LDFLAGS = $(EXTRA_LIBPCRE2_8_LDFLAGS)
|
||||
|
|
12
NEWS
12
NEWS
|
@ -1,11 +1,13 @@
|
|||
News about PCRE2 releases
|
||||
-------------------------
|
||||
|
||||
Version 10.0 xx-xxxx-2014
|
||||
-------------------------
|
||||
Version 10.00 24-November-2014
|
||||
------------------------------
|
||||
|
||||
Version 10.0 is the first release of PCRE2, a revised API for the PCRE library.
|
||||
Changes prior to 10.0 are logged in the ChangeLog file for the old API, up to
|
||||
item 20 for release 8.36.
|
||||
Version 10.00 is the first release of PCRE2, a revised API for the PCRE
|
||||
library. Changes prior to 10.00 are logged in the ChangeLog file for the old
|
||||
API, up to item 20 for release 8.36. New programs are recommended to use the
|
||||
new library. Programs that use the original (PCRE1) API will need changing
|
||||
before linking with the new library.
|
||||
|
||||
****
|
||||
|
|
72
README
72
README
|
@ -5,11 +5,9 @@ PCRE2 is a re-implementation of the original PCRE library with an entirely new
|
|||
API. The latest release of PCRE2 is always available in three alternative
|
||||
formats from:
|
||||
|
||||
FIXME: THIS WILL NOT BE THE CASE UNTIL THERE IS A FORMAL RELEASE.
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.gz
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.bz2
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.zip
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE (both the
|
||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||
|
@ -46,7 +44,7 @@ there as yet no C++ wrappers.
|
|||
|
||||
The distribution does contain a set of C wrapper functions for the 8-bit
|
||||
library that are based on the POSIX regular expression API (see the pcre2posix
|
||||
man page). These end up in the library called libpcre2posix. Note that this
|
||||
man page). These can be found in a library called libpcre2posix. Note that this
|
||||
just provides a POSIX calling interface to PCRE2; the regular expressions
|
||||
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
||||
and does not give full access to all of PCRE2's facilities.
|
||||
|
@ -72,7 +70,7 @@ new names.
|
|||
|
||||
|
||||
Documentation for PCRE2
|
||||
----------------------
|
||||
-----------------------
|
||||
|
||||
If you install PCRE2 in the normal way on a Unix-like system, you will end up
|
||||
with a set of man pages whose names all start with "pcre2". The one that is
|
||||
|
@ -95,7 +93,7 @@ PCRE2 documentation is supplied in two other forms:
|
|||
|
||||
|
||||
Building PCRE2 on non-Unix-like systems
|
||||
--------------------------------------
|
||||
---------------------------------------
|
||||
|
||||
For a non-Unix-like system, please read the comments in the file
|
||||
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
||||
|
@ -112,7 +110,7 @@ library, because it uses only Standard C functions.
|
|||
|
||||
|
||||
Building PCRE2 without using autotools
|
||||
-------------------------------------
|
||||
--------------------------------------
|
||||
|
||||
The use of autotools (in particular, libtool) is problematic in some
|
||||
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
||||
|
@ -120,7 +118,7 @@ file for ways of building PCRE2 without using autotools.
|
|||
|
||||
|
||||
Building PCRE2 using autotools
|
||||
-----------------------------
|
||||
------------------------------
|
||||
|
||||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
@ -166,15 +164,15 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
. By default, only the 8-bit library is built. If you add --enable-pcre2-16 to
|
||||
the "configure" command, the 16-bit library is also built. If you add
|
||||
--enable-pcre2-32 to the "configure" command, the 32-bit library is also built.
|
||||
If you want only the 16-bit or 32-bit library, use --disable-pcre2-8 to disable
|
||||
building the 8-bit library.
|
||||
--enable-pcre2-32 to the "configure" command, the 32-bit library is also
|
||||
built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8
|
||||
to disable building the 8-bit library.
|
||||
|
||||
. If you want to include support for just-in-time compiling, which can give
|
||||
large performance improvements on certain platforms, add --enable-jit to the
|
||||
"configure" command. This support is available only for certain hardware
|
||||
architectures. If you try to enable it on an unsupported architecture, there
|
||||
will be a compile time error. FIXME: NOT YET IMPLEMENTED.
|
||||
will be a compile time error.
|
||||
|
||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||
you add --disable-pcre2grep-jit to the "configure" command.
|
||||
|
@ -196,13 +194,13 @@ library. They are also documented in the pcre2build man page.
|
|||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, as indicating the
|
||||
end of a line. Whatever you specify at build time is the default; the caller
|
||||
of PCRE2 can change the selection at run time. The default newline indicator
|
||||
is a single LF character (the Unix standard). You can specify the default
|
||||
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
|
||||
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
|
||||
newline indicator by adding --enable-newline-is-cr, --enable-newline-is-lf,
|
||||
--enable-newline-is-crlf, --enable-newline-is-anycrlf, or
|
||||
--enable-newline-is-any to the "configure" command, respectively.
|
||||
|
||||
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
||||
|
@ -251,8 +249,9 @@ library. They are also documented in the pcre2build man page.
|
|||
command. PCRE2 then uses three bytes instead of two for offsets to different
|
||||
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
||||
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
||||
offsets. Increasing the internal link size reduces performance. In the 32-bit
|
||||
library, the link size setting is ignored, as 4-byte offsets are always used.
|
||||
offsets. Increasing the internal link size reduces performance in the 8-bit
|
||||
and 16-bit libraries. In the 32-bit library, the link size setting is
|
||||
ignored, as 4-byte offsets are always used.
|
||||
|
||||
. You can build PCRE2 so that its internal match() function that is called from
|
||||
pcre2_match() does not call itself recursively. Instead, it uses memory
|
||||
|
@ -376,12 +375,13 @@ contains compiler output from tests that "configure" runs.
|
|||
Once "configure" has run, you can run "make". This builds whichever of the
|
||||
libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
|
||||
program called pcre2test. If you enabled JIT support with --enable-jit, another
|
||||
test program called pcre2_jit_test is built as well. FIXME: still to be
|
||||
implemented. If the 8-bit library is built, libpcre2-posix and the pcre2grep
|
||||
command are also built.
|
||||
test program called pcre2_jit_test is built as well. If the 8-bit library is
|
||||
built, libpcre2-posix and the pcre2grep command are also built. Running
|
||||
"make" with the -j option may speed up compilation on multiprocessor systems.
|
||||
|
||||
The command "make check" runs all the appropriate tests. Details of the PCRE2
|
||||
tests are given below in a separate section of this document.
|
||||
tests are given below in a separate section of this document. The -j option of
|
||||
"make" can also be used when running the tests.
|
||||
|
||||
You can use "make install" to install PCRE2 into live directories on your
|
||||
system. The following are installed (file names are all relative to the
|
||||
|
@ -528,7 +528,7 @@ Testing PCRE2
|
|||
|
||||
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
||||
There is another script called RunGrepTest that tests the options of the
|
||||
pcre2grep command. When JIT support is enabled, another test program called
|
||||
pcre2grep command. When JIT support is enabled, a third test program called
|
||||
pcre2_jit_test is built. Both the scripts and all the program tests are run if
|
||||
you obey "make check". For other environments, see the instructions in
|
||||
NON-AUTOTOOLS-BUILD.
|
||||
|
@ -709,7 +709,6 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_context.c )
|
||||
src/pcre2_dfa_match.c )
|
||||
src/pcre2_error.c )
|
||||
src/pcre2_exec.c )
|
||||
src/pcre2_jit_compile.c )
|
||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||
|
@ -721,6 +720,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_pattern_info.c )
|
||||
src/pcre2_string_utils.c )
|
||||
src/pcre2_study.c )
|
||||
src/pcre2_substitute.c )
|
||||
src/pcre2_substring.c )
|
||||
src/pcre2_tables.c )
|
||||
src/pcre2_ucd.c )
|
||||
|
@ -736,13 +736,15 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_intmodedep.h a mode-specific internal header
|
||||
src/pcre2_ucp.h header for Unicode property handling
|
||||
|
||||
sljit/* 16 files that make up the JIT compiler FIXME
|
||||
sljit/* source files for the JIT compiler
|
||||
|
||||
(B) Source files for programs that use PCRE2:
|
||||
|
||||
src/pcre2demo.c simple demonstration of coding calls to PCRE2
|
||||
src/pcre2grep.c source of a grep utility that uses PCRE2
|
||||
src/pcre2test.c comprehensive test program
|
||||
src/pcre2_printint.c part of pcre2test
|
||||
src/pcre2_jit_test.c JIT test program
|
||||
|
||||
(C) Auxiliary files:
|
||||
|
||||
|
@ -790,7 +792,6 @@ The distribution should contain the files listed below.
|
|||
mkinstalldirs script for making install directories
|
||||
perltest.sh Script for running a Perl test program
|
||||
pcre2-config.in source of script which retains PCRE2 information
|
||||
pcre2_jit_test.c test program for the JIT compiler
|
||||
testdata/testinput* test data for main library tests
|
||||
testdata/testoutput* expected test results
|
||||
testdata/grep* input and output for pcre2grep tests
|
||||
|
@ -805,25 +806,14 @@ The distribution should contain the files listed below.
|
|||
CMakeLists.txt
|
||||
config-cmake.h.in
|
||||
|
||||
(E) Auxiliary files for VPASCAL FIXME FIXME
|
||||
|
||||
makevp.bat
|
||||
makevp_c.txt
|
||||
makevp_l.txt
|
||||
pcre2gexp.pas
|
||||
|
||||
(F) Auxiliary files for building PCRE2 "by hand"
|
||||
(E) Auxiliary files for building PCRE2 "by hand"
|
||||
|
||||
pcre2.h.generic ) a version of the public PCRE2 header file
|
||||
) for use in non-"configure" environments
|
||||
config.h.generic ) a version of config.h for use in non-"configure"
|
||||
) environments
|
||||
|
||||
(F) Miscellaneous
|
||||
|
||||
RunTest.bat a script for running tests under Windows FIXME
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 03 November 2014
|
||||
Last updated: 24 November 2014
|
||||
|
|
|
@ -10,8 +10,8 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
|
|||
|
||||
m4_define(pcre2_major, [10])
|
||||
m4_define(pcre2_minor, [00])
|
||||
m4_define(pcre2_prerelease, [-DEV])
|
||||
m4_define(pcre2_date, [2014-99-99])
|
||||
m4_define(pcre2_prerelease, [-RC1])
|
||||
m4_define(pcre2_date, [2014-11-24])
|
||||
|
||||
# NOTE: The CMakeLists.txt file searches for the above variables in the first
|
||||
# 50 lines of this file. Please update that if the variables above are moved.
|
||||
|
|
|
@ -5,11 +5,9 @@ PCRE2 is a re-implementation of the original PCRE library with an entirely new
|
|||
API. The latest release of PCRE2 is always available in three alternative
|
||||
formats from:
|
||||
|
||||
FIXME: THIS WILL NOT BE THE CASE UNTIL THERE IS A FORMAL RELEASE.
|
||||
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.gz
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.bz2
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.zip
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.gz
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.tar.bz2
|
||||
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre2-xxx.zip
|
||||
|
||||
There is a mailing list for discussion about the development of PCRE (both the
|
||||
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||
|
@ -46,7 +44,7 @@ there as yet no C++ wrappers.
|
|||
|
||||
The distribution does contain a set of C wrapper functions for the 8-bit
|
||||
library that are based on the POSIX regular expression API (see the pcre2posix
|
||||
man page). These end up in the library called libpcre2posix. Note that this
|
||||
man page). These can be found in a library called libpcre2posix. Note that this
|
||||
just provides a POSIX calling interface to PCRE2; the regular expressions
|
||||
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
||||
and does not give full access to all of PCRE2's facilities.
|
||||
|
@ -72,7 +70,7 @@ new names.
|
|||
|
||||
|
||||
Documentation for PCRE2
|
||||
----------------------
|
||||
-----------------------
|
||||
|
||||
If you install PCRE2 in the normal way on a Unix-like system, you will end up
|
||||
with a set of man pages whose names all start with "pcre2". The one that is
|
||||
|
@ -95,7 +93,7 @@ PCRE2 documentation is supplied in two other forms:
|
|||
|
||||
|
||||
Building PCRE2 on non-Unix-like systems
|
||||
--------------------------------------
|
||||
---------------------------------------
|
||||
|
||||
For a non-Unix-like system, please read the comments in the file
|
||||
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
||||
|
@ -112,7 +110,7 @@ library, because it uses only Standard C functions.
|
|||
|
||||
|
||||
Building PCRE2 without using autotools
|
||||
-------------------------------------
|
||||
--------------------------------------
|
||||
|
||||
The use of autotools (in particular, libtool) is problematic in some
|
||||
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
||||
|
@ -120,7 +118,7 @@ file for ways of building PCRE2 without using autotools.
|
|||
|
||||
|
||||
Building PCRE2 using autotools
|
||||
-----------------------------
|
||||
------------------------------
|
||||
|
||||
The following instructions assume the use of the widely used "configure; make;
|
||||
make install" (autotools) process.
|
||||
|
@ -166,15 +164,15 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
. By default, only the 8-bit library is built. If you add --enable-pcre2-16 to
|
||||
the "configure" command, the 16-bit library is also built. If you add
|
||||
--enable-pcre2-32 to the "configure" command, the 32-bit library is also built.
|
||||
If you want only the 16-bit or 32-bit library, use --disable-pcre2-8 to disable
|
||||
building the 8-bit library.
|
||||
--enable-pcre2-32 to the "configure" command, the 32-bit library is also
|
||||
built. If you want only the 16-bit or 32-bit library, use --disable-pcre2-8
|
||||
to disable building the 8-bit library.
|
||||
|
||||
. If you want to include support for just-in-time compiling, which can give
|
||||
large performance improvements on certain platforms, add --enable-jit to the
|
||||
"configure" command. This support is available only for certain hardware
|
||||
architectures. If you try to enable it on an unsupported architecture, there
|
||||
will be a compile time error. FIXME: NOT YET IMPLEMENTED.
|
||||
will be a compile time error.
|
||||
|
||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||
you add --disable-pcre2grep-jit to the "configure" command.
|
||||
|
@ -196,13 +194,13 @@ library. They are also documented in the pcre2build man page.
|
|||
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||
However, only the basic two-letter properties such as Lu are supported.
|
||||
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF, or any
|
||||
of the preceding, or any of the Unicode newline sequences, as indicating the
|
||||
end of a line. Whatever you specify at build time is the default; the caller
|
||||
of PCRE2 can change the selection at run time. The default newline indicator
|
||||
is a single LF character (the Unix standard). You can specify the default
|
||||
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
|
||||
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
|
||||
newline indicator by adding --enable-newline-is-cr, --enable-newline-is-lf,
|
||||
--enable-newline-is-crlf, --enable-newline-is-anycrlf, or
|
||||
--enable-newline-is-any to the "configure" command, respectively.
|
||||
|
||||
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
||||
|
@ -251,8 +249,9 @@ library. They are also documented in the pcre2build man page.
|
|||
command. PCRE2 then uses three bytes instead of two for offsets to different
|
||||
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
||||
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
||||
offsets. Increasing the internal link size reduces performance. In the 32-bit
|
||||
library, the link size setting is ignored, as 4-byte offsets are always used.
|
||||
offsets. Increasing the internal link size reduces performance in the 8-bit
|
||||
and 16-bit libraries. In the 32-bit library, the link size setting is
|
||||
ignored, as 4-byte offsets are always used.
|
||||
|
||||
. You can build PCRE2 so that its internal match() function that is called from
|
||||
pcre2_match() does not call itself recursively. Instead, it uses memory
|
||||
|
@ -376,12 +375,13 @@ contains compiler output from tests that "configure" runs.
|
|||
Once "configure" has run, you can run "make". This builds whichever of the
|
||||
libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
|
||||
program called pcre2test. If you enabled JIT support with --enable-jit, another
|
||||
test program called pcre2_jit_test is built as well. FIXME: still to be
|
||||
implemented. If the 8-bit library is built, libpcre2-posix and the pcre2grep
|
||||
command are also built.
|
||||
test program called pcre2_jit_test is built as well. If the 8-bit library is
|
||||
built, libpcre2-posix and the pcre2grep command are also built. Running
|
||||
"make" with the -j option may speed up compilation on multiprocessor systems.
|
||||
|
||||
The command "make check" runs all the appropriate tests. Details of the PCRE2
|
||||
tests are given below in a separate section of this document.
|
||||
tests are given below in a separate section of this document. The -j option of
|
||||
"make" can also be used when running the tests.
|
||||
|
||||
You can use "make install" to install PCRE2 into live directories on your
|
||||
system. The following are installed (file names are all relative to the
|
||||
|
@ -528,7 +528,7 @@ Testing PCRE2
|
|||
|
||||
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
||||
There is another script called RunGrepTest that tests the options of the
|
||||
pcre2grep command. When JIT support is enabled, another test program called
|
||||
pcre2grep command. When JIT support is enabled, a third test program called
|
||||
pcre2_jit_test is built. Both the scripts and all the program tests are run if
|
||||
you obey "make check". For other environments, see the instructions in
|
||||
NON-AUTOTOOLS-BUILD.
|
||||
|
@ -709,7 +709,6 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_context.c )
|
||||
src/pcre2_dfa_match.c )
|
||||
src/pcre2_error.c )
|
||||
src/pcre2_exec.c )
|
||||
src/pcre2_jit_compile.c )
|
||||
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||
|
@ -721,6 +720,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_pattern_info.c )
|
||||
src/pcre2_string_utils.c )
|
||||
src/pcre2_study.c )
|
||||
src/pcre2_substitute.c )
|
||||
src/pcre2_substring.c )
|
||||
src/pcre2_tables.c )
|
||||
src/pcre2_ucd.c )
|
||||
|
@ -736,13 +736,15 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_intmodedep.h a mode-specific internal header
|
||||
src/pcre2_ucp.h header for Unicode property handling
|
||||
|
||||
sljit/* 16 files that make up the JIT compiler FIXME
|
||||
sljit/* source files for the JIT compiler
|
||||
|
||||
(B) Source files for programs that use PCRE2:
|
||||
|
||||
src/pcre2demo.c simple demonstration of coding calls to PCRE2
|
||||
src/pcre2grep.c source of a grep utility that uses PCRE2
|
||||
src/pcre2test.c comprehensive test program
|
||||
src/pcre2_printint.c part of pcre2test
|
||||
src/pcre2_jit_test.c JIT test program
|
||||
|
||||
(C) Auxiliary files:
|
||||
|
||||
|
@ -790,7 +792,6 @@ The distribution should contain the files listed below.
|
|||
mkinstalldirs script for making install directories
|
||||
perltest.sh Script for running a Perl test program
|
||||
pcre2-config.in source of script which retains PCRE2 information
|
||||
pcre2_jit_test.c test program for the JIT compiler
|
||||
testdata/testinput* test data for main library tests
|
||||
testdata/testoutput* expected test results
|
||||
testdata/grep* input and output for pcre2grep tests
|
||||
|
@ -805,25 +806,14 @@ The distribution should contain the files listed below.
|
|||
CMakeLists.txt
|
||||
config-cmake.h.in
|
||||
|
||||
(E) Auxiliary files for VPASCAL FIXME FIXME
|
||||
|
||||
makevp.bat
|
||||
makevp_c.txt
|
||||
makevp_l.txt
|
||||
pcre2gexp.pas
|
||||
|
||||
(F) Auxiliary files for building PCRE2 "by hand"
|
||||
(E) Auxiliary files for building PCRE2 "by hand"
|
||||
|
||||
pcre2.h.generic ) a version of the public PCRE2 header file
|
||||
) for use in non-"configure" environments
|
||||
config.h.generic ) a version of config.h for use in non-"configure"
|
||||
) environments
|
||||
|
||||
(F) Miscellaneous
|
||||
|
||||
RunTest.bat a script for running tests under Windows FIXME
|
||||
|
||||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 03 November 2014
|
||||
Last updated: 24 November 2014
|
||||
|
|
|
@ -2074,6 +2074,12 @@ returned by <b>pcre2_get_startchar()</b>. For a non-partial match, this can be
|
|||
different to the value of <i>ovector[0]</i> if the pattern contains the \K
|
||||
escape sequence. After a partial match, however, this value is always the same
|
||||
as <i>ovector[0]</i> because \K does not affect the result of a partial match.
|
||||
</P>
|
||||
<P>
|
||||
The <b>startchar</b> field is also used to return the offset of an invalid
|
||||
UTF character when UTF checking fails. Details are given in the
|
||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||
page.
|
||||
<a name="errorlist"></a></P>
|
||||
<br><a name="SEC26" href="#TOC1">ERROR RETURNS FROM <b>pcre2_match()</b></a><br>
|
||||
<P>
|
||||
|
@ -2658,7 +2664,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC36" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 21 November 2014
|
||||
Last updated: 23 November 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -197,9 +197,9 @@ the string "dog" matched against the ungreedy pattern shown above:
|
|||
<pre>
|
||||
/dog(sbody)??/
|
||||
</pre>
|
||||
Whereas the standard functions stop as soon as they find the complete match for
|
||||
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
||||
return that when PCRE2_PARTIAL_HARD is set.
|
||||
Whereas the standard function stops as soon as it finds the complete match for
|
||||
"dog", the DFA function also finds the partial match for "dogsbody", and so
|
||||
returns that when PCRE2_PARTIAL_HARD is set.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">PARTIAL MATCHING AND WORD BOUNDARIES</a><br>
|
||||
<P>
|
||||
|
|
|
@ -244,7 +244,7 @@ input lines. Each set starts with a regular expression pattern, followed by any
|
|||
number of subject lines to be matched against that pattern. In between sets of
|
||||
test data, command lines that begin with a hash (#) character may appear. This
|
||||
file format, with some restrictions, can also be processed by the
|
||||
<b>perltest.pl</b> script that is distributed with PCRE2 as a means of checking
|
||||
<b>perltest.sh</b> script that is distributed with PCRE2 as a means of checking
|
||||
that the behaviour of PCRE2 and Perl is the same.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -287,11 +287,11 @@ patterns. Modifiers on a pattern can change these settings.
|
|||
#perltest
|
||||
</pre>
|
||||
The appearance of this line causes all subsequent modifier settings to be
|
||||
checked for compatibility with the <b>perltest.pl</b> script, which is used to
|
||||
checked for compatibility with the <b>perltest.sh</b> script, which is used to
|
||||
confirm that Perl gives the same results as PCRE2. Also, apart from comment
|
||||
lines, none of the other command lines are permitted, because they and many
|
||||
of the modifiers are specific to <b>pcre2test</b>, and should not be used in
|
||||
test files that are also processed by <b>perltest.pl</b>. The \fP#perltest\fB
|
||||
test files that are also processed by <b>perltest.sh</b>. The \fP#perltest\fB
|
||||
command helps detect tests that are accidentally put in the wrong file.
|
||||
<pre>
|
||||
#subject <modifier-list>
|
||||
|
@ -307,7 +307,7 @@ for both patterns and subject lines, whereas others are valid for one or the
|
|||
other only. Each modifier has a long name, for example "anchored", and some of
|
||||
them must be followed by an equals sign and a value, for example, "offset=12".
|
||||
Modifiers that do not take values may be preceded by a minus sign to turn off a
|
||||
previous default setting.
|
||||
previous setting.
|
||||
</P>
|
||||
<P>
|
||||
A few of the more common modifiers can also be specified as single letters, for
|
||||
|
@ -376,7 +376,7 @@ encoding non-printing characters in a visible way:
|
|||
\xhh hexadecimal byte (up to 2 hex digits)
|
||||
\x{hh...} hexadecimal character (any number of hex digits)
|
||||
</pre>
|
||||
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
||||
The use of \x{hh...} is not dependent on the use of the <b>utf</b> modifier on
|
||||
the pattern. It is recognized always. There may be any number of hexadecimal
|
||||
digits inside the braces; invalid values provoke error messages.
|
||||
</P>
|
||||
|
@ -411,7 +411,7 @@ is converted to "abcabcabcabc". This feature does not support nesting. To
|
|||
include a closing square bracket in the characters, code it as \x5D.
|
||||
</P>
|
||||
<P>
|
||||
A backslash followed by an equals sign marke the end of the subject string and
|
||||
A backslash followed by an equals sign marks the end of the subject string and
|
||||
the start of a modifier list. For example:
|
||||
<pre>
|
||||
abc\=notbol,notempty
|
||||
|
@ -503,8 +503,8 @@ is built, with the default default being Unicode.
|
|||
</P>
|
||||
<P>
|
||||
The <b>newline</b> modifier specifies which characters are to be interpreted as
|
||||
newlines, both in the pattern and (by default) in subject lines. The type must
|
||||
be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
||||
newlines, both in the pattern and in subject lines. The type must be one of CR,
|
||||
LF, CRLF, ANYCRLF, or ANY (in upper or lower case).
|
||||
</P>
|
||||
<br><b>
|
||||
Information about a pattern
|
||||
|
@ -522,8 +522,8 @@ regression tests can be used in different environments.
|
|||
</P>
|
||||
<P>
|
||||
The <b>fullbincode</b> modifier, by contrast, <i>does</i> include length and
|
||||
offset values. This is used in a few special tests and is also useful for
|
||||
one-off tests.
|
||||
offset values. This is used in a few special tests that run only for specific
|
||||
code unit widths and link sizes, and is also useful for one-off tests.
|
||||
</P>
|
||||
<P>
|
||||
The <b>info</b> modifier requests information about the compiled pattern
|
||||
|
@ -546,13 +546,14 @@ some typical examples:
|
|||
Last code unit = 'c' (caseless)
|
||||
Subject length lower bound = 3
|
||||
</pre>
|
||||
"Compile options" are those specified to the compile function; "overall
|
||||
options" have added options that are taken or deduced from the pattern. If both
|
||||
sets of options are the same, just a single "options" line is output. "First
|
||||
code unit" is where any match must start; if there is more than one they are
|
||||
listed as "starting code units". "Last code unit" is the last literal code unit
|
||||
that must be present in any match. This is not necessarily the last character.
|
||||
These lines are omitted if no starting or ending code units are recorded.
|
||||
"Compile options" are those specified by modifiers; "overall options" have
|
||||
added options that are taken or deduced from the pattern. If both sets of
|
||||
options are the same, just a single "options" line is output; if there are no
|
||||
options, the line is omitted. "First code unit" is where any match must start;
|
||||
if there is more than one they are listed as "starting code units". "Last code
|
||||
unit" is the last literal code unit that must be present in any match. This is
|
||||
not necessarily the last character. These lines are omitted if no starting or
|
||||
ending code units are recorded.
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying a pattern in hex
|
||||
|
@ -565,16 +566,16 @@ pairs. For example:
|
|||
/ab 32 59/hex
|
||||
</pre>
|
||||
This feature is provided as a way of creating patterns that contain binary zero
|
||||
characters. By default, <b>pcre2test</b> passes patterns as zero-terminated
|
||||
strings to <b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED.
|
||||
However, for patterns specified in hexadecimal, the actual length of the
|
||||
pattern is passed.
|
||||
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
|
||||
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
|
||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
|
||||
actual length of the pattern is passed.
|
||||
</P>
|
||||
<br><b>
|
||||
JIT compilation
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>/jit</b> modifier may optionally be followed by and equals sign and a
|
||||
The <b>/jit</b> modifier may optionally be followed by an equals sign and a
|
||||
number in the range 0 to 7:
|
||||
<pre>
|
||||
0 disable JIT
|
||||
|
@ -606,7 +607,7 @@ pattern shows whether JIT compilation was or was not successful. If
|
|||
<b>jitverify</b> is specified without <b>jit</b>, jit=7 is assumed. If JIT
|
||||
compilation is successful when <b>jitverify</b> is set, the text "(JIT)" is
|
||||
added to the first output line after a match or non match when JIT-compiled
|
||||
code was actually used.
|
||||
code was actually used in the match.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting a locale
|
||||
|
@ -689,8 +690,8 @@ be aborted.
|
|||
Using alternative character tables
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>/tables</b> modifier must be followed by a single digit. It causes a
|
||||
specific set of built-in character tables to be passed to
|
||||
The value specified for the <b>/tables</b> modifier must be one of the digits 0,
|
||||
1, or 2. It causes a specific set of built-in character tables to be passed to
|
||||
<b>pcre2_compile()</b>. This is used in the PCRE2 tests to check behaviour with
|
||||
different character tables. The digit specifies the tables as follows:
|
||||
<pre>
|
||||
|
@ -800,13 +801,13 @@ The effects of these modifiers are described in the following sections.
|
|||
Showing more text
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>aftertext</b> modifier requests that as well as outputting the substring
|
||||
that matched the entire pattern, <b>pcre2test</b> should in addition output the
|
||||
remainder of the subject string. This is useful for tests where the subject
|
||||
contains multiple copies of the same substring. The <b>allaftertext</b> modifier
|
||||
requests the same action for captured substrings as well as the main matched
|
||||
substring. In each case the remainder is output on the following line with a
|
||||
plus character following the capture number.
|
||||
The <b>aftertext</b> modifier requests that as well as outputting the part of
|
||||
the subject string that matched the entire pattern, <b>pcre2test</b> should in
|
||||
addition output the remainder of the subject string. This is useful for tests
|
||||
where the subject contains multiple copies of the same substring. The
|
||||
<b>allaftertext</b> modifier requests the same action for captured substrings as
|
||||
well as the main matched substring. In each case the remainder is output on the
|
||||
following line with a plus character following the capture number.
|
||||
</P>
|
||||
<P>
|
||||
The <b>allusedtext</b> modifier requests that all the text that was consulted
|
||||
|
@ -824,7 +825,8 @@ underneath them. Here is an example:
|
|||
<<< >>>
|
||||
</pre>
|
||||
This shows that the matched string is "abc", with the preceding and following
|
||||
strings "pqr" and "xyz" also consulted during the match.
|
||||
strings "pqr" and "xyz" having been consulted during the match (when processing
|
||||
the assertions).
|
||||
</P>
|
||||
<P>
|
||||
The <b>startchar</b> modifier requests that the starting character for the match
|
||||
|
@ -881,7 +883,7 @@ function is called again to search the remainder of the subject. The difference
|
|||
between <b>global</b> and <b>altglobal</b> is that the former uses the
|
||||
<i>start_offset</i> argument to <b>pcre2_match()</b> or <b>pcre2_dfa_match()</b>
|
||||
to start searching at a new point within the entire string (which is what Perl
|
||||
does), whereas the latter passes over a shortened substring. This makes a
|
||||
does), whereas the latter passes over a shortened subject. This makes a
|
||||
difference to the matching process if the pattern begins with a lookbehind
|
||||
assertion (including \b or \B).
|
||||
</P>
|
||||
|
@ -893,7 +895,7 @@ fails, the start offset is advanced, and the normal match is retried. This
|
|||
imitates the way Perl handles such cases when using the <b>/g</b> modifier or
|
||||
the <b>split()</b> function. Normally, the start offset is advanced by one
|
||||
character, but if the newline convention recognizes CRLF as a newline, and the
|
||||
current character is CR followed by LF, an advance of two is used.
|
||||
current character is CR followed by LF, an advance of two characters occurs.
|
||||
</P>
|
||||
<br><b>
|
||||
Testing substring extraction functions
|
||||
|
@ -906,9 +908,9 @@ for example:
|
|||
<pre>
|
||||
abcd\=copy=1,copy=3,get=G1
|
||||
</pre>
|
||||
If the <b>#subject</b> command is used to set default copy and get lists, these
|
||||
can be unset by specifying a negative number for numbered groups and an empty
|
||||
name for named groups.
|
||||
If the <b>#subject</b> command is used to set default copy and/or get lists,
|
||||
these can be unset by specifying a negative number to cancel all numbered
|
||||
groups and an empty name to cancel all named groups.
|
||||
</P>
|
||||
<P>
|
||||
The <b>getall</b> modifier tests <b>pcre2_substring_list_get()</b>, which
|
||||
|
@ -919,7 +921,8 @@ If the subject line is successfully matched, the substrings extracted by the
|
|||
convenience functions are output with C, G, or L after the string number
|
||||
instead of a colon. This is in addition to the normal full list. The string
|
||||
length (that is, the return from the extraction function) is given in
|
||||
parentheses after each substring.
|
||||
parentheses after each substring, followed by the name when the extraction was
|
||||
by name.
|
||||
</P>
|
||||
<br><b>
|
||||
Testing the substitution function
|
||||
|
@ -1093,11 +1096,10 @@ characters before the actual match start if a lookbehind assertion, \K, \b,
|
|||
or \B was involved.)
|
||||
</P>
|
||||
<P>
|
||||
For any other return, <b>pcre2test</b> outputs the PCRE2
|
||||
negative error number and a short descriptive phrase. If the error is a failed
|
||||
UTF string check, the offset of the start of the failing character and the
|
||||
reason code are also output. Here is an example of an interactive
|
||||
<b>pcre2test</b> run.
|
||||
For any other return, <b>pcre2test</b> outputs the PCRE2 negative error number
|
||||
and a short descriptive phrase. If the error is a failed UTF string check, the
|
||||
code unit offset of the start of the failing character is also output. Here is
|
||||
an example of an interactive <b>pcre2test</b> run.
|
||||
<pre>
|
||||
$ pcre2test
|
||||
PCRE2 version 9.00 2014-05-10
|
||||
|
@ -1110,10 +1112,10 @@ reason code are also output. Here is an example of an interactive
|
|||
No match
|
||||
</pre>
|
||||
Unset capturing substrings that are not followed by one that is set are not
|
||||
returned by <b>pcre2_match()</b>, and are not shown by <b>pcre2test</b>. In the
|
||||
following example, there are two capturing substrings, but when the first data
|
||||
line is matched, the second, unset substring is not shown. An "internal" unset
|
||||
substring is shown as "<unset>", as for the second data line.
|
||||
shown by <b>pcre2test</b> unless the <b>allcaptures</b> modifier is specified. In
|
||||
the following example, there are two capturing substrings, but when the first
|
||||
data line is matched, the second, unset substring is not shown. An "internal"
|
||||
unset substring is shown as "<unset>", as for the second data line.
|
||||
<pre>
|
||||
re> /(a)|(b)/
|
||||
data> a
|
||||
|
@ -1149,8 +1151,8 @@ are output in sequence, like this:
|
|||
1: pp
|
||||
</pre>
|
||||
"No match" is output only if the first match attempt fails. Here is an example
|
||||
of a failure message (the offset 4 that is specified by \>4 is past the end of
|
||||
the subject string):
|
||||
of a failure message (the offset 4 that is specified by the <b>offset</b>
|
||||
modifier is past the end of the subject string):
|
||||
<pre>
|
||||
re> /xyz/
|
||||
data> xyz\=offset=4
|
||||
|
@ -1175,12 +1177,13 @@ the subject where there is at least one match. For example:
|
|||
1: tang
|
||||
2: tan
|
||||
</pre>
|
||||
(Using the normal matching function on this data finds only "tang".) The
|
||||
Using the normal matching function on this data finds only "tang". The
|
||||
longest matching string is always given first (and numbered zero). After a
|
||||
PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the
|
||||
partially matching substring. (Note that this is the entire substring that was
|
||||
partially matching substring. Note that this is the entire substring that was
|
||||
inspected during the partial match; it may include characters before the actual
|
||||
match start if a lookbehind assertion, \K, \b, or \B was involved.)
|
||||
match start if a lookbehind assertion, \b, or \B was involved. (\K is not
|
||||
supported for DFA matching.)
|
||||
</P>
|
||||
<P>
|
||||
If global matching is requested, the search for further matches resumes
|
||||
|
@ -1217,9 +1220,9 @@ documentation.
|
|||
</P>
|
||||
<br><a name="SEC16" href="#TOC1">CALLOUTS</a><br>
|
||||
<P>
|
||||
If the pattern contains any callout requests, <b>pcre2test</b>'s callout function
|
||||
is called during matching. This works with both matching functions. By default,
|
||||
the called function displays the callout number, the start and current
|
||||
If the pattern contains any callout requests, <b>pcre2test</b>'s callout
|
||||
function is called during matching. This works with both matching functions. By
|
||||
default, the called function displays the callout number, the start and current
|
||||
positions in the text at the callout time, and the next pattern item to be
|
||||
tested. For example:
|
||||
<pre>
|
||||
|
@ -1306,7 +1309,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC20" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 14 November 2014
|
||||
Last updated: 23 November 2014
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -115,7 +115,10 @@ VALIDITY OF UTF STRINGS
|
|||
<P>
|
||||
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
|
||||
are (by default) checked for validity on entry to the relevant functions.
|
||||
If an invalid UTF string is passed, an error return is given.
|
||||
If an invalid UTF string is passed, an negative error code is returned. The
|
||||
code unit offset to the offending character can be extracted from the match
|
||||
data block by calling <b>pcre2_get_startchar()</b>, which is used for this
|
||||
purpose after a UTF error.
|
||||
</P>
|
||||
<P>
|
||||
UTF-16 and UTF-32 strings can indicate their endianness by special code knows
|
||||
|
|
|
@ -2057,6 +2057,10 @@ OTHER INFORMATION ABOUT A MATCH
|
|||
value is always the same as ovector[0] because \K does not affect the
|
||||
result of a partial match.
|
||||
|
||||
The startchar field is also used to return the offset of an invalid UTF
|
||||
character when UTF checking fails. Details are given in the pcre2uni-
|
||||
code page.
|
||||
|
||||
|
||||
ERROR RETURNS FROM pcre2_match()
|
||||
|
||||
|
@ -2601,7 +2605,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 21 November 2014
|
||||
Last updated: 23 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -4327,9 +4331,9 @@ PARTIAL MATCHING USING pcre2_dfa_match()
|
|||
|
||||
/dog(sbody)??/
|
||||
|
||||
Whereas the standard functions stop as soon as they find the complete
|
||||
match for "dog", the DFA functions also find the partial match for
|
||||
"dogsbody", and so return that when PCRE2_PARTIAL_HARD is set.
|
||||
Whereas the standard function stops as soon as it finds the complete
|
||||
match for "dog", the DFA function also finds the partial match for
|
||||
"dogsbody", and so returns that when PCRE2_PARTIAL_HARD is set.
|
||||
|
||||
|
||||
PARTIAL MATCHING AND WORD BOUNDARIES
|
||||
|
@ -4681,8 +4685,10 @@ VALIDITY OF UTF STRINGS
|
|||
|
||||
When the PCRE2_UTF option is set, the strings passed as patterns and
|
||||
subjects are (by default) checked for validity on entry to the relevant
|
||||
functions. If an invalid UTF string is passed, an error return is
|
||||
given.
|
||||
functions. If an invalid UTF string is passed, an negative error code
|
||||
is returned. The code unit offset to the offending character can be
|
||||
extracted from the match data block by calling pcre2_get_startchar(),
|
||||
which is used for this purpose after a UTF error.
|
||||
|
||||
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
||||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||
|
|
|
@ -2091,12 +2091,12 @@ different to the value of \fIovector[0]\fP if the pattern contains the \eK
|
|||
escape sequence. After a partial match, however, this value is always the same
|
||||
as \fIovector[0]\fP because \eK does not affect the result of a partial match.
|
||||
.P
|
||||
The \fBstartchar\fP field is also used to return the offset of an invalid
|
||||
The \fBstartchar\fP field is also used to return the offset of an invalid
|
||||
UTF character when UTF checking fails. Details are given in the
|
||||
.\" HREF
|
||||
\fBpcre2unicode\fP
|
||||
.\"
|
||||
page.
|
||||
page.
|
||||
.
|
||||
.
|
||||
.\" HTML <a name="errorlist"></a>
|
||||
|
|
|
@ -169,9 +169,9 @@ the string "dog" matched against the ungreedy pattern shown above:
|
|||
.sp
|
||||
/dog(sbody)??/
|
||||
.sp
|
||||
Whereas the standard functions stop as soon as they find the complete match for
|
||||
"dog", the DFA functions also find the partial match for "dogsbody", and so
|
||||
return that when PCRE2_PARTIAL_HARD is set.
|
||||
Whereas the standard function stops as soon as it finds the complete match for
|
||||
"dog", the DFA function also finds the partial match for "dogsbody", and so
|
||||
returns that when PCRE2_PARTIAL_HARD is set.
|
||||
.
|
||||
.
|
||||
.SH "PARTIAL MATCHING AND WORD BOUNDARIES"
|
||||
|
|
|
@ -478,7 +478,7 @@ link sizes and different code unit widths. By using \fBbincode\fP, the same
|
|||
regression tests can be used in different environments.
|
||||
.P
|
||||
The \fBfullbincode\fP modifier, by contrast, \fIdoes\fP include length and
|
||||
offset values. This is used in a few special tests that run only for specific
|
||||
offset values. This is used in a few special tests that run only for specific
|
||||
code unit widths and link sizes, and is also useful for one-off tests.
|
||||
.P
|
||||
The \fBinfo\fP modifier requests information about the compiled pattern
|
||||
|
@ -503,7 +503,7 @@ some typical examples:
|
|||
.sp
|
||||
"Compile options" are those specified by modifiers; "overall options" have
|
||||
added options that are taken or deduced from the pattern. If both sets of
|
||||
options are the same, just a single "options" line is output; if there are no
|
||||
options are the same, just a single "options" line is output; if there are no
|
||||
options, the line is omitted. "First code unit" is where any match must start;
|
||||
if there is more than one they are listed as "starting code units". "Last code
|
||||
unit" is the last literal code unit that must be present in any match. This is
|
||||
|
@ -646,7 +646,7 @@ be aborted.
|
|||
.SS "Using alternative character tables"
|
||||
.rs
|
||||
.sp
|
||||
The value specified for the \fB/tables\fP modifier must be one of the digits 0,
|
||||
The value specified for the \fB/tables\fP modifier must be one of the digits 0,
|
||||
1, or 2. It causes a specific set of built-in character tables to be passed to
|
||||
\fBpcre2_compile()\fP. This is used in the PCRE2 tests to check behaviour with
|
||||
different character tables. The digit specifies the tables as follows:
|
||||
|
@ -760,7 +760,7 @@ The effects of these modifiers are described in the following sections.
|
|||
.SS "Showing more text"
|
||||
.rs
|
||||
.sp
|
||||
The \fBaftertext\fP modifier requests that as well as outputting the part of
|
||||
The \fBaftertext\fP modifier requests that as well as outputting the part of
|
||||
the subject string that matched the entire pattern, \fBpcre2test\fP should in
|
||||
addition output the remainder of the subject string. This is useful for tests
|
||||
where the subject contains multiple copies of the same substring. The
|
||||
|
@ -783,7 +783,7 @@ underneath them. Here is an example:
|
|||
<<< >>>
|
||||
.sp
|
||||
This shows that the matched string is "abc", with the preceding and following
|
||||
strings "pqr" and "xyz" having been consulted during the match (when processing
|
||||
strings "pqr" and "xyz" having been consulted during the match (when processing
|
||||
the assertions).
|
||||
.P
|
||||
The \fBstartchar\fP modifier requests that the starting character for the match
|
||||
|
@ -873,7 +873,7 @@ If the subject line is successfully matched, the substrings extracted by the
|
|||
convenience functions are output with C, G, or L after the string number
|
||||
instead of a colon. This is in addition to the normal full list. The string
|
||||
length (that is, the return from the extraction function) is given in
|
||||
parentheses after each substring, followed by the name when the extraction was
|
||||
parentheses after each substring, followed by the name when the extraction was
|
||||
by name.
|
||||
.
|
||||
.
|
||||
|
@ -1102,7 +1102,7 @@ are output in sequence, like this:
|
|||
1: pp
|
||||
.sp
|
||||
"No match" is output only if the first match attempt fails. Here is an example
|
||||
of a failure message (the offset 4 that is specified by the \fBoffset\fP
|
||||
of a failure message (the offset 4 that is specified by the \fBoffset\fP
|
||||
modifier is past the end of the subject string):
|
||||
.sp
|
||||
re> /xyz/
|
||||
|
@ -1134,7 +1134,7 @@ longest matching string is always given first (and numbered zero). After a
|
|||
PCRE2_ERROR_PARTIAL return, the output is "Partial match:", followed by the
|
||||
partially matching substring. Note that this is the entire substring that was
|
||||
inspected during the partial match; it may include characters before the actual
|
||||
match start if a lookbehind assertion, \eb, or \eB was involved. (\eK is not
|
||||
match start if a lookbehind assertion, \eb, or \eB was involved. (\eK is not
|
||||
supported for DFA matching.)
|
||||
.P
|
||||
If global matching is requested, the search for further matches resumes
|
||||
|
|
|
@ -188,7 +188,7 @@ DESCRIPTION
|
|||
followed by any number of subject lines to be matched against that pat-
|
||||
tern. In between sets of test data, command lines that begin with a
|
||||
hash (#) character may appear. This file format, with some restric-
|
||||
tions, can also be processed by the perltest.pl script that is distrib-
|
||||
tions, can also be processed by the perltest.sh script that is distrib-
|
||||
uted with PCRE2 as a means of checking that the behaviour of PCRE2 and
|
||||
Perl is the same.
|
||||
|
||||
|
@ -232,11 +232,11 @@ COMMAND LINES
|
|||
#perltest
|
||||
|
||||
The appearance of this line causes all subsequent modifier settings to
|
||||
be checked for compatibility with the perltest.pl script, which is used
|
||||
be checked for compatibility with the perltest.sh script, which is used
|
||||
to confirm that Perl gives the same results as PCRE2. Also, apart from
|
||||
comment lines, none of the other command lines are permitted, because
|
||||
they and many of the modifiers are specific to pcre2test, and should
|
||||
not be used in test files that are also processed by perltest.pl. The
|
||||
not be used in test files that are also processed by perltest.sh. The
|
||||
#perltest command helps detect tests that are accidentally put in the
|
||||
wrong file.
|
||||
|
||||
|
@ -255,53 +255,52 @@ MODIFIER SYNTAX
|
|||
valid for one or the other only. Each modifier has a long name, for
|
||||
example "anchored", and some of them must be followed by an equals sign
|
||||
and a value, for example, "offset=12". Modifiers that do not take val-
|
||||
ues may be preceded by a minus sign to turn off a previous default set-
|
||||
ting.
|
||||
ues may be preceded by a minus sign to turn off a previous setting.
|
||||
|
||||
A few of the more common modifiers can also be specified as single let-
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
ters, for example "i" for "caseless". In documentation, following the
|
||||
Perl convention, these are written with a slash ("the /i modifier") for
|
||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||
item of a modifier list. If the first item is not recognized as a long
|
||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||
clarity. Abbreviated modifiers must all be concatenated in the first
|
||||
item of a modifier list. If the first item is not recognized as a long
|
||||
modifier name, it is interpreted as a sequence of these abbreviations.
|
||||
For example:
|
||||
|
||||
/abc/ig,newline=cr,jit=3
|
||||
|
||||
This is a pattern line whose modifier list starts with two one-letter
|
||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||
This is a pattern line whose modifier list starts with two one-letter
|
||||
modifiers (/i and /g). The lower-case abbreviated modifiers are the
|
||||
same as used in Perl.
|
||||
|
||||
|
||||
PATTERN SYNTAX
|
||||
|
||||
A pattern line must start with one of the following characters (common
|
||||
A pattern line must start with one of the following characters (common
|
||||
symbols, excluding pattern meta-characters):
|
||||
|
||||
/ ! " ' ` - = _ : ; , % & @ ~
|
||||
|
||||
This is interpreted as the pattern's delimiter. A regular expression
|
||||
may be continued over several input lines, in which case the newline
|
||||
This is interpreted as the pattern's delimiter. A regular expression
|
||||
may be continued over several input lines, in which case the newline
|
||||
characters are included within it. It is possible to include the delim-
|
||||
iter within the pattern by escaping it with a backslash, for example
|
||||
|
||||
/abc\/def/
|
||||
|
||||
If you do this, the escape and the delimiter form part of the pattern,
|
||||
If you do this, the escape and the delimiter form part of the pattern,
|
||||
but since the delimiters are all non-alphanumeric, this does not affect
|
||||
its interpretation. If the terminating delimiter is immediately fol-
|
||||
its interpretation. If the terminating delimiter is immediately fol-
|
||||
lowed by a backslash, for example,
|
||||
|
||||
/abc/\
|
||||
|
||||
then a backslash is added to the end of the pattern. This is done to
|
||||
provide a way of testing the error condition that arises if a pattern
|
||||
then a backslash is added to the end of the pattern. This is done to
|
||||
provide a way of testing the error condition that arises if a pattern
|
||||
finishes with a backslash, because
|
||||
|
||||
/abc\/
|
||||
|
||||
is interpreted as the first line of a pattern that starts with "abc/",
|
||||
causing pcre2test to read the next line as a continuation of the regu-
|
||||
is interpreted as the first line of a pattern that starts with "abc/",
|
||||
causing pcre2test to read the next line as a continuation of the regu-
|
||||
lar expression.
|
||||
|
||||
A pattern can be followed by a modifier list (details below).
|
||||
|
@ -309,7 +308,7 @@ PATTERN SYNTAX
|
|||
|
||||
SUBJECT LINE SYNTAX
|
||||
|
||||
Before each subject line is passed to pcre2_match() or
|
||||
Before each subject line is passed to pcre2_match() or
|
||||
pcre2_dfa_match(), leading and trailing white space is removed, and the
|
||||
line is scanned for backslash escapes. The following provide a means of
|
||||
encoding non-printing characters in a visible way:
|
||||
|
@ -329,23 +328,23 @@ SUBJECT LINE SYNTAX
|
|||
\x{hh...} hexadecimal character (any number of hex digits)
|
||||
|
||||
The use of \x{hh...} is not dependent on the use of the utf modifier on
|
||||
the pattern. It is recognized always. There may be any number of hexa-
|
||||
decimal digits inside the braces; invalid values provoke error mes-
|
||||
the pattern. It is recognized always. There may be any number of hexa-
|
||||
decimal digits inside the braces; invalid values provoke error mes-
|
||||
sages.
|
||||
|
||||
Note that \xhh specifies one byte rather than one character in UTF-8
|
||||
mode; this makes it possible to construct invalid UTF-8 sequences for
|
||||
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
||||
character in UTF-8 mode, generating more than one byte if the value is
|
||||
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
||||
Note that \xhh specifies one byte rather than one character in UTF-8
|
||||
mode; this makes it possible to construct invalid UTF-8 sequences for
|
||||
testing purposes. On the other hand, \x{hh} is interpreted as a UTF-8
|
||||
character in UTF-8 mode, generating more than one byte if the value is
|
||||
greater than 127. When testing the 8-bit library not in UTF-8 mode,
|
||||
\x{hh} generates one byte for values less than 256, and causes an error
|
||||
for greater values.
|
||||
|
||||
In UTF-16 mode, all 4-digit \x{hhhh} values are accepted. This makes it
|
||||
possible to construct invalid UTF-16 sequences for testing purposes.
|
||||
|
||||
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
||||
makes it possible to construct invalid UTF-32 sequences for testing
|
||||
In UTF-32 mode, all 4- to 8-digit \x{...} values are accepted. This
|
||||
makes it possible to construct invalid UTF-32 sequences for testing
|
||||
purposes.
|
||||
|
||||
There is a special backslash sequence that specifies replication of one
|
||||
|
@ -353,38 +352,38 @@ SUBJECT LINE SYNTAX
|
|||
|
||||
\[<characters>]{<count>}
|
||||
|
||||
This makes it possible to test long strings without having to provide
|
||||
This makes it possible to test long strings without having to provide
|
||||
them as part of the file. For example:
|
||||
|
||||
\[abc]{4}
|
||||
|
||||
is converted to "abcabcabcabc". This feature does not support nesting.
|
||||
is converted to "abcabcabcabc". This feature does not support nesting.
|
||||
To include a closing square bracket in the characters, code it as \x5D.
|
||||
|
||||
A backslash followed by an equals sign marke the end of the subject
|
||||
A backslash followed by an equals sign marks the end of the subject
|
||||
string and the start of a modifier list. For example:
|
||||
|
||||
abc\=notbol,notempty
|
||||
|
||||
A backslash followed by any other non-alphanumeric character just
|
||||
A backslash followed by any other non-alphanumeric character just
|
||||
escapes that character. A backslash followed by anything else causes an
|
||||
error. However, if the very last character in the line is a backslash
|
||||
(and there is no modifier list), it is ignored. This gives a way of
|
||||
passing an empty line as data, since a real empty line terminates the
|
||||
error. However, if the very last character in the line is a backslash
|
||||
(and there is no modifier list), it is ignored. This gives a way of
|
||||
passing an empty line as data, since a real empty line terminates the
|
||||
data input.
|
||||
|
||||
|
||||
PATTERN MODIFIERS
|
||||
|
||||
There are three types of modifier that can appear in pattern lines, two
|
||||
of which may also be used in a #pattern command. A pattern's modifier
|
||||
of which may also be used in a #pattern command. A pattern's modifier
|
||||
list can add to or override default modifiers that were set by a previ-
|
||||
ous #pattern command.
|
||||
|
||||
Setting compilation options
|
||||
|
||||
The following modifiers set options for pcre2_compile(). The most com-
|
||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||
The following modifiers set options for pcre2_compile(). The most com-
|
||||
mon ones have single-letter abbreviations. See pcreapi for a descrip-
|
||||
tion of their effects.
|
||||
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
|
@ -410,13 +409,13 @@ PATTERN MODIFIERS
|
|||
utf set PCRE2_UTF
|
||||
|
||||
As well as turning on the PCRE2_UTF option, the utf modifier causes all
|
||||
non-printing characters in output strings to be printed using the
|
||||
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
||||
non-printing characters in output strings to be printed using the
|
||||
\x{hh...} notation. Otherwise, those less than 0x100 are output in hex
|
||||
without the curly brackets.
|
||||
|
||||
Setting compilation controls
|
||||
|
||||
The following modifiers affect the compilation process or request
|
||||
The following modifiers affect the compilation process or request
|
||||
information about the pattern:
|
||||
|
||||
bsr=[anycrlf|unicode] specify \R handling
|
||||
|
@ -441,34 +440,34 @@ PATTERN MODIFIERS
|
|||
|
||||
Newline and \R handling
|
||||
|
||||
The bsr modifier specifies what \R in a pattern should match. If it is
|
||||
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
||||
"unicode", \R matches any Unicode newline sequence. The default is
|
||||
The bsr modifier specifies what \R in a pattern should match. If it is
|
||||
set to "anycrlf", \R matches CR, LF, or CRLF only. If it is set to
|
||||
"unicode", \R matches any Unicode newline sequence. The default is
|
||||
specified when PCRE2 is built, with the default default being Unicode.
|
||||
|
||||
The newline modifier specifies which characters are to be interpreted
|
||||
as newlines, both in the pattern and (by default) in subject lines. The
|
||||
type must be one of CR, LF, CRLF, ANYCRLF, or ANY.
|
||||
The newline modifier specifies which characters are to be interpreted
|
||||
as newlines, both in the pattern and in subject lines. The type must be
|
||||
one of CR, LF, CRLF, ANYCRLF, or ANY (in upper or lower case).
|
||||
|
||||
Information about a pattern
|
||||
|
||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||
The debug modifier is a shorthand for info,fullbincode, requesting all
|
||||
available information.
|
||||
|
||||
The bincode modifier causes a representation of the compiled code to be
|
||||
output after compilation. This information does not contain length and
|
||||
output after compilation. This information does not contain length and
|
||||
offset values, which ensures that the same output is generated for dif-
|
||||
ferent internal link sizes and different code unit widths. By using
|
||||
bincode, the same regression tests can be used in different environ-
|
||||
ferent internal link sizes and different code unit widths. By using
|
||||
bincode, the same regression tests can be used in different environ-
|
||||
ments.
|
||||
|
||||
The fullbincode modifier, by contrast, does include length and offset
|
||||
values. This is used in a few special tests and is also useful for one-
|
||||
off tests.
|
||||
The fullbincode modifier, by contrast, does include length and offset
|
||||
values. This is used in a few special tests that run only for specific
|
||||
code unit widths and link sizes, and is also useful for one-off tests.
|
||||
|
||||
The info modifier requests information about the compiled pattern
|
||||
(whether it is anchored, has a fixed first character, and so on). The
|
||||
information is obtained from the pcre2_pattern_info() function. Here
|
||||
The info modifier requests information about the compiled pattern
|
||||
(whether it is anchored, has a fixed first character, and so on). The
|
||||
information is obtained from the pcre2_pattern_info() function. Here
|
||||
are some typical examples:
|
||||
|
||||
re> /(?i)(^a|^b)/m,info
|
||||
|
@ -486,14 +485,15 @@ PATTERN MODIFIERS
|
|||
Last code unit = 'c' (caseless)
|
||||
Subject length lower bound = 3
|
||||
|
||||
"Compile options" are those specified to the compile function; "overall
|
||||
options" have added options that are taken or deduced from the pattern.
|
||||
If both sets of options are the same, just a single "options" line is
|
||||
output. "First code unit" is where any match must start; if there is
|
||||
more than one they are listed as "starting code units". "Last code
|
||||
unit" is the last literal code unit that must be present in any match.
|
||||
This is not necessarily the last character. These lines are omitted if
|
||||
no starting or ending code units are recorded.
|
||||
"Compile options" are those specified by modifiers; "overall options"
|
||||
have added options that are taken or deduced from the pattern. If both
|
||||
sets of options are the same, just a single "options" line is output;
|
||||
if there are no options, the line is omitted. "First code unit" is
|
||||
where any match must start; if there is more than one they are listed
|
||||
as "starting code units". "Last code unit" is the last literal code
|
||||
unit that must be present in any match. This is not necessarily the
|
||||
last character. These lines are omitted if no starting or ending code
|
||||
units are recorded.
|
||||
|
||||
Specifying a pattern in hex
|
||||
|
||||
|
@ -504,14 +504,14 @@ PATTERN MODIFIERS
|
|||
/ab 32 59/hex
|
||||
|
||||
This feature is provided as a way of creating patterns that contain
|
||||
binary zero characters. By default, pcre2test passes patterns as zero-
|
||||
terminated strings to pcre2_compile(), giving the length as
|
||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal,
|
||||
the actual length of the pattern is passed.
|
||||
binary zero and other non-printing characters. By default, pcre2test
|
||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
||||
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
||||
hexadecimal, the actual length of the pattern is passed.
|
||||
|
||||
JIT compilation
|
||||
|
||||
The /jit modifier may optionally be followed by and equals sign and a
|
||||
The /jit modifier may optionally be followed by an equals sign and a
|
||||
number in the range 0 to 7:
|
||||
|
||||
0 disable JIT
|
||||
|
@ -540,7 +540,7 @@ PATTERN MODIFIERS
|
|||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||
the first output line after a match or non match when JIT-compiled code
|
||||
was actually used.
|
||||
was actually used in the match.
|
||||
|
||||
Setting a locale
|
||||
|
||||
|
@ -609,25 +609,26 @@ PATTERN MODIFIERS
|
|||
|
||||
Using alternative character tables
|
||||
|
||||
The /tables modifier must be followed by a single digit. It causes a
|
||||
specific set of built-in character tables to be passed to pcre2_com-
|
||||
pile(). This is used in the PCRE2 tests to check behaviour with differ-
|
||||
ent character tables. The digit specifies the tables as follows:
|
||||
The value specified for the /tables modifier must be one of the digits
|
||||
0, 1, or 2. It causes a specific set of built-in character tables to be
|
||||
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
||||
haviour with different character tables. The digit specifies the tables
|
||||
as follows:
|
||||
|
||||
0 do not pass any special character tables
|
||||
1 the default ASCII tables, as distributed in
|
||||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
tables and a locale are mutually exclusive.
|
||||
|
||||
Setting certain match controls
|
||||
|
||||
The following modifiers are really subject modifiers, and are described
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
with that pattern. They do not affect the compilation process.
|
||||
|
||||
aftertext show text after match
|
||||
|
@ -639,7 +640,7 @@ PATTERN MODIFIERS
|
|||
replace=<string> specify a replacement string
|
||||
startchar show starting character when relevant
|
||||
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
as defaults, set them in a #subject command.
|
||||
|
||||
|
||||
|
@ -650,7 +651,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
Setting match options
|
||||
|
||||
The following modifiers set options for pcre2_match() or
|
||||
The following modifiers set options for pcre2_match() or
|
||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||
|
||||
anchored set PCRE2_ANCHORED
|
||||
|
@ -664,20 +665,20 @@ SUBJECT MODIFIERS
|
|||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
they appear frequently in tests.
|
||||
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
wrapper API to be used, the only option-setting modifiers that have any
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
Any other modifiers cause an error.
|
||||
|
||||
Setting match controls
|
||||
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
is matched against that pattern.
|
||||
|
||||
aftertext show text after match
|
||||
|
@ -710,23 +711,23 @@ SUBJECT MODIFIERS
|
|||
|
||||
Showing more text
|
||||
|
||||
The aftertext modifier requests that as well as outputting the sub-
|
||||
string that matched the entire pattern, pcre2test should in addition
|
||||
output the remainder of the subject string. This is useful for tests
|
||||
where the subject contains multiple copies of the same substring. The
|
||||
allaftertext modifier requests the same action for captured substrings
|
||||
as well as the main matched substring. In each case the remainder is
|
||||
output on the following line with a plus character following the cap-
|
||||
ture number.
|
||||
The aftertext modifier requests that as well as outputting the part of
|
||||
the subject string that matched the entire pattern, pcre2test should in
|
||||
addition output the remainder of the subject string. This is useful for
|
||||
tests where the subject contains multiple copies of the same substring.
|
||||
The allaftertext modifier requests the same action for captured sub-
|
||||
strings as well as the main matched substring. In each case the remain-
|
||||
der is output on the following line with a plus character following the
|
||||
capture number.
|
||||
|
||||
The allusedtext modifier requests that all the text that was consulted
|
||||
during a successful pattern match by the interpreter should be shown.
|
||||
This feature is not supported for JIT matching, and if requested with
|
||||
JIT it is ignored (with a warning message). Setting this modifier
|
||||
The allusedtext modifier requests that all the text that was consulted
|
||||
during a successful pattern match by the interpreter should be shown.
|
||||
This feature is not supported for JIT matching, and if requested with
|
||||
JIT it is ignored (with a warning message). Setting this modifier
|
||||
affects the output if there is a lookbehind at the start of a match, or
|
||||
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||
that precede or follow the start and end of the actual match are indi-
|
||||
cated in the output by '<' or '>' characters underneath them. Here is
|
||||
a lookahead at the end, or if \K is used in the pattern. Characters
|
||||
that precede or follow the start and end of the actual match are indi-
|
||||
cated in the output by '<' or '>' characters underneath them. Here is
|
||||
an example:
|
||||
|
||||
re> /(?<=pqr)abc(?=xyz)/
|
||||
|
@ -734,8 +735,9 @@ SUBJECT MODIFIERS
|
|||
0: pqrabcxyz
|
||||
<<< >>>
|
||||
|
||||
This shows that the matched string is "abc", with the preceding and
|
||||
following strings "pqr" and "xyz" also consulted during the match.
|
||||
This shows that the matched string is "abc", with the preceding and
|
||||
following strings "pqr" and "xyz" having been consulted during the
|
||||
match (when processing the assertions).
|
||||
|
||||
The startchar modifier requests that the starting character for the
|
||||
match be indicated, if it is different to the start of the matched
|
||||
|
@ -784,9 +786,9 @@ SUBJECT MODIFIERS
|
|||
difference between global and altglobal is that the former uses the
|
||||
start_offset argument to pcre2_match() or pcre2_dfa_match() to start
|
||||
searching at a new point within the entire string (which is what Perl
|
||||
does), whereas the latter passes over a shortened substring. This makes
|
||||
a difference to the matching process if the pattern begins with a look-
|
||||
behind assertion (including \b or \B).
|
||||
does), whereas the latter passes over a shortened subject. This makes a
|
||||
difference to the matching process if the pattern begins with a lookbe-
|
||||
hind assertion (including \b or \B).
|
||||
|
||||
If an empty string is matched, the next match is done with the
|
||||
PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search
|
||||
|
@ -796,7 +798,7 @@ SUBJECT MODIFIERS
|
|||
/g modifier or the split() function. Normally, the start offset is
|
||||
advanced by one character, but if the newline convention recognizes
|
||||
CRLF as a newline, and the current character is CR followed by LF, an
|
||||
advance of two is used.
|
||||
advance of two characters occurs.
|
||||
|
||||
Testing substring extraction functions
|
||||
|
||||
|
@ -807,9 +809,9 @@ SUBJECT MODIFIERS
|
|||
|
||||
abcd\=copy=1,copy=3,get=G1
|
||||
|
||||
If the #subject command is used to set default copy and get lists,
|
||||
these can be unset by specifying a negative number for numbered groups
|
||||
and an empty name for named groups.
|
||||
If the #subject command is used to set default copy and/or get lists,
|
||||
these can be unset by specifying a negative number to cancel all num-
|
||||
bered groups and an empty name to cancel all named groups.
|
||||
|
||||
The getall modifier tests pcre2_substring_list_get(), which extracts
|
||||
all captured substrings.
|
||||
|
@ -818,23 +820,24 @@ SUBJECT MODIFIERS
|
|||
by the convenience functions are output with C, G, or L after the
|
||||
string number instead of a colon. This is in addition to the normal
|
||||
full list. The string length (that is, the return from the extraction
|
||||
function) is given in parentheses after each substring.
|
||||
function) is given in parentheses after each substring, followed by the
|
||||
name when the extraction was by name.
|
||||
|
||||
Testing the substitution function
|
||||
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Unlike subject
|
||||
strings, pcre2test does not process replacement strings for escape
|
||||
If the replace modifier is set, the pcre2_substitute() function is
|
||||
called instead of one of the matching functions. Unlike subject
|
||||
strings, pcre2test does not process replacement strings for escape
|
||||
sequences. In UTF mode, a replacement string is checked to see if it is
|
||||
a valid UTF-8 string. If so, it is correctly converted to a UTF string
|
||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
||||
of the appropriate code unit width. If it is not a valid UTF-8 string,
|
||||
the individual code units are copied directly. This provides a means of
|
||||
passing an invalid UTF-8 string for testing purposes.
|
||||
|
||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||
If the global modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
|
||||
pcre2_substitute(). After a successful substitution, the modified
|
||||
string is output, preceded by the number of replacements. This may be
|
||||
zero if there were no matches. Here is a simple example of a substitu-
|
||||
string is output, preceded by the number of replacements. This may be
|
||||
zero if there were no matches. Here is a simple example of a substitu-
|
||||
tion test:
|
||||
|
||||
/abc/replace=xxx
|
||||
|
@ -843,11 +846,11 @@ SUBJECT MODIFIERS
|
|||
=abc=abc=\=global
|
||||
2: =xxx=xxx=
|
||||
|
||||
Subject and replacement strings should be kept relatively short for
|
||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
||||
test for buffer overflow, if the replacement string starts with a num-
|
||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
||||
the size of the output buffer, with the replacement string starting at
|
||||
Subject and replacement strings should be kept relatively short for
|
||||
substitution tests, as fixed-size buffers are used. To make it easy to
|
||||
test for buffer overflow, if the replacement string starts with a num-
|
||||
ber in square brackets, that number is passed to pcre2_substitute() as
|
||||
the size of the output buffer, with the replacement string starting at
|
||||
the next character. Here is an example that tests the edge case:
|
||||
|
||||
/abc/
|
||||
|
@ -857,125 +860,124 @@ SUBJECT MODIFIERS
|
|||
Failed: error -47: no more memory
|
||||
|
||||
A replacement string is ignored with POSIX and DFA matching. Specifying
|
||||
partial matching provokes an error return ("bad option value") from
|
||||
partial matching provokes an error return ("bad option value") from
|
||||
pcre2_substitute().
|
||||
|
||||
Setting the JIT stack size
|
||||
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
The jitstack modifier provides a way of setting the maximum stack size
|
||||
that is used by the just-in-time optimization code. It is ignored if
|
||||
JIT optimization is not being used. The value is a number of kilobytes.
|
||||
Providing a stack that is larger than the default 32K is necessary only
|
||||
for very complicated patterns.
|
||||
|
||||
Setting match and recursion limits
|
||||
|
||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||
The match_limit and recursion_limit modifiers set the appropriate lim-
|
||||
its in the match context. These values are ignored when the find_limits
|
||||
modifier is specified.
|
||||
|
||||
Finding minimum limits
|
||||
|
||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||
several times, setting different values in the match context via
|
||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||
the minimum values for each parameter that allow pcre2_match() to com-
|
||||
If the find_limits modifier is present, pcre2test calls pcre2_match()
|
||||
several times, setting different values in the match context via
|
||||
pcre2_set_match_limit() and pcre2_set_recursion_limit() until it finds
|
||||
the minimum values for each parameter that allow pcre2_match() to com-
|
||||
plete without error.
|
||||
|
||||
If JIT is being used, only the match limit is relevant. If DFA matching
|
||||
is being used, neither limit is relevant, and this modifier is ignored
|
||||
is being used, neither limit is relevant, and this modifier is ignored
|
||||
(with a warning message).
|
||||
|
||||
The match_limit number is a measure of the amount of backtracking that
|
||||
takes place, and learning the minimum value can be instructive. For
|
||||
most simple matches, the number is quite small, but for patterns with
|
||||
very large numbers of matching possibilities, it can become large very
|
||||
quickly with increasing length of subject string. The
|
||||
match_limit_recursion number is a measure of how much stack (or, if
|
||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||
The match_limit number is a measure of the amount of backtracking that
|
||||
takes place, and learning the minimum value can be instructive. For
|
||||
most simple matches, the number is quite small, but for patterns with
|
||||
very large numbers of matching possibilities, it can become large very
|
||||
quickly with increasing length of subject string. The
|
||||
match_limit_recursion number is a measure of how much stack (or, if
|
||||
PCRE2 is compiled with NO_RECURSE, how much heap) memory is needed to
|
||||
complete the match attempt.
|
||||
|
||||
Showing MARK names
|
||||
|
||||
|
||||
The mark modifier causes the names from backtracking control verbs that
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
are returned from calls to pcre2_match() to be displayed. If a mark is
|
||||
returned for a match, non-match, or partial match, pcre2test shows it.
|
||||
For a match, it is on a line by itself, tagged with "MK:". Otherwise,
|
||||
it is added to the non-match message.
|
||||
|
||||
Showing memory usage
|
||||
|
||||
The memory modifier causes pcre2test to log all memory allocation and
|
||||
The memory modifier causes pcre2test to log all memory allocation and
|
||||
freeing calls that occur during a match operation.
|
||||
|
||||
Setting a starting offset
|
||||
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
The offset modifier sets an offset in the subject string at which
|
||||
matching starts. Its value is a number of code units, not characters.
|
||||
|
||||
Setting the size of the output vector
|
||||
|
||||
The ovector modifier applies only to the subject line in which it
|
||||
appears, though of course it can also be used to set a default in a
|
||||
#subject command. It specifies the number of pairs of offsets that are
|
||||
The ovector modifier applies only to the subject line in which it
|
||||
appears, though of course it can also be used to set a default in a
|
||||
#subject command. It specifies the number of pairs of offsets that are
|
||||
available for storing matching information. The default is 15.
|
||||
|
||||
A value of zero is useful when testing the POSIX API because it causes
|
||||
A value of zero is useful when testing the POSIX API because it causes
|
||||
regexec() to be called with a NULL capture vector. When not testing the
|
||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||
ate_from_pattern() to be called, in order to create a match block of
|
||||
POSIX API, a value of zero is used to cause pcre2_match_data_cre-
|
||||
ate_from_pattern() to be called, in order to create a match block of
|
||||
exactly the right size for the pattern. (It is not possible to create a
|
||||
match block with a zero-length ovector; there is always at least one
|
||||
match block with a zero-length ovector; there is always at least one
|
||||
pair of offsets.)
|
||||
|
||||
Passing the subject as zero-terminated
|
||||
|
||||
By default, the subject string is passed to a native API matching func-
|
||||
tion with its correct length. In order to test the facility for passing
|
||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||
a zero-terminated string, the zero_terminate modifier is provided. It
|
||||
causes the length to be passed as PCRE2_ZERO_TERMINATED. (When matching
|
||||
via the POSIX interface, this modifier has no effect, as there is no
|
||||
via the POSIX interface, this modifier has no effect, as there is no
|
||||
facility for passing a length.)
|
||||
|
||||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
When testing pcre2_substitute(), this modifier also has the effect of
|
||||
passing the replacement string as zero-terminated.
|
||||
|
||||
|
||||
THE ALTERNATIVE MATCHING FUNCTION
|
||||
|
||||
By default, pcre2test uses the standard PCRE2 matching function,
|
||||
By default, pcre2test uses the standard PCRE2 matching function,
|
||||
pcre2_match() to match each subject line. PCRE2 also supports an alter-
|
||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||
ferent way, and has some restrictions. The differences between the two
|
||||
native matching function, pcre2_dfa_match(), which operates in a dif-
|
||||
ferent way, and has some restrictions. The differences between the two
|
||||
functions are described in the pcre2matching documentation.
|
||||
|
||||
If the dfa modifier is set, the alternative matching function is used.
|
||||
This function finds all possible matches at a given point in the sub-
|
||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||
after the first match is found. This is always the shortest possible
|
||||
If the dfa modifier is set, the alternative matching function is used.
|
||||
This function finds all possible matches at a given point in the sub-
|
||||
ject. If, however, the dfa_shortest modifier is set, processing stops
|
||||
after the first match is found. This is always the shortest possible
|
||||
match.
|
||||
|
||||
|
||||
DEFAULT OUTPUT FROM pcre2test
|
||||
|
||||
This section describes the output when the normal matching function,
|
||||
This section describes the output when the normal matching function,
|
||||
pcre2_match(), is being used.
|
||||
|
||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||
strings, starting with number 0 for the string that matched the whole
|
||||
pattern. Otherwise, it outputs "No match" when the return is
|
||||
PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially
|
||||
matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that
|
||||
this is the entire substring that was inspected during the partial
|
||||
match; it may include characters before the actual match start if a
|
||||
When a match succeeds, pcre2test outputs the list of captured sub-
|
||||
strings, starting with number 0 for the string that matched the whole
|
||||
pattern. Otherwise, it outputs "No match" when the return is
|
||||
PCRE2_ERROR_NOMATCH, or "Partial match:" followed by the partially
|
||||
matching substring when the return is PCRE2_ERROR_PARTIAL. (Note that
|
||||
this is the entire substring that was inspected during the partial
|
||||
match; it may include characters before the actual match start if a
|
||||
lookbehind assertion, \K, \b, or \B was involved.)
|
||||
|
||||
For any other return, pcre2test outputs the PCRE2 negative error number
|
||||
and a short descriptive phrase. If the error is a failed UTF string
|
||||
check, the offset of the start of the failing character and the reason
|
||||
code are also output. Here is an example of an interactive pcre2test
|
||||
run.
|
||||
and a short descriptive phrase. If the error is a failed UTF string
|
||||
check, the code unit offset of the start of the failing character is
|
||||
also output. Here is an example of an interactive pcre2test run.
|
||||
|
||||
$ pcre2test
|
||||
PCRE2 version 9.00 2014-05-10
|
||||
|
@ -988,8 +990,8 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
No match
|
||||
|
||||
Unset capturing substrings that are not followed by one that is set are
|
||||
not returned by pcre2_match(), and are not shown by pcre2test. In the
|
||||
following example, there are two capturing substrings, but when the
|
||||
not shown by pcre2test unless the allcaptures modifier is specified. In
|
||||
the following example, there are two capturing substrings, but when the
|
||||
first data line is matched, the second, unset substring is not shown.
|
||||
An "internal" unset substring is shown as "<unset>", as for the second
|
||||
data line.
|
||||
|
@ -1028,8 +1030,8 @@ DEFAULT OUTPUT FROM pcre2test
|
|||
1: pp
|
||||
|
||||
"No match" is output only if the first match attempt fails. Here is an
|
||||
example of a failure message (the offset 4 that is specified by \>4 is
|
||||
past the end of the subject string):
|
||||
example of a failure message (the offset 4 that is specified by the
|
||||
offset modifier is past the end of the subject string):
|
||||
|
||||
re> /xyz/
|
||||
data> xyz\=offset=4
|
||||
|
@ -1053,13 +1055,13 @@ OUTPUT FROM THE ALTERNATIVE MATCHING FUNCTION
|
|||
1: tang
|
||||
2: tan
|
||||
|
||||
(Using the normal matching function on this data finds only "tang".)
|
||||
The longest matching string is always given first (and numbered zero).
|
||||
Using the normal matching function on this data finds only "tang". The
|
||||
longest matching string is always given first (and numbered zero).
|
||||
After a PCRE2_ERROR_PARTIAL return, the output is "Partial match:",
|
||||
followed by the partially matching substring. (Note that this is the
|
||||
followed by the partially matching substring. Note that this is the
|
||||
entire substring that was inspected during the partial match; it may
|
||||
include characters before the actual match start if a lookbehind asser-
|
||||
tion, \K, \b, or \B was involved.)
|
||||
tion, \b, or \B was involved. (\K is not supported for DFA matching.)
|
||||
|
||||
If global matching is requested, the search for further matches resumes
|
||||
at the end of the longest match. For example:
|
||||
|
@ -1183,5 +1185,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 14 November 2014
|
||||
Last updated: 23 November 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
|
|
|
@ -108,8 +108,8 @@ case-equivalent, and these are treated as such.
|
|||
.sp
|
||||
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
|
||||
are (by default) checked for validity on entry to the relevant functions.
|
||||
If an invalid UTF string is passed, an negative error code is returned. The
|
||||
code unit offset to the offending character can be extracted from the match
|
||||
If an invalid UTF string is passed, an negative error code is returned. The
|
||||
code unit offset to the offending character can be extracted from the match
|
||||
data block by calling \fBpcre2_get_startchar()\fP, which is used for this
|
||||
purpose after a UTF error.
|
||||
.P
|
||||
|
|
|
@ -18,10 +18,10 @@ to set the macro values. In this case, you do not have to set -DHAVE_CONFIG_H,
|
|||
but if you do, default values will be taken from config.h for non-boolean
|
||||
macros that are not defined on the command line.
|
||||
|
||||
Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be
|
||||
defined (conventionally to 1) for TRUE, and not defined at all for FALSE. All
|
||||
such macros are listed as a commented #undef in config.h.generic. Macros such
|
||||
as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
|
||||
Boolean macros such as HAVE_STDLIB_H and SUPPORT_PCRE2_8 should either be defined
|
||||
(conventionally to 1) for TRUE, and not defined at all for FALSE. All such
|
||||
macros are listed as a commented #undef in config.h.generic. Macros such as
|
||||
MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
|
||||
surrounded by #ifndef/#endif lines so that the value can be overridden by -D.
|
||||
|
||||
PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
|
||||
|
@ -201,7 +201,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_NAME "PCRE2"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE2 10.00-DEV"
|
||||
#define PACKAGE_STRING "PCRE2 10.00-RC1"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre2"
|
||||
|
@ -210,7 +210,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "10.00-DEV"
|
||||
#define PACKAGE_VERSION "10.00-RC1"
|
||||
|
||||
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
||||
parentheses (of any kind) in a pattern. This limits the amount of system
|
||||
|
@ -288,7 +288,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
|||
/* #undef SUPPORT_VALGRIND */
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "10.00-DEV"
|
||||
#define VERSION "10.00-RC1"
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
|
|
@ -43,8 +43,8 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define PCRE2_MAJOR 10
|
||||
#define PCRE2_MINOR 00
|
||||
#define PCRE2_PRERELEASE -DEV
|
||||
#define PCRE2_DATE 2014-99-99
|
||||
#define PCRE2_PRERELEASE -RC1
|
||||
#define PCRE2_DATE 2014-11-24
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE2, the appropriate
|
||||
|
@ -125,8 +125,8 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_JIT_PARTIAL_HARD 0x00000004u
|
||||
|
||||
/* These are for pcre2_match() and pcre2_dfa_match(). Note that PCRE2_ANCHORED,
|
||||
PCRE2_NO_START_OPTIMIZE, and PCRE2_NO_UTF_CHECK can also be passed to these
|
||||
functions, so take care not to define synonyms by mistake. */
|
||||
and PCRE2_NO_UTF_CHECK can also be passed to these functions, so take care not
|
||||
to define synonyms by mistake. */
|
||||
|
||||
#define PCRE2_NOTBOL 0x00000001u
|
||||
#define PCRE2_NOTEOL 0x00000002u
|
||||
|
@ -140,6 +140,10 @@ functions, so take care not to define synonyms by mistake. */
|
|||
#define PCRE2_DFA_RESTART 0x00000040u
|
||||
#define PCRE2_DFA_SHORTEST 0x00000080u
|
||||
|
||||
/* This is an additional option for pcre2_substitute(). */
|
||||
|
||||
#define PCRE2_SUBSTITUTE_GLOBAL 0x00000100u
|
||||
|
||||
/* Newline and \R settings, for use in compile contexts. The newline values
|
||||
must be kept in step with values set in config.h and both sets must all be
|
||||
greater than zero. */
|
||||
|
@ -202,24 +206,25 @@ context functions. */
|
|||
#define PCRE2_ERROR_BADMODE (-32)
|
||||
#define PCRE2_ERROR_BADOFFSET (-33)
|
||||
#define PCRE2_ERROR_BADOPTION (-34)
|
||||
#define PCRE2_ERROR_BADUTFOFFSET (-35)
|
||||
#define PCRE2_ERROR_CALLOUT (-36) /* Never used by PCRE2 itself */
|
||||
#define PCRE2_ERROR_DFA_BADRESTART (-37)
|
||||
#define PCRE2_ERROR_DFA_RECURSE (-38)
|
||||
#define PCRE2_ERROR_DFA_UCOND (-39)
|
||||
#define PCRE2_ERROR_DFA_UITEM (-40)
|
||||
#define PCRE2_ERROR_DFA_WSSIZE (-41)
|
||||
#define PCRE2_ERROR_INTERNAL (-42)
|
||||
#define PCRE2_ERROR_JIT_BADOPTION (-43)
|
||||
#define PCRE2_ERROR_JIT_STACKLIMIT (-44)
|
||||
#define PCRE2_ERROR_MATCHLIMIT (-45)
|
||||
#define PCRE2_ERROR_NOMEMORY (-46)
|
||||
#define PCRE2_ERROR_NOSUBSTRING (-47)
|
||||
#define PCRE2_ERROR_NOUNIQUESUBSTRING (-48)
|
||||
#define PCRE2_ERROR_NULL (-49)
|
||||
#define PCRE2_ERROR_RECURSELOOP (-50)
|
||||
#define PCRE2_ERROR_RECURSIONLIMIT (-51)
|
||||
#define PCRE2_ERROR_UNSET (-52)
|
||||
#define PCRE2_ERROR_BADREPLACEMENT (-35)
|
||||
#define PCRE2_ERROR_BADUTFOFFSET (-36)
|
||||
#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */
|
||||
#define PCRE2_ERROR_DFA_BADRESTART (-38)
|
||||
#define PCRE2_ERROR_DFA_RECURSE (-39)
|
||||
#define PCRE2_ERROR_DFA_UCOND (-40)
|
||||
#define PCRE2_ERROR_DFA_UITEM (-41)
|
||||
#define PCRE2_ERROR_DFA_WSSIZE (-42)
|
||||
#define PCRE2_ERROR_INTERNAL (-43)
|
||||
#define PCRE2_ERROR_JIT_BADOPTION (-44)
|
||||
#define PCRE2_ERROR_JIT_STACKLIMIT (-45)
|
||||
#define PCRE2_ERROR_MATCHLIMIT (-46)
|
||||
#define PCRE2_ERROR_NOMEMORY (-47)
|
||||
#define PCRE2_ERROR_NOSUBSTRING (-48)
|
||||
#define PCRE2_ERROR_NOUNIQUESUBSTRING (-49)
|
||||
#define PCRE2_ERROR_NULL (-50)
|
||||
#define PCRE2_ERROR_RECURSELOOP (-51)
|
||||
#define PCRE2_ERROR_RECURSIONLIMIT (-52)
|
||||
#define PCRE2_ERROR_UNSET (-53)
|
||||
|
||||
/* Request types for pcre2_pattern_info() */
|
||||
|
||||
|
@ -406,7 +411,8 @@ PCRE2_EXP_DECL \
|
|||
pcre2_match_data *pcre2_match_data_create(uint32_t, \
|
||||
pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL \
|
||||
pcre2_match_data *pcre2_match_data_create_from_pattern(pcre2_code *, \
|
||||
pcre2_match_data *pcre2_match_data_create_from_pattern(\
|
||||
const pcre2_code *, \
|
||||
pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL int pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, \
|
||||
PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
||||
|
@ -447,19 +453,28 @@ PCRE2_EXP_DECL int pcre2_substring_list_get(pcre2_match_data *, \
|
|||
PCRE2_UCHAR ***, PCRE2_SIZE **);
|
||||
|
||||
|
||||
/* Convenience function for match + substitute. */
|
||||
|
||||
#define PCRE2_SUBSTITUTE_FUNCTION \
|
||||
PCRE2_EXP_DECL int pcre2_substitute(const pcre2_code *, \
|
||||
PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
||||
pcre2_match_data *, pcre2_match_context *, \
|
||||
PCRE2_SPTR, PCRE2_SIZE, PCRE2_UCHAR *, \
|
||||
PCRE2_SIZE *);
|
||||
|
||||
|
||||
/* Functions for JIT processing */
|
||||
|
||||
#define PCRE2_JIT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL int pcre2_jit_compile(pcre2_code *, uint32_t); \
|
||||
PCRE2_EXP_DECL int pcre2_jit_match(const pcre2_code *, \
|
||||
PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \
|
||||
pcre2_match_data *, pcre2_match_context *, \
|
||||
pcre2_jit_stack *); \
|
||||
PCRE2_EXP_DECL void pcre2_jit_free_unused_memory(pcre2_general_context *);\
|
||||
pcre2_match_data *, pcre2_match_context *); \
|
||||
PCRE2_EXP_DECL void pcre2_jit_free_unused_memory(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL \
|
||||
pcre2_jit_stack *pcre2_jit_stack_create(pcre2_general_context *, \
|
||||
PCRE2_SIZE, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL void pcre2_jit_stack_assign(const pcre2_code *, \
|
||||
PCRE2_EXP_DECL void pcre2_jit_stack_assign(pcre2_match_context *, \
|
||||
pcre2_jit_callback, void *); \
|
||||
PCRE2_EXP_DECL void pcre2_jit_stack_free(pcre2_jit_stack *);
|
||||
|
||||
|
@ -551,6 +566,7 @@ pcre2_compile are called by application code. */
|
|||
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
||||
#define pcre2_set_recursion_limit PCRE2_SUFFIX(pcre2_set_recursion_limit_)
|
||||
#define pcre2_set_recursion_memory_management PCRE2_SUFFIX(pcre2_set_recursion_memory_management_)
|
||||
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
|
||||
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
|
||||
#define pcre2_substring_copy_bynumber PCRE2_SUFFIX(pcre2_substring_copy_bynumber_)
|
||||
#define pcre2_substring_free PCRE2_SUFFIX(pcre2_substring_free_)
|
||||
|
@ -577,6 +593,7 @@ PCRE2_MATCH_CONTEXT_FUNCTIONS \
|
|||
PCRE2_COMPILE_FUNCTIONS \
|
||||
PCRE2_PATTERN_INFO_FUNCTIONS \
|
||||
PCRE2_MATCH_FUNCTIONS \
|
||||
PCRE2_SUBSTITUTE_FUNCTION \
|
||||
PCRE2_SUBSTRING_FUNCTIONS \
|
||||
PCRE2_JIT_FUNCTIONS \
|
||||
PCRE2_OTHER_FUNCTIONS
|
||||
|
|
|
@ -1570,13 +1570,13 @@ enum {
|
|||
|
||||
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
|
||||
definitions that follow must also be updated to match. There are also tables
|
||||
called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
|
||||
pcre_dfa_exec.c that must be updated. */
|
||||
called "opcode_possessify" in pcre2_compile.c and "coptable" and "poptable" in
|
||||
pcre2_dfa_exec.c that must be updated. */
|
||||
|
||||
|
||||
/* This macro defines textual names for all the opcodes. These are used only
|
||||
for debugging, and some of them are only partial names. The macro is referenced
|
||||
only in pcre_printint.c, which fills out the full names in many cases (and in
|
||||
only in pcre2_printint.c, which fills out the full names in many cases (and in
|
||||
some cases doesn't actually use these names at all). */
|
||||
|
||||
#define OP_NAME_LIST \
|
||||
|
|
|
@ -5570,13 +5570,13 @@ else for (gmatched = 0;; gmatched++)
|
|||
fprintf(outfile, "Failed: error %d: ", capcount);
|
||||
PCRE2_GET_ERROR_MESSAGE(mlen, capcount, pbuffer);
|
||||
PCHARSV(CASTVAR(void *, pbuffer), 0, mlen, FALSE, outfile);
|
||||
if (capcount <= PCRE2_ERROR_UTF8_ERR1 &&
|
||||
if (capcount <= PCRE2_ERROR_UTF8_ERR1 &&
|
||||
capcount >= PCRE2_ERROR_UTF32_ERR2)
|
||||
{
|
||||
PCRE2_SIZE startchar;
|
||||
PCRE2_GET_STARTCHAR(startchar, match_data);
|
||||
fprintf(outfile, " at offset %ld", startchar);
|
||||
}
|
||||
}
|
||||
fprintf(outfile, "\n");
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue