Add pcre2_code_copy_with_tables().
This commit is contained in:
parent
43e541adda
commit
2aec84e37e
|
@ -181,6 +181,9 @@ wrong name.
|
||||||
|
|
||||||
27. In pcre2test, give some offset information for errors in hex patterns.
|
27. In pcre2test, give some offset information for errors in hex patterns.
|
||||||
|
|
||||||
|
28. Implemented pcre2_code_copy_with_tables(), and added pushtablescopy to
|
||||||
|
pcre2test for testing it.
|
||||||
|
|
||||||
|
|
||||||
Version 10.22 29-July-2016
|
Version 10.22 29-July-2016
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -25,6 +25,7 @@ dist_html_DATA = \
|
||||||
doc/html/pcre2.html \
|
doc/html/pcre2.html \
|
||||||
doc/html/pcre2_callout_enumerate.html \
|
doc/html/pcre2_callout_enumerate.html \
|
||||||
doc/html/pcre2_code_copy.html \
|
doc/html/pcre2_code_copy.html \
|
||||||
|
doc/html/pcre2_code_copy_with_tables.html \
|
||||||
doc/html/pcre2_code_free.html \
|
doc/html/pcre2_code_free.html \
|
||||||
doc/html/pcre2_compile.html \
|
doc/html/pcre2_compile.html \
|
||||||
doc/html/pcre2_compile_context_copy.html \
|
doc/html/pcre2_compile_context_copy.html \
|
||||||
|
@ -107,6 +108,7 @@ dist_man_MANS = \
|
||||||
doc/pcre2.3 \
|
doc/pcre2.3 \
|
||||||
doc/pcre2_callout_enumerate.3 \
|
doc/pcre2_callout_enumerate.3 \
|
||||||
doc/pcre2_code_copy.3 \
|
doc/pcre2_code_copy.3 \
|
||||||
|
doc/pcre2_code_copy_with_tables.3 \
|
||||||
doc/pcre2_code_free.3 \
|
doc/pcre2_code_free.3 \
|
||||||
doc/pcre2_compile.3 \
|
doc/pcre2_compile.3 \
|
||||||
doc/pcre2_compile_context_copy.3 \
|
doc/pcre2_compile_context_copy.3 \
|
||||||
|
|
|
@ -174,7 +174,11 @@ can skip ahead to the CMake section.
|
||||||
|
|
||||||
(11) If you want to use the pcre2grep command, compile and link
|
(11) If you want to use the pcre2grep command, compile and link
|
||||||
src/pcre2grep.c; it uses only the basic 8-bit PCRE2 library (it does not
|
src/pcre2grep.c; it uses only the basic 8-bit PCRE2 library (it does not
|
||||||
need the pcre2posix library).
|
need the pcre2posix library). If you have built the PCRE2 library with JIT
|
||||||
|
support by defining SUPPORT_JIT in src/config.h, you can also define
|
||||||
|
SUPPORT_PCRE2GREP_JIT, which causes pcre2grep to make use of JIT (unless
|
||||||
|
it is run with --no-jit). If you define SUPPORT_PCRE2GREP_JIT without
|
||||||
|
defining SUPPORT_JIT, pcre2grep does not try to make use of JIT.
|
||||||
|
|
||||||
|
|
||||||
STACK SIZE IN WINDOWS ENVIRONMENTS
|
STACK SIZE IN WINDOWS ENVIRONMENTS
|
||||||
|
@ -389,4 +393,4 @@ and executable, is in EBCDIC and native z/OS file formats and this is the
|
||||||
recommended download site.
|
recommended download site.
|
||||||
|
|
||||||
=============================
|
=============================
|
||||||
Last Updated: 16 July 2015
|
Last Updated: 13 October 2016
|
||||||
|
|
|
@ -44,7 +44,7 @@ wrappers.
|
||||||
|
|
||||||
The distribution does contain a set of C wrapper functions for the 8-bit
|
The distribution does contain a set of C wrapper functions for the 8-bit
|
||||||
library that are based on the POSIX regular expression API (see the pcre2posix
|
library that are based on the POSIX regular expression API (see the pcre2posix
|
||||||
man page). These can be found in a library called libpcre2posix. Note that this
|
man page). These can be found in a library called libpcre2-posix. Note that this
|
||||||
just provides a POSIX calling interface to PCRE2; the regular expressions
|
just provides a POSIX calling interface to PCRE2; the regular expressions
|
||||||
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
||||||
and does not give full access to all of PCRE2's facilities.
|
and does not give full access to all of PCRE2's facilities.
|
||||||
|
@ -58,8 +58,8 @@ renamed or pointed at by a link.
|
||||||
If you are using the POSIX interface to PCRE2 and there is already a POSIX
|
If you are using the POSIX interface to PCRE2 and there is already a POSIX
|
||||||
regex library installed on your system, as well as worrying about the regex.h
|
regex library installed on your system, as well as worrying about the regex.h
|
||||||
header file (as mentioned above), you must also take care when linking programs
|
header file (as mentioned above), you must also take care when linking programs
|
||||||
to ensure that they link with PCRE2's libpcre2posix library. Otherwise they may
|
to ensure that they link with PCRE2's libpcre2-posix library. Otherwise they
|
||||||
pick up the POSIX functions of the same name from the other library.
|
may pick up the POSIX functions of the same name from the other library.
|
||||||
|
|
||||||
One way of avoiding this confusion is to compile PCRE2 with the addition of
|
One way of avoiding this confusion is to compile PCRE2 with the addition of
|
||||||
-Dregcomp=PCRE2regcomp (and similarly for the other POSIX functions) to the
|
-Dregcomp=PCRE2regcomp (and similarly for the other POSIX functions) to the
|
||||||
|
@ -204,13 +204,6 @@ library. They are also documented in the pcre2build man page.
|
||||||
--enable-newline-is-crlf, --enable-newline-is-anycrlf, or
|
--enable-newline-is-crlf, --enable-newline-is-anycrlf, or
|
||||||
--enable-newline-is-any to the "configure" command, respectively.
|
--enable-newline-is-any to the "configure" command, respectively.
|
||||||
|
|
||||||
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
|
||||||
the standard tests will fail, because the lines in the test files end with
|
|
||||||
LF. Even if the files are edited to change the line endings, there are likely
|
|
||||||
to be some failures. With --enable-newline-is-anycrlf or
|
|
||||||
--enable-newline-is-any, many tests should succeed, but there may be some
|
|
||||||
failures.
|
|
||||||
|
|
||||||
. By default, the sequence \R in a pattern matches any Unicode line ending
|
. By default, the sequence \R in a pattern matches any Unicode line ending
|
||||||
sequence. This is independent of the option specifying what PCRE2 considers
|
sequence. This is independent of the option specifying what PCRE2 considers
|
||||||
to be the end of a line (see above). However, the caller of PCRE2 can
|
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||||
|
@ -253,13 +246,13 @@ library. They are also documented in the pcre2build man page.
|
||||||
sizes in the pcre2stack man page.
|
sizes in the pcre2stack man page.
|
||||||
|
|
||||||
. In the 8-bit library, the default maximum compiled pattern size is around
|
. In the 8-bit library, the default maximum compiled pattern size is around
|
||||||
64K. You can increase this by adding --with-link-size=3 to the "configure"
|
64K bytes. You can increase this by adding --with-link-size=3 to the
|
||||||
command. PCRE2 then uses three bytes instead of two for offsets to different
|
"configure" command. PCRE2 then uses three bytes instead of two for offsets
|
||||||
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
to different parts of the compiled pattern. In the 16-bit library,
|
||||||
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
--with-link-size=3 is the same as --with-link-size=4, which (in both
|
||||||
offsets. Increasing the internal link size reduces performance in the 8-bit
|
libraries) uses four-byte offsets. Increasing the internal link size reduces
|
||||||
and 16-bit libraries. In the 32-bit library, the link size setting is
|
performance in the 8-bit and 16-bit libraries. In the 32-bit library, the
|
||||||
ignored, as 4-byte offsets are always used.
|
link size setting is ignored, as 4-byte offsets are always used.
|
||||||
|
|
||||||
. You can build PCRE2 so that its internal match() function that is called from
|
. You can build PCRE2 so that its internal match() function that is called from
|
||||||
pcre2_match() does not call itself recursively. Instead, it uses memory
|
pcre2_match() does not call itself recursively. Instead, it uses memory
|
||||||
|
@ -339,12 +332,23 @@ library. They are also documented in the pcre2build man page.
|
||||||
|
|
||||||
Of course, the relevant libraries must be installed on your system.
|
Of course, the relevant libraries must be installed on your system.
|
||||||
|
|
||||||
. The default size (in bytes) of the internal buffer used by pcre2grep can be
|
. The default starting size (in bytes) of the internal buffer used by pcre2grep
|
||||||
set by, for example:
|
can be set by, for example:
|
||||||
|
|
||||||
--with-pcre2grep-bufsize=51200
|
--with-pcre2grep-bufsize=51200
|
||||||
|
|
||||||
The value must be a plain integer. The default is 20480.
|
The value must be a plain integer. The default is 20480. The amount of memory
|
||||||
|
used by pcre2grep is actually three times this number, to allow for "before"
|
||||||
|
and "after" lines. If very long lines are encountered, the buffer is
|
||||||
|
automatically enlarged, up to a fixed maximum size.
|
||||||
|
|
||||||
|
. The default maximum size of pcre2grep's internal buffer can be set by, for
|
||||||
|
example:
|
||||||
|
|
||||||
|
--with-pcre2grep-max-bufsize=2097152
|
||||||
|
|
||||||
|
The default is either 1048576 or the value of --with-pcre2grep-bufsize,
|
||||||
|
whichever is the larger.
|
||||||
|
|
||||||
. It is possible to compile pcre2test so that it links with the libreadline
|
. It is possible to compile pcre2test so that it links with the libreadline
|
||||||
or libedit libraries, by specifying, respectively,
|
or libedit libraries, by specifying, respectively,
|
||||||
|
@ -369,6 +373,22 @@ library. They are also documented in the pcre2build man page.
|
||||||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||||
should fix it.
|
should fix it.
|
||||||
|
|
||||||
|
. There is a special option called --enable-fuzz-support for use by people who
|
||||||
|
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||||
|
library. If set, it causes an extra library called libpcre2-fuzzsupport.a to
|
||||||
|
be built, but not installed. This contains a single function called
|
||||||
|
LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the
|
||||||
|
length of the string. When called, this function tries to compile the string
|
||||||
|
as a pattern, and if that succeeds, to match it. This is done both with no
|
||||||
|
options and with some random options bits that are generated from the string.
|
||||||
|
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||||
|
be created. This is normally run under valgrind or used when PCRE2 is
|
||||||
|
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||||
|
outputs information about it is doing. The input strings are specified by
|
||||||
|
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||||
|
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||||
|
file are the test string.
|
||||||
|
|
||||||
The "configure" script builds the following files for the basic C library:
|
The "configure" script builds the following files for the basic C library:
|
||||||
|
|
||||||
. Makefile the makefile that builds the library
|
. Makefile the makefile that builds the library
|
||||||
|
@ -543,7 +563,7 @@ script creates the .txt and HTML forms of the documentation from the man pages.
|
||||||
|
|
||||||
|
|
||||||
Testing PCRE2
|
Testing PCRE2
|
||||||
------------
|
-------------
|
||||||
|
|
||||||
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
||||||
There is another script called RunGrepTest that tests the pcre2grep command.
|
There is another script called RunGrepTest that tests the pcre2grep command.
|
||||||
|
@ -757,6 +777,7 @@ The distribution should contain the files listed below.
|
||||||
src/pcre2_xclass.c )
|
src/pcre2_xclass.c )
|
||||||
|
|
||||||
src/pcre2_printint.c debugging function that is used by pcre2test,
|
src/pcre2_printint.c debugging function that is used by pcre2test,
|
||||||
|
src/pcre2_fuzzsupport.c function for (optional) fuzzing support
|
||||||
|
|
||||||
src/config.h.in template for config.h, when built by "configure"
|
src/config.h.in template for config.h, when built by "configure"
|
||||||
src/pcre2.h.in template for pcre2.h when built by "configure"
|
src/pcre2.h.in template for pcre2.h when built by "configure"
|
||||||
|
@ -814,7 +835,7 @@ The distribution should contain the files listed below.
|
||||||
libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config
|
libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config
|
||||||
libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config
|
libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config
|
||||||
libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config
|
libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config
|
||||||
libpcre2posix.pc.in template for libpcre2posix.pc for pkg-config
|
libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config
|
||||||
ltmain.sh file used to build a libtool script
|
ltmain.sh file used to build a libtool script
|
||||||
missing ) common stub for a few missing GNU programs while
|
missing ) common stub for a few missing GNU programs while
|
||||||
) installing, generated by automake
|
) installing, generated by automake
|
||||||
|
@ -845,4 +866,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 01 April 2016
|
Last updated: 01 November 2016
|
||||||
|
|
|
@ -94,6 +94,9 @@ in the library.
|
||||||
<tr><td><a href="pcre2_code_copy.html">pcre2_code_copy</a></td>
|
<tr><td><a href="pcre2_code_copy.html">pcre2_code_copy</a></td>
|
||||||
<td> Copy a compiled pattern</td></tr>
|
<td> Copy a compiled pattern</td></tr>
|
||||||
|
|
||||||
|
<tr><td><a href="pcre2_code_copy_with_tables.html">pcre2_code_copy_with_tables</a></td>
|
||||||
|
<td> Copy a compiled pattern and its character tables</td></tr>
|
||||||
|
|
||||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||||
<td> Free a compiled pattern</td></tr>
|
<td> Free a compiled pattern</td></tr>
|
||||||
|
|
||||||
|
|
|
@ -28,8 +28,9 @@ DESCRIPTION
|
||||||
This function makes a copy of the memory used for a compiled pattern, excluding
|
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||||
any memory used by the JIT compiler. Without a subsequent call to
|
any memory used by the JIT compiler. Without a subsequent call to
|
||||||
<b>pcre2_jit_compile()</b>, the copy can be used only for non-JIT matching. The
|
<b>pcre2_jit_compile()</b>, the copy can be used only for non-JIT matching. The
|
||||||
yield of the function is NULL if <i>code</i> is NULL or if sufficient memory
|
pointer to the character tables is copied, not the tables themselves (see
|
||||||
cannot be obtained.
|
<b>pcre2_code_copy_with_tables()</b>). The yield of the function is NULL if
|
||||||
|
<i>code</i> is NULL or if sufficient memory cannot be obtained.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
There is a complete description of the PCRE2 native API in the
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>pcre2_code_copy_with_tables specification</title>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||||
|
<h1>pcre2_code_copy_with_tables man page</h1>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
|
please consult the man page, in case the conversion went wrong.
|
||||||
|
<br>
|
||||||
|
<br><b>
|
||||||
|
SYNOPSIS
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
<b>#include <pcre2.h></b>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *<i>code</i>);</b>
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
|
DESCRIPTION
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||||
|
any memory used by the JIT compiler. Without a subsequent call to
|
||||||
|
<b>pcre2_jit_compile()</b>, the copy can be used only for non-JIT matching.
|
||||||
|
Unlike <b>pcre2_code_copy()</b>, a separate copy of the character tables is also
|
||||||
|
made, with the new code pointing to it. This memory will be automatically freed
|
||||||
|
when <b>pcre2_code_free()</b> is called. The yield of the function is NULL if
|
||||||
|
<i>code</i> is NULL or if sufficient memory cannot be obtained.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
|
page and a description of the POSIX API in the
|
||||||
|
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||||
|
page.
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
|
@ -26,8 +26,11 @@ SYNOPSIS
|
||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
This function sets, in a compile context, the maximum length (in code units) of
|
This function sets, in a compile context, the maximum text length (in code
|
||||||
the pattern that can be compiled. The result is always zero.
|
units) of the pattern that can be compiled. The result is always zero. If a
|
||||||
|
longer pattern is passed to <b>pcre2_compile()</b> there is an immediate error
|
||||||
|
return. The default is effectively unlimited, being the largest value a
|
||||||
|
PCRE2_SIZE variable can hold.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
There is a complete description of the PCRE2 native API in the
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
|
|
@ -294,6 +294,9 @@ document for an overview of all the PCRE2 documentation.
|
||||||
<b>pcre2_code *pcre2_code_copy(const pcre2_code *<i>code</i>);</b>
|
<b>pcre2_code *pcre2_code_copy(const pcre2_code *<i>code</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
|
<b>pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *<i>code</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
<b>int pcre2_get_error_message(int <i>errorcode</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
|
<b>int pcre2_get_error_message(int <i>errorcode</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
|
||||||
<b> PCRE2_SIZE <i>bufflen</i>);</b>
|
<b> PCRE2_SIZE <i>bufflen</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -567,8 +570,9 @@ If JIT is being used, but the JIT compilation is not being done immediately,
|
||||||
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
||||||
required. JIT compilation updates a pointer within the compiled code block, so
|
required. JIT compilation updates a pointer within the compiled code block, so
|
||||||
a thread must gain unique write access to the pointer before calling
|
a thread must gain unique write access to the pointer before calling
|
||||||
<b>pcre2_jit_compile()</b>. Alternatively, <b>pcre2_code_copy()</b> can be used
|
<b>pcre2_jit_compile()</b>. Alternatively, <b>pcre2_code_copy()</b> or
|
||||||
to obtain a private copy of the compiled code.
|
<b>pcre2_code_copy_with_tables()</b> can be used to obtain a private copy of the
|
||||||
|
compiled code.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Context blocks
|
Context blocks
|
||||||
|
@ -736,7 +740,8 @@ functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
||||||
<br>
|
<br>
|
||||||
This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
|
This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
|
||||||
depth of parenthesis nesting in a pattern. This limit stops rogue patterns
|
depth of parenthesis nesting in a pattern. This limit stops rogue patterns
|
||||||
using up too much system stack when being compiled.
|
using up too much system stack when being compiled. The limit applies to
|
||||||
|
parentheses of all kinds, not just capturing parentheses.
|
||||||
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||||
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
||||||
<br>
|
<br>
|
||||||
|
@ -1058,6 +1063,9 @@ zero.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
<b>pcre2_code *pcre2_code_copy(const pcre2_code *<i>code</i>);</b>
|
<b>pcre2_code *pcre2_code_copy(const pcre2_code *<i>code</i>);</b>
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
<b>pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *<i>code</i>);</b>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
||||||
|
@ -1079,9 +1087,22 @@ if the code has been processed by the JIT compiler (see
|
||||||
<a href="#jitcompiling">below),</a>
|
<a href="#jitcompiling">below),</a>
|
||||||
the JIT information cannot be copied (because it is position-dependent).
|
the JIT information cannot be copied (because it is position-dependent).
|
||||||
The new copy can initially be used only for non-JIT matching, though it can be
|
The new copy can initially be used only for non-JIT matching, though it can be
|
||||||
passed to <b>pcre2_jit_compile()</b> if required. The <b>pcre2_code_copy()</b>
|
passed to <b>pcre2_jit_compile()</b> if required.
|
||||||
function provides a way for individual threads in a multithreaded application
|
</P>
|
||||||
to acquire a private copy of shared compiled code.
|
<P>
|
||||||
|
The <b>pcre2_code_copy()</b> function provides a way for individual threads in a
|
||||||
|
multithreaded application to acquire a private copy of shared compiled code.
|
||||||
|
However, it does not make a copy of the character tables used by the compiled
|
||||||
|
pattern; the new pattern code points to the same tables as the original code.
|
||||||
|
(See
|
||||||
|
<a href="#jitcompiling">"Locale Support"</a>
|
||||||
|
below for details of these character tables.) In many applications the same
|
||||||
|
tables are used throughout, so this behaviour is appropriate. Nevertheless,
|
||||||
|
there are occasions when a copy of a compiled pattern and the relevant tables
|
||||||
|
are needed. The <b>pcre2_code_copy_with_tables()</b> provides this facility.
|
||||||
|
Copies of both the code and the tables are made, with the new code pointing to
|
||||||
|
the new tables. The memory for the new tables is automatically freed when
|
||||||
|
<b>pcre2_code_free()</b> is called for the new copy of the compiled code.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
NOTE: When one of the matching functions is called, pointers to the compiled
|
NOTE: When one of the matching functions is called, pointers to the compiled
|
||||||
|
@ -1122,6 +1143,13 @@ error has occurred. The values are not defined when compilation is successful
|
||||||
and <b>pcre2_compile()</b> returns a non-NULL value.
|
and <b>pcre2_compile()</b> returns a non-NULL value.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
The value returned in <i>erroroffset</i> is an indication of where in the
|
||||||
|
pattern the error occurred. It is not necessarily the furthest point in the
|
||||||
|
pattern that was read. For example, after the error "lookbehind assertion is
|
||||||
|
not fixed length", the error offset points to the start of the failing
|
||||||
|
assertion.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
The <b>pcre2_get_error_message()</b> function (see "Obtaining a textual error
|
The <b>pcre2_get_error_message()</b> function (see "Obtaining a textual error
|
||||||
message"
|
message"
|
||||||
<a href="#geterrormessage">below)</a>
|
<a href="#geterrormessage">below)</a>
|
||||||
|
@ -1215,8 +1243,8 @@ recognized, exactly as in the rest of the pattern.
|
||||||
PCRE2_AUTO_CALLOUT
|
PCRE2_AUTO_CALLOUT
|
||||||
</pre>
|
</pre>
|
||||||
If this bit is set, <b>pcre2_compile()</b> automatically inserts callout items,
|
If this bit is set, <b>pcre2_compile()</b> automatically inserts callout items,
|
||||||
all with number 255, before each pattern item. For discussion of the callout
|
all with number 255, before each pattern item, except immediately before or
|
||||||
facility, see the
|
after a callout in the pattern. For discussion of the callout facility, see the
|
||||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||||
documentation.
|
documentation.
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -3235,7 +3263,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC41" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC41" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 17 June 2016
|
Last updated: 22 November 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2016 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -34,9 +34,10 @@ please consult the man page, in case the conversion went wrong.
|
||||||
<li><a name="TOC19" href="#SEC19">INCLUDING DEBUGGING CODE</a>
|
<li><a name="TOC19" href="#SEC19">INCLUDING DEBUGGING CODE</a>
|
||||||
<li><a name="TOC20" href="#SEC20">DEBUGGING WITH VALGRIND SUPPORT</a>
|
<li><a name="TOC20" href="#SEC20">DEBUGGING WITH VALGRIND SUPPORT</a>
|
||||||
<li><a name="TOC21" href="#SEC21">CODE COVERAGE REPORTING</a>
|
<li><a name="TOC21" href="#SEC21">CODE COVERAGE REPORTING</a>
|
||||||
<li><a name="TOC22" href="#SEC22">SEE ALSO</a>
|
<li><a name="TOC22" href="#SEC22">SUPPORT FOR FUZZERS</a>
|
||||||
<li><a name="TOC23" href="#SEC23">AUTHOR</a>
|
<li><a name="TOC23" href="#SEC23">SEE ALSO</a>
|
||||||
<li><a name="TOC24" href="#SEC24">REVISION</a>
|
<li><a name="TOC24" href="#SEC24">AUTHOR</a>
|
||||||
|
<li><a name="TOC25" href="#SEC25">REVISION</a>
|
||||||
</ul>
|
</ul>
|
||||||
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
|
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -376,16 +377,19 @@ they are not.
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
|
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
|
||||||
scanning, in order to be able to output "before" and "after" lines when it
|
scanning, in order to be able to output "before" and "after" lines when it
|
||||||
finds a match. The size of the buffer is controlled by a parameter whose
|
finds a match. The starting size of the buffer is controlled by a parameter
|
||||||
default value is 20K. The buffer itself is three times this size, but because
|
whose default value is 20K. The buffer itself is three times this size, but
|
||||||
of the way it is used for holding "before" lines, the longest line that is
|
because of the way it is used for holding "before" lines, the longest line that
|
||||||
guaranteed to be processable is the parameter size. You can change the default
|
is guaranteed to be processable is the parameter size. If a longer line is
|
||||||
parameter value by adding, for example,
|
encountered, <b>pcre2grep</b> automatically expands the buffer, up to a
|
||||||
|
specified maximum size, whose default is 1M or the starting size, whichever is
|
||||||
|
the larger. You can change the default parameter values by adding, for example,
|
||||||
<pre>
|
<pre>
|
||||||
--with-pcre2grep-bufsize=50K
|
--with-pcre2grep-bufsize=51200
|
||||||
|
--with-pcre2grep-max-bufsize=2097152
|
||||||
</pre>
|
</pre>
|
||||||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
|
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override
|
||||||
value by using --buffer-size on the command line.
|
these values by using --buffer-size and --max-buffer-size on the command line.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC18" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
<br><a name="SEC18" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -497,11 +501,32 @@ This cleans all coverage data including the generated coverage report. For more
|
||||||
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
|
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
|
||||||
documentation.
|
documentation.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC22" href="#TOC1">SEE ALSO</a><br>
|
<br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
|
||||||
|
<P>
|
||||||
|
There is a special option for use by people who want to run fuzzing tests on
|
||||||
|
PCRE2:
|
||||||
|
<pre>
|
||||||
|
--enable-fuzz-support
|
||||||
|
</pre>
|
||||||
|
At present this applies only to the 8-bit library. If set, it causes an extra
|
||||||
|
library called libpcre2-fuzzsupport.a to be built, but not installed. This
|
||||||
|
contains a single function called LLVMFuzzerTestOneInput() whose arguments are
|
||||||
|
a pointer to a string and the length of the string. When called, this function
|
||||||
|
tries to compile the string as a pattern, and if that succeeds, to match it.
|
||||||
|
This is done both with no options and with some random options bits that are
|
||||||
|
generated from the string. Setting --enable-fuzz-support also causes a binary
|
||||||
|
called <b>pcre2fuzzcheck</b> to be created. This is normally run under valgrind
|
||||||
|
or used when PCRE2 is compiled with address sanitizing enabled. It calls the
|
||||||
|
fuzzing function and outputs information about it is doing. The input strings
|
||||||
|
are specified by arguments: if an argument starts with "=" the rest of it is a
|
||||||
|
literal input string. Otherwise, it is assumed to be a file name, and the
|
||||||
|
contents of the file are the test string.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC23" href="#TOC1">SEE ALSO</a><br>
|
||||||
<P>
|
<P>
|
||||||
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
|
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC23" href="#TOC1">AUTHOR</a><br>
|
<br><a name="SEC24" href="#TOC1">AUTHOR</a><br>
|
||||||
<P>
|
<P>
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
<br>
|
<br>
|
||||||
|
@ -510,9 +535,9 @@ University Computing Service
|
||||||
Cambridge, England.
|
Cambridge, England.
|
||||||
<br>
|
<br>
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC24" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC25" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 01 April 2016
|
Last updated: 01 November 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2016 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -57,11 +57,20 @@ two callout points:
|
||||||
</pre>
|
</pre>
|
||||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
|
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
|
||||||
automatically inserts callouts, all with number 255, before each item in the
|
automatically inserts callouts, all with number 255, before each item in the
|
||||||
pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
|
pattern except for immediately before or after a callout item in the pattern.
|
||||||
|
For example, if PCRE2_AUTO_CALLOUT is used with the pattern
|
||||||
|
<pre>
|
||||||
|
A(?C3)B
|
||||||
|
</pre>
|
||||||
|
it is processed as if it were
|
||||||
|
<pre>
|
||||||
|
(?C255)A(?C3)B(?C255)
|
||||||
|
</pre>
|
||||||
|
Here is a more complicated example:
|
||||||
<pre>
|
<pre>
|
||||||
A(\d{2}|--)
|
A(\d{2}|--)
|
||||||
</pre>
|
</pre>
|
||||||
it is processed as if it were
|
With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
||||||
|
@ -107,10 +116,10 @@ with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string
|
||||||
No match
|
No match
|
||||||
</pre>
|
</pre>
|
||||||
This indicates that when matching [bc] fails, there is no backtracking into a+
|
This indicates that when matching [bc] fails, there is no backtracking into a+
|
||||||
and therefore the callouts that would be taken for the backtracks do not occur.
|
(because it is being treated as a++) and therefore the callouts that would be
|
||||||
You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to
|
taken for the backtracks do not occur. You can disable the auto-possessify
|
||||||
<b>pcre2_compile()</b>, or starting the pattern with (*NO_AUTO_POSSESS). In this
|
feature by passing PCRE2_NO_AUTO_POSSESS to <b>pcre2_compile()</b>, or starting
|
||||||
case, the output changes to this:
|
the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
|
||||||
<pre>
|
<pre>
|
||||||
--->aaaa
|
--->aaaa
|
||||||
+0 ^ a+
|
+0 ^ a+
|
||||||
|
@ -235,8 +244,8 @@ Fields for numerical callouts
|
||||||
<P>
|
<P>
|
||||||
For a numerical callout, <i>callout_string</i> is NULL, and <i>callout_number</i>
|
For a numerical callout, <i>callout_string</i> is NULL, and <i>callout_number</i>
|
||||||
contains the number of the callout, in the range 0-255. This is the number
|
contains the number of the callout, in the range 0-255. This is the number
|
||||||
that follows (?C for manual callouts; it is 255 for automatically generated
|
that follows (?C for callouts that part of the pattern; it is 255 for
|
||||||
callouts.
|
automatically generated callouts.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Fields for string callouts
|
Fields for string callouts
|
||||||
|
@ -310,10 +319,15 @@ the next item to be matched.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>next_item_length</i> field contains the length of the next item to be
|
The <i>next_item_length</i> field contains the length of the next item to be
|
||||||
matched in the pattern string. When the callout immediately precedes an
|
processed in the pattern string. When the callout is at the end of the pattern,
|
||||||
alternation bar, a closing parenthesis, or the end of the pattern, the length
|
the length is zero. When the callout precedes an opening parenthesis, the
|
||||||
is zero. When the callout precedes an opening parenthesis, the length is that
|
length includes meta characters that follow the parenthesis. For example, in a
|
||||||
of the entire subpattern.
|
callout before an assertion such as (?=ab) the length is 3. For an an
|
||||||
|
alternation bar or a closing parenthesis, the length is one, unless a closing
|
||||||
|
parenthesis is followed by a quantifier, in which case its length is included.
|
||||||
|
(This changed in release 10.23. In earlier releases, before an opening
|
||||||
|
parenthesis the length was that of the entire subpattern, and before an
|
||||||
|
alternation bar or a closing parenthesis the length was zero.)
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
|
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
|
||||||
|
@ -399,9 +413,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 23 March 2015
|
Last updated: 29 September 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -107,7 +107,7 @@ processed as anchored at the point where they are tested.
|
||||||
one that is backtracked onto acts. For example, in the pattern
|
one that is backtracked onto acts. For example, in the pattern
|
||||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||||
same as PCRE2, but there are examples where it differs.
|
same as PCRE2, but there are cases where it differs.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
11. Most backtracking verbs in assertions have their normal actions. They are
|
11. Most backtracking verbs in assertions have their normal actions. They are
|
||||||
|
@ -123,7 +123,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||||
13. PCRE2's handling of duplicate subpattern numbers and duplicate subpattern
|
13. PCRE2's handling of duplicate subpattern numbers and duplicate subpattern
|
||||||
names is not as general as Perl's. This is a consequence of the fact the PCRE2
|
names is not as general as Perl's. This is a consequence of the fact the PCRE2
|
||||||
works internally just with numbers, using an external table to translate
|
works internally just with numbers, using an external table to translate
|
||||||
between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b)B),
|
between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b>B),
|
||||||
where the two capturing parentheses have the same number but different names,
|
where the two capturing parentheses have the same number but different names,
|
||||||
is not supported, and causes an error at compile time. If it were allowed, it
|
is not supported, and causes an error at compile time. If it were allowed, it
|
||||||
would not be possible to distinguish which parentheses matched, because both
|
would not be possible to distinguish which parentheses matched, because both
|
||||||
|
@ -131,10 +131,11 @@ names map to capturing subpattern number 1. To avoid this confusing situation,
|
||||||
an error is given at compile time.
|
an error is given at compile time.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
14. Perl recognizes comments in some places that PCRE2 does not, for example,
|
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||||
between the ( and ? at the start of a subpattern. If the /x modifier is set,
|
example, between the ( and ? at the start of a subpattern. If the /x modifier
|
||||||
Perl allows white space between ( and ? (though current Perls warn that this is
|
is set, Perl allowed white space between ( and ? though the latest Perls give
|
||||||
deprecated) but PCRE2 never does, even if the PCRE2_EXTENDED option is set.
|
an error (for a while it was just deprecated). There may still be some cases
|
||||||
|
where Perl behaves differently.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||||
|
@ -161,42 +162,47 @@ each alternative branch of a lookbehind assertion can match a different length
|
||||||
of string. Perl requires them all to have the same length.
|
of string. Perl requires them all to have the same length.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
|
(b) From PCRE2 10.23, back references to groups of fixed length are supported
|
||||||
|
in lookbehinds, provided that there is no possibility of referencing a
|
||||||
|
non-unique number or name. Perl does not support backreferences in lookbehinds.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
(c) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
|
||||||
meta-character matches only at the very end of the string.
|
meta-character matches only at the very end of the string.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(c) A backslash followed by a letter with no special meaning is faulted. (Perl
|
(d) A backslash followed by a letter with no special meaning is faulted. (Perl
|
||||||
can be made to issue a warning.)
|
can be made to issue a warning.)
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
|
(e) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
|
||||||
inverted, that is, by default they are not greedy, but if followed by a
|
inverted, that is, by default they are not greedy, but if followed by a
|
||||||
question mark they are.
|
question mark they are.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
|
(f) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
|
||||||
only at the first matching position in the subject string.
|
only at the first matching position in the subject string.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and
|
(g) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and
|
||||||
PCRE2_NO_AUTO_CAPTURE options have no Perl equivalents.
|
PCRE2_NO_AUTO_CAPTURE options have no Perl equivalents.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
|
(h) The \R escape sequence can be restricted to match only CR, LF, or CRLF
|
||||||
by the PCRE2_BSR_ANYCRLF option.
|
by the PCRE2_BSR_ANYCRLF option.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(h) The callout facility is PCRE2-specific.
|
(i) The callout facility is PCRE2-specific.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(i) The partial matching facility is PCRE2-specific.
|
(j) The partial matching facility is PCRE2-specific.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(j) The alternative matching function (<b>pcre2_dfa_match()</b> matches in a
|
(k) The alternative matching function (<b>pcre2_dfa_match()</b> matches in a
|
||||||
different way and is not Perl-compatible.
|
different way and is not Perl-compatible.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
(k) PCRE2 recognizes some special sequences such as (*CR) at the start of
|
(l) PCRE2 recognizes some special sequences such as (*CR) at the start of
|
||||||
a pattern that set overall options that cannot be changed within the pattern.
|
a pattern that set overall options that cannot be changed within the pattern.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
@ -214,9 +220,9 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 15 March 2015
|
Last updated: 18 October 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -80,11 +80,19 @@ span line boundaries. What defines a line boundary is controlled by the
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The amount of memory used for buffering files that are being scanned is
|
The amount of memory used for buffering files that are being scanned is
|
||||||
controlled by a parameter that can be set by the <b>--buffer-size</b> option.
|
controlled by parameters that can be set by the <b>--buffer-size</b> and
|
||||||
The default value for this parameter is specified when <b>pcre2grep</b> is
|
<b>--max-buffer-size</b> options. The first of these sets the size of buffer
|
||||||
built, with the default default being 20K. A block of memory three times this
|
that is obtained at the start of processing. If an input file contains very
|
||||||
size is used (to allow for buffering "before" and "after" lines). An error
|
long lines, a larger buffer may be needed; this is handled by automatically
|
||||||
occurs if a line overflows the buffer.
|
extending the buffer, up to the limit specified by <b>--max-buffer-size</b>. The
|
||||||
|
default values for these parameters are specified when <b>pcre2grep</b> is
|
||||||
|
built, with the default defaults being 20K and 1M respectively. An error occurs
|
||||||
|
if a line is too long and the buffer can no longer be expanded.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The block of memory that is actually used is three times the "buffer size", to
|
||||||
|
allow for buffering "before" and "after" lines. If the buffer size is too
|
||||||
|
small, fewer than requested "before" and "after" lines may be output.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
|
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
|
||||||
|
@ -155,12 +163,13 @@ processing of patterns and file names that start with hyphens.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-A</b> <i>number</i>, <b>--after-context=</b><i>number</i>
|
<b>-A</b> <i>number</i>, <b>--after-context=</b><i>number</i>
|
||||||
Output <i>number</i> lines of context after each matching line. If file names
|
Output up to <i>number</i> lines of context after each matching line. Fewer
|
||||||
and/or line numbers are being output, a hyphen separator is used instead of a
|
lines are output if the next match or the end of the file is reached, or if the
|
||||||
colon for the context lines. A line containing "--" is output between each
|
processing buffer size has been set too small. If file names and/or line
|
||||||
group of lines, unless they are in fact contiguous in the input file. The value
|
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||||
of <i>number</i> is expected to be relatively small. However, <b>pcre2grep</b>
|
context lines. A line containing "--" is output between each group of lines,
|
||||||
guarantees to have up to 8K of following text available for context output.
|
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||||
|
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-a</b>, <b>--text</b>
|
<b>-a</b>, <b>--text</b>
|
||||||
|
@ -169,12 +178,14 @@ Treat binary files as text. This is equivalent to
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
|
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
|
||||||
Output <i>number</i> lines of context before each matching line. If file names
|
Output up to <i>number</i> lines of context before each matching line. Fewer
|
||||||
and/or line numbers are being output, a hyphen separator is used instead of a
|
lines are output if the previous match or the start of the file is within
|
||||||
colon for the context lines. A line containing "--" is output between each
|
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||||
group of lines, unless they are in fact contiguous in the input file. The value
|
file names and/or line numbers are being output, a hyphen separator is used
|
||||||
of <i>number</i> is expected to be relatively small. However, <b>pcre2grep</b>
|
instead of a colon for the context lines. A line containing "--" is output
|
||||||
guarantees to have up to 8K of preceding text available for context output.
|
between each group of lines, unless they are in fact contiguous in the input
|
||||||
|
file. The value of <i>number</i> is expected to be relatively small. When
|
||||||
|
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>--binary-files=</b><i>word</i>
|
<b>--binary-files=</b><i>word</i>
|
||||||
|
@ -191,8 +202,9 @@ return code.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>--buffer-size=</b><i>number</i>
|
<b>--buffer-size=</b><i>number</i>
|
||||||
Set the parameter that controls how much memory is used for buffering files
|
Set the parameter that controls how much memory is obtained at the start of
|
||||||
that are being scanned.
|
processing for buffering files that are being scanned. See also
|
||||||
|
<b>--max-buffer-size</b> below.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-C</b> <i>number</i>, <b>--context=</b><i>number</i>
|
<b>-C</b> <i>number</i>, <b>--context=</b><i>number</i>
|
||||||
|
@ -202,14 +214,16 @@ This is equivalent to setting both <b>-A</b> and <b>-B</b> to the same value.
|
||||||
<P>
|
<P>
|
||||||
<b>-c</b>, <b>--count</b>
|
<b>-c</b>, <b>--count</b>
|
||||||
Do not output lines from the files that are being scanned; instead output the
|
Do not output lines from the files that are being scanned; instead output the
|
||||||
number of matches (or non-matches if <b>-v</b> is used) that would otherwise
|
number of lines that would have been shown, either because they matched, or, if
|
||||||
have caused lines to be shown. By default, this count is the same as the number
|
<b>-v</b> is set, because they failed to match. By default, this count is
|
||||||
of suppressed lines, but if the <b>-M</b> (multiline) option is used (without
|
exactly the same as the number of lines that would have been output, but if the
|
||||||
<b>-v</b>), there may be more suppressed lines than the number of matches.
|
<b>-M</b> (multiline) option is used (without <b>-v</b>), there may be more
|
||||||
|
suppressed lines than the count (that is, the number of matches).
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
If no lines are selected, the number zero is output. If several files are are
|
If no lines are selected, the number zero is output. If several files are are
|
||||||
being scanned, a count is output for each of them. However, if the
|
being scanned, a count is output for each of them and the <b>-t</b> option can
|
||||||
|
be used to cause a total to be output at the end. However, if the
|
||||||
<b>--files-with-matches</b> option is also used, only those files whose counts
|
<b>--files-with-matches</b> option is also used, only those files whose counts
|
||||||
are greater than zero are listed. When <b>-c</b> is used, the <b>-A</b>,
|
are greater than zero are listed. When <b>-c</b> is used, the <b>-A</b>,
|
||||||
<b>-B</b>, and <b>-C</b> options are ignored.
|
<b>-B</b>, and <b>-C</b> options are ignored.
|
||||||
|
@ -232,11 +246,12 @@ just one, in order to colour them all.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
The colour that is used can be specified by setting the environment variable
|
The colour that is used can be specified by setting the environment variable
|
||||||
PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The value of this variable should be a
|
PCRE2GREP_COLOUR or PCRE2GREP_COLOR. If neither of these are set,
|
||||||
string of two numbers, separated by a semicolon. They are copied directly into
|
<b>pcre2grep</b> looks for GREP_COLOUR or GREP_COLOR. The value of the variable
|
||||||
the control string for setting colour on a terminal, so it is your
|
should be a string of two numbers, separated by a semicolon. They are copied
|
||||||
responsibility to ensure that they make sense. If neither of the environment
|
directly into the control string for setting colour on a terminal, so it is
|
||||||
variables is set, the default is "1;31", which gives red.
|
your responsibility to ensure that they make sense. If neither of the
|
||||||
|
environment variables is set, the default is "1;31", which gives red.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-D</b> <i>action</i>, <b>--devices=</b><i>action</i>
|
<b>-D</b> <i>action</i>, <b>--devices=</b><i>action</i>
|
||||||
|
@ -321,18 +336,18 @@ files; it does not apply to patterns specified by any of the <b>--include</b> or
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-f</b> <i>filename</i>, <b>--file=</b><i>filename</i>
|
<b>-f</b> <i>filename</i>, <b>--file=</b><i>filename</i>
|
||||||
Read patterns from the file, one per line, and match them against
|
Read patterns from the file, one per line, and match them against each line of
|
||||||
each line of input. What constitutes a newline when reading the file is the
|
input. What constitutes a newline when reading the file is the operating
|
||||||
operating system's default. The <b>--newline</b> option has no effect on this
|
system's default. The <b>--newline</b> option has no effect on this option.
|
||||||
option. Trailing white space is removed from each line, and blank lines are
|
Trailing white space is removed from each line, and blank lines are ignored. An
|
||||||
ignored. An empty file contains no patterns and therefore matches nothing. See
|
empty file contains no patterns and therefore matches nothing. See also the
|
||||||
also the comments about multiple patterns versus a single pattern with
|
comments about multiple patterns versus a single pattern with alternatives in
|
||||||
alternatives in the description of <b>-e</b> above.
|
the description of <b>-e</b> above.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
If this option is given more than once, all the specified files are
|
If this option is given more than once, all the specified files are read. A
|
||||||
read. A data line is output if any of the patterns match it. A file name can
|
data line is output if any of the patterns match it. A file name can be given
|
||||||
be given as "-" to refer to the standard input. When <b>-f</b> is used, patterns
|
as "-" to refer to the standard input. When <b>-f</b> is used, patterns
|
||||||
specified on the command line using <b>-e</b> may also be present; they are
|
specified on the command line using <b>-e</b> may also be present; they are
|
||||||
tested before the file's patterns. However, no other pattern is taken from the
|
tested before the file's patterns. However, no other pattern is taken from the
|
||||||
command line; all arguments are treated as the names of paths to be searched.
|
command line; all arguments are treated as the names of paths to be searched.
|
||||||
|
@ -502,22 +517,24 @@ There are no short forms for these options. The default settings are specified
|
||||||
when the PCRE2 library is compiled, with the default default being 10 million.
|
when the PCRE2 library is compiled, with the default default being 10 million.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
\fB--max-buffer-size=<i>number</i>
|
||||||
|
This limits the expansion of the processing buffer, whose initial size can be
|
||||||
|
set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
||||||
|
smaller than the starting buffer size.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
<b>-M</b>, <b>--multiline</b>
|
<b>-M</b>, <b>--multiline</b>
|
||||||
Allow patterns to match more than one line. When this option is given, patterns
|
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||||
may usefully contain literal newline characters and internal occurrences of ^
|
library is called in "multiline" mode. This allows a matched string to extend
|
||||||
and $ characters. The output for a successful match may consist of more than
|
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||||
one line. The first is the line in which the match started, and the last is the
|
used with <b>-M</b> may usefully contain literal newline characters and internal
|
||||||
line in which the match ended. If the matched string ends with a newline
|
occurrences of ^ and $ characters. The output for a successful match may
|
||||||
sequence the output ends at the end of that line.
|
consist of more than one line. The first line is the line in which the match
|
||||||
<br>
|
started, and the last line is the line in which the match ended. If the matched
|
||||||
<br>
|
string ends with a newline sequence, the output ends at the end of that line.
|
||||||
When this option is set, the PCRE2 library is called in "multiline" mode. This
|
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
|
||||||
allows a matched string to extend past the end of a line and continue on one or
|
match has been handled, scanning restarts at the beginning of the line after
|
||||||
more subsequent lines. However, <b>pcre2grep</b> still processes the input line
|
the one in which the match ended.
|
||||||
by line. Once a match has been handled, scanning restarts at the beginning of
|
|
||||||
the next line, just as it does when <b>-M</b> is not present. This means that it
|
|
||||||
is possible for the second or subsequent lines in a multiline match to be
|
|
||||||
output again as part of another match.
|
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
The newline sequence that separates multiple lines must be matched as part of
|
The newline sequence that separates multiple lines must be matched as part of
|
||||||
|
@ -533,11 +550,8 @@ well as possibly handling a two-character newline sequence.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
There is a limit to the number of lines that can be matched, imposed by the way
|
There is a limit to the number of lines that can be matched, imposed by the way
|
||||||
that <b>pcre2grep</b> buffers the input file as it scans it. However,
|
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
||||||
<b>pcre2grep</b> ensures that at least 8K characters or the rest of the file
|
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
||||||
(whichever is the shorter) are available for forward matching, and similarly
|
|
||||||
the previous 8K characters (or all the previous characters, if fewer than 8K)
|
|
||||||
are guaranteed to be available for lookbehind assertions. The <b>-M</b> option
|
|
||||||
does not work when input is read line by line (see \fP--line-buffered\fP.)
|
does not work when input is read line by line (see \fP--line-buffered\fP.)
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -585,12 +599,13 @@ It should never be needed in normal use.
|
||||||
Show only the part of the line that matched a pattern instead of the whole
|
Show only the part of the line that matched a pattern instead of the whole
|
||||||
line. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and
|
line. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and
|
||||||
<b>-C</b> options are ignored. If there is more than one match in a line, each
|
<b>-C</b> options are ignored. If there is more than one match in a line, each
|
||||||
of them is shown separately. If <b>-o</b> is combined with <b>-v</b> (invert the
|
of them is shown separately, on a separate line of output. If <b>-o</b> is
|
||||||
sense of the match to find non-matching lines), no output is generated, but the
|
combined with <b>-v</b> (invert the sense of the match to find non-matching
|
||||||
return code is set appropriately. If the matched portion of the line is empty,
|
lines), no output is generated, but the return code is set appropriately. If
|
||||||
nothing is output unless the file name or line number are being printed, in
|
the matched portion of the line is empty, nothing is output unless the file
|
||||||
which case they are shown on an otherwise empty line. This option is mutually
|
name or line number are being printed, in which case they are shown on an
|
||||||
exclusive with <b>--file-offsets</b> and <b>--line-offsets</b>.
|
otherwise empty line. This option is mutually exclusive with
|
||||||
|
<b>--file-offsets</b> and <b>--line-offsets</b>.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
||||||
|
@ -604,10 +619,11 @@ capturing parentheses do not exist in the pattern, or were not set in the
|
||||||
match, nothing is output unless the file name or line number are being output.
|
match, nothing is output unless the file name or line number are being output.
|
||||||
<br>
|
<br>
|
||||||
<br>
|
<br>
|
||||||
If this option is given multiple times, multiple substrings are output, in the
|
If this option is given multiple times, multiple substrings are output for each
|
||||||
order the options are given. For example, -o3 -o1 -o3 causes the substrings
|
match, in the order the options are given, and all on one line. For example,
|
||||||
matched by capturing parentheses 3 and 1 and then 3 again to be output. By
|
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||||
default, there is no separator (but see the next option).
|
then 3 again to be output. By default, there is no separator (but see the next
|
||||||
|
option).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<b>--om-separator</b>=<i>text</i>
|
<b>--om-separator</b>=<i>text</i>
|
||||||
|
@ -638,6 +654,18 @@ quietly skipped. However, the return code is still 2, even if matches were
|
||||||
found in other files.
|
found in other files.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
<b>-t</b>, <b>--total-count</b>
|
||||||
|
This option is useful when scanning more than one file. If used on its own,
|
||||||
|
<b>-t</b> suppresses all output except for a grand total number of matching
|
||||||
|
lines (or non-matching lines if <b>-v</b> is used) in all the files. If <b>-t</b>
|
||||||
|
is used with <b>-c</b>, a grand total is output except when the previous output
|
||||||
|
is just one line. In other words, it is not output when just one file's count
|
||||||
|
is listed. If file names are being output, the grand total is preceded by
|
||||||
|
"TOTAL:". Otherwise, it appears as just another number. The <b>-t</b> option is
|
||||||
|
ignored when used with <b>-L</b> (list files without matches), because the grand
|
||||||
|
total would always be zero.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
<b>-u</b>, <b>--utf-8</b>
|
<b>-u</b>, <b>--utf-8</b>
|
||||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||||
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
||||||
|
@ -665,11 +693,12 @@ specified by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||||
<P>
|
<P>
|
||||||
<b>-x</b>, <b>--line-regex</b>, <b>--line-regexp</b>
|
<b>-x</b>, <b>--line-regex</b>, <b>--line-regexp</b>
|
||||||
Force the patterns to be anchored (each must start matching at the beginning of
|
Force the patterns to be anchored (each must start matching at the beginning of
|
||||||
a line) and in addition, require them to match entire lines. This is equivalent
|
a line) and in addition, require them to match entire lines. In multiline mode
|
||||||
to having ^ and $ characters at the start and end of each alternative top-level
|
the match may be more than one line. This is equivalent to having \A and \Z
|
||||||
branch in every pattern. This option applies only to the patterns that are
|
characters at the start and end of each alternative top-level branch in every
|
||||||
matched against the contents of files; it does not apply to patterns specified
|
pattern. This option applies only to the patterns that are matched against the
|
||||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
contents of files; it does not apply to patterns specified by any of the
|
||||||
|
<b>--include</b> or <b>--exclude</b> options.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC6" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
<br><a name="SEC6" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -831,7 +860,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 19 June 2016
|
Last updated: 31 October 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2016 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -61,14 +61,10 @@ The maximum length of a lookbehind assertion is 65535 characters.
|
||||||
There is no limit to the number of parenthesized subpatterns, but there can be
|
There is no limit to the number of parenthesized subpatterns, but there can be
|
||||||
no more than 65535 capturing subpatterns. There is, however, a limit to the
|
no more than 65535 capturing subpatterns. There is, however, a limit to the
|
||||||
depth of nesting of parenthesized subpatterns of all kinds. This is imposed in
|
depth of nesting of parenthesized subpatterns of all kinds. This is imposed in
|
||||||
order to limit the amount of system stack used at compile time. The limit can
|
order to limit the amount of system stack used at compile time. The default
|
||||||
be specified when PCRE2 is built; the default is 250.
|
limit can be specified when PCRE2 is built; the default default is 250. An
|
||||||
</P>
|
application can change this limit by calling pcre2_set_parens_nest_limit() to
|
||||||
<P>
|
set the limit in a compile context.
|
||||||
There is a limit to the number of forward references to subsequent subpatterns
|
|
||||||
of around 200,000. Repeated forward references with fixed upper limits, for
|
|
||||||
example, (?2){0,100} when subpattern number 2 is to the right, are included in
|
|
||||||
the count. There is no limit to the number of backward references.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The maximum length of name for a named subpattern is 32 code units, and the
|
The maximum length of name for a named subpattern is 32 code units, and the
|
||||||
|
@ -76,7 +72,12 @@ maximum number of named subpatterns is 10000.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb
|
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb
|
||||||
is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries.
|
is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
||||||
|
32-bit libraries.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The maximum length of a string argument to a callout is the largest number a
|
||||||
|
32-bit unsigned integer can hold.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
AUTHOR
|
AUTHOR
|
||||||
|
@ -93,9 +94,9 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 05 November 2015
|
Last updated: 26 October 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -379,32 +379,31 @@ case letter, it is converted to upper case. Then bit 6 of the character (hex
|
||||||
40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A (A is 41, Z is 5A),
|
40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A (A is 41, Z is 5A),
|
||||||
but \c{ becomes hex 3B ({ is 7B), and \c; becomes hex 7B (; is 3B). If the
|
but \c{ becomes hex 3B ({ is 7B), and \c; becomes hex 7B (; is 3B). If the
|
||||||
code unit following \c has a value less than 32 or greater than 126, a
|
code unit following \c has a value less than 32 or greater than 126, a
|
||||||
compile-time error occurs. This locks out non-printable ASCII characters in all
|
compile-time error occurs.
|
||||||
modes.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t
|
When PCRE2 is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t
|
||||||
generate the appropriate EBCDIC code values. The \c escape is processed
|
generate the appropriate EBCDIC code values. The \c escape is processed
|
||||||
as specified for Perl in the <b>perlebcdic</b> document. The only characters
|
as specified for Perl in the <b>perlebcdic</b> document. The only characters
|
||||||
that are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. Any
|
that are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. Any
|
||||||
other character provokes a compile-time error. The sequence \@ encodes
|
other character provokes a compile-time error. The sequence \c@ encodes
|
||||||
character code 0; the letters (in either case) encode characters 1-26 (hex 01
|
character code 0; after \c the letters (in either case) encode characters 1-26
|
||||||
to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and
|
(hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex
|
||||||
\? becomes either 255 (hex FF) or 95 (hex 5F).
|
1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Thus, apart from \?, these escapes generate the same character code values as
|
Thus, apart from \c?, these escapes generate the same character code values as
|
||||||
they do in an ASCII environment, though the meanings of the values mostly
|
they do in an ASCII environment, though the meanings of the values mostly
|
||||||
differ. For example, \G always generates code value 7, which is BEL in ASCII
|
differ. For example, \cG always generates code value 7, which is BEL in ASCII
|
||||||
but DEL in EBCDIC.
|
but DEL in EBCDIC.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The sequence \? generates DEL (127, hex 7F) in an ASCII environment, but
|
The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but
|
||||||
because 127 is not a control character in EBCDIC, Perl makes it generate the
|
because 127 is not a control character in EBCDIC, Perl makes it generate the
|
||||||
APC character. Unfortunately, there are several variants of EBCDIC. In most of
|
APC character. Unfortunately, there are several variants of EBCDIC. In most of
|
||||||
them the APC character has the value 255 (hex FF), but in the one Perl calls
|
them the APC character has the value 255 (hex FF), but in the one Perl calls
|
||||||
POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC
|
POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC
|
||||||
values, PCRE2 makes \? generate 95; otherwise it generates 255.
|
values, PCRE2 makes \c? generate 95; otherwise it generates 255.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
After \0 up to two further octal digits are read. If there are fewer than two
|
After \0 up to two further octal digits are read. If there are fewer than two
|
||||||
|
@ -526,9 +525,9 @@ by code point, as described in the previous section.
|
||||||
Absolute and relative back references
|
Absolute and relative back references
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The sequence \g followed by an unsigned or a negative number, optionally
|
The sequence \g followed by a signed or unsigned number, optionally enclosed
|
||||||
enclosed in braces, is an absolute or relative back reference. A named back
|
in braces, is an absolute or relative back reference. A named back reference
|
||||||
reference can be coded as \g{name}. Back references are discussed
|
can be coded as \g{name}. Back references are discussed
|
||||||
<a href="#backreferences">later,</a>
|
<a href="#backreferences">later,</a>
|
||||||
following the discussion of
|
following the discussion of
|
||||||
<a href="#subpattern">parenthesized subpatterns.</a>
|
<a href="#subpattern">parenthesized subpatterns.</a>
|
||||||
|
@ -1326,13 +1325,32 @@ whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A
|
||||||
class such as [^a] always matches one of these characters.
|
class such as [^a] always matches one of these characters.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v,
|
||||||
|
\V, \w, and \W may appear in a character class, and add the characters that
|
||||||
|
they match to the class. For example, [\dABCDEF] matches any hexadecimal
|
||||||
|
digit. In UTF modes, the PCRE2_UCP option affects the meanings of \d, \s, \w
|
||||||
|
and their upper case partners, just as it does when they appear outside a
|
||||||
|
character class, as described in the section entitled
|
||||||
|
<a href="#genericchartypes">"Generic character types"</a>
|
||||||
|
above. The escape sequence \b has a different meaning inside a character
|
||||||
|
class; it matches the backspace character. The sequences \B, \N, \R, and \X
|
||||||
|
are not special inside a character class. Like any other unrecognized escape
|
||||||
|
sequences, they cause an error.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
The minus (hyphen) character can be used to specify a range of characters in a
|
The minus (hyphen) character can be used to specify a range of characters in a
|
||||||
character class. For example, [d-m] matches any letter between d and m,
|
character class. For example, [d-m] matches any letter between d and m,
|
||||||
inclusive. If a minus character is required in a class, it must be escaped with
|
inclusive. If a minus character is required in a class, it must be escaped with
|
||||||
a backslash or appear in a position where it cannot be interpreted as
|
a backslash or appear in a position where it cannot be interpreted as
|
||||||
indicating a range, typically as the first or last character in the class, or
|
indicating a range, typically as the first or last character in the class,
|
||||||
immediately after a range. For example, [b-d-z] matches letters in the range b
|
or immediately after a range. For example, [b-d-z] matches letters in the range
|
||||||
to d, a hyphen character, or z.
|
b to d, a hyphen character, or z.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Perl treats a hyphen as a literal if it appears before a POSIX class (see
|
||||||
|
below) or a character type escape such as as \d, but gives a warning in its
|
||||||
|
warning mode, as this is most likely a user error. As PCRE2 has no facility for
|
||||||
|
warning, an error is given in these cases.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
It is not possible to have the literal character "]" as the end character of a
|
It is not possible to have the literal character "]" as the end character of a
|
||||||
|
@ -1344,12 +1362,6 @@ followed by two other characters. The octal or hexadecimal representation of
|
||||||
"]" can also be used to end a range.
|
"]" can also be used to end a range.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
An error is generated if a POSIX character class (see below) or an escape
|
|
||||||
sequence other than one that defines a single character appears at a point
|
|
||||||
where a range ending character is expected. For example, [z-\xff] is valid,
|
|
||||||
but [A-\d] and [A-[:digit:]] are not.
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
Ranges normally include all code points between the start and end characters,
|
Ranges normally include all code points between the start and end characters,
|
||||||
inclusive. They can also be used for code points specified numerically, for
|
inclusive. They can also be used for code points specified numerically, for
|
||||||
example [\000-\037]. Ranges can include any characters that are valid for the
|
example [\000-\037]. Ranges can include any characters that are valid for the
|
||||||
|
@ -1372,19 +1384,6 @@ tables for a French locale are in use, [\xc8-\xcb] matches accented E
|
||||||
characters in both cases.
|
characters in both cases.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v,
|
|
||||||
\V, \w, and \W may appear in a character class, and add the characters that
|
|
||||||
they match to the class. For example, [\dABCDEF] matches any hexadecimal
|
|
||||||
digit. In UTF modes, the PCRE2_UCP option affects the meanings of \d, \s, \w
|
|
||||||
and their upper case partners, just as it does when they appear outside a
|
|
||||||
character class, as described in the section entitled
|
|
||||||
<a href="#genericchartypes">"Generic character types"</a>
|
|
||||||
above. The escape sequence \b has a different meaning inside a character
|
|
||||||
class; it matches the backspace character. The sequences \B, \N, \R, and \X
|
|
||||||
are not special inside a character class. Like any other unrecognized escape
|
|
||||||
sequences, they cause an error.
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
A circumflex can conveniently be used with the upper case character types to
|
A circumflex can conveniently be used with the upper case character types to
|
||||||
specify a more restricted set of characters than the matching lower case type.
|
specify a more restricted set of characters than the matching lower case type.
|
||||||
For example, the class [^\W_] matches any letter or digit, but not underscore,
|
For example, the class [^\W_] matches any letter or digit, but not underscore,
|
||||||
|
@ -1552,13 +1551,8 @@ respectively.
|
||||||
<P>
|
<P>
|
||||||
When one of these option changes occurs at top level (that is, not inside
|
When one of these option changes occurs at top level (that is, not inside
|
||||||
subpattern parentheses), the change applies to the remainder of the pattern
|
subpattern parentheses), the change applies to the remainder of the pattern
|
||||||
that follows. If the change is placed right at the start of a pattern, PCRE2
|
that follows. An option change within a subpattern (see below for a description
|
||||||
extracts it into the global options (and it will therefore show up in data
|
of subpatterns) affects only that part of the subpattern that follows it, so
|
||||||
extracted by the <b>pcre2_pattern_info()</b> function).
|
|
||||||
</P>
|
|
||||||
<P>
|
|
||||||
An option change within a subpattern (see below for a description of
|
|
||||||
subpatterns) affects only that part of the subpattern that follows it, so
|
|
||||||
<pre>
|
<pre>
|
||||||
(a(?i)b)c
|
(a(?i)b)c
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -2093,9 +2087,9 @@ subpattern is possible using named parentheses (see below).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Another way of avoiding the ambiguity inherent in the use of digits following a
|
Another way of avoiding the ambiguity inherent in the use of digits following a
|
||||||
backslash is to use the \g escape sequence. This escape must be followed by an
|
backslash is to use the \g escape sequence. This escape must be followed by a
|
||||||
unsigned number or a negative number, optionally enclosed in braces. These
|
signed or unsigned number, optionally enclosed in braces. These examples are
|
||||||
examples are all identical:
|
all identical:
|
||||||
<pre>
|
<pre>
|
||||||
(ring), \1
|
(ring), \1
|
||||||
(ring), \g1
|
(ring), \g1
|
||||||
|
@ -2103,8 +2097,7 @@ examples are all identical:
|
||||||
</pre>
|
</pre>
|
||||||
An unsigned number specifies an absolute reference without the ambiguity that
|
An unsigned number specifies an absolute reference without the ambiguity that
|
||||||
is present in the older syntax. It is also useful when literal digits follow
|
is present in the older syntax. It is also useful when literal digits follow
|
||||||
the reference. A negative number is a relative reference. Consider this
|
the reference. A signed number is a relative reference. Consider this example:
|
||||||
example:
|
|
||||||
<pre>
|
<pre>
|
||||||
(abc(def)ghi)\g{-1}
|
(abc(def)ghi)\g{-1}
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -2115,6 +2108,11 @@ can be helpful in long patterns, and also in patterns that are created by
|
||||||
joining together fragments that contain references within themselves.
|
joining together fragments that contain references within themselves.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
The sequence \g{+1} is a reference to the next capturing subpattern. This kind
|
||||||
|
of forward reference can be useful it patterns that repeat. Perl does not
|
||||||
|
support the use of + in this way.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
A back reference matches whatever actually matched the capturing subpattern in
|
A back reference matches whatever actually matched the capturing subpattern in
|
||||||
the current subject string, rather than anything matching the subpattern
|
the current subject string, rather than anything matching the subpattern
|
||||||
itself (see
|
itself (see
|
||||||
|
@ -2214,6 +2212,14 @@ capturing is carried out only for positive assertions. (Perl sometimes, but not
|
||||||
always, does do capturing in negative assertions.)
|
always, does do capturing in negative assertions.)
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
WARNING: If a positive assertion containing one or more capturing subpatterns
|
||||||
|
succeeds, but failure to match later in the pattern causes backtracking over
|
||||||
|
this assertion, the captures within the assertion are reset only if no higher
|
||||||
|
numbered captures are already set. This is, unfortunately, a fundamental
|
||||||
|
limitation of the current implementation; it may get removed in a future
|
||||||
|
reworking.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
For compatibility with Perl, most assertion subpatterns may be repeated; though
|
For compatibility with Perl, most assertion subpatterns may be repeated; though
|
||||||
it makes no sense to assert the same thing several times, the side effect of
|
it makes no sense to assert the same thing several times, the side effect of
|
||||||
capturing parentheses may occasionally be useful. However, an assertion that
|
capturing parentheses may occasionally be useful. However, an assertion that
|
||||||
|
@ -2310,18 +2316,31 @@ match. If there are insufficient characters before the current position, the
|
||||||
assertion fails.
|
assertion fails.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
In a UTF mode, PCRE2 does not allow the \C escape (which matches a single code
|
In UTF-8 and UTF-16 modes, PCRE2 does not allow the \C escape (which matches a
|
||||||
unit even in a UTF mode) to appear in lookbehind assertions, because it makes
|
single code unit even in a UTF mode) to appear in lookbehind assertions,
|
||||||
it impossible to calculate the length of the lookbehind. The \X and \R
|
because it makes it impossible to calculate the length of the lookbehind. The
|
||||||
escapes, which can match different numbers of code units, are also not
|
\X and \R escapes, which can match different numbers of code units, are never
|
||||||
permitted.
|
permitted in lookbehinds.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
<a href="#subpatternsassubroutines">"Subroutine"</a>
|
<a href="#subpatternsassubroutines">"Subroutine"</a>
|
||||||
calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long
|
calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long
|
||||||
as the subpattern matches a fixed-length string.
|
as the subpattern matches a fixed-length string. However,
|
||||||
<a href="#recursion">Recursion,</a>
|
<a href="#recursion">recursion,</a>
|
||||||
however, is not supported.
|
that is, a "subroutine" call into a group that is already active,
|
||||||
|
is not supported.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Perl does not support back references in lookbehinds. PCRE2 does support them,
|
||||||
|
but only if certain conditions are met. The PCRE2_MATCH_UNSET_BACKREF option
|
||||||
|
must not be set, there must be no use of (?| in the pattern (it creates
|
||||||
|
duplicate subpattern numbers), and if the back reference is by name, the name
|
||||||
|
must be unique. Of course, the referenced subpattern must itself be of fixed
|
||||||
|
length. The following pattern matches words containing at least two characters
|
||||||
|
that begin and end with the same character:
|
||||||
|
<pre>
|
||||||
|
\b(\w)\w++(?<=\1)
|
||||||
|
</PRE>
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Possessive quantifiers can be used in conjunction with lookbehind assertions to
|
Possessive quantifiers can be used in conjunction with lookbehind assertions to
|
||||||
|
@ -2459,7 +2478,9 @@ Checking for a used subpattern by name
|
||||||
<P>
|
<P>
|
||||||
Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used
|
Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used
|
||||||
subpattern by name. For compatibility with earlier versions of PCRE1, which had
|
subpattern by name. For compatibility with earlier versions of PCRE1, which had
|
||||||
this facility before Perl, the syntax (?(name)...) is also recognized.
|
this facility before Perl, the syntax (?(name)...) is also recognized. Note,
|
||||||
|
however, that undelimited names consisting of the letter R followed by digits
|
||||||
|
are ambiguous (see the following section).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Rewriting the above example to use a named subpattern gives this:
|
Rewriting the above example to use a named subpattern gives this:
|
||||||
|
@ -2474,30 +2495,52 @@ matched.
|
||||||
Checking for pattern recursion
|
Checking for pattern recursion
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
If the condition is the string (R), and there is no subpattern with the name R,
|
"Recursion" in this sense refers to any subroutine-like call from one part of
|
||||||
the condition is true if a recursive call to the whole pattern or any
|
the pattern to another, whether or not it is actually recursive. See the
|
||||||
subpattern has been made. If digits or a name preceded by ampersand follow the
|
sections entitled
|
||||||
letter R, for example:
|
<a href="#recursion">"Recursive patterns"</a>
|
||||||
|
and
|
||||||
|
<a href="#subpatternsassubroutines">"Subpatterns as subroutines"</a>
|
||||||
|
below for details of recursion and subpattern calls.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If a condition is the string (R), and there is no subpattern with the name R,
|
||||||
|
the condition is true if matching is currently in a recursion or subroutine
|
||||||
|
call to the whole pattern or any subpattern. If digits follow the letter R, and
|
||||||
|
there is no subpattern with that name, the condition is true if the most recent
|
||||||
|
call is into a subpattern with the given number, which must exist somewhere in
|
||||||
|
the overall pattern. This is a contrived example that is equivalent to a+b:
|
||||||
<pre>
|
<pre>
|
||||||
(?(R3)...) or (?(R&name)...)
|
((?(R1)a+|(?1)b))
|
||||||
</pre>
|
</pre>
|
||||||
the condition is true if the most recent recursion is into a subpattern whose
|
However, in both cases, if there is a subpattern with a matching name, the
|
||||||
number or name is given. This condition does not check the entire recursion
|
condition tests for its being set, as described in the section above, instead
|
||||||
stack. If the name used in a condition of this kind is a duplicate, the test is
|
of testing for recursion. For example, creating a group with the name R1 by
|
||||||
applied to all subpatterns of the same name, and is true if any one of them is
|
adding (?<R1>) to the above pattern completely changes its meaning.
|
||||||
the most recent recursion.
|
</P>
|
||||||
|
<P>
|
||||||
|
If a name preceded by ampersand follows the letter R, for example:
|
||||||
|
<pre>
|
||||||
|
(?(R&name)...)
|
||||||
|
</pre>
|
||||||
|
the condition is true if the most recent recursion is into a subpattern of that
|
||||||
|
name (which must exist within the pattern).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
This condition does not check the entire recursion stack. It tests only the
|
||||||
|
current level. If the name used in a condition of this kind is a duplicate, the
|
||||||
|
test is applied to all subpatterns of the same name, and is true if any one of
|
||||||
|
them is the most recent recursion.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
At "top level", all these recursion test conditions are false.
|
At "top level", all these recursion test conditions are false.
|
||||||
<a href="#recursion">The syntax for recursive patterns</a>
|
|
||||||
is described below.
|
|
||||||
<a name="subdefine"></a></P>
|
<a name="subdefine"></a></P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Defining subpatterns for use by reference only
|
Defining subpatterns for use by reference only
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
If the condition is the string (DEFINE), and there is no subpattern with the
|
If the condition is the string (DEFINE), the condition is always false, even if
|
||||||
name DEFINE, the condition is always false. In this case, there may be only one
|
there is a group with the name DEFINE. In this case, there may be only one
|
||||||
alternative in the subpattern. It is always skipped if control reaches this
|
alternative in the subpattern. It is always skipped if control reaches this
|
||||||
point in the pattern; the idea of DEFINE is that it can be used to define
|
point in the pattern; the idea of DEFINE is that it can be used to define
|
||||||
subroutines that can be referenced from elsewhere. (The use of
|
subroutines that can be referenced from elsewhere. (The use of
|
||||||
|
@ -2965,12 +3008,22 @@ depending on whether or not a name is present.
|
||||||
By default, for compatibility with Perl, a name is any sequence of characters
|
By default, for compatibility with Perl, a name is any sequence of characters
|
||||||
that does not include a closing parenthesis. The name is not processed in
|
that does not include a closing parenthesis. The name is not processed in
|
||||||
any way, and it is not possible to include a closing parenthesis in the name.
|
any way, and it is not possible to include a closing parenthesis in the name.
|
||||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result
|
||||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
is no longer Perl-compatible.
|
||||||
the name. A closing parenthesis can be included in a name either as \) or
|
</P>
|
||||||
between \Q and \E. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
<P>
|
||||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names
|
||||||
of the pattern.
|
and only an unescaped closing parenthesis terminates the name. However, the
|
||||||
|
only backslash items that are permitted are \Q, \E, and sequences such as
|
||||||
|
\x{100} that define character code points. Character type escapes such as \d
|
||||||
|
are faulted.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
A closing parenthesis can be included in a name either as \) or between \Q
|
||||||
|
and \E. In addition to backslash processing, if the PCRE2_EXTENDED option is
|
||||||
|
also set, unescaped whitespace in verb names is skipped, and #-comments are
|
||||||
|
recognized, exactly as in the rest of the pattern. PCRE2_EXTENDED does not
|
||||||
|
affect verb names unless PCRE2_ALT_VERBNAMES is also set.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||||
|
@ -3393,7 +3446,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 20 June 2016
|
Last updated: 23 October 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2016 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -492,6 +492,9 @@ Each top-level branch of a look behind must be of a fixed length.
|
||||||
\n reference by number (can be ambiguous)
|
\n reference by number (can be ambiguous)
|
||||||
\gn reference by number
|
\gn reference by number
|
||||||
\g{n} reference by number
|
\g{n} reference by number
|
||||||
|
\g+n relative reference by number (PCRE2 extension)
|
||||||
|
\g-n relative reference by number
|
||||||
|
\g{+n} relative reference by number (PCRE2 extension)
|
||||||
\g{-n} relative reference by number
|
\g{-n} relative reference by number
|
||||||
\k<name> reference by name (Perl)
|
\k<name> reference by name (Perl)
|
||||||
\k'name' reference by name (Perl)
|
\k'name' reference by name (Perl)
|
||||||
|
@ -530,14 +533,17 @@ Each top-level branch of a look behind must be of a fixed length.
|
||||||
(?(-n) relative reference condition
|
(?(-n) relative reference condition
|
||||||
(?(<name>) named reference condition (Perl)
|
(?(<name>) named reference condition (Perl)
|
||||||
(?('name') named reference condition (Perl)
|
(?('name') named reference condition (Perl)
|
||||||
(?(name) named reference condition (PCRE2)
|
(?(name) named reference condition (PCRE2, deprecated)
|
||||||
(?(R) overall recursion condition
|
(?(R) overall recursion condition
|
||||||
(?(Rn) specific group recursion condition
|
(?(Rn) specific numbered group recursion condition
|
||||||
(?(R&name) specific recursion condition
|
(?(R&name) specific named group recursion condition
|
||||||
(?(DEFINE) define subpattern for reference
|
(?(DEFINE) define subpattern for reference
|
||||||
(?(VERSION[>]=n.m) test PCRE2 version
|
(?(VERSION[>]=n.m) test PCRE2 version
|
||||||
(?(assert) assertion condition
|
(?(assert) assertion condition
|
||||||
</PRE>
|
</pre>
|
||||||
|
Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
||||||
|
conditions or recursion tests. Such a condition is interpreted as a reference
|
||||||
|
condition if the relevant named group exists.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC23" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
<br><a name="SEC23" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -589,9 +595,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 16 October 2015
|
Last updated: 28 September 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -615,6 +615,7 @@ about the pattern:
|
||||||
pushcopy push a copy onto the stack
|
pushcopy push a copy onto the stack
|
||||||
stackguard=<number> test the stackguard feature
|
stackguard=<number> test the stackguard feature
|
||||||
tables=[0|1|2] select internal tables
|
tables=[0|1|2] select internal tables
|
||||||
|
use_length do not zero-terminate the pattern
|
||||||
utf8_input treat input as UTF-8
|
utf8_input treat input as UTF-8
|
||||||
</pre>
|
</pre>
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
|
@ -698,6 +699,18 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
|
Specifying the pattern's length
|
||||||
|
</b><br>
|
||||||
|
<P>
|
||||||
|
By default, patterns are passed to the compiling functions as zero-terminated
|
||||||
|
strings. When using the POSIX wrapper API, there is no other option. However,
|
||||||
|
when using PCRE2's native API, patterns can be passed by length instead of
|
||||||
|
being zero-terminated. The <b>use_length</b> modifier causes this to happen.
|
||||||
|
Using a length happens automatically (whether or not <b>use_length</b> is set)
|
||||||
|
when <b>hex</b> is set, because patterns specified in hexadecimal may contain
|
||||||
|
binary zeros.
|
||||||
|
</P>
|
||||||
|
<br><b>
|
||||||
Specifying pattern characters in hexadecimal
|
Specifying pattern characters in hexadecimal
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -720,10 +733,10 @@ the delimiter within a substring. The <b>hex</b> and <b>expand</b> modifiers are
|
||||||
mutually exclusive.
|
mutually exclusive.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
|
The POSIX API cannot be used with patterns specified in hexadecimal because
|
||||||
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
|
they may contain binary zeros, which conflicts with <b>regcomp()</b>'s
|
||||||
patterns specified with the <b>hex</b> modifier, the actual length of the
|
requirement for a zero-terminated string. Such patterns are always passed to
|
||||||
pattern is passed.
|
<b>pcre2_compile()</b> as a string with a length, not as zero-terminated.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Specifying wide characters in 16-bit and 32-bit modes
|
Specifying wide characters in 16-bit and 32-bit modes
|
||||||
|
@ -1753,7 +1766,7 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 02 August 2016
|
Last updated: 04 November 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2016 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -94,6 +94,9 @@ in the library.
|
||||||
<tr><td><a href="pcre2_code_copy.html">pcre2_code_copy</a></td>
|
<tr><td><a href="pcre2_code_copy.html">pcre2_code_copy</a></td>
|
||||||
<td> Copy a compiled pattern</td></tr>
|
<td> Copy a compiled pattern</td></tr>
|
||||||
|
|
||||||
|
<tr><td><a href="pcre2_code_copy_with_tables.html">pcre2_code_copy_with_tables</a></td>
|
||||||
|
<td> Copy a compiled pattern and its character tables</td></tr>
|
||||||
|
|
||||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||||
<td> Free a compiled pattern</td></tr>
|
<td> Free a compiled pattern</td></tr>
|
||||||
|
|
||||||
|
|
411
doc/pcre2.txt
411
doc/pcre2.txt
|
@ -379,6 +379,8 @@ PCRE2 NATIVE API AUXILIARY FUNCTIONS
|
||||||
|
|
||||||
pcre2_code *pcre2_code_copy(const pcre2_code *code);
|
pcre2_code *pcre2_code_copy(const pcre2_code *code);
|
||||||
|
|
||||||
|
pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
|
||||||
|
|
||||||
int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
|
int pcre2_get_error_message(int errorcode, PCRE2_UCHAR *buffer,
|
||||||
PCRE2_SIZE bufflen);
|
PCRE2_SIZE bufflen);
|
||||||
|
|
||||||
|
@ -626,8 +628,8 @@ MULTITHREADING
|
||||||
similar logic is required. JIT compilation updates a pointer within the
|
similar logic is required. JIT compilation updates a pointer within the
|
||||||
compiled code block, so a thread must gain unique write access to the
|
compiled code block, so a thread must gain unique write access to the
|
||||||
pointer before calling pcre2_jit_compile(). Alternatively,
|
pointer before calling pcre2_jit_compile(). Alternatively,
|
||||||
pcre2_code_copy() can be used to obtain a private copy of the compiled
|
pcre2_code_copy() or pcre2_code_copy_with_tables() can be used to
|
||||||
code.
|
obtain a private copy of the compiled code.
|
||||||
|
|
||||||
Context blocks
|
Context blocks
|
||||||
|
|
||||||
|
@ -789,7 +791,9 @@ PCRE2 CONTEXTS
|
||||||
|
|
||||||
This parameter ajusts the limit, set when PCRE2 is built (default 250),
|
This parameter ajusts the limit, set when PCRE2 is built (default 250),
|
||||||
on the depth of parenthesis nesting in a pattern. This limit stops
|
on the depth of parenthesis nesting in a pattern. This limit stops
|
||||||
rogue patterns using up too much system stack when being compiled.
|
rogue patterns using up too much system stack when being compiled. The
|
||||||
|
limit applies to parentheses of all kinds, not just capturing parenthe-
|
||||||
|
ses.
|
||||||
|
|
||||||
int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
|
||||||
int (*guard_function)(uint32_t, void *), void *user_data);
|
int (*guard_function)(uint32_t, void *), void *user_data);
|
||||||
|
@ -1102,6 +1106,8 @@ COMPILING A PATTERN
|
||||||
|
|
||||||
pcre2_code *pcre2_code_copy(const pcre2_code *code);
|
pcre2_code *pcre2_code_copy(const pcre2_code *code);
|
||||||
|
|
||||||
|
pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code);
|
||||||
|
|
||||||
The pcre2_compile() function compiles a pattern into an internal form.
|
The pcre2_compile() function compiles a pattern into an internal form.
|
||||||
The pattern is defined by a pointer to a string of code units and a
|
The pattern is defined by a pointer to a string of code units and a
|
||||||
length. If the pattern is zero-terminated, the length can be specified
|
length. If the pattern is zero-terminated, the length can be specified
|
||||||
|
@ -1120,10 +1126,21 @@ COMPILING A PATTERN
|
||||||
However, if the code has been processed by the JIT compiler (see
|
However, if the code has been processed by the JIT compiler (see
|
||||||
below), the JIT information cannot be copied (because it is position-
|
below), the JIT information cannot be copied (because it is position-
|
||||||
dependent). The new copy can initially be used only for non-JIT match-
|
dependent). The new copy can initially be used only for non-JIT match-
|
||||||
ing, though it can be passed to pcre2_jit_compile() if required. The
|
ing, though it can be passed to pcre2_jit_compile() if required.
|
||||||
pcre2_code_copy() function provides a way for individual threads in a
|
|
||||||
multithreaded application to acquire a private copy of shared compiled
|
The pcre2_code_copy() function provides a way for individual threads in
|
||||||
code.
|
a multithreaded application to acquire a private copy of shared com-
|
||||||
|
piled code. However, it does not make a copy of the character tables
|
||||||
|
used by the compiled pattern; the new pattern code points to the same
|
||||||
|
tables as the original code. (See "Locale Support" below for details
|
||||||
|
of these character tables.) In many applications the same tables are
|
||||||
|
used throughout, so this behaviour is appropriate. Nevertheless, there
|
||||||
|
are occasions when a copy of a compiled pattern and the relevant tables
|
||||||
|
are needed. The pcre2_code_copy_with_tables() provides this facility.
|
||||||
|
Copies of both the code and the tables are made, with the new code
|
||||||
|
pointing to the new tables. The memory for the new tables is automati-
|
||||||
|
cally freed when pcre2_code_free() is called for the new copy of the
|
||||||
|
compiled code.
|
||||||
|
|
||||||
NOTE: When one of the matching functions is called, pointers to the
|
NOTE: When one of the matching functions is called, pointers to the
|
||||||
compiled pattern and the subject string are set in the match data block
|
compiled pattern and the subject string are set in the match data block
|
||||||
|
@ -1155,6 +1172,12 @@ COMPILING A PATTERN
|
||||||
error has occurred. The values are not defined when compilation is suc-
|
error has occurred. The values are not defined when compilation is suc-
|
||||||
cessful and pcre2_compile() returns a non-NULL value.
|
cessful and pcre2_compile() returns a non-NULL value.
|
||||||
|
|
||||||
|
The value returned in erroroffset is an indication of where in the pat-
|
||||||
|
tern the error occurred. It is not necessarily the furthest point in
|
||||||
|
the pattern that was read. For example, after the error "lookbehind
|
||||||
|
assertion is not fixed length", the error offset points to the start of
|
||||||
|
the failing assertion.
|
||||||
|
|
||||||
The pcre2_get_error_message() function (see "Obtaining a textual error
|
The pcre2_get_error_message() function (see "Obtaining a textual error
|
||||||
message" below) provides a textual message for each error code. Compi-
|
message" below) provides a textual message for each error code. Compi-
|
||||||
lation errors have positive error codes; UTF formatting error codes are
|
lation errors have positive error codes; UTF formatting error codes are
|
||||||
|
@ -1244,8 +1267,9 @@ COMPILING A PATTERN
|
||||||
PCRE2_AUTO_CALLOUT
|
PCRE2_AUTO_CALLOUT
|
||||||
|
|
||||||
If this bit is set, pcre2_compile() automatically inserts callout
|
If this bit is set, pcre2_compile() automatically inserts callout
|
||||||
items, all with number 255, before each pattern item. For discussion of
|
items, all with number 255, before each pattern item, except immedi-
|
||||||
the callout facility, see the pcre2callout documentation.
|
ately before or after a callout in the pattern. For discussion of the
|
||||||
|
callout facility, see the pcre2callout documentation.
|
||||||
|
|
||||||
PCRE2_CASELESS
|
PCRE2_CASELESS
|
||||||
|
|
||||||
|
@ -3151,7 +3175,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 17 June 2016
|
Last updated: 22 November 2016
|
||||||
Copyright (c) 1997-2016 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -3506,16 +3530,21 @@ PCRE2GREP BUFFER SIZE
|
||||||
|
|
||||||
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
pcre2grep uses an internal buffer to hold a "window" on the file it is
|
||||||
scanning, in order to be able to output "before" and "after" lines when
|
scanning, in order to be able to output "before" and "after" lines when
|
||||||
it finds a match. The size of the buffer is controlled by a parameter
|
it finds a match. The starting size of the buffer is controlled by a
|
||||||
whose default value is 20K. The buffer itself is three times this size,
|
parameter whose default value is 20K. The buffer itself is three times
|
||||||
but because of the way it is used for holding "before" lines, the long-
|
this size, but because of the way it is used for holding "before"
|
||||||
est line that is guaranteed to be processable is the parameter size.
|
lines, the longest line that is guaranteed to be processable is the
|
||||||
You can change the default parameter value by adding, for example,
|
parameter size. If a longer line is encountered, pcre2grep automati-
|
||||||
|
cally expands the buffer, up to a specified maximum size, whose default
|
||||||
|
is 1M or the starting size, whichever is the larger. You can change the
|
||||||
|
default parameter values by adding, for example,
|
||||||
|
|
||||||
--with-pcre2grep-bufsize=50K
|
--with-pcre2grep-bufsize=51200
|
||||||
|
--with-pcre2grep-max-bufsize=2097152
|
||||||
|
|
||||||
to the configure command. The caller of pcre2grep can override this
|
to the configure command. The caller of pcre2grep can override these
|
||||||
value by using --buffer-size on the command line.
|
values by using --buffer-size and --max-buffer-size on the command
|
||||||
|
line.
|
||||||
|
|
||||||
|
|
||||||
PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
|
PCRE2TEST OPTION FOR LIBREADLINE SUPPORT
|
||||||
|
@ -3630,6 +3659,29 @@ CODE COVERAGE REPORTING
|
||||||
mentation.
|
mentation.
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORT FOR FUZZERS
|
||||||
|
|
||||||
|
There is a special option for use by people who want to run fuzzing
|
||||||
|
tests on PCRE2:
|
||||||
|
|
||||||
|
--enable-fuzz-support
|
||||||
|
|
||||||
|
At present this applies only to the 8-bit library. If set, it causes an
|
||||||
|
extra library called libpcre2-fuzzsupport.a to be built, but not
|
||||||
|
installed. This contains a single function called LLVMFuzzerTestOneIn-
|
||||||
|
put() whose arguments are a pointer to a string and the length of the
|
||||||
|
string. When called, this function tries to compile the string as a
|
||||||
|
pattern, and if that succeeds, to match it. This is done both with no
|
||||||
|
options and with some random options bits that are generated from the
|
||||||
|
string. Setting --enable-fuzz-support also causes a binary called
|
||||||
|
pcre2fuzzcheck to be created. This is normally run under valgrind or
|
||||||
|
used when PCRE2 is compiled with address sanitizing enabled. It calls
|
||||||
|
the fuzzing function and outputs information about it is doing. The
|
||||||
|
input strings are specified by arguments: if an argument starts with
|
||||||
|
"=" the rest of it is a literal input string. Otherwise, it is assumed
|
||||||
|
to be a file name, and the contents of the file are the test string.
|
||||||
|
|
||||||
|
|
||||||
SEE ALSO
|
SEE ALSO
|
||||||
|
|
||||||
pcre2api(3), pcre2-config(3).
|
pcre2api(3), pcre2-config(3).
|
||||||
|
@ -3644,7 +3696,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 01 April 2016
|
Last updated: 01 November 2016
|
||||||
Copyright (c) 1997-2016 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -3689,13 +3741,22 @@ DESCRIPTION
|
||||||
|
|
||||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled,
|
||||||
PCRE2 automatically inserts callouts, all with number 255, before each
|
PCRE2 automatically inserts callouts, all with number 255, before each
|
||||||
|
item in the pattern except for immediately before or after a callout
|
||||||
item in the pattern. For example, if PCRE2_AUTO_CALLOUT is used with
|
item in the pattern. For example, if PCRE2_AUTO_CALLOUT is used with
|
||||||
the pattern
|
the pattern
|
||||||
|
|
||||||
A(\d{2}|--)
|
A(?C3)B
|
||||||
|
|
||||||
it is processed as if it were
|
it is processed as if it were
|
||||||
|
|
||||||
|
(?C255)A(?C3)B(?C255)
|
||||||
|
|
||||||
|
Here is a more complicated example:
|
||||||
|
|
||||||
|
A(\d{2}|--)
|
||||||
|
|
||||||
|
With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were
|
||||||
|
|
||||||
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
||||||
|
|
||||||
Notice that there is a callout before and after each parenthesis and
|
Notice that there is a callout before and after each parenthesis and
|
||||||
|
@ -3737,10 +3798,11 @@ MISSING CALLOUTS
|
||||||
No match
|
No match
|
||||||
|
|
||||||
This indicates that when matching [bc] fails, there is no backtracking
|
This indicates that when matching [bc] fails, there is no backtracking
|
||||||
into a+ and therefore the callouts that would be taken for the back-
|
into a+ (because it is being treated as a++) and therefore the callouts
|
||||||
tracks do not occur. You can disable the auto-possessify feature by
|
that would be taken for the backtracks do not occur. You can disable
|
||||||
passing PCRE2_NO_AUTO_POSSESS to pcre2_compile(), or starting the pat-
|
the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to
|
||||||
tern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
|
pcre2_compile(), or starting the pattern with (*NO_AUTO_POSSESS). In
|
||||||
|
this case, the output changes to this:
|
||||||
|
|
||||||
--->aaaa
|
--->aaaa
|
||||||
+0 ^ a+
|
+0 ^ a+
|
||||||
|
@ -3859,8 +3921,8 @@ THE CALLOUT INTERFACE
|
||||||
|
|
||||||
For a numerical callout, callout_string is NULL, and callout_number
|
For a numerical callout, callout_string is NULL, and callout_number
|
||||||
contains the number of the callout, in the range 0-255. This is the
|
contains the number of the callout, in the range 0-255. This is the
|
||||||
number that follows (?C for manual callouts; it is 255 for automati-
|
number that follows (?C for callouts that part of the pattern; it is
|
||||||
cally generated callouts.
|
255 for automatically generated callouts.
|
||||||
|
|
||||||
Fields for string callouts
|
Fields for string callouts
|
||||||
|
|
||||||
|
@ -3921,10 +3983,16 @@ THE CALLOUT INTERFACE
|
||||||
the next item to be matched.
|
the next item to be matched.
|
||||||
|
|
||||||
The next_item_length field contains the length of the next item to be
|
The next_item_length field contains the length of the next item to be
|
||||||
matched in the pattern string. When the callout immediately precedes an
|
processed in the pattern string. When the callout is at the end of the
|
||||||
alternation bar, a closing parenthesis, or the end of the pattern, the
|
pattern, the length is zero. When the callout precedes an opening
|
||||||
length is zero. When the callout precedes an opening parenthesis, the
|
parenthesis, the length includes meta characters that follow the paren-
|
||||||
length is that of the entire subpattern.
|
thesis. For example, in a callout before an assertion such as (?=ab)
|
||||||
|
the length is 3. For an an alternation bar or a closing parenthesis,
|
||||||
|
the length is one, unless a closing parenthesis is followed by a quan-
|
||||||
|
tifier, in which case its length is included. (This changed in release
|
||||||
|
10.23. In earlier releases, before an opening parenthesis the length
|
||||||
|
was that of the entire subpattern, and before an alternation bar or a
|
||||||
|
closing parenthesis the length was zero.)
|
||||||
|
|
||||||
The pattern_position and next_item_length fields are intended to help
|
The pattern_position and next_item_length fields are intended to help
|
||||||
in distinguishing between different automatic callouts, which all have
|
in distinguishing between different automatic callouts, which all have
|
||||||
|
@ -4008,8 +4076,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 23 March 2015
|
Last updated: 29 September 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -4103,7 +4171,7 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
||||||
first one that is backtracked onto acts. For example, in the pattern
|
first one that is backtracked onto acts. For example, in the pattern
|
||||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure
|
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure
|
||||||
in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases
|
in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases
|
||||||
it is the same as PCRE2, but there are examples where it differs.
|
it is the same as PCRE2, but there are cases where it differs.
|
||||||
|
|
||||||
11. Most backtracking verbs in assertions have their normal actions.
|
11. Most backtracking verbs in assertions have their normal actions.
|
||||||
They are not confined to the assertion.
|
They are not confined to the assertion.
|
||||||
|
@ -4117,18 +4185,18 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
||||||
pattern names is not as general as Perl's. This is a consequence of the
|
pattern names is not as general as Perl's. This is a consequence of the
|
||||||
fact the PCRE2 works internally just with numbers, using an external
|
fact the PCRE2 works internally just with numbers, using an external
|
||||||
table to translate between numbers and names. In particular, a pattern
|
table to translate between numbers and names. In particular, a pattern
|
||||||
such as (?|(?<a>A)|(?<b)B), where the two capturing parentheses have
|
such as (?|(?<a>A)|(?<b>B), where the two capturing parentheses have
|
||||||
the same number but different names, is not supported, and causes an
|
the same number but different names, is not supported, and causes an
|
||||||
error at compile time. If it were allowed, it would not be possible to
|
error at compile time. If it were allowed, it would not be possible to
|
||||||
distinguish which parentheses matched, because both names map to cap-
|
distinguish which parentheses matched, because both names map to cap-
|
||||||
turing subpattern number 1. To avoid this confusing situation, an error
|
turing subpattern number 1. To avoid this confusing situation, an error
|
||||||
is given at compile time.
|
is given at compile time.
|
||||||
|
|
||||||
14. Perl recognizes comments in some places that PCRE2 does not, for
|
14. Perl used to recognize comments in some places that PCRE2 does not,
|
||||||
example, between the ( and ? at the start of a subpattern. If the /x
|
for example, between the ( and ? at the start of a subpattern. If the
|
||||||
modifier is set, Perl allows white space between ( and ? (though cur-
|
/x modifier is set, Perl allowed white space between ( and ? though the
|
||||||
rent Perls warn that this is deprecated) but PCRE2 never does, even if
|
latest Perls give an error (for a while it was just deprecated). There
|
||||||
the PCRE2_EXTENDED option is set.
|
may still be some cases where Perl behaves differently.
|
||||||
|
|
||||||
15. Perl, when in warning mode, gives warnings for character classes
|
15. Perl, when in warning mode, gives warnings for character classes
|
||||||
such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter-
|
such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter-
|
||||||
|
@ -4152,34 +4220,39 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
|
||||||
different length of string. Perl requires them all to have the same
|
different length of string. Perl requires them all to have the same
|
||||||
length.
|
length.
|
||||||
|
|
||||||
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the
|
(b) From PCRE2 10.23, back references to groups of fixed length are
|
||||||
|
supported in lookbehinds, provided that there is no possibility of ref-
|
||||||
|
erencing a non-unique number or name. Perl does not support backrefer-
|
||||||
|
ences in lookbehinds.
|
||||||
|
|
||||||
|
(c) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the
|
||||||
$ meta-character matches only at the very end of the string.
|
$ meta-character matches only at the very end of the string.
|
||||||
|
|
||||||
(c) A backslash followed by a letter with no special meaning is
|
(d) A backslash followed by a letter with no special meaning is
|
||||||
faulted. (Perl can be made to issue a warning.)
|
faulted. (Perl can be made to issue a warning.)
|
||||||
|
|
||||||
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti-
|
(e) If PCRE2_UNGREEDY is set, the greediness of the repetition quanti-
|
||||||
fiers is inverted, that is, by default they are not greedy, but if fol-
|
fiers is inverted, that is, by default they are not greedy, but if fol-
|
||||||
lowed by a question mark they are.
|
lowed by a question mark they are.
|
||||||
|
|
||||||
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to
|
(f) PCRE2_ANCHORED can be used at matching time to force a pattern to
|
||||||
be tried only at the first matching position in the subject string.
|
be tried only at the first matching position in the subject string.
|
||||||
|
|
||||||
(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
(g) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY,
|
||||||
PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl
|
PCRE2_NOTEMPTY_ATSTART, and PCRE2_NO_AUTO_CAPTURE options have no Perl
|
||||||
equivalents.
|
equivalents.
|
||||||
|
|
||||||
(g) The \R escape sequence can be restricted to match only CR, LF, or
|
(h) The \R escape sequence can be restricted to match only CR, LF, or
|
||||||
CRLF by the PCRE2_BSR_ANYCRLF option.
|
CRLF by the PCRE2_BSR_ANYCRLF option.
|
||||||
|
|
||||||
(h) The callout facility is PCRE2-specific.
|
(i) The callout facility is PCRE2-specific.
|
||||||
|
|
||||||
(i) The partial matching facility is PCRE2-specific.
|
(j) The partial matching facility is PCRE2-specific.
|
||||||
|
|
||||||
(j) The alternative matching function (pcre2_dfa_match() matches in a
|
(k) The alternative matching function (pcre2_dfa_match() matches in a
|
||||||
different way and is not Perl-compatible.
|
different way and is not Perl-compatible.
|
||||||
|
|
||||||
(k) PCRE2 recognizes some special sequences such as (*CR) at the start
|
(l) PCRE2 recognizes some special sequences such as (*CR) at the start
|
||||||
of a pattern that set overall options that cannot be changed within the
|
of a pattern that set overall options that cannot be changed within the
|
||||||
pattern.
|
pattern.
|
||||||
|
|
||||||
|
@ -4193,8 +4266,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 15 March 2015
|
Last updated: 18 October 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -4642,21 +4715,20 @@ SIZE AND OTHER LIMITATIONS
|
||||||
can be no more than 65535 capturing subpatterns. There is, however, a
|
can be no more than 65535 capturing subpatterns. There is, however, a
|
||||||
limit to the depth of nesting of parenthesized subpatterns of all
|
limit to the depth of nesting of parenthesized subpatterns of all
|
||||||
kinds. This is imposed in order to limit the amount of system stack
|
kinds. This is imposed in order to limit the amount of system stack
|
||||||
used at compile time. The limit can be specified when PCRE2 is built;
|
used at compile time. The default limit can be specified when PCRE2 is
|
||||||
the default is 250.
|
built; the default default is 250. An application can change this limit
|
||||||
|
by calling pcre2_set_parens_nest_limit() to set the limit in a compile
|
||||||
There is a limit to the number of forward references to subsequent sub-
|
context.
|
||||||
patterns of around 200,000. Repeated forward references with fixed
|
|
||||||
upper limits, for example, (?2){0,100} when subpattern number 2 is to
|
|
||||||
the right, are included in the count. There is no limit to the number
|
|
||||||
of backward references.
|
|
||||||
|
|
||||||
The maximum length of name for a named subpattern is 32 code units, and
|
The maximum length of name for a named subpattern is 32 code units, and
|
||||||
the maximum number of named subpatterns is 10000.
|
the maximum number of named subpatterns is 10000.
|
||||||
|
|
||||||
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or
|
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or
|
||||||
(*THEN) verb is 255 for the 8-bit library and 65535 for the 16-bit and
|
(*THEN) verb is 255 code units for the 8-bit library and 65535 code
|
||||||
32-bit libraries.
|
units for the 16-bit and 32-bit libraries.
|
||||||
|
|
||||||
|
The maximum length of a string argument to a callout is the largest
|
||||||
|
number a 32-bit unsigned integer can hold.
|
||||||
|
|
||||||
|
|
||||||
AUTHOR
|
AUTHOR
|
||||||
|
@ -4668,8 +4740,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 05 November 2015
|
Last updated: 26 October 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -5644,29 +5716,29 @@ BACKSLASH
|
||||||
character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A
|
character (hex 40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A
|
||||||
(A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes
|
(A is 41, Z is 5A), but \c{ becomes hex 3B ({ is 7B), and \c; becomes
|
||||||
hex 7B (; is 3B). If the code unit following \c has a value less than
|
hex 7B (; is 3B). If the code unit following \c has a value less than
|
||||||
32 or greater than 126, a compile-time error occurs. This locks out
|
32 or greater than 126, a compile-time error occurs.
|
||||||
non-printable ASCII characters in all modes.
|
|
||||||
|
|
||||||
When PCRE2 is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t gen-
|
When PCRE2 is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t gen-
|
||||||
erate the appropriate EBCDIC code values. The \c escape is processed as
|
erate the appropriate EBCDIC code values. The \c escape is processed as
|
||||||
specified for Perl in the perlebcdic document. The only characters that
|
specified for Perl in the perlebcdic document. The only characters that
|
||||||
are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?.
|
are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?.
|
||||||
Any other character provokes a compile-time error. The sequence \@
|
Any other character provokes a compile-time error. The sequence \c@
|
||||||
encodes character code 0; the letters (in either case) encode charac-
|
encodes character code 0; after \c the letters (in either case) encode
|
||||||
ters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31
|
characters 1-26 (hex 01 to hex 1A); [, \, ], ^, and _ encode characters
|
||||||
(hex 1B to hex 1F), and \? becomes either 255 (hex FF) or 95 (hex 5F).
|
27-31 (hex 1B to hex 1F), and \c? becomes either 255 (hex FF) or 95
|
||||||
|
(hex 5F).
|
||||||
|
|
||||||
Thus, apart from \?, these escapes generate the same character code
|
Thus, apart from \c?, these escapes generate the same character code
|
||||||
values as they do in an ASCII environment, though the meanings of the
|
values as they do in an ASCII environment, though the meanings of the
|
||||||
values mostly differ. For example, \G always generates code value 7,
|
values mostly differ. For example, \cG always generates code value 7,
|
||||||
which is BEL in ASCII but DEL in EBCDIC.
|
which is BEL in ASCII but DEL in EBCDIC.
|
||||||
|
|
||||||
The sequence \? generates DEL (127, hex 7F) in an ASCII environment,
|
The sequence \c? generates DEL (127, hex 7F) in an ASCII environment,
|
||||||
but because 127 is not a control character in EBCDIC, Perl makes it
|
but because 127 is not a control character in EBCDIC, Perl makes it
|
||||||
generate the APC character. Unfortunately, there are several variants
|
generate the APC character. Unfortunately, there are several variants
|
||||||
of EBCDIC. In most of them the APC character has the value 255 (hex
|
of EBCDIC. In most of them the APC character has the value 255 (hex
|
||||||
FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If
|
FF), but in the one Perl calls POSIX-BC its value is 95 (hex 5F). If
|
||||||
certain other characters have POSIX-BC values, PCRE2 makes \? generate
|
certain other characters have POSIX-BC values, PCRE2 makes \c? generate
|
||||||
95; otherwise it generates 255.
|
95; otherwise it generates 255.
|
||||||
|
|
||||||
After \0 up to two further octal digits are read. If there are fewer
|
After \0 up to two further octal digits are read. If there are fewer
|
||||||
|
@ -5776,10 +5848,10 @@ BACKSLASH
|
||||||
|
|
||||||
Absolute and relative back references
|
Absolute and relative back references
|
||||||
|
|
||||||
The sequence \g followed by an unsigned or a negative number, option-
|
The sequence \g followed by a signed or unsigned number, optionally
|
||||||
ally enclosed in braces, is an absolute or relative back reference. A
|
enclosed in braces, is an absolute or relative back reference. A named
|
||||||
named back reference can be coded as \g{name}. Back references are dis-
|
back reference can be coded as \g{name}. Back references are discussed
|
||||||
cussed later, following the discussion of parenthesized subpatterns.
|
later, following the discussion of parenthesized subpatterns.
|
||||||
|
|
||||||
Absolute and relative subroutine calls
|
Absolute and relative subroutine calls
|
||||||
|
|
||||||
|
@ -6404,6 +6476,18 @@ SQUARE BRACKETS AND CHARACTER CLASSES
|
||||||
PCRE2_MULTILINE options is used. A class such as [^a] always matches
|
PCRE2_MULTILINE options is used. A class such as [^a] always matches
|
||||||
one of these characters.
|
one of these characters.
|
||||||
|
|
||||||
|
The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V,
|
||||||
|
\w, and \W may appear in a character class, and add the characters that
|
||||||
|
they match to the class. For example, [\dABCDEF] matches any hexadeci-
|
||||||
|
mal digit. In UTF modes, the PCRE2_UCP option affects the meanings of
|
||||||
|
\d, \s, \w and their upper case partners, just as it does when they
|
||||||
|
appear outside a character class, as described in the section entitled
|
||||||
|
"Generic character types" above. The escape sequence \b has a different
|
||||||
|
meaning inside a character class; it matches the backspace character.
|
||||||
|
The sequences \B, \N, \R, and \X are not special inside a character
|
||||||
|
class. Like any other unrecognized escape sequences, they cause an
|
||||||
|
error.
|
||||||
|
|
||||||
The minus (hyphen) character can be used to specify a range of charac-
|
The minus (hyphen) character can be used to specify a range of charac-
|
||||||
ters in a character class. For example, [d-m] matches any letter
|
ters in a character class. For example, [d-m] matches any letter
|
||||||
between d and m, inclusive. If a minus character is required in a
|
between d and m, inclusive. If a minus character is required in a
|
||||||
|
@ -6413,6 +6497,11 @@ SQUARE BRACKETS AND CHARACTER CLASSES
|
||||||
example, [b-d-z] matches letters in the range b to d, a hyphen charac-
|
example, [b-d-z] matches letters in the range b to d, a hyphen charac-
|
||||||
ter, or z.
|
ter, or z.
|
||||||
|
|
||||||
|
Perl treats a hyphen as a literal if it appears before a POSIX class
|
||||||
|
(see below) or a character type escape such as as \d, but gives a warn-
|
||||||
|
ing in its warning mode, as this is most likely a user error. As PCRE2
|
||||||
|
has no facility for warning, an error is given in these cases.
|
||||||
|
|
||||||
It is not possible to have the literal character "]" as the end charac-
|
It is not possible to have the literal character "]" as the end charac-
|
||||||
ter of a range. A pattern such as [W-]46] is interpreted as a class of
|
ter of a range. A pattern such as [W-]46] is interpreted as a class of
|
||||||
two characters ("W" and "-") followed by a literal string "46]", so it
|
two characters ("W" and "-") followed by a literal string "46]", so it
|
||||||
|
@ -6422,11 +6511,6 @@ SQUARE BRACKETS AND CHARACTER CLASSES
|
||||||
The octal or hexadecimal representation of "]" can also be used to end
|
The octal or hexadecimal representation of "]" can also be used to end
|
||||||
a range.
|
a range.
|
||||||
|
|
||||||
An error is generated if a POSIX character class (see below) or an
|
|
||||||
escape sequence other than one that defines a single character appears
|
|
||||||
at a point where a range ending character is expected. For example,
|
|
||||||
[z-\xff] is valid, but [A-\d] and [A-[:digit:]] are not.
|
|
||||||
|
|
||||||
Ranges normally include all code points between the start and end char-
|
Ranges normally include all code points between the start and end char-
|
||||||
acters, inclusive. They can also be used for code points specified
|
acters, inclusive. They can also be used for code points specified
|
||||||
numerically, for example [\000-\037]. Ranges can include any characters
|
numerically, for example [\000-\037]. Ranges can include any characters
|
||||||
|
@ -6446,18 +6530,6 @@ SQUARE BRACKETS AND CHARACTER CLASSES
|
||||||
character tables for a French locale are in use, [\xc8-\xcb] matches
|
character tables for a French locale are in use, [\xc8-\xcb] matches
|
||||||
accented E characters in both cases.
|
accented E characters in both cases.
|
||||||
|
|
||||||
The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v, \V,
|
|
||||||
\w, and \W may appear in a character class, and add the characters that
|
|
||||||
they match to the class. For example, [\dABCDEF] matches any hexadeci-
|
|
||||||
mal digit. In UTF modes, the PCRE2_UCP option affects the meanings of
|
|
||||||
\d, \s, \w and their upper case partners, just as it does when they
|
|
||||||
appear outside a character class, as described in the section entitled
|
|
||||||
"Generic character types" above. The escape sequence \b has a different
|
|
||||||
meaning inside a character class; it matches the backspace character.
|
|
||||||
The sequences \B, \N, \R, and \X are not special inside a character
|
|
||||||
class. Like any other unrecognized escape sequences, they cause an
|
|
||||||
error.
|
|
||||||
|
|
||||||
A circumflex can conveniently be used with the upper case character
|
A circumflex can conveniently be used with the upper case character
|
||||||
types to specify a more restricted set of characters than the matching
|
types to specify a more restricted set of characters than the matching
|
||||||
lower case type. For example, the class [^\W_] matches any letter or
|
lower case type. For example, the class [^\W_] matches any letter or
|
||||||
|
@ -6618,14 +6690,9 @@ INTERNAL OPTION SETTING
|
||||||
|
|
||||||
When one of these option changes occurs at top level (that is, not
|
When one of these option changes occurs at top level (that is, not
|
||||||
inside subpattern parentheses), the change applies to the remainder of
|
inside subpattern parentheses), the change applies to the remainder of
|
||||||
the pattern that follows. If the change is placed right at the start of
|
the pattern that follows. An option change within a subpattern (see
|
||||||
a pattern, PCRE2 extracts it into the global options (and it will
|
below for a description of subpatterns) affects only that part of the
|
||||||
therefore show up in data extracted by the pcre2_pattern_info() func-
|
subpattern that follows it, so
|
||||||
tion).
|
|
||||||
|
|
||||||
An option change within a subpattern (see below for a description of
|
|
||||||
subpatterns) affects only that part of the subpattern that follows it,
|
|
||||||
so
|
|
||||||
|
|
||||||
(a(?i)b)c
|
(a(?i)b)c
|
||||||
|
|
||||||
|
@ -7140,8 +7207,8 @@ BACK REFERENCES
|
||||||
|
|
||||||
Another way of avoiding the ambiguity inherent in the use of digits
|
Another way of avoiding the ambiguity inherent in the use of digits
|
||||||
following a backslash is to use the \g escape sequence. This escape
|
following a backslash is to use the \g escape sequence. This escape
|
||||||
must be followed by an unsigned number or a negative number, optionally
|
must be followed by a signed or unsigned number, optionally enclosed in
|
||||||
enclosed in braces. These examples are all identical:
|
braces. These examples are all identical:
|
||||||
|
|
||||||
(ring), \1
|
(ring), \1
|
||||||
(ring), \g1
|
(ring), \g1
|
||||||
|
@ -7149,7 +7216,7 @@ BACK REFERENCES
|
||||||
|
|
||||||
An unsigned number specifies an absolute reference without the ambigu-
|
An unsigned number specifies an absolute reference without the ambigu-
|
||||||
ity that is present in the older syntax. It is also useful when literal
|
ity that is present in the older syntax. It is also useful when literal
|
||||||
digits follow the reference. A negative number is a relative reference.
|
digits follow the reference. A signed number is a relative reference.
|
||||||
Consider this example:
|
Consider this example:
|
||||||
|
|
||||||
(abc(def)ghi)\g{-1}
|
(abc(def)ghi)\g{-1}
|
||||||
|
@ -7161,6 +7228,10 @@ BACK REFERENCES
|
||||||
are created by joining together fragments that contain references
|
are created by joining together fragments that contain references
|
||||||
within themselves.
|
within themselves.
|
||||||
|
|
||||||
|
The sequence \g{+1} is a reference to the next capturing subpattern.
|
||||||
|
This kind of forward reference can be useful it patterns that repeat.
|
||||||
|
Perl does not support the use of + in this way.
|
||||||
|
|
||||||
A back reference matches whatever actually matched the capturing sub-
|
A back reference matches whatever actually matched the capturing sub-
|
||||||
pattern in the current subject string, rather than anything matching
|
pattern in the current subject string, rather than anything matching
|
||||||
the subpattern itself (see "Subpatterns as subroutines" below for a way
|
the subpattern itself (see "Subpatterns as subroutines" below for a way
|
||||||
|
@ -7252,6 +7323,13 @@ ASSERTIONS
|
||||||
assertions. (Perl sometimes, but not always, does do capturing in nega-
|
assertions. (Perl sometimes, but not always, does do capturing in nega-
|
||||||
tive assertions.)
|
tive assertions.)
|
||||||
|
|
||||||
|
WARNING: If a positive assertion containing one or more capturing sub-
|
||||||
|
patterns succeeds, but failure to match later in the pattern causes
|
||||||
|
backtracking over this assertion, the captures within the assertion are
|
||||||
|
reset only if no higher numbered captures are already set. This is,
|
||||||
|
unfortunately, a fundamental limitation of the current implementation;
|
||||||
|
it may get removed in a future reworking.
|
||||||
|
|
||||||
For compatibility with Perl, most assertion subpatterns may be
|
For compatibility with Perl, most assertion subpatterns may be
|
||||||
repeated; though it makes no sense to assert the same thing several
|
repeated; though it makes no sense to assert the same thing several
|
||||||
times, the side effect of capturing parentheses may occasionally be
|
times, the side effect of capturing parentheses may occasionally be
|
||||||
|
@ -7340,15 +7418,27 @@ ASSERTIONS
|
||||||
then try to match. If there are insufficient characters before the cur-
|
then try to match. If there are insufficient characters before the cur-
|
||||||
rent position, the assertion fails.
|
rent position, the assertion fails.
|
||||||
|
|
||||||
In a UTF mode, PCRE2 does not allow the \C escape (which matches a sin-
|
In UTF-8 and UTF-16 modes, PCRE2 does not allow the \C escape (which
|
||||||
gle code unit even in a UTF mode) to appear in lookbehind assertions,
|
matches a single code unit even in a UTF mode) to appear in lookbehind
|
||||||
because it makes it impossible to calculate the length of the lookbe-
|
assertions, because it makes it impossible to calculate the length of
|
||||||
hind. The \X and \R escapes, which can match different numbers of code
|
the lookbehind. The \X and \R escapes, which can match different num-
|
||||||
units, are also not permitted.
|
bers of code units, are never permitted in lookbehinds.
|
||||||
|
|
||||||
"Subroutine" calls (see below) such as (?2) or (?&X) are permitted in
|
"Subroutine" calls (see below) such as (?2) or (?&X) are permitted in
|
||||||
lookbehinds, as long as the subpattern matches a fixed-length string.
|
lookbehinds, as long as the subpattern matches a fixed-length string.
|
||||||
Recursion, however, is not supported.
|
However, recursion, that is, a "subroutine" call into a group that is
|
||||||
|
already active, is not supported.
|
||||||
|
|
||||||
|
Perl does not support back references in lookbehinds. PCRE2 does sup-
|
||||||
|
port them, but only if certain conditions are met. The
|
||||||
|
PCRE2_MATCH_UNSET_BACKREF option must not be set, there must be no use
|
||||||
|
of (?| in the pattern (it creates duplicate subpattern numbers), and if
|
||||||
|
the back reference is by name, the name must be unique. Of course, the
|
||||||
|
referenced subpattern must itself be of fixed length. The following
|
||||||
|
pattern matches words containing at least two characters that begin and
|
||||||
|
end with the same character:
|
||||||
|
|
||||||
|
\b(\w)\w++(?<=\1)
|
||||||
|
|
||||||
Possessive quantifiers can be used in conjunction with lookbehind
|
Possessive quantifiers can be used in conjunction with lookbehind
|
||||||
assertions to specify efficient matching of fixed-length strings at the
|
assertions to specify efficient matching of fixed-length strings at the
|
||||||
|
@ -7482,7 +7572,9 @@ CONDITIONAL SUBPATTERNS
|
||||||
Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a
|
Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a
|
||||||
used subpattern by name. For compatibility with earlier versions of
|
used subpattern by name. For compatibility with earlier versions of
|
||||||
PCRE1, which had this facility before Perl, the syntax (?(name)...) is
|
PCRE1, which had this facility before Perl, the syntax (?(name)...) is
|
||||||
also recognized.
|
also recognized. Note, however, that undelimited names consisting of
|
||||||
|
the letter R followed by digits are ambiguous (see the following sec-
|
||||||
|
tion).
|
||||||
|
|
||||||
Rewriting the above example to use a named subpattern gives this:
|
Rewriting the above example to use a named subpattern gives this:
|
||||||
|
|
||||||
|
@ -7494,32 +7586,51 @@ CONDITIONAL SUBPATTERNS
|
||||||
|
|
||||||
Checking for pattern recursion
|
Checking for pattern recursion
|
||||||
|
|
||||||
If the condition is the string (R), and there is no subpattern with the
|
"Recursion" in this sense refers to any subroutine-like call from one
|
||||||
name R, the condition is true if a recursive call to the whole pattern
|
part of the pattern to another, whether or not it is actually recur-
|
||||||
or any subpattern has been made. If digits or a name preceded by amper-
|
sive. See the sections entitled "Recursive patterns" and "Subpatterns
|
||||||
sand follow the letter R, for example:
|
as subroutines" below for details of recursion and subpattern calls.
|
||||||
|
|
||||||
(?(R3)...) or (?(R&name)...)
|
If a condition is the string (R), and there is no subpattern with the
|
||||||
|
name R, the condition is true if matching is currently in a recursion
|
||||||
|
or subroutine call to the whole pattern or any subpattern. If digits
|
||||||
|
follow the letter R, and there is no subpattern with that name, the
|
||||||
|
condition is true if the most recent call is into a subpattern with the
|
||||||
|
given number, which must exist somewhere in the overall pattern. This
|
||||||
|
is a contrived example that is equivalent to a+b:
|
||||||
|
|
||||||
|
((?(R1)a+|(?1)b))
|
||||||
|
|
||||||
|
However, in both cases, if there is a subpattern with a matching name,
|
||||||
|
the condition tests for its being set, as described in the section
|
||||||
|
above, instead of testing for recursion. For example, creating a group
|
||||||
|
with the name R1 by adding (?<R1>) to the above pattern completely
|
||||||
|
changes its meaning.
|
||||||
|
|
||||||
|
If a name preceded by ampersand follows the letter R, for example:
|
||||||
|
|
||||||
|
(?(R&name)...)
|
||||||
|
|
||||||
the condition is true if the most recent recursion is into a subpattern
|
the condition is true if the most recent recursion is into a subpattern
|
||||||
whose number or name is given. This condition does not check the entire
|
of that name (which must exist within the pattern).
|
||||||
recursion stack. If the name used in a condition of this kind is a
|
|
||||||
|
This condition does not check the entire recursion stack. It tests only
|
||||||
|
the current level. If the name used in a condition of this kind is a
|
||||||
duplicate, the test is applied to all subpatterns of the same name, and
|
duplicate, the test is applied to all subpatterns of the same name, and
|
||||||
is true if any one of them is the most recent recursion.
|
is true if any one of them is the most recent recursion.
|
||||||
|
|
||||||
At "top level", all these recursion test conditions are false. The
|
At "top level", all these recursion test conditions are false.
|
||||||
syntax for recursive patterns is described below.
|
|
||||||
|
|
||||||
Defining subpatterns for use by reference only
|
Defining subpatterns for use by reference only
|
||||||
|
|
||||||
If the condition is the string (DEFINE), and there is no subpattern
|
If the condition is the string (DEFINE), the condition is always false,
|
||||||
with the name DEFINE, the condition is always false. In this case,
|
even if there is a group with the name DEFINE. In this case, there may
|
||||||
there may be only one alternative in the subpattern. It is always
|
be only one alternative in the subpattern. It is always skipped if con-
|
||||||
skipped if control reaches this point in the pattern; the idea of
|
trol reaches this point in the pattern; the idea of DEFINE is that it
|
||||||
DEFINE is that it can be used to define subroutines that can be refer-
|
can be used to define subroutines that can be referenced from else-
|
||||||
enced from elsewhere. (The use of subroutines is described below.) For
|
where. (The use of subroutines is described below.) For example, a pat-
|
||||||
example, a pattern to match an IPv4 address such as "192.168.23.245"
|
tern to match an IPv4 address such as "192.168.23.245" could be written
|
||||||
could be written like this (ignore white space and line breaks):
|
like this (ignore white space and line breaks):
|
||||||
|
|
||||||
(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
|
(?(DEFINE) (?<byte> 2[0-4]\d | 25[0-5] | 1\d\d | [1-9]?\d) )
|
||||||
\b (?&byte) (\.(?&byte)){3} \b
|
\b (?&byte) (\.(?&byte)){3} \b
|
||||||
|
@ -7971,13 +8082,22 @@ BACKTRACKING CONTROL
|
||||||
By default, for compatibility with Perl, a name is any sequence of
|
By default, for compatibility with Perl, a name is any sequence of
|
||||||
characters that does not include a closing parenthesis. The name is not
|
characters that does not include a closing parenthesis. The name is not
|
||||||
processed in any way, and it is not possible to include a closing
|
processed in any way, and it is not possible to include a closing
|
||||||
parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES option is
|
parenthesis in the name. This can be changed by setting the
|
||||||
set, normal backslash processing is applied to verb names and only an
|
PCRE2_ALT_VERBNAMES option, but the result is no longer Perl-compati-
|
||||||
unescaped closing parenthesis terminates the name. A closing parenthe-
|
ble.
|
||||||
sis can be included in a name either as \) or between \Q and \E. If the
|
|
||||||
PCRE2_EXTENDED option is set, unescaped whitespace in verb names is
|
When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to
|
||||||
skipped and #-comments are recognized, exactly as in the rest of the
|
verb names and only an unescaped closing parenthesis terminates the
|
||||||
pattern.
|
name. However, the only backslash items that are permitted are \Q, \E,
|
||||||
|
and sequences such as \x{100} that define character code points. Char-
|
||||||
|
acter type escapes such as \d are faulted.
|
||||||
|
|
||||||
|
A closing parenthesis can be included in a name either as \) or between
|
||||||
|
\Q and \E. In addition to backslash processing, if the PCRE2_EXTENDED
|
||||||
|
option is also set, unescaped whitespace in verb names is skipped, and
|
||||||
|
#-comments are recognized, exactly as in the rest of the pattern.
|
||||||
|
PCRE2_EXTENDED does not affect verb names unless PCRE2_ALT_VERBNAMES is
|
||||||
|
also set.
|
||||||
|
|
||||||
The maximum length of a name is 255 in the 8-bit library and 65535 in
|
The maximum length of a name is 255 in the 8-bit library and 65535 in
|
||||||
the 16-bit and 32-bit libraries. If the name is empty, that is, if the
|
the 16-bit and 32-bit libraries. If the name is empty, that is, if the
|
||||||
|
@ -8367,7 +8487,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 20 June 2016
|
Last updated: 23 October 2016
|
||||||
Copyright (c) 1997-2016 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -9589,6 +9709,9 @@ BACKREFERENCES
|
||||||
\n reference by number (can be ambiguous)
|
\n reference by number (can be ambiguous)
|
||||||
\gn reference by number
|
\gn reference by number
|
||||||
\g{n} reference by number
|
\g{n} reference by number
|
||||||
|
\g+n relative reference by number (PCRE2 extension)
|
||||||
|
\g-n relative reference by number
|
||||||
|
\g{+n} relative reference by number (PCRE2 extension)
|
||||||
\g{-n} relative reference by number
|
\g{-n} relative reference by number
|
||||||
\k<name> reference by name (Perl)
|
\k<name> reference by name (Perl)
|
||||||
\k'name' reference by name (Perl)
|
\k'name' reference by name (Perl)
|
||||||
|
@ -9625,14 +9748,18 @@ CONDITIONAL PATTERNS
|
||||||
(?(-n) relative reference condition
|
(?(-n) relative reference condition
|
||||||
(?(<name>) named reference condition (Perl)
|
(?(<name>) named reference condition (Perl)
|
||||||
(?('name') named reference condition (Perl)
|
(?('name') named reference condition (Perl)
|
||||||
(?(name) named reference condition (PCRE2)
|
(?(name) named reference condition (PCRE2, deprecated)
|
||||||
(?(R) overall recursion condition
|
(?(R) overall recursion condition
|
||||||
(?(Rn) specific group recursion condition
|
(?(Rn) specific numbered group recursion condition
|
||||||
(?(R&name) specific recursion condition
|
(?(R&name) specific named group recursion condition
|
||||||
(?(DEFINE) define subpattern for reference
|
(?(DEFINE) define subpattern for reference
|
||||||
(?(VERSION[>]=n.m) test PCRE2 version
|
(?(VERSION[>]=n.m) test PCRE2 version
|
||||||
(?(assert) assertion condition
|
(?(assert) assertion condition
|
||||||
|
|
||||||
|
Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
||||||
|
conditions or recursion tests. Such a condition is interpreted as a
|
||||||
|
reference condition if the relevant named group exists.
|
||||||
|
|
||||||
|
|
||||||
BACKTRACKING CONTROL
|
BACKTRACKING CONTROL
|
||||||
|
|
||||||
|
@ -9684,8 +9811,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 16 October 2015
|
Last updated: 28 September 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2_CODE_COPY 3 "26 February 2016" "PCRE2 10.22"
|
.TH PCRE2_CODE_COPY 3 "22 November 2016" "PCRE2 10.23"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
|
@ -16,8 +16,9 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
This function makes a copy of the memory used for a compiled pattern, excluding
|
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||||
any memory used by the JIT compiler. Without a subsequent call to
|
any memory used by the JIT compiler. Without a subsequent call to
|
||||||
\fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching. The
|
\fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching. The
|
||||||
yield of the function is NULL if \fIcode\fP is NULL or if sufficient memory
|
pointer to the character tables is copied, not the tables themselves (see
|
||||||
cannot be obtained.
|
\fBpcre2_code_copy_with_tables()\fP). The yield of the function is NULL if
|
||||||
|
\fIcode\fP is NULL or if sufficient memory cannot be obtained.
|
||||||
.P
|
.P
|
||||||
There is a complete description of the PCRE2 native API in the
|
There is a complete description of the PCRE2 native API in the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
.TH PCRE2_CODE_COPY 3 "22 November 2016" "PCRE2 10.23"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.B #include <pcre2.h>
|
||||||
|
.PP
|
||||||
|
.nf
|
||||||
|
.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||||
|
any memory used by the JIT compiler. Without a subsequent call to
|
||||||
|
\fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching.
|
||||||
|
Unlike \fBpcre2_code_copy()\fP, a separate copy of the character tables is also
|
||||||
|
made, with the new code pointing to it. This memory will be automatically freed
|
||||||
|
when \fBpcre2_code_free()\fP is called. The yield of the function is NULL if
|
||||||
|
\fIcode\fP is NULL or if sufficient memory cannot be obtained.
|
||||||
|
.P
|
||||||
|
There is a complete description of the PCRE2 native API in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2api\fP
|
||||||
|
.\"
|
||||||
|
page and a description of the POSIX API in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2posix\fP
|
||||||
|
.\"
|
||||||
|
page.
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2API 3 "30 September 2016" "PCRE2 10.23"
|
.TH PCRE2API 3 "22 November 2016" "PCRE2 10.23"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.sp
|
.sp
|
||||||
|
@ -235,6 +235,8 @@ document for an overview of all the PCRE2 documentation.
|
||||||
.nf
|
.nf
|
||||||
.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
|
.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
|
||||||
.sp
|
.sp
|
||||||
|
.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
|
||||||
|
.sp
|
||||||
.B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP,
|
.B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP,
|
||||||
.B " PCRE2_SIZE \fIbufflen\fP);"
|
.B " PCRE2_SIZE \fIbufflen\fP);"
|
||||||
.sp
|
.sp
|
||||||
|
@ -509,8 +511,9 @@ If JIT is being used, but the JIT compilation is not being done immediately,
|
||||||
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
||||||
required. JIT compilation updates a pointer within the compiled code block, so
|
required. JIT compilation updates a pointer within the compiled code block, so
|
||||||
a thread must gain unique write access to the pointer before calling
|
a thread must gain unique write access to the pointer before calling
|
||||||
\fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP can be used
|
\fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP or
|
||||||
to obtain a private copy of the compiled code.
|
\fBpcre2_code_copy_with_tables()\fP can be used to obtain a private copy of the
|
||||||
|
compiled code.
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SS "Context blocks"
|
.SS "Context blocks"
|
||||||
|
@ -1027,6 +1030,8 @@ zero.
|
||||||
.B void pcre2_code_free(pcre2_code *\fIcode\fP);
|
.B void pcre2_code_free(pcre2_code *\fIcode\fP);
|
||||||
.sp
|
.sp
|
||||||
.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
|
.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
|
||||||
|
.sp
|
||||||
|
.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
|
||||||
.fi
|
.fi
|
||||||
.P
|
.P
|
||||||
The \fBpcre2_compile()\fP function compiles a pattern into an internal form.
|
The \fBpcre2_compile()\fP function compiles a pattern into an internal form.
|
||||||
|
@ -1049,9 +1054,24 @@ below),
|
||||||
.\"
|
.\"
|
||||||
the JIT information cannot be copied (because it is position-dependent).
|
the JIT information cannot be copied (because it is position-dependent).
|
||||||
The new copy can initially be used only for non-JIT matching, though it can be
|
The new copy can initially be used only for non-JIT matching, though it can be
|
||||||
passed to \fBpcre2_jit_compile()\fP if required. The \fBpcre2_code_copy()\fP
|
passed to \fBpcre2_jit_compile()\fP if required.
|
||||||
function provides a way for individual threads in a multithreaded application
|
.P
|
||||||
to acquire a private copy of shared compiled code.
|
The \fBpcre2_code_copy()\fP function provides a way for individual threads in a
|
||||||
|
multithreaded application to acquire a private copy of shared compiled code.
|
||||||
|
However, it does not make a copy of the character tables used by the compiled
|
||||||
|
pattern; the new pattern code points to the same tables as the original code.
|
||||||
|
(See
|
||||||
|
.\" HTML <a href="#jitcompiling">
|
||||||
|
.\" </a>
|
||||||
|
"Locale Support"
|
||||||
|
.\"
|
||||||
|
below for details of these character tables.) In many applications the same
|
||||||
|
tables are used throughout, so this behaviour is appropriate. Nevertheless,
|
||||||
|
there are occasions when a copy of a compiled pattern and the relevant tables
|
||||||
|
are needed. The \fBpcre2_code_copy_with_tables()\fP provides this facility.
|
||||||
|
Copies of both the code and the tables are made, with the new code pointing to
|
||||||
|
the new tables. The memory for the new tables is automatically freed when
|
||||||
|
\fBpcre2_code_free()\fP is called for the new copy of the compiled code.
|
||||||
.P
|
.P
|
||||||
NOTE: When one of the matching functions is called, pointers to the compiled
|
NOTE: When one of the matching functions is called, pointers to the compiled
|
||||||
pattern and the subject string are set in the match data block so that they can
|
pattern and the subject string are set in the match data block so that they can
|
||||||
|
@ -3299,6 +3319,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 30 September 2016
|
Last updated: 22 November 2016
|
||||||
Copyright (c) 1997-2016 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -51,11 +51,20 @@ DESCRIPTION
|
||||||
boundary is controlled by the -N (--newline) option.
|
boundary is controlled by the -N (--newline) option.
|
||||||
|
|
||||||
The amount of memory used for buffering files that are being scanned is
|
The amount of memory used for buffering files that are being scanned is
|
||||||
controlled by a parameter that can be set by the --buffer-size option.
|
controlled by parameters that can be set by the --buffer-size and
|
||||||
The default value for this parameter is specified when pcre2grep is
|
--max-buffer-size options. The first of these sets the size of buffer
|
||||||
built, with the default default being 20K. A block of memory three
|
that is obtained at the start of processing. If an input file contains
|
||||||
times this size is used (to allow for buffering "before" and "after"
|
very long lines, a larger buffer may be needed; this is handled by
|
||||||
lines). An error occurs if a line overflows the buffer.
|
automatically extending the buffer, up to the limit specified by --max-
|
||||||
|
buffer-size. The default values for these parameters are specified when
|
||||||
|
pcre2grep is built, with the default defaults being 20K and 1M respec-
|
||||||
|
tively. An error occurs if a line is too long and the buffer can no
|
||||||
|
longer be expanded.
|
||||||
|
|
||||||
|
The block of memory that is actually used is three times the "buffer
|
||||||
|
size", to allow for buffering "before" and "after" lines. If the buffer
|
||||||
|
size is too small, fewer than requested "before" and "after" lines may
|
||||||
|
be output.
|
||||||
|
|
||||||
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the
|
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the
|
||||||
greater. BUFSIZ is defined in <stdio.h>. When there is more than one
|
greater. BUFSIZ is defined in <stdio.h>. When there is more than one
|
||||||
|
@ -126,28 +135,31 @@ OPTIONS
|
||||||
names that start with hyphens.
|
names that start with hyphens.
|
||||||
|
|
||||||
-A number, --after-context=number
|
-A number, --after-context=number
|
||||||
Output number lines of context after each matching line. If
|
Output up to number lines of context after each matching
|
||||||
file names and/or line numbers are being output, a hyphen
|
line. Fewer lines are output if the next match or the end of
|
||||||
separator is used instead of a colon for the context lines. A
|
the file is reached, or if the processing buffer size has
|
||||||
line containing "--" is output between each group of lines,
|
been set too small. If file names and/or line numbers are
|
||||||
unless they are in fact contiguous in the input file. The
|
being output, a hyphen separator is used instead of a colon
|
||||||
value of number is expected to be relatively small. However,
|
for the context lines. A line containing "--" is output
|
||||||
pcre2grep guarantees to have up to 8K of following text
|
between each group of lines, unless they are in fact contigu-
|
||||||
available for context output.
|
ous in the input file. The value of number is expected to be
|
||||||
|
relatively small. When -c is used, -A is ignored.
|
||||||
|
|
||||||
-a, --text
|
-a, --text
|
||||||
Treat binary files as text. This is equivalent to --binary-
|
Treat binary files as text. This is equivalent to --binary-
|
||||||
files=text.
|
files=text.
|
||||||
|
|
||||||
-B number, --before-context=number
|
-B number, --before-context=number
|
||||||
Output number lines of context before each matching line. If
|
Output up to number lines of context before each matching
|
||||||
file names and/or line numbers are being output, a hyphen
|
line. Fewer lines are output if the previous match or the
|
||||||
separator is used instead of a colon for the context lines. A
|
start of the file is within number lines, or if the process-
|
||||||
line containing "--" is output between each group of lines,
|
ing buffer size has been set too small. If file names and/or
|
||||||
unless they are in fact contiguous in the input file. The
|
line numbers are being output, a hyphen separator is used
|
||||||
value of number is expected to be relatively small. However,
|
instead of a colon for the context lines. A line containing
|
||||||
pcre2grep guarantees to have up to 8K of preceding text
|
"--" is output between each group of lines, unless they are
|
||||||
available for context output.
|
in fact contiguous in the input file. The value of number is
|
||||||
|
expected to be relatively small. When -c is used, -B is
|
||||||
|
ignored.
|
||||||
|
|
||||||
--binary-files=word
|
--binary-files=word
|
||||||
Specify how binary files are to be processed. If the word is
|
Specify how binary files are to be processed. If the word is
|
||||||
|
@ -164,8 +176,9 @@ OPTIONS
|
||||||
any output or affecting the return code.
|
any output or affecting the return code.
|
||||||
|
|
||||||
--buffer-size=number
|
--buffer-size=number
|
||||||
Set the parameter that controls how much memory is used for
|
Set the parameter that controls how much memory is obtained
|
||||||
buffering files that are being scanned.
|
at the start of processing for buffering files that are being
|
||||||
|
scanned. See also --max-buffer-size below.
|
||||||
|
|
||||||
-C number, --context=number
|
-C number, --context=number
|
||||||
Output number lines of context both before and after each
|
Output number lines of context both before and after each
|
||||||
|
@ -174,19 +187,21 @@ OPTIONS
|
||||||
|
|
||||||
-c, --count
|
-c, --count
|
||||||
Do not output lines from the files that are being scanned;
|
Do not output lines from the files that are being scanned;
|
||||||
instead output the number of matches (or non-matches if -v is
|
instead output the number of lines that would have been
|
||||||
used) that would otherwise have caused lines to be shown. By
|
shown, either because they matched, or, if -v is set, because
|
||||||
default, this count is the same as the number of suppressed
|
they failed to match. By default, this count is exactly the
|
||||||
lines, but if the -M (multiline) option is used (without -v),
|
same as the number of lines that would have been output, but
|
||||||
there may be more suppressed lines than the number of
|
if the -M (multiline) option is used (without -v), there may
|
||||||
matches.
|
be more suppressed lines than the count (that is, the number
|
||||||
|
of matches).
|
||||||
|
|
||||||
If no lines are selected, the number zero is output. If sev-
|
If no lines are selected, the number zero is output. If sev-
|
||||||
eral files are are being scanned, a count is output for each
|
eral files are are being scanned, a count is output for each
|
||||||
of them. However, if the --files-with-matches option is also
|
of them and the -t option can be used to cause a total to be
|
||||||
used, only those files whose counts are greater than zero are
|
output at the end. However, if the --files-with-matches
|
||||||
listed. When -c is used, the -A, -B, and -C options are
|
option is also used, only those files whose counts are
|
||||||
ignored.
|
greater than zero are listed. When -c is used, the -A, -B,
|
||||||
|
and -C options are ignored.
|
||||||
|
|
||||||
--colour, --color
|
--colour, --color
|
||||||
If this option is given without any data, it is equivalent to
|
If this option is given without any data, it is equivalent to
|
||||||
|
@ -205,13 +220,14 @@ OPTIONS
|
||||||
them all.
|
them all.
|
||||||
|
|
||||||
The colour that is used can be specified by setting the envi-
|
The colour that is used can be specified by setting the envi-
|
||||||
ronment variable PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The
|
ronment variable PCRE2GREP_COLOUR or PCRE2GREP_COLOR. If nei-
|
||||||
value of this variable should be a string of two numbers,
|
ther of these are set, pcre2grep looks for GREP_COLOUR or
|
||||||
separated by a semicolon. They are copied directly into the
|
GREP_COLOR. The value of the variable should be a string of
|
||||||
control string for setting colour on a terminal, so it is
|
two numbers, separated by a semicolon. They are copied
|
||||||
your responsibility to ensure that they make sense. If nei-
|
directly into the control string for setting colour on a ter-
|
||||||
ther of the environment variables is set, the default is
|
minal, so it is your responsibility to ensure that they make
|
||||||
"1;31", which gives red.
|
sense. If neither of the environment variables is set, the
|
||||||
|
default is "1;31", which gives red.
|
||||||
|
|
||||||
-D action, --devices=action
|
-D action, --devices=action
|
||||||
If an input path is not a regular file or a directory,
|
If an input path is not a regular file or a directory,
|
||||||
|
@ -299,12 +315,12 @@ OPTIONS
|
||||||
Read patterns from the file, one per line, and match them
|
Read patterns from the file, one per line, and match them
|
||||||
against each line of input. What constitutes a newline when
|
against each line of input. What constitutes a newline when
|
||||||
reading the file is the operating system's default. The
|
reading the file is the operating system's default. The
|
||||||
--newline option has no effect on this option. Trailing white
|
--newline option has no effect on this option. Trailing
|
||||||
space is removed from each line, and blank lines are ignored.
|
white space is removed from each line, and blank lines are
|
||||||
An empty file contains no patterns and therefore matches
|
ignored. An empty file contains no patterns and therefore
|
||||||
nothing. See also the comments about multiple patterns versus
|
matches nothing. See also the comments about multiple pat-
|
||||||
a single pattern with alternatives in the description of -e
|
terns versus a single pattern with alternatives in the
|
||||||
above.
|
description of -e above.
|
||||||
|
|
||||||
If this option is given more than once, all the specified
|
If this option is given more than once, all the specified
|
||||||
files are read. A data line is output if any of the patterns
|
files are read. A data line is output if any of the patterns
|
||||||
|
@ -482,25 +498,27 @@ OPTIONS
|
||||||
tings are specified when the PCRE2 library is compiled, with
|
tings are specified when the PCRE2 library is compiled, with
|
||||||
the default default being 10 million.
|
the default default being 10 million.
|
||||||
|
|
||||||
|
--max-buffer-size=number
|
||||||
|
This limits the expansion of the processing buffer, whose
|
||||||
|
initial size can be set by --buffer-size. The maximum buffer
|
||||||
|
size is silently forced to be no smaller than the starting
|
||||||
|
buffer size.
|
||||||
|
|
||||||
-M, --multiline
|
-M, --multiline
|
||||||
Allow patterns to match more than one line. When this option
|
Allow patterns to match more than one line. When this option
|
||||||
is given, patterns may usefully contain literal newline char-
|
is set, the PCRE2 library is called in "multiline" mode. This
|
||||||
acters and internal occurrences of ^ and $ characters. The
|
allows a matched string to extend past the end of a line and
|
||||||
output for a successful match may consist of more than one
|
continue on one or more subsequent lines. Patterns used with
|
||||||
line. The first is the line in which the match started, and
|
-M may usefully contain literal newline characters and inter-
|
||||||
the last is the line in which the match ended. If the matched
|
nal occurrences of ^ and $ characters. The output for a suc-
|
||||||
string ends with a newline sequence the output ends at the
|
cessful match may consist of more than one line. The first
|
||||||
end of that line.
|
line is the line in which the match started, and the last
|
||||||
|
line is the line in which the match ended. If the matched
|
||||||
When this option is set, the PCRE2 library is called in "mul-
|
string ends with a newline sequence, the output ends at the
|
||||||
tiline" mode. This allows a matched string to extend past the
|
end of that line. If -v is set, none of the lines in a
|
||||||
end of a line and continue on one or more subsequent lines.
|
multi-line match are output. Once a match has been handled,
|
||||||
However, pcre2grep still processes the input line by line.
|
scanning restarts at the beginning of the line after the one
|
||||||
Once a match has been handled, scanning restarts at the
|
in which the match ended.
|
||||||
beginning of the next line, just as it does when -M is not
|
|
||||||
present. This means that it is possible for the second or
|
|
||||||
subsequent lines in a multiline match to be output again as
|
|
||||||
part of another match.
|
|
||||||
|
|
||||||
The newline sequence that separates multiple lines must be
|
The newline sequence that separates multiple lines must be
|
||||||
matched as part of the pattern. For example, to find the
|
matched as part of the pattern. For example, to find the
|
||||||
|
@ -517,13 +535,9 @@ OPTIONS
|
||||||
|
|
||||||
There is a limit to the number of lines that can be matched,
|
There is a limit to the number of lines that can be matched,
|
||||||
imposed by the way that pcre2grep buffers the input file as
|
imposed by the way that pcre2grep buffers the input file as
|
||||||
it scans it. However, pcre2grep ensures that at least 8K
|
it scans it. With a sufficiently large processing buffer,
|
||||||
characters or the rest of the file (whichever is the shorter)
|
this should not be a problem, but the -M option does not work
|
||||||
are available for forward matching, and similarly the previ-
|
when input is read line by line (see --line-buffered.)
|
||||||
ous 8K characters (or all the previous characters, if fewer
|
|
||||||
than 8K) are guaranteed to be available for lookbehind asser-
|
|
||||||
tions. The -M option does not work when input is read line by
|
|
||||||
line (see --line-buffered.)
|
|
||||||
|
|
||||||
-N newline-type, --newline=newline-type
|
-N newline-type, --newline=newline-type
|
||||||
The PCRE2 library supports five different conventions for
|
The PCRE2 library supports five different conventions for
|
||||||
|
@ -570,14 +584,15 @@ OPTIONS
|
||||||
Show only the part of the line that matched a pattern instead
|
Show only the part of the line that matched a pattern instead
|
||||||
of the whole line. In this mode, no context is shown. That
|
of the whole line. In this mode, no context is shown. That
|
||||||
is, the -A, -B, and -C options are ignored. If there is more
|
is, the -A, -B, and -C options are ignored. If there is more
|
||||||
than one match in a line, each of them is shown separately.
|
than one match in a line, each of them is shown separately,
|
||||||
If -o is combined with -v (invert the sense of the match to
|
on a separate line of output. If -o is combined with -v
|
||||||
find non-matching lines), no output is generated, but the
|
(invert the sense of the match to find non-matching lines),
|
||||||
return code is set appropriately. If the matched portion of
|
no output is generated, but the return code is set appropri-
|
||||||
the line is empty, nothing is output unless the file name or
|
ately. If the matched portion of the line is empty, nothing
|
||||||
line number are being printed, in which case they are shown
|
is output unless the file name or line number are being
|
||||||
on an otherwise empty line. This option is mutually exclusive
|
printed, in which case they are shown on an otherwise empty
|
||||||
with --file-offsets and --line-offsets.
|
line. This option is mutually exclusive with --file-offsets
|
||||||
|
and --line-offsets.
|
||||||
|
|
||||||
-onumber, --only-matching=number
|
-onumber, --only-matching=number
|
||||||
Show only the part of the line that matched the capturing
|
Show only the part of the line that matched the capturing
|
||||||
|
@ -593,10 +608,11 @@ OPTIONS
|
||||||
put.
|
put.
|
||||||
|
|
||||||
If this option is given multiple times, multiple substrings
|
If this option is given multiple times, multiple substrings
|
||||||
are output, in the order the options are given. For example,
|
are output for each match, in the order the options are
|
||||||
-o3 -o1 -o3 causes the substrings matched by capturing paren-
|
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||||
theses 3 and 1 and then 3 again to be output. By default,
|
the substrings matched by capturing parentheses 3 and 1 and
|
||||||
there is no separator (but see the next option).
|
then 3 again to be output. By default, there is no separator
|
||||||
|
(but see the next option).
|
||||||
|
|
||||||
--om-separator=text
|
--om-separator=text
|
||||||
Specify a separating string for multiple occurrences of -o.
|
Specify a separating string for multiple occurrences of -o.
|
||||||
|
@ -624,6 +640,19 @@ OPTIONS
|
||||||
files. Such files are quietly skipped. However, the return
|
files. Such files are quietly skipped. However, the return
|
||||||
code is still 2, even if matches were found in other files.
|
code is still 2, even if matches were found in other files.
|
||||||
|
|
||||||
|
-t, --total-count
|
||||||
|
This option is useful when scanning more than one file. If
|
||||||
|
used on its own, -t suppresses all output except for a grand
|
||||||
|
total number of matching lines (or non-matching lines if -v
|
||||||
|
is used) in all the files. If -t is used with -c, a grand
|
||||||
|
total is output except when the previous output is just one
|
||||||
|
line. In other words, it is not output when just one file's
|
||||||
|
count is listed. If file names are being output, the grand
|
||||||
|
total is preceded by "TOTAL:". Otherwise, it appears as just
|
||||||
|
another number. The -t option is ignored when used with -L
|
||||||
|
(list files without matches), because the grand total would
|
||||||
|
always be zero.
|
||||||
|
|
||||||
-u, --utf-8
|
-u, --utf-8
|
||||||
Operate in UTF-8 mode. This option is available only if PCRE2
|
Operate in UTF-8 mode. This option is available only if PCRE2
|
||||||
has been compiled with UTF-8 support. All patterns (including
|
has been compiled with UTF-8 support. All patterns (including
|
||||||
|
@ -650,8 +679,9 @@ OPTIONS
|
||||||
-x, --line-regex, --line-regexp
|
-x, --line-regex, --line-regexp
|
||||||
Force the patterns to be anchored (each must start matching
|
Force the patterns to be anchored (each must start matching
|
||||||
at the beginning of a line) and in addition, require them to
|
at the beginning of a line) and in addition, require them to
|
||||||
match entire lines. This is equivalent to having ^ and $
|
match entire lines. In multiline mode the match may be more
|
||||||
characters at the start and end of each alternative top-level
|
than one line. This is equivalent to having \A and \Z charac-
|
||||||
|
ters at the start and end of each alternative top-level
|
||||||
branch in every pattern. This option applies only to the pat-
|
branch in every pattern. This option applies only to the pat-
|
||||||
terns that are matched against the contents of files; it does
|
terns that are matched against the contents of files; it does
|
||||||
not apply to patterns specified by any of the --include or
|
not apply to patterns specified by any of the --include or
|
||||||
|
@ -822,5 +852,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 19 June 2016
|
Last updated: 31 October 2016
|
||||||
Copyright (c) 1997-2016 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
|
|
|
@ -558,6 +558,7 @@ PATTERN MODIFIERS
|
||||||
pushcopy push a copy onto the stack
|
pushcopy push a copy onto the stack
|
||||||
stackguard=<number> test the stackguard feature
|
stackguard=<number> test the stackguard feature
|
||||||
tables=[0|1|2] select internal tables
|
tables=[0|1|2] select internal tables
|
||||||
|
use_length do not zero-terminate the pattern
|
||||||
utf8_input treat input as UTF-8
|
utf8_input treat input as UTF-8
|
||||||
|
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
|
@ -631,6 +632,16 @@ PATTERN MODIFIERS
|
||||||
testing that pcre2_compile() behaves correctly in this case (it uses
|
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
|
|
||||||
|
Specifying the pattern's length
|
||||||
|
|
||||||
|
By default, patterns are passed to the compiling functions as zero-ter-
|
||||||
|
minated strings. When using the POSIX wrapper API, there is no other
|
||||||
|
option. However, when using PCRE2's native API, patterns can be passed
|
||||||
|
by length instead of being zero-terminated. The use_length modifier
|
||||||
|
causes this to happen. Using a length happens automatically (whether
|
||||||
|
or not use_length is set) when hex is set, because patterns specified
|
||||||
|
in hexadecimal may contain binary zeros.
|
||||||
|
|
||||||
Specifying pattern characters in hexadecimal
|
Specifying pattern characters in hexadecimal
|
||||||
|
|
||||||
The hex modifier specifies that the characters of the pattern, except
|
The hex modifier specifies that the characters of the pattern, except
|
||||||
|
@ -652,10 +663,11 @@ PATTERN MODIFIERS
|
||||||
ing the delimiter within a substring. The hex and expand modifiers are
|
ing the delimiter within a substring. The hex and expand modifiers are
|
||||||
mutually exclusive.
|
mutually exclusive.
|
||||||
|
|
||||||
By default, pcre2test passes patterns as zero-terminated strings to
|
The POSIX API cannot be used with patterns specified in hexadecimal
|
||||||
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
|
because they may contain binary zeros, which conflicts with regcomp()'s
|
||||||
for patterns specified with the hex modifier, the actual length of the
|
requirement for a zero-terminated string. Such patterns are always
|
||||||
pattern is passed.
|
passed to pcre2_compile() as a string with a length, not as zero-termi-
|
||||||
|
nated.
|
||||||
|
|
||||||
Specifying wide characters in 16-bit and 32-bit modes
|
Specifying wide characters in 16-bit and 32-bit modes
|
||||||
|
|
||||||
|
@ -1589,5 +1601,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 02 August 2016
|
Last updated: 04 November 2016
|
||||||
Copyright (c) 1997-2016 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
|
|
|
@ -465,7 +465,9 @@ PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||||
pcre2_code_free(pcre2_code *); \
|
pcre2_code_free(pcre2_code *); \
|
||||||
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||||
*pcre2_code_copy(const pcre2_code *);
|
*pcre2_code_copy(const pcre2_code *); \
|
||||||
|
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||||
|
*pcre2_code_copy_with_tables(const pcre2_code *);
|
||||||
|
|
||||||
|
|
||||||
/* Functions that give information about a compiled pattern. */
|
/* Functions that give information about a compiled pattern. */
|
||||||
|
@ -629,6 +631,7 @@ pcre2_compile are called by application code. */
|
||||||
|
|
||||||
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
||||||
#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_)
|
#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_)
|
||||||
|
#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_)
|
||||||
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
||||||
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
||||||
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
||||||
|
|
|
@ -465,7 +465,9 @@ PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||||
pcre2_code_free(pcre2_code *); \
|
pcre2_code_free(pcre2_code *); \
|
||||||
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||||
*pcre2_code_copy(const pcre2_code *);
|
*pcre2_code_copy(const pcre2_code *); \
|
||||||
|
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||||
|
*pcre2_code_copy_with_tables(const pcre2_code *);
|
||||||
|
|
||||||
|
|
||||||
/* Functions that give information about a compiled pattern. */
|
/* Functions that give information about a compiled pattern. */
|
||||||
|
@ -629,6 +631,7 @@ pcre2_compile are called by application code. */
|
||||||
|
|
||||||
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
||||||
#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_)
|
#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_)
|
||||||
|
#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_)
|
||||||
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
||||||
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
||||||
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
||||||
|
|
|
@ -1042,6 +1042,45 @@ return newcode;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*************************************************
|
||||||
|
* Copy compiled code and character tables *
|
||||||
|
*************************************************/
|
||||||
|
|
||||||
|
/* Compiled JIT code cannot be copied, so the new compiled block has no
|
||||||
|
associated JIT data. This version of code_copy also makes a separate copy of
|
||||||
|
the character tables. */
|
||||||
|
|
||||||
|
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
|
||||||
|
pcre2_code_copy_with_tables(const pcre2_code *code)
|
||||||
|
{
|
||||||
|
PCRE2_SIZE* ref_count;
|
||||||
|
pcre2_code *newcode;
|
||||||
|
uint8_t *newtables;
|
||||||
|
|
||||||
|
if (code == NULL) return NULL;
|
||||||
|
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
|
||||||
|
if (newcode == NULL) return NULL;
|
||||||
|
memcpy(newcode, code, code->blocksize);
|
||||||
|
newcode->executable_jit = NULL;
|
||||||
|
|
||||||
|
newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
|
||||||
|
code->memctl.memory_data);
|
||||||
|
if (newtables == NULL)
|
||||||
|
{
|
||||||
|
code->memctl.free((void *)newcode, code->memctl.memory_data);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
memcpy(newtables, code->tables, tables_length);
|
||||||
|
ref_count = (PCRE2_SIZE *)(newtables + tables_length);
|
||||||
|
*ref_count = 1;
|
||||||
|
|
||||||
|
newcode->tables = newtables;
|
||||||
|
newcode->flags |= PCRE2_DEREF_TABLES;
|
||||||
|
return newcode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*************************************************
|
/*************************************************
|
||||||
* Free compiled code *
|
* Free compiled code *
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
|
@ -427,15 +427,13 @@ so many of them that they are split into two fields. */
|
||||||
#define CTL_NULLCONTEXT 0x00200000u
|
#define CTL_NULLCONTEXT 0x00200000u
|
||||||
#define CTL_POSIX 0x00400000u
|
#define CTL_POSIX 0x00400000u
|
||||||
#define CTL_POSIX_NOSUB 0x00800000u
|
#define CTL_POSIX_NOSUB 0x00800000u
|
||||||
#define CTL_PUSH 0x01000000u
|
#define CTL_PUSH 0x01000000u /* These three must be */
|
||||||
#define CTL_PUSHCOPY 0x02000000u
|
#define CTL_PUSHCOPY 0x02000000u /* all in the same */
|
||||||
#define CTL_STARTCHAR 0x04000000u
|
#define CTL_PUSHTABLESCOPY 0x04000000u /* word. */
|
||||||
#define CTL_USE_LENGTH 0x08000000u /* Same word as HEXPAT */
|
#define CTL_STARTCHAR 0x08000000u
|
||||||
#define CTL_UTF8_INPUT 0x10000000u
|
#define CTL_USE_LENGTH 0x10000000u /* Same word as HEXPAT */
|
||||||
#define CTL_ZERO_TERMINATE 0x20000000u
|
#define CTL_UTF8_INPUT 0x20000000u
|
||||||
|
#define CTL_ZERO_TERMINATE 0x40000000u
|
||||||
#define CTL_NL_SET 0x40000000u /* Informational */
|
|
||||||
#define CTL_BSR_SET 0x80000000u /* Informational */
|
|
||||||
|
|
||||||
/* Second control word */
|
/* Second control word */
|
||||||
|
|
||||||
|
@ -444,6 +442,9 @@ so many of them that they are split into two fields. */
|
||||||
#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000004u
|
#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000004u
|
||||||
#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000008u
|
#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000008u
|
||||||
|
|
||||||
|
#define CTL_NL_SET 0x40000000u /* Informational */
|
||||||
|
#define CTL_BSR_SET 0x80000000u /* Informational */
|
||||||
|
|
||||||
/* Combinations */
|
/* Combinations */
|
||||||
|
|
||||||
#define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */
|
#define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */
|
||||||
|
@ -608,6 +609,7 @@ static modstruct modlist[] = {
|
||||||
{ "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) },
|
{ "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) },
|
||||||
{ "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) },
|
{ "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) },
|
||||||
{ "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) },
|
{ "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) },
|
||||||
|
{ "pushtablescopy", MOD_PAT, MOD_CTL, CTL_PUSHTABLESCOPY, PO(control) },
|
||||||
{ "recursion_limit", MOD_CTM, MOD_INT, 0, MO(recursion_limit) },
|
{ "recursion_limit", MOD_CTM, MOD_INT, 0, MO(recursion_limit) },
|
||||||
{ "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) },
|
{ "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) },
|
||||||
{ "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) },
|
{ "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) },
|
||||||
|
@ -651,10 +653,10 @@ static modstruct modlist[] = {
|
||||||
|
|
||||||
#define PUSH_SUPPORTED_COMPILE_CONTROLS ( \
|
#define PUSH_SUPPORTED_COMPILE_CONTROLS ( \
|
||||||
CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \
|
CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \
|
||||||
CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_PUSHCOPY|CTL_BSR_SET|CTL_NL_SET| \
|
CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_PUSHCOPY|CTL_PUSHTABLESCOPY| \
|
||||||
CTL_USE_LENGTH)
|
CTL_USE_LENGTH)
|
||||||
|
|
||||||
#define PUSH_SUPPORTED_COMPILE_CONTROLS2 (0)
|
#define PUSH_SUPPORTED_COMPILE_CONTROLS2 (CTL_BSR_SET|CTL_NL_SET)
|
||||||
|
|
||||||
/* Controls that apply only at compile time with 'push'. */
|
/* Controls that apply only at compile time with 'push'. */
|
||||||
|
|
||||||
|
@ -664,7 +666,7 @@ static modstruct modlist[] = {
|
||||||
/* Controls that are forbidden with #pop or #popcopy. */
|
/* Controls that are forbidden with #pop or #popcopy. */
|
||||||
|
|
||||||
#define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_POSIX_NOSUB|CTL_PUSH| \
|
#define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_POSIX_NOSUB|CTL_PUSH| \
|
||||||
CTL_PUSHCOPY|CTL_USE_LENGTH)
|
CTL_PUSHCOPY|CTL_PUSHTABLESCOPY|CTL_USE_LENGTH)
|
||||||
|
|
||||||
/* Pattern controls that are mutually exclusive. At present these are all in
|
/* Pattern controls that are mutually exclusive. At present these are all in
|
||||||
the first control word. Note that CTL_POSIX_NOSUB is always accompanied by
|
the first control word. Note that CTL_POSIX_NOSUB is always accompanied by
|
||||||
|
@ -674,6 +676,7 @@ static uint32_t exclusive_pat_controls[] = {
|
||||||
CTL_POSIX | CTL_HEXPAT,
|
CTL_POSIX | CTL_HEXPAT,
|
||||||
CTL_POSIX | CTL_PUSH,
|
CTL_POSIX | CTL_PUSH,
|
||||||
CTL_POSIX | CTL_PUSHCOPY,
|
CTL_POSIX | CTL_PUSHCOPY,
|
||||||
|
CTL_POSIX | CTL_PUSHTABLESCOPY,
|
||||||
CTL_POSIX | CTL_USE_LENGTH,
|
CTL_POSIX | CTL_USE_LENGTH,
|
||||||
CTL_EXPAND | CTL_HEXPAT };
|
CTL_EXPAND | CTL_HEXPAT };
|
||||||
|
|
||||||
|
@ -973,6 +976,14 @@ are supported. */
|
||||||
else \
|
else \
|
||||||
a = (void *)pcre2_code_copy_32(G(b,32))
|
a = (void *)pcre2_code_copy_32(G(b,32))
|
||||||
|
|
||||||
|
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) \
|
||||||
|
if (test_mode == PCRE8_MODE) \
|
||||||
|
a = (void *)pcre2_code_copy_with_tables_8(G(b,8)); \
|
||||||
|
else if (test_mode == PCRE16_MODE) \
|
||||||
|
a = (void *)pcre2_code_copy_with_tables_16(G(b,16)); \
|
||||||
|
else \
|
||||||
|
a = (void *)pcre2_code_copy_with_tables_32(G(b,32))
|
||||||
|
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
if (test_mode == PCRE8_MODE) \
|
if (test_mode == PCRE8_MODE) \
|
||||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g); \
|
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g); \
|
||||||
|
@ -1436,6 +1447,12 @@ the three different cases. */
|
||||||
else \
|
else \
|
||||||
a = (void *)G(pcre2_code_copy_,BITTWO)(G(b,BITTWO))
|
a = (void *)G(pcre2_code_copy_,BITTWO)(G(b,BITTWO))
|
||||||
|
|
||||||
|
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) \
|
||||||
|
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||||
|
a = (void *)G(pcre2_code_copy_with_tables_,BITONE)(G(b,BITONE)); \
|
||||||
|
else \
|
||||||
|
a = (void *)G(pcre2_code_copy_with_tables_,BITTWO)(G(b,BITTWO))
|
||||||
|
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||||
G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,g); \
|
G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,g); \
|
||||||
|
@ -1773,6 +1790,7 @@ the three different cases. */
|
||||||
(int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c)
|
(int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c)
|
||||||
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,8) = pcre2_code_copy_8(b)
|
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,8) = pcre2_code_copy_8(b)
|
||||||
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_8(G(b,8))
|
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_8(G(b,8))
|
||||||
|
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_8(G(b,8))
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g)
|
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g)
|
||||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||||
|
@ -1868,6 +1886,7 @@ the three different cases. */
|
||||||
(int (*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c)
|
(int (*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c)
|
||||||
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,16) = pcre2_code_copy_16(b)
|
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,16) = pcre2_code_copy_16(b)
|
||||||
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_16(G(b,16))
|
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_16(G(b,16))
|
||||||
|
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_16(G(b,16))
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,g)
|
G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,g)
|
||||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||||
|
@ -1963,6 +1982,7 @@ the three different cases. */
|
||||||
(int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c)
|
(int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c)
|
||||||
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,32) = pcre2_code_copy_32(b)
|
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,32) = pcre2_code_copy_32(b)
|
||||||
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_32(G(b,32))
|
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_32(G(b,32))
|
||||||
|
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_32(G(b,32))
|
||||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||||
G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,g)
|
G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,g)
|
||||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||||
|
@ -3435,8 +3455,8 @@ for (;;)
|
||||||
#else
|
#else
|
||||||
*((uint16_t *)field) = PCRE2_BSR_UNICODE;
|
*((uint16_t *)field) = PCRE2_BSR_UNICODE;
|
||||||
#endif
|
#endif
|
||||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control &= ~CTL_BSR_SET;
|
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 &= ~CTL_BSR_SET;
|
||||||
else dctl->control &= ~CTL_BSR_SET;
|
else dctl->control2 &= ~CTL_BSR_SET;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -3445,8 +3465,8 @@ for (;;)
|
||||||
else if (len == 7 && strncmpic(pp, (const uint8_t *)"unicode", 7) == 0)
|
else if (len == 7 && strncmpic(pp, (const uint8_t *)"unicode", 7) == 0)
|
||||||
*((uint16_t *)field) = PCRE2_BSR_UNICODE;
|
*((uint16_t *)field) = PCRE2_BSR_UNICODE;
|
||||||
else goto INVALID_VALUE;
|
else goto INVALID_VALUE;
|
||||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control |= CTL_BSR_SET;
|
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 |= CTL_BSR_SET;
|
||||||
else dctl->control |= CTL_BSR_SET;
|
else dctl->control2 |= CTL_BSR_SET;
|
||||||
}
|
}
|
||||||
pp = ep;
|
pp = ep;
|
||||||
break;
|
break;
|
||||||
|
@ -3513,14 +3533,14 @@ for (;;)
|
||||||
if (i == 0)
|
if (i == 0)
|
||||||
{
|
{
|
||||||
*((uint16_t *)field) = NEWLINE_DEFAULT;
|
*((uint16_t *)field) = NEWLINE_DEFAULT;
|
||||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control &= ~CTL_NL_SET;
|
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 &= ~CTL_NL_SET;
|
||||||
else dctl->control &= ~CTL_NL_SET;
|
else dctl->control2 &= ~CTL_NL_SET;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
*((uint16_t *)field) = i;
|
*((uint16_t *)field) = i;
|
||||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control |= CTL_NL_SET;
|
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 |= CTL_NL_SET;
|
||||||
else dctl->control |= CTL_NL_SET;
|
else dctl->control2 |= CTL_NL_SET;
|
||||||
}
|
}
|
||||||
pp = ep;
|
pp = ep;
|
||||||
break;
|
break;
|
||||||
|
@ -3691,7 +3711,7 @@ Returns: nothing
|
||||||
static void
|
static void
|
||||||
show_controls(uint32_t controls, uint32_t controls2, const char *before)
|
show_controls(uint32_t controls, uint32_t controls2, const char *before)
|
||||||
{
|
{
|
||||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
before,
|
before,
|
||||||
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
||||||
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
||||||
|
@ -3699,7 +3719,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s
|
||||||
((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "",
|
((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "",
|
||||||
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
|
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
|
||||||
((controls & CTL_BINCODE) != 0)? " bincode" : "",
|
((controls & CTL_BINCODE) != 0)? " bincode" : "",
|
||||||
((controls & CTL_BSR_SET) != 0)? " bsr" : "",
|
((controls2 & CTL_BSR_SET) != 0)? " bsr" : "",
|
||||||
((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "",
|
((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "",
|
||||||
((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "",
|
((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "",
|
||||||
((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "",
|
((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "",
|
||||||
|
@ -3715,12 +3735,13 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s
|
||||||
((controls & CTL_JITVERIFY) != 0)? " jitverify" : "",
|
((controls & CTL_JITVERIFY) != 0)? " jitverify" : "",
|
||||||
((controls & CTL_MARK) != 0)? " mark" : "",
|
((controls & CTL_MARK) != 0)? " mark" : "",
|
||||||
((controls & CTL_MEMORY) != 0)? " memory" : "",
|
((controls & CTL_MEMORY) != 0)? " memory" : "",
|
||||||
((controls & CTL_NL_SET) != 0)? " newline" : "",
|
((controls2 & CTL_NL_SET) != 0)? " newline" : "",
|
||||||
((controls & CTL_NULLCONTEXT) != 0)? " null_context" : "",
|
((controls & CTL_NULLCONTEXT) != 0)? " null_context" : "",
|
||||||
((controls & CTL_POSIX) != 0)? " posix" : "",
|
((controls & CTL_POSIX) != 0)? " posix" : "",
|
||||||
((controls & CTL_POSIX_NOSUB) != 0)? " posix_nosub" : "",
|
((controls & CTL_POSIX_NOSUB) != 0)? " posix_nosub" : "",
|
||||||
((controls & CTL_PUSH) != 0)? " push" : "",
|
((controls & CTL_PUSH) != 0)? " push" : "",
|
||||||
((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "",
|
((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "",
|
||||||
|
((controls & CTL_PUSHTABLESCOPY) != 0)? " pushtablescopy" : "",
|
||||||
((controls & CTL_STARTCHAR) != 0)? " startchar" : "",
|
((controls & CTL_STARTCHAR) != 0)? " startchar" : "",
|
||||||
((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "",
|
((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "",
|
||||||
((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "",
|
((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "",
|
||||||
|
@ -4061,7 +4082,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
||||||
|
|
||||||
if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
|
if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
|
||||||
|
|
||||||
if ((pat_patctl.control & CTL_BSR_SET) != 0 ||
|
if ((pat_patctl.control2 & CTL_BSR_SET) != 0 ||
|
||||||
(FLD(compiled_code, flags) & PCRE2_BSR_SET) != 0)
|
(FLD(compiled_code, flags) & PCRE2_BSR_SET) != 0)
|
||||||
fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)?
|
fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)?
|
||||||
"any Unicode newline" : "CR, LF, or CRLF");
|
"any Unicode newline" : "CR, LF, or CRLF");
|
||||||
|
@ -4930,7 +4951,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
|
||||||
/* Handle compiling via the native interface. Controls that act later are
|
/* Handle compiling via the native interface. Controls that act later are
|
||||||
ignored with "push". Replacements are locked out. */
|
ignored with "push". Replacements are locked out. */
|
||||||
|
|
||||||
if ((pat_patctl.control & (CTL_PUSH|CTL_PUSHCOPY)) != 0)
|
if ((pat_patctl.control & (CTL_PUSH|CTL_PUSHCOPY|CTL_PUSHTABLESCOPY)) != 0)
|
||||||
{
|
{
|
||||||
if (pat_patctl.replacement[0] != 0)
|
if (pat_patctl.replacement[0] != 0)
|
||||||
{
|
{
|
||||||
|
@ -5031,7 +5052,7 @@ if (test_mode == PCRE32_MODE && pbuffer32 != NULL)
|
||||||
appropriate default newline setting, local_newline_default will be non-zero. We
|
appropriate default newline setting, local_newline_default will be non-zero. We
|
||||||
use this if there is no explicit newline modifier. */
|
use this if there is no explicit newline modifier. */
|
||||||
|
|
||||||
if ((pat_patctl.control & CTL_NL_SET) == 0 && local_newline_default != 0)
|
if ((pat_patctl.control2 & CTL_NL_SET) == 0 && local_newline_default != 0)
|
||||||
{
|
{
|
||||||
SETFLD(pat_context, newline_convention, local_newline_default);
|
SETFLD(pat_context, newline_convention, local_newline_default);
|
||||||
}
|
}
|
||||||
|
@ -5163,7 +5184,7 @@ if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0)
|
||||||
/* If an explicit newline modifier was given, set the information flag in the
|
/* If an explicit newline modifier was given, set the information flag in the
|
||||||
pattern so that it is preserved over push/pop. */
|
pattern so that it is preserved over push/pop. */
|
||||||
|
|
||||||
if ((pat_patctl.control & CTL_NL_SET) != 0)
|
if ((pat_patctl.control2 & CTL_NL_SET) != 0)
|
||||||
{
|
{
|
||||||
SETFLD(compiled_code, flags, FLD(compiled_code, flags) | PCRE2_NL_SET);
|
SETFLD(compiled_code, flags, FLD(compiled_code, flags) | PCRE2_NL_SET);
|
||||||
}
|
}
|
||||||
|
@ -5191,18 +5212,26 @@ if ((pat_patctl.control & CTL_PUSH) != 0)
|
||||||
SET(compiled_code, NULL);
|
SET(compiled_code, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The "pushcopy" control is similar, but pushes a copy of the pattern. This
|
/* The "pushcopy" and "pushtablescopy" controls are similar, but push a
|
||||||
tests the pcre2_code_copy() function. */
|
copy of the pattern, the latter with a copy of its character tables. This tests
|
||||||
|
the pcre2_code_copy() and pcre2_code_copy_with_tables() functions. */
|
||||||
|
|
||||||
if ((pat_patctl.control & CTL_PUSHCOPY) != 0)
|
if ((pat_patctl.control & (CTL_PUSHCOPY|CTL_PUSHTABLESCOPY)) != 0)
|
||||||
{
|
{
|
||||||
if (patstacknext >= PATSTACKSIZE)
|
if (patstacknext >= PATSTACKSIZE)
|
||||||
{
|
{
|
||||||
fprintf(outfile, "** Too many pushed patterns (max %d)\n", PATSTACKSIZE);
|
fprintf(outfile, "** Too many pushed patterns (max %d)\n", PATSTACKSIZE);
|
||||||
return PR_ABEND;
|
return PR_ABEND;
|
||||||
}
|
}
|
||||||
|
if ((pat_patctl.control & CTL_PUSHCOPY) != 0)
|
||||||
|
{
|
||||||
PCRE2_CODE_COPY_TO_VOID(patstack[patstacknext++], compiled_code);
|
PCRE2_CODE_COPY_TO_VOID(patstack[patstacknext++], compiled_code);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(patstack[patstacknext++],
|
||||||
|
compiled_code); }
|
||||||
|
}
|
||||||
|
|
||||||
return PR_OK;
|
return PR_OK;
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,4 +88,13 @@
|
||||||
|
|
||||||
#pop should give an error
|
#pop should give an error
|
||||||
|
|
||||||
|
/abcd/pushtablescopy
|
||||||
|
abcd
|
||||||
|
|
||||||
|
#popcopy
|
||||||
|
abcd
|
||||||
|
|
||||||
|
#pop
|
||||||
|
abcd
|
||||||
|
|
||||||
# End of testinput20
|
# End of testinput20
|
||||||
|
|
|
@ -135,4 +135,16 @@ Serialization failed: error -30: patterns do not all use the same character tabl
|
||||||
#pop should give an error
|
#pop should give an error
|
||||||
** Can't pop off an empty stack
|
** Can't pop off an empty stack
|
||||||
|
|
||||||
|
/abcd/pushtablescopy
|
||||||
|
abcd
|
||||||
|
0: abcd
|
||||||
|
|
||||||
|
#popcopy
|
||||||
|
abcd
|
||||||
|
0: abcd
|
||||||
|
|
||||||
|
#pop
|
||||||
|
abcd
|
||||||
|
0: abcd
|
||||||
|
|
||||||
# End of testinput20
|
# End of testinput20
|
||||||
|
|
Loading…
Reference in New Issue