Add pcre2_code_copy_with_tables().
This commit is contained in:
parent
43e541adda
commit
2aec84e37e
|
@ -181,6 +181,9 @@ wrong name.
|
|||
|
||||
27. In pcre2test, give some offset information for errors in hex patterns.
|
||||
|
||||
28. Implemented pcre2_code_copy_with_tables(), and added pushtablescopy to
|
||||
pcre2test for testing it.
|
||||
|
||||
|
||||
Version 10.22 29-July-2016
|
||||
--------------------------
|
||||
|
|
|
@ -25,6 +25,7 @@ dist_html_DATA = \
|
|||
doc/html/pcre2.html \
|
||||
doc/html/pcre2_callout_enumerate.html \
|
||||
doc/html/pcre2_code_copy.html \
|
||||
doc/html/pcre2_code_copy_with_tables.html \
|
||||
doc/html/pcre2_code_free.html \
|
||||
doc/html/pcre2_compile.html \
|
||||
doc/html/pcre2_compile_context_copy.html \
|
||||
|
@ -107,6 +108,7 @@ dist_man_MANS = \
|
|||
doc/pcre2.3 \
|
||||
doc/pcre2_callout_enumerate.3 \
|
||||
doc/pcre2_code_copy.3 \
|
||||
doc/pcre2_code_copy_with_tables.3 \
|
||||
doc/pcre2_code_free.3 \
|
||||
doc/pcre2_compile.3 \
|
||||
doc/pcre2_compile_context_copy.3 \
|
||||
|
|
|
@ -174,7 +174,11 @@ can skip ahead to the CMake section.
|
|||
|
||||
(11) If you want to use the pcre2grep command, compile and link
|
||||
src/pcre2grep.c; it uses only the basic 8-bit PCRE2 library (it does not
|
||||
need the pcre2posix library).
|
||||
need the pcre2posix library). If you have built the PCRE2 library with JIT
|
||||
support by defining SUPPORT_JIT in src/config.h, you can also define
|
||||
SUPPORT_PCRE2GREP_JIT, which causes pcre2grep to make use of JIT (unless
|
||||
it is run with --no-jit). If you define SUPPORT_PCRE2GREP_JIT without
|
||||
defining SUPPORT_JIT, pcre2grep does not try to make use of JIT.
|
||||
|
||||
|
||||
STACK SIZE IN WINDOWS ENVIRONMENTS
|
||||
|
@ -389,4 +393,4 @@ and executable, is in EBCDIC and native z/OS file formats and this is the
|
|||
recommended download site.
|
||||
|
||||
=============================
|
||||
Last Updated: 16 July 2015
|
||||
Last Updated: 13 October 2016
|
||||
|
|
|
@ -44,7 +44,7 @@ wrappers.
|
|||
|
||||
The distribution does contain a set of C wrapper functions for the 8-bit
|
||||
library that are based on the POSIX regular expression API (see the pcre2posix
|
||||
man page). These can be found in a library called libpcre2posix. Note that this
|
||||
man page). These can be found in a library called libpcre2-posix. Note that this
|
||||
just provides a POSIX calling interface to PCRE2; the regular expressions
|
||||
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
||||
and does not give full access to all of PCRE2's facilities.
|
||||
|
@ -58,8 +58,8 @@ renamed or pointed at by a link.
|
|||
If you are using the POSIX interface to PCRE2 and there is already a POSIX
|
||||
regex library installed on your system, as well as worrying about the regex.h
|
||||
header file (as mentioned above), you must also take care when linking programs
|
||||
to ensure that they link with PCRE2's libpcre2posix library. Otherwise they may
|
||||
pick up the POSIX functions of the same name from the other library.
|
||||
to ensure that they link with PCRE2's libpcre2-posix library. Otherwise they
|
||||
may pick up the POSIX functions of the same name from the other library.
|
||||
|
||||
One way of avoiding this confusion is to compile PCRE2 with the addition of
|
||||
-Dregcomp=PCRE2regcomp (and similarly for the other POSIX functions) to the
|
||||
|
@ -204,13 +204,6 @@ library. They are also documented in the pcre2build man page.
|
|||
--enable-newline-is-crlf, --enable-newline-is-anycrlf, or
|
||||
--enable-newline-is-any to the "configure" command, respectively.
|
||||
|
||||
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
||||
the standard tests will fail, because the lines in the test files end with
|
||||
LF. Even if the files are edited to change the line endings, there are likely
|
||||
to be some failures. With --enable-newline-is-anycrlf or
|
||||
--enable-newline-is-any, many tests should succeed, but there may be some
|
||||
failures.
|
||||
|
||||
. By default, the sequence \R in a pattern matches any Unicode line ending
|
||||
sequence. This is independent of the option specifying what PCRE2 considers
|
||||
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||
|
@ -253,13 +246,13 @@ library. They are also documented in the pcre2build man page.
|
|||
sizes in the pcre2stack man page.
|
||||
|
||||
. In the 8-bit library, the default maximum compiled pattern size is around
|
||||
64K. You can increase this by adding --with-link-size=3 to the "configure"
|
||||
command. PCRE2 then uses three bytes instead of two for offsets to different
|
||||
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
||||
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
||||
offsets. Increasing the internal link size reduces performance in the 8-bit
|
||||
and 16-bit libraries. In the 32-bit library, the link size setting is
|
||||
ignored, as 4-byte offsets are always used.
|
||||
64K bytes. You can increase this by adding --with-link-size=3 to the
|
||||
"configure" command. PCRE2 then uses three bytes instead of two for offsets
|
||||
to different parts of the compiled pattern. In the 16-bit library,
|
||||
--with-link-size=3 is the same as --with-link-size=4, which (in both
|
||||
libraries) uses four-byte offsets. Increasing the internal link size reduces
|
||||
performance in the 8-bit and 16-bit libraries. In the 32-bit library, the
|
||||
link size setting is ignored, as 4-byte offsets are always used.
|
||||
|
||||
. You can build PCRE2 so that its internal match() function that is called from
|
||||
pcre2_match() does not call itself recursively. Instead, it uses memory
|
||||
|
@ -339,12 +332,23 @@ library. They are also documented in the pcre2build man page.
|
|||
|
||||
Of course, the relevant libraries must be installed on your system.
|
||||
|
||||
. The default size (in bytes) of the internal buffer used by pcre2grep can be
|
||||
set by, for example:
|
||||
. The default starting size (in bytes) of the internal buffer used by pcre2grep
|
||||
can be set by, for example:
|
||||
|
||||
--with-pcre2grep-bufsize=51200
|
||||
|
||||
The value must be a plain integer. The default is 20480.
|
||||
The value must be a plain integer. The default is 20480. The amount of memory
|
||||
used by pcre2grep is actually three times this number, to allow for "before"
|
||||
and "after" lines. If very long lines are encountered, the buffer is
|
||||
automatically enlarged, up to a fixed maximum size.
|
||||
|
||||
. The default maximum size of pcre2grep's internal buffer can be set by, for
|
||||
example:
|
||||
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
|
||||
The default is either 1048576 or the value of --with-pcre2grep-bufsize,
|
||||
whichever is the larger.
|
||||
|
||||
. It is possible to compile pcre2test so that it links with the libreadline
|
||||
or libedit libraries, by specifying, respectively,
|
||||
|
@ -369,6 +373,22 @@ library. They are also documented in the pcre2build man page.
|
|||
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||
should fix it.
|
||||
|
||||
. There is a special option called --enable-fuzz-support for use by people who
|
||||
want to run fuzzing tests on PCRE2. At present this applies only to the 8-bit
|
||||
library. If set, it causes an extra library called libpcre2-fuzzsupport.a to
|
||||
be built, but not installed. This contains a single function called
|
||||
LLVMFuzzerTestOneInput() whose arguments are a pointer to a string and the
|
||||
length of the string. When called, this function tries to compile the string
|
||||
as a pattern, and if that succeeds, to match it. This is done both with no
|
||||
options and with some random options bits that are generated from the string.
|
||||
Setting --enable-fuzz-support also causes a binary called pcre2fuzzcheck to
|
||||
be created. This is normally run under valgrind or used when PCRE2 is
|
||||
compiled with address sanitizing enabled. It calls the fuzzing function and
|
||||
outputs information about it is doing. The input strings are specified by
|
||||
arguments: if an argument starts with "=" the rest of it is a literal input
|
||||
string. Otherwise, it is assumed to be a file name, and the contents of the
|
||||
file are the test string.
|
||||
|
||||
The "configure" script builds the following files for the basic C library:
|
||||
|
||||
. Makefile the makefile that builds the library
|
||||
|
@ -543,7 +563,7 @@ script creates the .txt and HTML forms of the documentation from the man pages.
|
|||
|
||||
|
||||
Testing PCRE2
|
||||
------------
|
||||
-------------
|
||||
|
||||
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
||||
There is another script called RunGrepTest that tests the pcre2grep command.
|
||||
|
@ -757,6 +777,7 @@ The distribution should contain the files listed below.
|
|||
src/pcre2_xclass.c )
|
||||
|
||||
src/pcre2_printint.c debugging function that is used by pcre2test,
|
||||
src/pcre2_fuzzsupport.c function for (optional) fuzzing support
|
||||
|
||||
src/config.h.in template for config.h, when built by "configure"
|
||||
src/pcre2.h.in template for pcre2.h when built by "configure"
|
||||
|
@ -814,7 +835,7 @@ The distribution should contain the files listed below.
|
|||
libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config
|
||||
libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config
|
||||
libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config
|
||||
libpcre2posix.pc.in template for libpcre2posix.pc for pkg-config
|
||||
libpcre2-posix.pc.in template for libpcre2-posix.pc for pkg-config
|
||||
ltmain.sh file used to build a libtool script
|
||||
missing ) common stub for a few missing GNU programs while
|
||||
) installing, generated by automake
|
||||
|
@ -845,4 +866,4 @@ The distribution should contain the files listed below.
|
|||
Philip Hazel
|
||||
Email local part: ph10
|
||||
Email domain: cam.ac.uk
|
||||
Last updated: 01 April 2016
|
||||
Last updated: 01 November 2016
|
||||
|
|
|
@ -94,6 +94,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_code_copy.html">pcre2_code_copy</a></td>
|
||||
<td> Copy a compiled pattern</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_code_copy_with_tables.html">pcre2_code_copy_with_tables</a></td>
|
||||
<td> Copy a compiled pattern and its character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||
<td> Free a compiled pattern</td></tr>
|
||||
|
||||
|
|
|
@ -28,8 +28,9 @@ DESCRIPTION
|
|||
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||
any memory used by the JIT compiler. Without a subsequent call to
|
||||
<b>pcre2_jit_compile()</b>, the copy can be used only for non-JIT matching. The
|
||||
yield of the function is NULL if <i>code</i> is NULL or if sufficient memory
|
||||
cannot be obtained.
|
||||
pointer to the character tables is copied, not the tables themselves (see
|
||||
<b>pcre2_code_copy_with_tables()</b>). The yield of the function is NULL if
|
||||
<i>code</i> is NULL or if sufficient memory cannot be obtained.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>pcre2_code_copy_with_tables specification</title>
|
||||
</head>
|
||||
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||
<h1>pcre2_code_copy_with_tables man page</h1>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
||||
<p>
|
||||
This page is part of the PCRE2 HTML documentation. It was generated
|
||||
automatically from the original man page. If there is any nonsense in it,
|
||||
please consult the man page, in case the conversion went wrong.
|
||||
<br>
|
||||
<br><b>
|
||||
SYNOPSIS
|
||||
</b><br>
|
||||
<P>
|
||||
<b>#include <pcre2.h></b>
|
||||
</P>
|
||||
<P>
|
||||
<b>pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *<i>code</i>);</b>
|
||||
</P>
|
||||
<br><b>
|
||||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||
any memory used by the JIT compiler. Without a subsequent call to
|
||||
<b>pcre2_jit_compile()</b>, the copy can be used only for non-JIT matching.
|
||||
Unlike <b>pcre2_code_copy()</b>, a separate copy of the character tables is also
|
||||
made, with the new code pointing to it. This memory will be automatically freed
|
||||
when <b>pcre2_code_free()</b> is called. The yield of the function is NULL if
|
||||
<i>code</i> is NULL or if sufficient memory cannot be obtained.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
page and a description of the POSIX API in the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
page.
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
</p>
|
|
@ -26,8 +26,11 @@ SYNOPSIS
|
|||
DESCRIPTION
|
||||
</b><br>
|
||||
<P>
|
||||
This function sets, in a compile context, the maximum length (in code units) of
|
||||
the pattern that can be compiled. The result is always zero.
|
||||
This function sets, in a compile context, the maximum text length (in code
|
||||
units) of the pattern that can be compiled. The result is always zero. If a
|
||||
longer pattern is passed to <b>pcre2_compile()</b> there is an immediate error
|
||||
return. The default is effectively unlimited, being the largest value a
|
||||
PCRE2_SIZE variable can hold.
|
||||
</P>
|
||||
<P>
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
|
|
|
@ -294,6 +294,9 @@ document for an overview of all the PCRE2 documentation.
|
|||
<b>pcre2_code *pcre2_code_copy(const pcre2_code *<i>code</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *<i>code</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>int pcre2_get_error_message(int <i>errorcode</i>, PCRE2_UCHAR *<i>buffer</i>,</b>
|
||||
<b> PCRE2_SIZE <i>bufflen</i>);</b>
|
||||
<br>
|
||||
|
@ -567,8 +570,9 @@ If JIT is being used, but the JIT compilation is not being done immediately,
|
|||
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
||||
required. JIT compilation updates a pointer within the compiled code block, so
|
||||
a thread must gain unique write access to the pointer before calling
|
||||
<b>pcre2_jit_compile()</b>. Alternatively, <b>pcre2_code_copy()</b> can be used
|
||||
to obtain a private copy of the compiled code.
|
||||
<b>pcre2_jit_compile()</b>. Alternatively, <b>pcre2_code_copy()</b> or
|
||||
<b>pcre2_code_copy_with_tables()</b> can be used to obtain a private copy of the
|
||||
compiled code.
|
||||
</P>
|
||||
<br><b>
|
||||
Context blocks
|
||||
|
@ -736,7 +740,8 @@ functions, <i>pcre2_match()</i> and <i>pcre2_dfa_match()</i>.
|
|||
<br>
|
||||
This parameter ajusts the limit, set when PCRE2 is built (default 250), on the
|
||||
depth of parenthesis nesting in a pattern. This limit stops rogue patterns
|
||||
using up too much system stack when being compiled.
|
||||
using up too much system stack when being compiled. The limit applies to
|
||||
parentheses of all kinds, not just capturing parentheses.
|
||||
<b>int pcre2_set_compile_recursion_guard(pcre2_compile_context *<i>ccontext</i>,</b>
|
||||
<b> int (*<i>guard_function</i>)(uint32_t, void *), void *<i>user_data</i>);</b>
|
||||
<br>
|
||||
|
@ -1058,6 +1063,9 @@ zero.
|
|||
<br>
|
||||
<br>
|
||||
<b>pcre2_code *pcre2_code_copy(const pcre2_code *<i>code</i>);</b>
|
||||
<br>
|
||||
<br>
|
||||
<b>pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *<i>code</i>);</b>
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_compile()</b> function compiles a pattern into an internal form.
|
||||
|
@ -1079,9 +1087,22 @@ if the code has been processed by the JIT compiler (see
|
|||
<a href="#jitcompiling">below),</a>
|
||||
the JIT information cannot be copied (because it is position-dependent).
|
||||
The new copy can initially be used only for non-JIT matching, though it can be
|
||||
passed to <b>pcre2_jit_compile()</b> if required. The <b>pcre2_code_copy()</b>
|
||||
function provides a way for individual threads in a multithreaded application
|
||||
to acquire a private copy of shared compiled code.
|
||||
passed to <b>pcre2_jit_compile()</b> if required.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_code_copy()</b> function provides a way for individual threads in a
|
||||
multithreaded application to acquire a private copy of shared compiled code.
|
||||
However, it does not make a copy of the character tables used by the compiled
|
||||
pattern; the new pattern code points to the same tables as the original code.
|
||||
(See
|
||||
<a href="#jitcompiling">"Locale Support"</a>
|
||||
below for details of these character tables.) In many applications the same
|
||||
tables are used throughout, so this behaviour is appropriate. Nevertheless,
|
||||
there are occasions when a copy of a compiled pattern and the relevant tables
|
||||
are needed. The <b>pcre2_code_copy_with_tables()</b> provides this facility.
|
||||
Copies of both the code and the tables are made, with the new code pointing to
|
||||
the new tables. The memory for the new tables is automatically freed when
|
||||
<b>pcre2_code_free()</b> is called for the new copy of the compiled code.
|
||||
</P>
|
||||
<P>
|
||||
NOTE: When one of the matching functions is called, pointers to the compiled
|
||||
|
@ -1122,6 +1143,13 @@ error has occurred. The values are not defined when compilation is successful
|
|||
and <b>pcre2_compile()</b> returns a non-NULL value.
|
||||
</P>
|
||||
<P>
|
||||
The value returned in <i>erroroffset</i> is an indication of where in the
|
||||
pattern the error occurred. It is not necessarily the furthest point in the
|
||||
pattern that was read. For example, after the error "lookbehind assertion is
|
||||
not fixed length", the error offset points to the start of the failing
|
||||
assertion.
|
||||
</P>
|
||||
<P>
|
||||
The <b>pcre2_get_error_message()</b> function (see "Obtaining a textual error
|
||||
message"
|
||||
<a href="#geterrormessage">below)</a>
|
||||
|
@ -1215,8 +1243,8 @@ recognized, exactly as in the rest of the pattern.
|
|||
PCRE2_AUTO_CALLOUT
|
||||
</pre>
|
||||
If this bit is set, <b>pcre2_compile()</b> automatically inserts callout items,
|
||||
all with number 255, before each pattern item. For discussion of the callout
|
||||
facility, see the
|
||||
all with number 255, before each pattern item, except immediately before or
|
||||
after a callout in the pattern. For discussion of the callout facility, see the
|
||||
<a href="pcre2callout.html"><b>pcre2callout</b></a>
|
||||
documentation.
|
||||
<pre>
|
||||
|
@ -3235,7 +3263,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC41" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 June 2016
|
||||
Last updated: 22 November 2016
|
||||
<br>
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -34,9 +34,10 @@ please consult the man page, in case the conversion went wrong.
|
|||
<li><a name="TOC19" href="#SEC19">INCLUDING DEBUGGING CODE</a>
|
||||
<li><a name="TOC20" href="#SEC20">DEBUGGING WITH VALGRIND SUPPORT</a>
|
||||
<li><a name="TOC21" href="#SEC21">CODE COVERAGE REPORTING</a>
|
||||
<li><a name="TOC22" href="#SEC22">SEE ALSO</a>
|
||||
<li><a name="TOC23" href="#SEC23">AUTHOR</a>
|
||||
<li><a name="TOC24" href="#SEC24">REVISION</a>
|
||||
<li><a name="TOC22" href="#SEC22">SUPPORT FOR FUZZERS</a>
|
||||
<li><a name="TOC23" href="#SEC23">SEE ALSO</a>
|
||||
<li><a name="TOC24" href="#SEC24">AUTHOR</a>
|
||||
<li><a name="TOC25" href="#SEC25">REVISION</a>
|
||||
</ul>
|
||||
<br><a name="SEC1" href="#TOC1">BUILDING PCRE2</a><br>
|
||||
<P>
|
||||
|
@ -376,16 +377,19 @@ they are not.
|
|||
<P>
|
||||
<b>pcre2grep</b> uses an internal buffer to hold a "window" on the file it is
|
||||
scanning, in order to be able to output "before" and "after" lines when it
|
||||
finds a match. The size of the buffer is controlled by a parameter whose
|
||||
default value is 20K. The buffer itself is three times this size, but because
|
||||
of the way it is used for holding "before" lines, the longest line that is
|
||||
guaranteed to be processable is the parameter size. You can change the default
|
||||
parameter value by adding, for example,
|
||||
finds a match. The starting size of the buffer is controlled by a parameter
|
||||
whose default value is 20K. The buffer itself is three times this size, but
|
||||
because of the way it is used for holding "before" lines, the longest line that
|
||||
is guaranteed to be processable is the parameter size. If a longer line is
|
||||
encountered, <b>pcre2grep</b> automatically expands the buffer, up to a
|
||||
specified maximum size, whose default is 1M or the starting size, whichever is
|
||||
the larger. You can change the default parameter values by adding, for example,
|
||||
<pre>
|
||||
--with-pcre2grep-bufsize=50K
|
||||
--with-pcre2grep-bufsize=51200
|
||||
--with-pcre2grep-max-bufsize=2097152
|
||||
</pre>
|
||||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override this
|
||||
value by using --buffer-size on the command line.
|
||||
to the <b>configure</b> command. The caller of \fPpcre2grep\fP can override
|
||||
these values by using --buffer-size and --max-buffer-size on the command line.
|
||||
</P>
|
||||
<br><a name="SEC18" href="#TOC1">PCRE2TEST OPTION FOR LIBREADLINE SUPPORT</a><br>
|
||||
<P>
|
||||
|
@ -497,11 +501,32 @@ This cleans all coverage data including the generated coverage report. For more
|
|||
information about code coverage, see the <b>gcov</b> and <b>lcov</b>
|
||||
documentation.
|
||||
</P>
|
||||
<br><a name="SEC22" href="#TOC1">SEE ALSO</a><br>
|
||||
<br><a name="SEC22" href="#TOC1">SUPPORT FOR FUZZERS</a><br>
|
||||
<P>
|
||||
There is a special option for use by people who want to run fuzzing tests on
|
||||
PCRE2:
|
||||
<pre>
|
||||
--enable-fuzz-support
|
||||
</pre>
|
||||
At present this applies only to the 8-bit library. If set, it causes an extra
|
||||
library called libpcre2-fuzzsupport.a to be built, but not installed. This
|
||||
contains a single function called LLVMFuzzerTestOneInput() whose arguments are
|
||||
a pointer to a string and the length of the string. When called, this function
|
||||
tries to compile the string as a pattern, and if that succeeds, to match it.
|
||||
This is done both with no options and with some random options bits that are
|
||||
generated from the string. Setting --enable-fuzz-support also causes a binary
|
||||
called <b>pcre2fuzzcheck</b> to be created. This is normally run under valgrind
|
||||
or used when PCRE2 is compiled with address sanitizing enabled. It calls the
|
||||
fuzzing function and outputs information about it is doing. The input strings
|
||||
are specified by arguments: if an argument starts with "=" the rest of it is a
|
||||
literal input string. Otherwise, it is assumed to be a file name, and the
|
||||
contents of the file are the test string.
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">SEE ALSO</a><br>
|
||||
<P>
|
||||
<b>pcre2api</b>(3), <b>pcre2-config</b>(3).
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">AUTHOR</a><br>
|
||||
<br><a name="SEC24" href="#TOC1">AUTHOR</a><br>
|
||||
<P>
|
||||
Philip Hazel
|
||||
<br>
|
||||
|
@ -510,9 +535,9 @@ University Computing Service
|
|||
Cambridge, England.
|
||||
<br>
|
||||
</P>
|
||||
<br><a name="SEC24" href="#TOC1">REVISION</a><br>
|
||||
<br><a name="SEC25" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 01 April 2016
|
||||
Last updated: 01 November 2016
|
||||
<br>
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -57,11 +57,20 @@ two callout points:
|
|||
</pre>
|
||||
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
|
||||
automatically inserts callouts, all with number 255, before each item in the
|
||||
pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
|
||||
pattern except for immediately before or after a callout item in the pattern.
|
||||
For example, if PCRE2_AUTO_CALLOUT is used with the pattern
|
||||
<pre>
|
||||
A(?C3)B
|
||||
</pre>
|
||||
it is processed as if it were
|
||||
<pre>
|
||||
(?C255)A(?C3)B(?C255)
|
||||
</pre>
|
||||
Here is a more complicated example:
|
||||
<pre>
|
||||
A(\d{2}|--)
|
||||
</pre>
|
||||
it is processed as if it were
|
||||
With PCRE2_AUTO_CALLOUT, this pattern is processed as if it were
|
||||
<br>
|
||||
<br>
|
||||
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
|
||||
|
@ -107,10 +116,10 @@ with PCRE2_ANCHORED and PCRE2_AUTO_CALLOUT and then applied to the string
|
|||
No match
|
||||
</pre>
|
||||
This indicates that when matching [bc] fails, there is no backtracking into a+
|
||||
and therefore the callouts that would be taken for the backtracks do not occur.
|
||||
You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS to
|
||||
<b>pcre2_compile()</b>, or starting the pattern with (*NO_AUTO_POSSESS). In this
|
||||
case, the output changes to this:
|
||||
(because it is being treated as a++) and therefore the callouts that would be
|
||||
taken for the backtracks do not occur. You can disable the auto-possessify
|
||||
feature by passing PCRE2_NO_AUTO_POSSESS to <b>pcre2_compile()</b>, or starting
|
||||
the pattern with (*NO_AUTO_POSSESS). In this case, the output changes to this:
|
||||
<pre>
|
||||
--->aaaa
|
||||
+0 ^ a+
|
||||
|
@ -235,8 +244,8 @@ Fields for numerical callouts
|
|||
<P>
|
||||
For a numerical callout, <i>callout_string</i> is NULL, and <i>callout_number</i>
|
||||
contains the number of the callout, in the range 0-255. This is the number
|
||||
that follows (?C for manual callouts; it is 255 for automatically generated
|
||||
callouts.
|
||||
that follows (?C for callouts that part of the pattern; it is 255 for
|
||||
automatically generated callouts.
|
||||
</P>
|
||||
<br><b>
|
||||
Fields for string callouts
|
||||
|
@ -310,10 +319,15 @@ the next item to be matched.
|
|||
</P>
|
||||
<P>
|
||||
The <i>next_item_length</i> field contains the length of the next item to be
|
||||
matched in the pattern string. When the callout immediately precedes an
|
||||
alternation bar, a closing parenthesis, or the end of the pattern, the length
|
||||
is zero. When the callout precedes an opening parenthesis, the length is that
|
||||
of the entire subpattern.
|
||||
processed in the pattern string. When the callout is at the end of the pattern,
|
||||
the length is zero. When the callout precedes an opening parenthesis, the
|
||||
length includes meta characters that follow the parenthesis. For example, in a
|
||||
callout before an assertion such as (?=ab) the length is 3. For an an
|
||||
alternation bar or a closing parenthesis, the length is one, unless a closing
|
||||
parenthesis is followed by a quantifier, in which case its length is included.
|
||||
(This changed in release 10.23. In earlier releases, before an opening
|
||||
parenthesis the length was that of the entire subpattern, and before an
|
||||
alternation bar or a closing parenthesis the length was zero.)
|
||||
</P>
|
||||
<P>
|
||||
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
|
||||
|
@ -399,9 +413,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC8" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 23 March 2015
|
||||
Last updated: 29 September 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -107,7 +107,7 @@ processed as anchored at the point where they are tested.
|
|||
one that is backtracked onto acts. For example, in the pattern
|
||||
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||
same as PCRE2, but there are examples where it differs.
|
||||
same as PCRE2, but there are cases where it differs.
|
||||
</P>
|
||||
<P>
|
||||
11. Most backtracking verbs in assertions have their normal actions. They are
|
||||
|
@ -123,7 +123,7 @@ the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
|||
13. PCRE2's handling of duplicate subpattern numbers and duplicate subpattern
|
||||
names is not as general as Perl's. This is a consequence of the fact the PCRE2
|
||||
works internally just with numbers, using an external table to translate
|
||||
between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b)B),
|
||||
between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b>B),
|
||||
where the two capturing parentheses have the same number but different names,
|
||||
is not supported, and causes an error at compile time. If it were allowed, it
|
||||
would not be possible to distinguish which parentheses matched, because both
|
||||
|
@ -131,10 +131,11 @@ names map to capturing subpattern number 1. To avoid this confusing situation,
|
|||
an error is given at compile time.
|
||||
</P>
|
||||
<P>
|
||||
14. Perl recognizes comments in some places that PCRE2 does not, for example,
|
||||
between the ( and ? at the start of a subpattern. If the /x modifier is set,
|
||||
Perl allows white space between ( and ? (though current Perls warn that this is
|
||||
deprecated) but PCRE2 never does, even if the PCRE2_EXTENDED option is set.
|
||||
14. Perl used to recognize comments in some places that PCRE2 does not, for
|
||||
example, between the ( and ? at the start of a subpattern. If the /x modifier
|
||||
is set, Perl allowed white space between ( and ? though the latest Perls give
|
||||
an error (for a while it was just deprecated). There may still be some cases
|
||||
where Perl behaves differently.
|
||||
</P>
|
||||
<P>
|
||||
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||
|
@ -161,42 +162,47 @@ each alternative branch of a lookbehind assertion can match a different length
|
|||
of string. Perl requires them all to have the same length.
|
||||
<br>
|
||||
<br>
|
||||
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
|
||||
(b) From PCRE2 10.23, back references to groups of fixed length are supported
|
||||
in lookbehinds, provided that there is no possibility of referencing a
|
||||
non-unique number or name. Perl does not support backreferences in lookbehinds.
|
||||
<br>
|
||||
<br>
|
||||
(c) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
|
||||
meta-character matches only at the very end of the string.
|
||||
<br>
|
||||
<br>
|
||||
(c) A backslash followed by a letter with no special meaning is faulted. (Perl
|
||||
(d) A backslash followed by a letter with no special meaning is faulted. (Perl
|
||||
can be made to issue a warning.)
|
||||
<br>
|
||||
<br>
|
||||
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
|
||||
(e) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
|
||||
inverted, that is, by default they are not greedy, but if followed by a
|
||||
question mark they are.
|
||||
<br>
|
||||
<br>
|
||||
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
|
||||
(f) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
|
||||
only at the first matching position in the subject string.
|
||||
<br>
|
||||
<br>
|
||||
(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and
|
||||
(g) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and
|
||||
PCRE2_NO_AUTO_CAPTURE options have no Perl equivalents.
|
||||
<br>
|
||||
<br>
|
||||
(g) The \R escape sequence can be restricted to match only CR, LF, or CRLF
|
||||
(h) The \R escape sequence can be restricted to match only CR, LF, or CRLF
|
||||
by the PCRE2_BSR_ANYCRLF option.
|
||||
<br>
|
||||
<br>
|
||||
(h) The callout facility is PCRE2-specific.
|
||||
(i) The callout facility is PCRE2-specific.
|
||||
<br>
|
||||
<br>
|
||||
(i) The partial matching facility is PCRE2-specific.
|
||||
(j) The partial matching facility is PCRE2-specific.
|
||||
<br>
|
||||
<br>
|
||||
(j) The alternative matching function (<b>pcre2_dfa_match()</b> matches in a
|
||||
(k) The alternative matching function (<b>pcre2_dfa_match()</b> matches in a
|
||||
different way and is not Perl-compatible.
|
||||
<br>
|
||||
<br>
|
||||
(k) PCRE2 recognizes some special sequences such as (*CR) at the start of
|
||||
(l) PCRE2 recognizes some special sequences such as (*CR) at the start of
|
||||
a pattern that set overall options that cannot be changed within the pattern.
|
||||
</P>
|
||||
<br><b>
|
||||
|
@ -214,9 +220,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 15 March 2015
|
||||
Last updated: 18 October 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -80,11 +80,19 @@ span line boundaries. What defines a line boundary is controlled by the
|
|||
</P>
|
||||
<P>
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by a parameter that can be set by the <b>--buffer-size</b> option.
|
||||
The default value for this parameter is specified when <b>pcre2grep</b> is
|
||||
built, with the default default being 20K. A block of memory three times this
|
||||
size is used (to allow for buffering "before" and "after" lines). An error
|
||||
occurs if a line overflows the buffer.
|
||||
controlled by parameters that can be set by the <b>--buffer-size</b> and
|
||||
<b>--max-buffer-size</b> options. The first of these sets the size of buffer
|
||||
that is obtained at the start of processing. If an input file contains very
|
||||
long lines, a larger buffer may be needed; this is handled by automatically
|
||||
extending the buffer, up to the limit specified by <b>--max-buffer-size</b>. The
|
||||
default values for these parameters are specified when <b>pcre2grep</b> is
|
||||
built, with the default defaults being 20K and 1M respectively. An error occurs
|
||||
if a line is too long and the buffer can no longer be expanded.
|
||||
</P>
|
||||
<P>
|
||||
The block of memory that is actually used is three times the "buffer size", to
|
||||
allow for buffering "before" and "after" lines. If the buffer size is too
|
||||
small, fewer than requested "before" and "after" lines may be output.
|
||||
</P>
|
||||
<P>
|
||||
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
|
||||
|
@ -155,12 +163,13 @@ processing of patterns and file names that start with hyphens.
|
|||
</P>
|
||||
<P>
|
||||
<b>-A</b> <i>number</i>, <b>--after-context=</b><i>number</i>
|
||||
Output <i>number</i> lines of context after each matching line. If file names
|
||||
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||
colon for the context lines. A line containing "--" is output between each
|
||||
group of lines, unless they are in fact contiguous in the input file. The value
|
||||
of <i>number</i> is expected to be relatively small. However, <b>pcre2grep</b>
|
||||
guarantees to have up to 8K of following text available for context output.
|
||||
Output up to <i>number</i> lines of context after each matching line. Fewer
|
||||
lines are output if the next match or the end of the file is reached, or if the
|
||||
processing buffer size has been set too small. If file names and/or line
|
||||
numbers are being output, a hyphen separator is used instead of a colon for the
|
||||
context lines. A line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The value of <i>number</i>
|
||||
is expected to be relatively small. When <b>-c</b> is used, <b>-A</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>-a</b>, <b>--text</b>
|
||||
|
@ -169,12 +178,14 @@ Treat binary files as text. This is equivalent to
|
|||
</P>
|
||||
<P>
|
||||
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
|
||||
Output <i>number</i> lines of context before each matching line. If file names
|
||||
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||
colon for the context lines. A line containing "--" is output between each
|
||||
group of lines, unless they are in fact contiguous in the input file. The value
|
||||
of <i>number</i> is expected to be relatively small. However, <b>pcre2grep</b>
|
||||
guarantees to have up to 8K of preceding text available for context output.
|
||||
Output up to <i>number</i> lines of context before each matching line. Fewer
|
||||
lines are output if the previous match or the start of the file is within
|
||||
<i>number</i> lines, or if the processing buffer size has been set too small. If
|
||||
file names and/or line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contiguous in the input
|
||||
file. The value of <i>number</i> is expected to be relatively small. When
|
||||
<b>-c</b> is used, <b>-B</b> is ignored.
|
||||
</P>
|
||||
<P>
|
||||
<b>--binary-files=</b><i>word</i>
|
||||
|
@ -191,8 +202,9 @@ return code.
|
|||
</P>
|
||||
<P>
|
||||
<b>--buffer-size=</b><i>number</i>
|
||||
Set the parameter that controls how much memory is used for buffering files
|
||||
that are being scanned.
|
||||
Set the parameter that controls how much memory is obtained at the start of
|
||||
processing for buffering files that are being scanned. See also
|
||||
<b>--max-buffer-size</b> below.
|
||||
</P>
|
||||
<P>
|
||||
<b>-C</b> <i>number</i>, <b>--context=</b><i>number</i>
|
||||
|
@ -202,14 +214,16 @@ This is equivalent to setting both <b>-A</b> and <b>-B</b> to the same value.
|
|||
<P>
|
||||
<b>-c</b>, <b>--count</b>
|
||||
Do not output lines from the files that are being scanned; instead output the
|
||||
number of matches (or non-matches if <b>-v</b> is used) that would otherwise
|
||||
have caused lines to be shown. By default, this count is the same as the number
|
||||
of suppressed lines, but if the <b>-M</b> (multiline) option is used (without
|
||||
<b>-v</b>), there may be more suppressed lines than the number of matches.
|
||||
number of lines that would have been shown, either because they matched, or, if
|
||||
<b>-v</b> is set, because they failed to match. By default, this count is
|
||||
exactly the same as the number of lines that would have been output, but if the
|
||||
<b>-M</b> (multiline) option is used (without <b>-v</b>), there may be more
|
||||
suppressed lines than the count (that is, the number of matches).
|
||||
<br>
|
||||
<br>
|
||||
If no lines are selected, the number zero is output. If several files are are
|
||||
being scanned, a count is output for each of them. However, if the
|
||||
being scanned, a count is output for each of them and the <b>-t</b> option can
|
||||
be used to cause a total to be output at the end. However, if the
|
||||
<b>--files-with-matches</b> option is also used, only those files whose counts
|
||||
are greater than zero are listed. When <b>-c</b> is used, the <b>-A</b>,
|
||||
<b>-B</b>, and <b>-C</b> options are ignored.
|
||||
|
@ -232,11 +246,12 @@ just one, in order to colour them all.
|
|||
<br>
|
||||
<br>
|
||||
The colour that is used can be specified by setting the environment variable
|
||||
PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The value of this variable should be a
|
||||
string of two numbers, separated by a semicolon. They are copied directly into
|
||||
the control string for setting colour on a terminal, so it is your
|
||||
responsibility to ensure that they make sense. If neither of the environment
|
||||
variables is set, the default is "1;31", which gives red.
|
||||
PCRE2GREP_COLOUR or PCRE2GREP_COLOR. If neither of these are set,
|
||||
<b>pcre2grep</b> looks for GREP_COLOUR or GREP_COLOR. The value of the variable
|
||||
should be a string of two numbers, separated by a semicolon. They are copied
|
||||
directly into the control string for setting colour on a terminal, so it is
|
||||
your responsibility to ensure that they make sense. If neither of the
|
||||
environment variables is set, the default is "1;31", which gives red.
|
||||
</P>
|
||||
<P>
|
||||
<b>-D</b> <i>action</i>, <b>--devices=</b><i>action</i>
|
||||
|
@ -321,18 +336,18 @@ files; it does not apply to patterns specified by any of the <b>--include</b> or
|
|||
</P>
|
||||
<P>
|
||||
<b>-f</b> <i>filename</i>, <b>--file=</b><i>filename</i>
|
||||
Read patterns from the file, one per line, and match them against
|
||||
each line of input. What constitutes a newline when reading the file is the
|
||||
operating system's default. The <b>--newline</b> option has no effect on this
|
||||
option. Trailing white space is removed from each line, and blank lines are
|
||||
ignored. An empty file contains no patterns and therefore matches nothing. See
|
||||
also the comments about multiple patterns versus a single pattern with
|
||||
alternatives in the description of <b>-e</b> above.
|
||||
Read patterns from the file, one per line, and match them against each line of
|
||||
input. What constitutes a newline when reading the file is the operating
|
||||
system's default. The <b>--newline</b> option has no effect on this option.
|
||||
Trailing white space is removed from each line, and blank lines are ignored. An
|
||||
empty file contains no patterns and therefore matches nothing. See also the
|
||||
comments about multiple patterns versus a single pattern with alternatives in
|
||||
the description of <b>-e</b> above.
|
||||
<br>
|
||||
<br>
|
||||
If this option is given more than once, all the specified files are
|
||||
read. A data line is output if any of the patterns match it. A file name can
|
||||
be given as "-" to refer to the standard input. When <b>-f</b> is used, patterns
|
||||
If this option is given more than once, all the specified files are read. A
|
||||
data line is output if any of the patterns match it. A file name can be given
|
||||
as "-" to refer to the standard input. When <b>-f</b> is used, patterns
|
||||
specified on the command line using <b>-e</b> may also be present; they are
|
||||
tested before the file's patterns. However, no other pattern is taken from the
|
||||
command line; all arguments are treated as the names of paths to be searched.
|
||||
|
@ -502,22 +517,24 @@ There are no short forms for these options. The default settings are specified
|
|||
when the PCRE2 library is compiled, with the default default being 10 million.
|
||||
</P>
|
||||
<P>
|
||||
\fB--max-buffer-size=<i>number</i>
|
||||
This limits the expansion of the processing buffer, whose initial size can be
|
||||
set by <b>--buffer-size</b>. The maximum buffer size is silently forced to be no
|
||||
smaller than the starting buffer size.
|
||||
</P>
|
||||
<P>
|
||||
<b>-M</b>, <b>--multiline</b>
|
||||
Allow patterns to match more than one line. When this option is given, patterns
|
||||
may usefully contain literal newline characters and internal occurrences of ^
|
||||
and $ characters. The output for a successful match may consist of more than
|
||||
one line. The first is the line in which the match started, and the last is the
|
||||
line in which the match ended. If the matched string ends with a newline
|
||||
sequence the output ends at the end of that line.
|
||||
<br>
|
||||
<br>
|
||||
When this option is set, the PCRE2 library is called in "multiline" mode. This
|
||||
allows a matched string to extend past the end of a line and continue on one or
|
||||
more subsequent lines. However, <b>pcre2grep</b> still processes the input line
|
||||
by line. Once a match has been handled, scanning restarts at the beginning of
|
||||
the next line, just as it does when <b>-M</b> is not present. This means that it
|
||||
is possible for the second or subsequent lines in a multiline match to be
|
||||
output again as part of another match.
|
||||
Allow patterns to match more than one line. When this option is set, the PCRE2
|
||||
library is called in "multiline" mode. This allows a matched string to extend
|
||||
past the end of a line and continue on one or more subsequent lines. Patterns
|
||||
used with <b>-M</b> may usefully contain literal newline characters and internal
|
||||
occurrences of ^ and $ characters. The output for a successful match may
|
||||
consist of more than one line. The first line is the line in which the match
|
||||
started, and the last line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the end of that line.
|
||||
If <b>-v</b> is set, none of the lines in a multi-line match are output. Once a
|
||||
match has been handled, scanning restarts at the beginning of the line after
|
||||
the one in which the match ended.
|
||||
<br>
|
||||
<br>
|
||||
The newline sequence that separates multiple lines must be matched as part of
|
||||
|
@ -533,11 +550,8 @@ well as possibly handling a two-character newline sequence.
|
|||
<br>
|
||||
<br>
|
||||
There is a limit to the number of lines that can be matched, imposed by the way
|
||||
that <b>pcre2grep</b> buffers the input file as it scans it. However,
|
||||
<b>pcre2grep</b> ensures that at least 8K characters or the rest of the file
|
||||
(whichever is the shorter) are available for forward matching, and similarly
|
||||
the previous 8K characters (or all the previous characters, if fewer than 8K)
|
||||
are guaranteed to be available for lookbehind assertions. The <b>-M</b> option
|
||||
that <b>pcre2grep</b> buffers the input file as it scans it. With a sufficiently
|
||||
large processing buffer, this should not be a problem, but the <b>-M</b> option
|
||||
does not work when input is read line by line (see \fP--line-buffered\fP.)
|
||||
</P>
|
||||
<P>
|
||||
|
@ -585,12 +599,13 @@ It should never be needed in normal use.
|
|||
Show only the part of the line that matched a pattern instead of the whole
|
||||
line. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and
|
||||
<b>-C</b> options are ignored. If there is more than one match in a line, each
|
||||
of them is shown separately. If <b>-o</b> is combined with <b>-v</b> (invert the
|
||||
sense of the match to find non-matching lines), no output is generated, but the
|
||||
return code is set appropriately. If the matched portion of the line is empty,
|
||||
nothing is output unless the file name or line number are being printed, in
|
||||
which case they are shown on an otherwise empty line. This option is mutually
|
||||
exclusive with <b>--file-offsets</b> and <b>--line-offsets</b>.
|
||||
of them is shown separately, on a separate line of output. If <b>-o</b> is
|
||||
combined with <b>-v</b> (invert the sense of the match to find non-matching
|
||||
lines), no output is generated, but the return code is set appropriately. If
|
||||
the matched portion of the line is empty, nothing is output unless the file
|
||||
name or line number are being printed, in which case they are shown on an
|
||||
otherwise empty line. This option is mutually exclusive with
|
||||
<b>--file-offsets</b> and <b>--line-offsets</b>.
|
||||
</P>
|
||||
<P>
|
||||
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
||||
|
@ -604,10 +619,11 @@ capturing parentheses do not exist in the pattern, or were not set in the
|
|||
match, nothing is output unless the file name or line number are being output.
|
||||
<br>
|
||||
<br>
|
||||
If this option is given multiple times, multiple substrings are output, in the
|
||||
order the options are given. For example, -o3 -o1 -o3 causes the substrings
|
||||
matched by capturing parentheses 3 and 1 and then 3 again to be output. By
|
||||
default, there is no separator (but see the next option).
|
||||
If this option is given multiple times, multiple substrings are output for each
|
||||
match, in the order the options are given, and all on one line. For example,
|
||||
-o3 -o1 -o3 causes the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator (but see the next
|
||||
option).
|
||||
</P>
|
||||
<P>
|
||||
<b>--om-separator</b>=<i>text</i>
|
||||
|
@ -638,6 +654,18 @@ quietly skipped. However, the return code is still 2, even if matches were
|
|||
found in other files.
|
||||
</P>
|
||||
<P>
|
||||
<b>-t</b>, <b>--total-count</b>
|
||||
This option is useful when scanning more than one file. If used on its own,
|
||||
<b>-t</b> suppresses all output except for a grand total number of matching
|
||||
lines (or non-matching lines if <b>-v</b> is used) in all the files. If <b>-t</b>
|
||||
is used with <b>-c</b>, a grand total is output except when the previous output
|
||||
is just one line. In other words, it is not output when just one file's count
|
||||
is listed. If file names are being output, the grand total is preceded by
|
||||
"TOTAL:". Otherwise, it appears as just another number. The <b>-t</b> option is
|
||||
ignored when used with <b>-L</b> (list files without matches), because the grand
|
||||
total would always be zero.
|
||||
</P>
|
||||
<P>
|
||||
<b>-u</b>, <b>--utf-8</b>
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
||||
|
@ -665,11 +693,12 @@ specified by any of the <b>--include</b> or <b>--exclude</b> options.
|
|||
<P>
|
||||
<b>-x</b>, <b>--line-regex</b>, <b>--line-regexp</b>
|
||||
Force the patterns to be anchored (each must start matching at the beginning of
|
||||
a line) and in addition, require them to match entire lines. This is equivalent
|
||||
to having ^ and $ characters at the start and end of each alternative top-level
|
||||
branch in every pattern. This option applies only to the patterns that are
|
||||
matched against the contents of files; it does not apply to patterns specified
|
||||
by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||
a line) and in addition, require them to match entire lines. In multiline mode
|
||||
the match may be more than one line. This is equivalent to having \A and \Z
|
||||
characters at the start and end of each alternative top-level branch in every
|
||||
pattern. This option applies only to the patterns that are matched against the
|
||||
contents of files; it does not apply to patterns specified by any of the
|
||||
<b>--include</b> or <b>--exclude</b> options.
|
||||
</P>
|
||||
<br><a name="SEC6" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||
<P>
|
||||
|
@ -831,7 +860,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC15" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 19 June 2016
|
||||
Last updated: 31 October 2016
|
||||
<br>
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -61,14 +61,10 @@ The maximum length of a lookbehind assertion is 65535 characters.
|
|||
There is no limit to the number of parenthesized subpatterns, but there can be
|
||||
no more than 65535 capturing subpatterns. There is, however, a limit to the
|
||||
depth of nesting of parenthesized subpatterns of all kinds. This is imposed in
|
||||
order to limit the amount of system stack used at compile time. The limit can
|
||||
be specified when PCRE2 is built; the default is 250.
|
||||
</P>
|
||||
<P>
|
||||
There is a limit to the number of forward references to subsequent subpatterns
|
||||
of around 200,000. Repeated forward references with fixed upper limits, for
|
||||
example, (?2){0,100} when subpattern number 2 is to the right, are included in
|
||||
the count. There is no limit to the number of backward references.
|
||||
order to limit the amount of system stack used at compile time. The default
|
||||
limit can be specified when PCRE2 is built; the default default is 250. An
|
||||
application can change this limit by calling pcre2_set_parens_nest_limit() to
|
||||
set the limit in a compile context.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of name for a named subpattern is 32 code units, and the
|
||||
|
@ -76,7 +72,12 @@ maximum number of named subpatterns is 10000.
|
|||
</P>
|
||||
<P>
|
||||
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb
|
||||
is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries.
|
||||
is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
|
||||
32-bit libraries.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of a string argument to a callout is the largest number a
|
||||
32-bit unsigned integer can hold.
|
||||
</P>
|
||||
<br><b>
|
||||
AUTHOR
|
||||
|
@ -93,9 +94,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 05 November 2015
|
||||
Last updated: 26 October 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -379,32 +379,31 @@ case letter, it is converted to upper case. Then bit 6 of the character (hex
|
|||
40) is inverted. Thus \cA to \cZ become hex 01 to hex 1A (A is 41, Z is 5A),
|
||||
but \c{ becomes hex 3B ({ is 7B), and \c; becomes hex 7B (; is 3B). If the
|
||||
code unit following \c has a value less than 32 or greater than 126, a
|
||||
compile-time error occurs. This locks out non-printable ASCII characters in all
|
||||
modes.
|
||||
compile-time error occurs.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2 is compiled in EBCDIC mode, \a, \e, \f, \n, \r, and \t
|
||||
generate the appropriate EBCDIC code values. The \c escape is processed
|
||||
as specified for Perl in the <b>perlebcdic</b> document. The only characters
|
||||
that are allowed after \c are A-Z, a-z, or one of @, [, \, ], ^, _, or ?. Any
|
||||
other character provokes a compile-time error. The sequence \@ encodes
|
||||
character code 0; the letters (in either case) encode characters 1-26 (hex 01
|
||||
to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex 1F), and
|
||||
\? becomes either 255 (hex FF) or 95 (hex 5F).
|
||||
other character provokes a compile-time error. The sequence \c@ encodes
|
||||
character code 0; after \c the letters (in either case) encode characters 1-26
|
||||
(hex 01 to hex 1A); [, \, ], ^, and _ encode characters 27-31 (hex 1B to hex
|
||||
1F), and \c? becomes either 255 (hex FF) or 95 (hex 5F).
|
||||
</P>
|
||||
<P>
|
||||
Thus, apart from \?, these escapes generate the same character code values as
|
||||
Thus, apart from \c?, these escapes generate the same character code values as
|
||||
they do in an ASCII environment, though the meanings of the values mostly
|
||||
differ. For example, \G always generates code value 7, which is BEL in ASCII
|
||||
differ. For example, \cG always generates code value 7, which is BEL in ASCII
|
||||
but DEL in EBCDIC.
|
||||
</P>
|
||||
<P>
|
||||
The sequence \? generates DEL (127, hex 7F) in an ASCII environment, but
|
||||
The sequence \c? generates DEL (127, hex 7F) in an ASCII environment, but
|
||||
because 127 is not a control character in EBCDIC, Perl makes it generate the
|
||||
APC character. Unfortunately, there are several variants of EBCDIC. In most of
|
||||
them the APC character has the value 255 (hex FF), but in the one Perl calls
|
||||
POSIX-BC its value is 95 (hex 5F). If certain other characters have POSIX-BC
|
||||
values, PCRE2 makes \? generate 95; otherwise it generates 255.
|
||||
values, PCRE2 makes \c? generate 95; otherwise it generates 255.
|
||||
</P>
|
||||
<P>
|
||||
After \0 up to two further octal digits are read. If there are fewer than two
|
||||
|
@ -526,9 +525,9 @@ by code point, as described in the previous section.
|
|||
Absolute and relative back references
|
||||
</b><br>
|
||||
<P>
|
||||
The sequence \g followed by an unsigned or a negative number, optionally
|
||||
enclosed in braces, is an absolute or relative back reference. A named back
|
||||
reference can be coded as \g{name}. Back references are discussed
|
||||
The sequence \g followed by a signed or unsigned number, optionally enclosed
|
||||
in braces, is an absolute or relative back reference. A named back reference
|
||||
can be coded as \g{name}. Back references are discussed
|
||||
<a href="#backreferences">later,</a>
|
||||
following the discussion of
|
||||
<a href="#subpattern">parenthesized subpatterns.</a>
|
||||
|
@ -1326,13 +1325,32 @@ whatever setting of the PCRE2_DOTALL and PCRE2_MULTILINE options is used. A
|
|||
class such as [^a] always matches one of these characters.
|
||||
</P>
|
||||
<P>
|
||||
The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v,
|
||||
\V, \w, and \W may appear in a character class, and add the characters that
|
||||
they match to the class. For example, [\dABCDEF] matches any hexadecimal
|
||||
digit. In UTF modes, the PCRE2_UCP option affects the meanings of \d, \s, \w
|
||||
and their upper case partners, just as it does when they appear outside a
|
||||
character class, as described in the section entitled
|
||||
<a href="#genericchartypes">"Generic character types"</a>
|
||||
above. The escape sequence \b has a different meaning inside a character
|
||||
class; it matches the backspace character. The sequences \B, \N, \R, and \X
|
||||
are not special inside a character class. Like any other unrecognized escape
|
||||
sequences, they cause an error.
|
||||
</P>
|
||||
<P>
|
||||
The minus (hyphen) character can be used to specify a range of characters in a
|
||||
character class. For example, [d-m] matches any letter between d and m,
|
||||
inclusive. If a minus character is required in a class, it must be escaped with
|
||||
a backslash or appear in a position where it cannot be interpreted as
|
||||
indicating a range, typically as the first or last character in the class, or
|
||||
immediately after a range. For example, [b-d-z] matches letters in the range b
|
||||
to d, a hyphen character, or z.
|
||||
indicating a range, typically as the first or last character in the class,
|
||||
or immediately after a range. For example, [b-d-z] matches letters in the range
|
||||
b to d, a hyphen character, or z.
|
||||
</P>
|
||||
<P>
|
||||
Perl treats a hyphen as a literal if it appears before a POSIX class (see
|
||||
below) or a character type escape such as as \d, but gives a warning in its
|
||||
warning mode, as this is most likely a user error. As PCRE2 has no facility for
|
||||
warning, an error is given in these cases.
|
||||
</P>
|
||||
<P>
|
||||
It is not possible to have the literal character "]" as the end character of a
|
||||
|
@ -1344,12 +1362,6 @@ followed by two other characters. The octal or hexadecimal representation of
|
|||
"]" can also be used to end a range.
|
||||
</P>
|
||||
<P>
|
||||
An error is generated if a POSIX character class (see below) or an escape
|
||||
sequence other than one that defines a single character appears at a point
|
||||
where a range ending character is expected. For example, [z-\xff] is valid,
|
||||
but [A-\d] and [A-[:digit:]] are not.
|
||||
</P>
|
||||
<P>
|
||||
Ranges normally include all code points between the start and end characters,
|
||||
inclusive. They can also be used for code points specified numerically, for
|
||||
example [\000-\037]. Ranges can include any characters that are valid for the
|
||||
|
@ -1372,19 +1384,6 @@ tables for a French locale are in use, [\xc8-\xcb] matches accented E
|
|||
characters in both cases.
|
||||
</P>
|
||||
<P>
|
||||
The character escape sequences \d, \D, \h, \H, \p, \P, \s, \S, \v,
|
||||
\V, \w, and \W may appear in a character class, and add the characters that
|
||||
they match to the class. For example, [\dABCDEF] matches any hexadecimal
|
||||
digit. In UTF modes, the PCRE2_UCP option affects the meanings of \d, \s, \w
|
||||
and their upper case partners, just as it does when they appear outside a
|
||||
character class, as described in the section entitled
|
||||
<a href="#genericchartypes">"Generic character types"</a>
|
||||
above. The escape sequence \b has a different meaning inside a character
|
||||
class; it matches the backspace character. The sequences \B, \N, \R, and \X
|
||||
are not special inside a character class. Like any other unrecognized escape
|
||||
sequences, they cause an error.
|
||||
</P>
|
||||
<P>
|
||||
A circumflex can conveniently be used with the upper case character types to
|
||||
specify a more restricted set of characters than the matching lower case type.
|
||||
For example, the class [^\W_] matches any letter or digit, but not underscore,
|
||||
|
@ -1552,13 +1551,8 @@ respectively.
|
|||
<P>
|
||||
When one of these option changes occurs at top level (that is, not inside
|
||||
subpattern parentheses), the change applies to the remainder of the pattern
|
||||
that follows. If the change is placed right at the start of a pattern, PCRE2
|
||||
extracts it into the global options (and it will therefore show up in data
|
||||
extracted by the <b>pcre2_pattern_info()</b> function).
|
||||
</P>
|
||||
<P>
|
||||
An option change within a subpattern (see below for a description of
|
||||
subpatterns) affects only that part of the subpattern that follows it, so
|
||||
that follows. An option change within a subpattern (see below for a description
|
||||
of subpatterns) affects only that part of the subpattern that follows it, so
|
||||
<pre>
|
||||
(a(?i)b)c
|
||||
</pre>
|
||||
|
@ -2093,9 +2087,9 @@ subpattern is possible using named parentheses (see below).
|
|||
</P>
|
||||
<P>
|
||||
Another way of avoiding the ambiguity inherent in the use of digits following a
|
||||
backslash is to use the \g escape sequence. This escape must be followed by an
|
||||
unsigned number or a negative number, optionally enclosed in braces. These
|
||||
examples are all identical:
|
||||
backslash is to use the \g escape sequence. This escape must be followed by a
|
||||
signed or unsigned number, optionally enclosed in braces. These examples are
|
||||
all identical:
|
||||
<pre>
|
||||
(ring), \1
|
||||
(ring), \g1
|
||||
|
@ -2103,8 +2097,7 @@ examples are all identical:
|
|||
</pre>
|
||||
An unsigned number specifies an absolute reference without the ambiguity that
|
||||
is present in the older syntax. It is also useful when literal digits follow
|
||||
the reference. A negative number is a relative reference. Consider this
|
||||
example:
|
||||
the reference. A signed number is a relative reference. Consider this example:
|
||||
<pre>
|
||||
(abc(def)ghi)\g{-1}
|
||||
</pre>
|
||||
|
@ -2115,6 +2108,11 @@ can be helpful in long patterns, and also in patterns that are created by
|
|||
joining together fragments that contain references within themselves.
|
||||
</P>
|
||||
<P>
|
||||
The sequence \g{+1} is a reference to the next capturing subpattern. This kind
|
||||
of forward reference can be useful it patterns that repeat. Perl does not
|
||||
support the use of + in this way.
|
||||
</P>
|
||||
<P>
|
||||
A back reference matches whatever actually matched the capturing subpattern in
|
||||
the current subject string, rather than anything matching the subpattern
|
||||
itself (see
|
||||
|
@ -2214,6 +2212,14 @@ capturing is carried out only for positive assertions. (Perl sometimes, but not
|
|||
always, does do capturing in negative assertions.)
|
||||
</P>
|
||||
<P>
|
||||
WARNING: If a positive assertion containing one or more capturing subpatterns
|
||||
succeeds, but failure to match later in the pattern causes backtracking over
|
||||
this assertion, the captures within the assertion are reset only if no higher
|
||||
numbered captures are already set. This is, unfortunately, a fundamental
|
||||
limitation of the current implementation; it may get removed in a future
|
||||
reworking.
|
||||
</P>
|
||||
<P>
|
||||
For compatibility with Perl, most assertion subpatterns may be repeated; though
|
||||
it makes no sense to assert the same thing several times, the side effect of
|
||||
capturing parentheses may occasionally be useful. However, an assertion that
|
||||
|
@ -2310,18 +2316,31 @@ match. If there are insufficient characters before the current position, the
|
|||
assertion fails.
|
||||
</P>
|
||||
<P>
|
||||
In a UTF mode, PCRE2 does not allow the \C escape (which matches a single code
|
||||
unit even in a UTF mode) to appear in lookbehind assertions, because it makes
|
||||
it impossible to calculate the length of the lookbehind. The \X and \R
|
||||
escapes, which can match different numbers of code units, are also not
|
||||
permitted.
|
||||
In UTF-8 and UTF-16 modes, PCRE2 does not allow the \C escape (which matches a
|
||||
single code unit even in a UTF mode) to appear in lookbehind assertions,
|
||||
because it makes it impossible to calculate the length of the lookbehind. The
|
||||
\X and \R escapes, which can match different numbers of code units, are never
|
||||
permitted in lookbehinds.
|
||||
</P>
|
||||
<P>
|
||||
<a href="#subpatternsassubroutines">"Subroutine"</a>
|
||||
calls (see below) such as (?2) or (?&X) are permitted in lookbehinds, as long
|
||||
as the subpattern matches a fixed-length string.
|
||||
<a href="#recursion">Recursion,</a>
|
||||
however, is not supported.
|
||||
as the subpattern matches a fixed-length string. However,
|
||||
<a href="#recursion">recursion,</a>
|
||||
that is, a "subroutine" call into a group that is already active,
|
||||
is not supported.
|
||||
</P>
|
||||
<P>
|
||||
Perl does not support back references in lookbehinds. PCRE2 does support them,
|
||||
but only if certain conditions are met. The PCRE2_MATCH_UNSET_BACKREF option
|
||||
must not be set, there must be no use of (?| in the pattern (it creates
|
||||
duplicate subpattern numbers), and if the back reference is by name, the name
|
||||
must be unique. Of course, the referenced subpattern must itself be of fixed
|
||||
length. The following pattern matches words containing at least two characters
|
||||
that begin and end with the same character:
|
||||
<pre>
|
||||
\b(\w)\w++(?<=\1)
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
Possessive quantifiers can be used in conjunction with lookbehind assertions to
|
||||
|
@ -2459,7 +2478,9 @@ Checking for a used subpattern by name
|
|||
<P>
|
||||
Perl uses the syntax (?(<name>)...) or (?('name')...) to test for a used
|
||||
subpattern by name. For compatibility with earlier versions of PCRE1, which had
|
||||
this facility before Perl, the syntax (?(name)...) is also recognized.
|
||||
this facility before Perl, the syntax (?(name)...) is also recognized. Note,
|
||||
however, that undelimited names consisting of the letter R followed by digits
|
||||
are ambiguous (see the following section).
|
||||
</P>
|
||||
<P>
|
||||
Rewriting the above example to use a named subpattern gives this:
|
||||
|
@ -2474,30 +2495,52 @@ matched.
|
|||
Checking for pattern recursion
|
||||
</b><br>
|
||||
<P>
|
||||
If the condition is the string (R), and there is no subpattern with the name R,
|
||||
the condition is true if a recursive call to the whole pattern or any
|
||||
subpattern has been made. If digits or a name preceded by ampersand follow the
|
||||
letter R, for example:
|
||||
"Recursion" in this sense refers to any subroutine-like call from one part of
|
||||
the pattern to another, whether or not it is actually recursive. See the
|
||||
sections entitled
|
||||
<a href="#recursion">"Recursive patterns"</a>
|
||||
and
|
||||
<a href="#subpatternsassubroutines">"Subpatterns as subroutines"</a>
|
||||
below for details of recursion and subpattern calls.
|
||||
</P>
|
||||
<P>
|
||||
If a condition is the string (R), and there is no subpattern with the name R,
|
||||
the condition is true if matching is currently in a recursion or subroutine
|
||||
call to the whole pattern or any subpattern. If digits follow the letter R, and
|
||||
there is no subpattern with that name, the condition is true if the most recent
|
||||
call is into a subpattern with the given number, which must exist somewhere in
|
||||
the overall pattern. This is a contrived example that is equivalent to a+b:
|
||||
<pre>
|
||||
(?(R3)...) or (?(R&name)...)
|
||||
((?(R1)a+|(?1)b))
|
||||
</pre>
|
||||
the condition is true if the most recent recursion is into a subpattern whose
|
||||
number or name is given. This condition does not check the entire recursion
|
||||
stack. If the name used in a condition of this kind is a duplicate, the test is
|
||||
applied to all subpatterns of the same name, and is true if any one of them is
|
||||
the most recent recursion.
|
||||
However, in both cases, if there is a subpattern with a matching name, the
|
||||
condition tests for its being set, as described in the section above, instead
|
||||
of testing for recursion. For example, creating a group with the name R1 by
|
||||
adding (?<R1>) to the above pattern completely changes its meaning.
|
||||
</P>
|
||||
<P>
|
||||
If a name preceded by ampersand follows the letter R, for example:
|
||||
<pre>
|
||||
(?(R&name)...)
|
||||
</pre>
|
||||
the condition is true if the most recent recursion is into a subpattern of that
|
||||
name (which must exist within the pattern).
|
||||
</P>
|
||||
<P>
|
||||
This condition does not check the entire recursion stack. It tests only the
|
||||
current level. If the name used in a condition of this kind is a duplicate, the
|
||||
test is applied to all subpatterns of the same name, and is true if any one of
|
||||
them is the most recent recursion.
|
||||
</P>
|
||||
<P>
|
||||
At "top level", all these recursion test conditions are false.
|
||||
<a href="#recursion">The syntax for recursive patterns</a>
|
||||
is described below.
|
||||
<a name="subdefine"></a></P>
|
||||
<br><b>
|
||||
Defining subpatterns for use by reference only
|
||||
</b><br>
|
||||
<P>
|
||||
If the condition is the string (DEFINE), and there is no subpattern with the
|
||||
name DEFINE, the condition is always false. In this case, there may be only one
|
||||
If the condition is the string (DEFINE), the condition is always false, even if
|
||||
there is a group with the name DEFINE. In this case, there may be only one
|
||||
alternative in the subpattern. It is always skipped if control reaches this
|
||||
point in the pattern; the idea of DEFINE is that it can be used to define
|
||||
subroutines that can be referenced from elsewhere. (The use of
|
||||
|
@ -2965,12 +3008,22 @@ depending on whether or not a name is present.
|
|||
By default, for compatibility with Perl, a name is any sequence of characters
|
||||
that does not include a closing parenthesis. The name is not processed in
|
||||
any way, and it is not possible to include a closing parenthesis in the name.
|
||||
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
|
||||
is applied to verb names and only an unescaped closing parenthesis terminates
|
||||
the name. A closing parenthesis can be included in a name either as \) or
|
||||
between \Q and \E. If the PCRE2_EXTENDED option is set, unescaped whitespace
|
||||
in verb names is skipped and #-comments are recognized, exactly as in the rest
|
||||
of the pattern.
|
||||
This can be changed by setting the PCRE2_ALT_VERBNAMES option, but the result
|
||||
is no longer Perl-compatible.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2_ALT_VERBNAMES is set, backslash processing is applied to verb names
|
||||
and only an unescaped closing parenthesis terminates the name. However, the
|
||||
only backslash items that are permitted are \Q, \E, and sequences such as
|
||||
\x{100} that define character code points. Character type escapes such as \d
|
||||
are faulted.
|
||||
</P>
|
||||
<P>
|
||||
A closing parenthesis can be included in a name either as \) or between \Q
|
||||
and \E. In addition to backslash processing, if the PCRE2_EXTENDED option is
|
||||
also set, unescaped whitespace in verb names is skipped, and #-comments are
|
||||
recognized, exactly as in the rest of the pattern. PCRE2_EXTENDED does not
|
||||
affect verb names unless PCRE2_ALT_VERBNAMES is also set.
|
||||
</P>
|
||||
<P>
|
||||
The maximum length of a name is 255 in the 8-bit library and 65535 in the
|
||||
|
@ -3393,7 +3446,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 20 June 2016
|
||||
Last updated: 23 October 2016
|
||||
<br>
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -492,6 +492,9 @@ Each top-level branch of a look behind must be of a fixed length.
|
|||
\n reference by number (can be ambiguous)
|
||||
\gn reference by number
|
||||
\g{n} reference by number
|
||||
\g+n relative reference by number (PCRE2 extension)
|
||||
\g-n relative reference by number
|
||||
\g{+n} relative reference by number (PCRE2 extension)
|
||||
\g{-n} relative reference by number
|
||||
\k<name> reference by name (Perl)
|
||||
\k'name' reference by name (Perl)
|
||||
|
@ -530,14 +533,17 @@ Each top-level branch of a look behind must be of a fixed length.
|
|||
(?(-n) relative reference condition
|
||||
(?(<name>) named reference condition (Perl)
|
||||
(?('name') named reference condition (Perl)
|
||||
(?(name) named reference condition (PCRE2)
|
||||
(?(name) named reference condition (PCRE2, deprecated)
|
||||
(?(R) overall recursion condition
|
||||
(?(Rn) specific group recursion condition
|
||||
(?(R&name) specific recursion condition
|
||||
(?(Rn) specific numbered group recursion condition
|
||||
(?(R&name) specific named group recursion condition
|
||||
(?(DEFINE) define subpattern for reference
|
||||
(?(VERSION[>]=n.m) test PCRE2 version
|
||||
(?(assert) assertion condition
|
||||
</PRE>
|
||||
</pre>
|
||||
Note the ambiguity of (?(R) and (?(Rn) which might be named reference
|
||||
conditions or recursion tests. Such a condition is interpreted as a reference
|
||||
condition if the relevant named group exists.
|
||||
</P>
|
||||
<br><a name="SEC23" href="#TOC1">BACKTRACKING CONTROL</a><br>
|
||||
<P>
|
||||
|
@ -589,9 +595,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC27" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 16 October 2015
|
||||
Last updated: 28 September 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -615,6 +615,7 @@ about the pattern:
|
|||
pushcopy push a copy onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
tables=[0|1|2] select internal tables
|
||||
use_length do not zero-terminate the pattern
|
||||
utf8_input treat input as UTF-8
|
||||
</pre>
|
||||
The effects of these modifiers are described in the following sections.
|
||||
|
@ -698,6 +699,18 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
|||
default values).
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying the pattern's length
|
||||
</b><br>
|
||||
<P>
|
||||
By default, patterns are passed to the compiling functions as zero-terminated
|
||||
strings. When using the POSIX wrapper API, there is no other option. However,
|
||||
when using PCRE2's native API, patterns can be passed by length instead of
|
||||
being zero-terminated. The <b>use_length</b> modifier causes this to happen.
|
||||
Using a length happens automatically (whether or not <b>use_length</b> is set)
|
||||
when <b>hex</b> is set, because patterns specified in hexadecimal may contain
|
||||
binary zeros.
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying pattern characters in hexadecimal
|
||||
</b><br>
|
||||
<P>
|
||||
|
@ -720,10 +733,10 @@ the delimiter within a substring. The <b>hex</b> and <b>expand</b> modifiers are
|
|||
mutually exclusive.
|
||||
</P>
|
||||
<P>
|
||||
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
|
||||
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
|
||||
patterns specified with the <b>hex</b> modifier, the actual length of the
|
||||
pattern is passed.
|
||||
The POSIX API cannot be used with patterns specified in hexadecimal because
|
||||
they may contain binary zeros, which conflicts with <b>regcomp()</b>'s
|
||||
requirement for a zero-terminated string. Such patterns are always passed to
|
||||
<b>pcre2_compile()</b> as a string with a length, not as zero-terminated.
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying wide characters in 16-bit and 32-bit modes
|
||||
|
@ -1753,7 +1766,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 02 August 2016
|
||||
Last updated: 04 November 2016
|
||||
<br>
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -94,6 +94,9 @@ in the library.
|
|||
<tr><td><a href="pcre2_code_copy.html">pcre2_code_copy</a></td>
|
||||
<td> Copy a compiled pattern</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_code_copy_with_tables.html">pcre2_code_copy_with_tables</a></td>
|
||||
<td> Copy a compiled pattern and its character tables</td></tr>
|
||||
|
||||
<tr><td><a href="pcre2_code_free.html">pcre2_code_free</a></td>
|
||||
<td> Free a compiled pattern</td></tr>
|
||||
|
||||
|
|
1425
doc/pcre2.txt
1425
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_CODE_COPY 3 "26 February 2016" "PCRE2 10.22"
|
||||
.TH PCRE2_CODE_COPY 3 "22 November 2016" "PCRE2 10.23"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -16,8 +16,9 @@ PCRE2 - Perl-compatible regular expressions (revised API)
|
|||
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||
any memory used by the JIT compiler. Without a subsequent call to
|
||||
\fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching. The
|
||||
yield of the function is NULL if \fIcode\fP is NULL or if sufficient memory
|
||||
cannot be obtained.
|
||||
pointer to the character tables is copied, not the tables themselves (see
|
||||
\fBpcre2_code_copy_with_tables()\fP). The yield of the function is NULL if
|
||||
\fIcode\fP is NULL or if sufficient memory cannot be obtained.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
.TH PCRE2_CODE_COPY 3 "22 November 2016" "PCRE2 10.23"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
.rs
|
||||
.sp
|
||||
.B #include <pcre2.h>
|
||||
.PP
|
||||
.nf
|
||||
.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
|
||||
.fi
|
||||
.
|
||||
.SH DESCRIPTION
|
||||
.rs
|
||||
.sp
|
||||
This function makes a copy of the memory used for a compiled pattern, excluding
|
||||
any memory used by the JIT compiler. Without a subsequent call to
|
||||
\fBpcre2_jit_compile()\fP, the copy can be used only for non-JIT matching.
|
||||
Unlike \fBpcre2_code_copy()\fP, a separate copy of the character tables is also
|
||||
made, with the new code pointing to it. This memory will be automatically freed
|
||||
when \fBpcre2_code_free()\fP is called. The yield of the function is NULL if
|
||||
\fIcode\fP is NULL or if sufficient memory cannot be obtained.
|
||||
.P
|
||||
There is a complete description of the PCRE2 native API in the
|
||||
.\" HREF
|
||||
\fBpcre2api\fP
|
||||
.\"
|
||||
page and a description of the POSIX API in the
|
||||
.\" HREF
|
||||
\fBpcre2posix\fP
|
||||
.\"
|
||||
page.
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "30 September 2016" "PCRE2 10.23"
|
||||
.TH PCRE2API 3 "22 November 2016" "PCRE2 10.23"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -235,6 +235,8 @@ document for an overview of all the PCRE2 documentation.
|
|||
.nf
|
||||
.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
|
||||
.sp
|
||||
.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
|
||||
.sp
|
||||
.B int pcre2_get_error_message(int \fIerrorcode\fP, PCRE2_UCHAR *\fIbuffer\fP,
|
||||
.B " PCRE2_SIZE \fIbufflen\fP);"
|
||||
.sp
|
||||
|
@ -509,8 +511,9 @@ If JIT is being used, but the JIT compilation is not being done immediately,
|
|||
(perhaps waiting to see if the pattern is used often enough) similar logic is
|
||||
required. JIT compilation updates a pointer within the compiled code block, so
|
||||
a thread must gain unique write access to the pointer before calling
|
||||
\fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP can be used
|
||||
to obtain a private copy of the compiled code.
|
||||
\fBpcre2_jit_compile()\fP. Alternatively, \fBpcre2_code_copy()\fP or
|
||||
\fBpcre2_code_copy_with_tables()\fP can be used to obtain a private copy of the
|
||||
compiled code.
|
||||
.
|
||||
.
|
||||
.SS "Context blocks"
|
||||
|
@ -1027,6 +1030,8 @@ zero.
|
|||
.B void pcre2_code_free(pcre2_code *\fIcode\fP);
|
||||
.sp
|
||||
.B pcre2_code *pcre2_code_copy(const pcre2_code *\fIcode\fP);
|
||||
.sp
|
||||
.B pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *\fIcode\fP);
|
||||
.fi
|
||||
.P
|
||||
The \fBpcre2_compile()\fP function compiles a pattern into an internal form.
|
||||
|
@ -1049,9 +1054,24 @@ below),
|
|||
.\"
|
||||
the JIT information cannot be copied (because it is position-dependent).
|
||||
The new copy can initially be used only for non-JIT matching, though it can be
|
||||
passed to \fBpcre2_jit_compile()\fP if required. The \fBpcre2_code_copy()\fP
|
||||
function provides a way for individual threads in a multithreaded application
|
||||
to acquire a private copy of shared compiled code.
|
||||
passed to \fBpcre2_jit_compile()\fP if required.
|
||||
.P
|
||||
The \fBpcre2_code_copy()\fP function provides a way for individual threads in a
|
||||
multithreaded application to acquire a private copy of shared compiled code.
|
||||
However, it does not make a copy of the character tables used by the compiled
|
||||
pattern; the new pattern code points to the same tables as the original code.
|
||||
(See
|
||||
.\" HTML <a href="#jitcompiling">
|
||||
.\" </a>
|
||||
"Locale Support"
|
||||
.\"
|
||||
below for details of these character tables.) In many applications the same
|
||||
tables are used throughout, so this behaviour is appropriate. Nevertheless,
|
||||
there are occasions when a copy of a compiled pattern and the relevant tables
|
||||
are needed. The \fBpcre2_code_copy_with_tables()\fP provides this facility.
|
||||
Copies of both the code and the tables are made, with the new code pointing to
|
||||
the new tables. The memory for the new tables is automatically freed when
|
||||
\fBpcre2_code_free()\fP is called for the new copy of the compiled code.
|
||||
.P
|
||||
NOTE: When one of the matching functions is called, pointers to the compiled
|
||||
pattern and the subject string are set in the match data block so that they can
|
||||
|
@ -3299,6 +3319,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 September 2016
|
||||
Last updated: 22 November 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -51,103 +51,115 @@ DESCRIPTION
|
|||
boundary is controlled by the -N (--newline) option.
|
||||
|
||||
The amount of memory used for buffering files that are being scanned is
|
||||
controlled by a parameter that can be set by the --buffer-size option.
|
||||
The default value for this parameter is specified when pcre2grep is
|
||||
built, with the default default being 20K. A block of memory three
|
||||
times this size is used (to allow for buffering "before" and "after"
|
||||
lines). An error occurs if a line overflows the buffer.
|
||||
controlled by parameters that can be set by the --buffer-size and
|
||||
--max-buffer-size options. The first of these sets the size of buffer
|
||||
that is obtained at the start of processing. If an input file contains
|
||||
very long lines, a larger buffer may be needed; this is handled by
|
||||
automatically extending the buffer, up to the limit specified by --max-
|
||||
buffer-size. The default values for these parameters are specified when
|
||||
pcre2grep is built, with the default defaults being 20K and 1M respec-
|
||||
tively. An error occurs if a line is too long and the buffer can no
|
||||
longer be expanded.
|
||||
|
||||
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the
|
||||
greater. BUFSIZ is defined in <stdio.h>. When there is more than one
|
||||
The block of memory that is actually used is three times the "buffer
|
||||
size", to allow for buffering "before" and "after" lines. If the buffer
|
||||
size is too small, fewer than requested "before" and "after" lines may
|
||||
be output.
|
||||
|
||||
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the
|
||||
greater. BUFSIZ is defined in <stdio.h>. When there is more than one
|
||||
pattern (specified by the use of -e and/or -f), each pattern is applied
|
||||
to each line in the order in which they are defined, except that all
|
||||
to each line in the order in which they are defined, except that all
|
||||
the -e patterns are tried before the -f patterns.
|
||||
|
||||
By default, as soon as one pattern matches a line, no further patterns
|
||||
By default, as soon as one pattern matches a line, no further patterns
|
||||
are considered. However, if --colour (or --color) is used to colour the
|
||||
matching substrings, or if --only-matching, --file-offsets, or --line-
|
||||
offsets is used to output only the part of the line that matched
|
||||
matching substrings, or if --only-matching, --file-offsets, or --line-
|
||||
offsets is used to output only the part of the line that matched
|
||||
(either shown literally, or as an offset), scanning resumes immediately
|
||||
following the match, so that further matches on the same line can be
|
||||
found. If there are multiple patterns, they are all tried on the
|
||||
remainder of the line, but patterns that follow the one that matched
|
||||
following the match, so that further matches on the same line can be
|
||||
found. If there are multiple patterns, they are all tried on the
|
||||
remainder of the line, but patterns that follow the one that matched
|
||||
are not tried on the earlier part of the line.
|
||||
|
||||
This behaviour means that the order in which multiple patterns are
|
||||
specified can affect the output when one of the above options is used.
|
||||
This is no longer the same behaviour as GNU grep, which now manages to
|
||||
display earlier matches for later patterns (as long as there is no
|
||||
This behaviour means that the order in which multiple patterns are
|
||||
specified can affect the output when one of the above options is used.
|
||||
This is no longer the same behaviour as GNU grep, which now manages to
|
||||
display earlier matches for later patterns (as long as there is no
|
||||
overlap).
|
||||
|
||||
Patterns that can match an empty string are accepted, but empty string
|
||||
Patterns that can match an empty string are accepted, but empty string
|
||||
matches are never recognized. An example is the pattern
|
||||
"(super)?(man)?", in which all components are optional. This pattern
|
||||
finds all occurrences of both "super" and "man"; the output differs
|
||||
from matching with "super|man" when only the matching substrings are
|
||||
"(super)?(man)?", in which all components are optional. This pattern
|
||||
finds all occurrences of both "super" and "man"; the output differs
|
||||
from matching with "super|man" when only the matching substrings are
|
||||
being shown.
|
||||
|
||||
If the LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses
|
||||
If the LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses
|
||||
the value to set a locale when calling the PCRE2 library. The --locale
|
||||
option can be used to override this.
|
||||
|
||||
|
||||
SUPPORT FOR COMPRESSED FILES
|
||||
|
||||
It is possible to compile pcre2grep so that it uses libz or libbz2 to
|
||||
read files whose names end in .gz or .bz2, respectively. You can find
|
||||
It is possible to compile pcre2grep so that it uses libz or libbz2 to
|
||||
read files whose names end in .gz or .bz2, respectively. You can find
|
||||
out whether your binary has support for one or both of these file types
|
||||
by running it with the --help option. If the appropriate support is not
|
||||
present, files are treated as plain text. The standard input is always
|
||||
present, files are treated as plain text. The standard input is always
|
||||
so treated.
|
||||
|
||||
|
||||
BINARY FILES
|
||||
|
||||
By default, a file that contains a binary zero byte within the first
|
||||
1024 bytes is identified as a binary file, and is processed specially.
|
||||
(GNU grep also identifies binary files in this manner.) See the
|
||||
--binary-files option for a means of changing the way binary files are
|
||||
By default, a file that contains a binary zero byte within the first
|
||||
1024 bytes is identified as a binary file, and is processed specially.
|
||||
(GNU grep also identifies binary files in this manner.) See the
|
||||
--binary-files option for a means of changing the way binary files are
|
||||
handled.
|
||||
|
||||
|
||||
OPTIONS
|
||||
|
||||
The order in which some of the options appear can affect the output.
|
||||
For example, both the -h and -l options affect the printing of file
|
||||
names. Whichever comes later in the command line will be the one that
|
||||
takes effect. Similarly, except where noted below, if an option is
|
||||
given twice, the later setting is used. Numerical values for options
|
||||
may be followed by K or M, to signify multiplication by 1024 or
|
||||
The order in which some of the options appear can affect the output.
|
||||
For example, both the -h and -l options affect the printing of file
|
||||
names. Whichever comes later in the command line will be the one that
|
||||
takes effect. Similarly, except where noted below, if an option is
|
||||
given twice, the later setting is used. Numerical values for options
|
||||
may be followed by K or M, to signify multiplication by 1024 or
|
||||
1024*1024 respectively.
|
||||
|
||||
-- This terminates the list of options. It is useful if the next
|
||||
item on the command line starts with a hyphen but is not an
|
||||
option. This allows for the processing of patterns and file
|
||||
item on the command line starts with a hyphen but is not an
|
||||
option. This allows for the processing of patterns and file
|
||||
names that start with hyphens.
|
||||
|
||||
-A number, --after-context=number
|
||||
Output number lines of context after each matching line. If
|
||||
file names and/or line numbers are being output, a hyphen
|
||||
separator is used instead of a colon for the context lines. A
|
||||
line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The
|
||||
value of number is expected to be relatively small. However,
|
||||
pcre2grep guarantees to have up to 8K of following text
|
||||
available for context output.
|
||||
Output up to number lines of context after each matching
|
||||
line. Fewer lines are output if the next match or the end of
|
||||
the file is reached, or if the processing buffer size has
|
||||
been set too small. If file names and/or line numbers are
|
||||
being output, a hyphen separator is used instead of a colon
|
||||
for the context lines. A line containing "--" is output
|
||||
between each group of lines, unless they are in fact contigu-
|
||||
ous in the input file. The value of number is expected to be
|
||||
relatively small. When -c is used, -A is ignored.
|
||||
|
||||
-a, --text
|
||||
Treat binary files as text. This is equivalent to --binary-
|
||||
files=text.
|
||||
|
||||
-B number, --before-context=number
|
||||
Output number lines of context before each matching line. If
|
||||
file names and/or line numbers are being output, a hyphen
|
||||
separator is used instead of a colon for the context lines. A
|
||||
line containing "--" is output between each group of lines,
|
||||
unless they are in fact contiguous in the input file. The
|
||||
value of number is expected to be relatively small. However,
|
||||
pcre2grep guarantees to have up to 8K of preceding text
|
||||
available for context output.
|
||||
Output up to number lines of context before each matching
|
||||
line. Fewer lines are output if the previous match or the
|
||||
start of the file is within number lines, or if the process-
|
||||
ing buffer size has been set too small. If file names and/or
|
||||
line numbers are being output, a hyphen separator is used
|
||||
instead of a colon for the context lines. A line containing
|
||||
"--" is output between each group of lines, unless they are
|
||||
in fact contiguous in the input file. The value of number is
|
||||
expected to be relatively small. When -c is used, -B is
|
||||
ignored.
|
||||
|
||||
--binary-files=word
|
||||
Specify how binary files are to be processed. If the word is
|
||||
|
@ -164,54 +176,58 @@ OPTIONS
|
|||
any output or affecting the return code.
|
||||
|
||||
--buffer-size=number
|
||||
Set the parameter that controls how much memory is used for
|
||||
buffering files that are being scanned.
|
||||
Set the parameter that controls how much memory is obtained
|
||||
at the start of processing for buffering files that are being
|
||||
scanned. See also --max-buffer-size below.
|
||||
|
||||
-C number, --context=number
|
||||
Output number lines of context both before and after each
|
||||
matching line. This is equivalent to setting both -A and -B
|
||||
Output number lines of context both before and after each
|
||||
matching line. This is equivalent to setting both -A and -B
|
||||
to the same value.
|
||||
|
||||
-c, --count
|
||||
Do not output lines from the files that are being scanned;
|
||||
instead output the number of matches (or non-matches if -v is
|
||||
used) that would otherwise have caused lines to be shown. By
|
||||
default, this count is the same as the number of suppressed
|
||||
lines, but if the -M (multiline) option is used (without -v),
|
||||
there may be more suppressed lines than the number of
|
||||
matches.
|
||||
Do not output lines from the files that are being scanned;
|
||||
instead output the number of lines that would have been
|
||||
shown, either because they matched, or, if -v is set, because
|
||||
they failed to match. By default, this count is exactly the
|
||||
same as the number of lines that would have been output, but
|
||||
if the -M (multiline) option is used (without -v), there may
|
||||
be more suppressed lines than the count (that is, the number
|
||||
of matches).
|
||||
|
||||
If no lines are selected, the number zero is output. If sev-
|
||||
eral files are are being scanned, a count is output for each
|
||||
of them. However, if the --files-with-matches option is also
|
||||
used, only those files whose counts are greater than zero are
|
||||
listed. When -c is used, the -A, -B, and -C options are
|
||||
ignored.
|
||||
of them and the -t option can be used to cause a total to be
|
||||
output at the end. However, if the --files-with-matches
|
||||
option is also used, only those files whose counts are
|
||||
greater than zero are listed. When -c is used, the -A, -B,
|
||||
and -C options are ignored.
|
||||
|
||||
--colour, --color
|
||||
If this option is given without any data, it is equivalent to
|
||||
"--colour=auto". If data is required, it must be given in
|
||||
"--colour=auto". If data is required, it must be given in
|
||||
the same shell item, separated by an equals sign.
|
||||
|
||||
--colour=value, --color=value
|
||||
This option specifies under what circumstances the parts of a
|
||||
line that matched a pattern should be coloured in the output.
|
||||
By default, the output is not coloured. The value (which is
|
||||
optional, see above) may be "never", "always", or "auto". In
|
||||
the latter case, colouring happens only if the standard out-
|
||||
put is connected to a terminal. More resources are used when
|
||||
By default, the output is not coloured. The value (which is
|
||||
optional, see above) may be "never", "always", or "auto". In
|
||||
the latter case, colouring happens only if the standard out-
|
||||
put is connected to a terminal. More resources are used when
|
||||
colouring is enabled, because pcre2grep has to search for all
|
||||
possible matches in a line, not just one, in order to colour
|
||||
possible matches in a line, not just one, in order to colour
|
||||
them all.
|
||||
|
||||
The colour that is used can be specified by setting the envi-
|
||||
ronment variable PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The
|
||||
value of this variable should be a string of two numbers,
|
||||
separated by a semicolon. They are copied directly into the
|
||||
control string for setting colour on a terminal, so it is
|
||||
your responsibility to ensure that they make sense. If nei-
|
||||
ther of the environment variables is set, the default is
|
||||
"1;31", which gives red.
|
||||
ronment variable PCRE2GREP_COLOUR or PCRE2GREP_COLOR. If nei-
|
||||
ther of these are set, pcre2grep looks for GREP_COLOUR or
|
||||
GREP_COLOR. The value of the variable should be a string of
|
||||
two numbers, separated by a semicolon. They are copied
|
||||
directly into the control string for setting colour on a ter-
|
||||
minal, so it is your responsibility to ensure that they make
|
||||
sense. If neither of the environment variables is set, the
|
||||
default is "1;31", which gives red.
|
||||
|
||||
-D action, --devices=action
|
||||
If an input path is not a regular file or a directory,
|
||||
|
@ -299,12 +315,12 @@ OPTIONS
|
|||
Read patterns from the file, one per line, and match them
|
||||
against each line of input. What constitutes a newline when
|
||||
reading the file is the operating system's default. The
|
||||
--newline option has no effect on this option. Trailing white
|
||||
space is removed from each line, and blank lines are ignored.
|
||||
An empty file contains no patterns and therefore matches
|
||||
nothing. See also the comments about multiple patterns versus
|
||||
a single pattern with alternatives in the description of -e
|
||||
above.
|
||||
--newline option has no effect on this option. Trailing
|
||||
white space is removed from each line, and blank lines are
|
||||
ignored. An empty file contains no patterns and therefore
|
||||
matches nothing. See also the comments about multiple pat-
|
||||
terns versus a single pattern with alternatives in the
|
||||
description of -e above.
|
||||
|
||||
If this option is given more than once, all the specified
|
||||
files are read. A data line is output if any of the patterns
|
||||
|
@ -482,102 +498,101 @@ OPTIONS
|
|||
tings are specified when the PCRE2 library is compiled, with
|
||||
the default default being 10 million.
|
||||
|
||||
--max-buffer-size=number
|
||||
This limits the expansion of the processing buffer, whose
|
||||
initial size can be set by --buffer-size. The maximum buffer
|
||||
size is silently forced to be no smaller than the starting
|
||||
buffer size.
|
||||
|
||||
-M, --multiline
|
||||
Allow patterns to match more than one line. When this option
|
||||
is given, patterns may usefully contain literal newline char-
|
||||
acters and internal occurrences of ^ and $ characters. The
|
||||
output for a successful match may consist of more than one
|
||||
line. The first is the line in which the match started, and
|
||||
the last is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence the output ends at the
|
||||
end of that line.
|
||||
Allow patterns to match more than one line. When this option
|
||||
is set, the PCRE2 library is called in "multiline" mode. This
|
||||
allows a matched string to extend past the end of a line and
|
||||
continue on one or more subsequent lines. Patterns used with
|
||||
-M may usefully contain literal newline characters and inter-
|
||||
nal occurrences of ^ and $ characters. The output for a suc-
|
||||
cessful match may consist of more than one line. The first
|
||||
line is the line in which the match started, and the last
|
||||
line is the line in which the match ended. If the matched
|
||||
string ends with a newline sequence, the output ends at the
|
||||
end of that line. If -v is set, none of the lines in a
|
||||
multi-line match are output. Once a match has been handled,
|
||||
scanning restarts at the beginning of the line after the one
|
||||
in which the match ended.
|
||||
|
||||
When this option is set, the PCRE2 library is called in "mul-
|
||||
tiline" mode. This allows a matched string to extend past the
|
||||
end of a line and continue on one or more subsequent lines.
|
||||
However, pcre2grep still processes the input line by line.
|
||||
Once a match has been handled, scanning restarts at the
|
||||
beginning of the next line, just as it does when -M is not
|
||||
present. This means that it is possible for the second or
|
||||
subsequent lines in a multiline match to be output again as
|
||||
part of another match.
|
||||
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
The newline sequence that separates multiple lines must be
|
||||
matched as part of the pattern. For example, to find the
|
||||
phrase "regular expression" in a file where "regular" might
|
||||
be at the end of a line and "expression" at the start of the
|
||||
next line, you could use this command:
|
||||
|
||||
pcre2grep -M 'regular\s+expression' <file>
|
||||
|
||||
The \s escape sequence matches any white space character,
|
||||
including newlines, and is followed by + so as to match
|
||||
trailing white space on the first line as well as possibly
|
||||
The \s escape sequence matches any white space character,
|
||||
including newlines, and is followed by + so as to match
|
||||
trailing white space on the first line as well as possibly
|
||||
handling a two-character newline sequence.
|
||||
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. However, pcre2grep ensures that at least 8K
|
||||
characters or the rest of the file (whichever is the shorter)
|
||||
are available for forward matching, and similarly the previ-
|
||||
ous 8K characters (or all the previous characters, if fewer
|
||||
than 8K) are guaranteed to be available for lookbehind asser-
|
||||
tions. The -M option does not work when input is read line by
|
||||
line (see --line-buffered.)
|
||||
There is a limit to the number of lines that can be matched,
|
||||
imposed by the way that pcre2grep buffers the input file as
|
||||
it scans it. With a sufficiently large processing buffer,
|
||||
this should not be a problem, but the -M option does not work
|
||||
when input is read line by line (see --line-buffered.)
|
||||
|
||||
-N newline-type, --newline=newline-type
|
||||
The PCRE2 library supports five different conventions for
|
||||
indicating the ends of lines. They are the single-character
|
||||
sequences CR (carriage return) and LF (linefeed), the two-
|
||||
character sequence CRLF, an "anycrlf" convention, which rec-
|
||||
ognizes any of the preceding three types, and an "any" con-
|
||||
The PCRE2 library supports five different conventions for
|
||||
indicating the ends of lines. They are the single-character
|
||||
sequences CR (carriage return) and LF (linefeed), the two-
|
||||
character sequence CRLF, an "anycrlf" convention, which rec-
|
||||
ognizes any of the preceding three types, and an "any" con-
|
||||
vention, in which any Unicode line ending sequence is assumed
|
||||
to end a line. The Unicode sequences are the three just men-
|
||||
tioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator,
|
||||
to end a line. The Unicode sequences are the three just men-
|
||||
tioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||
U+000C), NEL (next line, U+0085), LS (line separator,
|
||||
U+2028), and PS (paragraph separator, U+2029).
|
||||
|
||||
When the PCRE2 library is built, a default line-ending
|
||||
sequence is specified. This is normally the standard
|
||||
When the PCRE2 library is built, a default line-ending
|
||||
sequence is specified. This is normally the standard
|
||||
sequence for the operating system. Unless otherwise specified
|
||||
by this option, pcre2grep uses the library's default. The
|
||||
by this option, pcre2grep uses the library's default. The
|
||||
possible values for this option are CR, LF, CRLF, ANYCRLF, or
|
||||
ANY. This makes it possible to use pcre2grep to scan files
|
||||
ANY. This makes it possible to use pcre2grep to scan files
|
||||
that have come from other environments without having to mod-
|
||||
ify their line endings. If the data that is being scanned
|
||||
does not agree with the convention set by this option,
|
||||
pcre2grep may behave in strange ways. Note that this option
|
||||
does not apply to files specified by the -f, --exclude-from,
|
||||
or --include-from options, which are expected to use the
|
||||
ify their line endings. If the data that is being scanned
|
||||
does not agree with the convention set by this option,
|
||||
pcre2grep may behave in strange ways. Note that this option
|
||||
does not apply to files specified by the -f, --exclude-from,
|
||||
or --include-from options, which are expected to use the
|
||||
operating system's standard newline sequence.
|
||||
|
||||
-n, --line-number
|
||||
Precede each output line by its line number in the file, fol-
|
||||
lowed by a colon for matching lines or a hyphen for context
|
||||
lowed by a colon for matching lines or a hyphen for context
|
||||
lines. If the file name is also being output, it precedes the
|
||||
line number. When the -M option causes a pattern to match
|
||||
more than one line, only the first is preceded by its line
|
||||
line number. When the -M option causes a pattern to match
|
||||
more than one line, only the first is preceded by its line
|
||||
number. This option is forced if --line-offsets is used.
|
||||
|
||||
--no-jit If the PCRE2 library is built with support for just-in-time
|
||||
--no-jit If the PCRE2 library is built with support for just-in-time
|
||||
compiling (which speeds up matching), pcre2grep automatically
|
||||
makes use of this, unless it was explicitly disabled at build
|
||||
time. This option can be used to disable the use of JIT at
|
||||
run time. It is provided for testing and working round prob-
|
||||
time. This option can be used to disable the use of JIT at
|
||||
run time. It is provided for testing and working round prob-
|
||||
lems. It should never be needed in normal use.
|
||||
|
||||
-o, --only-matching
|
||||
Show only the part of the line that matched a pattern instead
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately.
|
||||
If -o is combined with -v (invert the sense of the match to
|
||||
find non-matching lines), no output is generated, but the
|
||||
return code is set appropriately. If the matched portion of
|
||||
the line is empty, nothing is output unless the file name or
|
||||
line number are being printed, in which case they are shown
|
||||
on an otherwise empty line. This option is mutually exclusive
|
||||
with --file-offsets and --line-offsets.
|
||||
of the whole line. In this mode, no context is shown. That
|
||||
is, the -A, -B, and -C options are ignored. If there is more
|
||||
than one match in a line, each of them is shown separately,
|
||||
on a separate line of output. If -o is combined with -v
|
||||
(invert the sense of the match to find non-matching lines),
|
||||
no output is generated, but the return code is set appropri-
|
||||
ately. If the matched portion of the line is empty, nothing
|
||||
is output unless the file name or line number are being
|
||||
printed, in which case they are shown on an otherwise empty
|
||||
line. This option is mutually exclusive with --file-offsets
|
||||
and --line-offsets.
|
||||
|
||||
-onumber, --only-matching=number
|
||||
Show only the part of the line that matched the capturing
|
||||
|
@ -593,65 +608,80 @@ OPTIONS
|
|||
put.
|
||||
|
||||
If this option is given multiple times, multiple substrings
|
||||
are output, in the order the options are given. For example,
|
||||
-o3 -o1 -o3 causes the substrings matched by capturing paren-
|
||||
theses 3 and 1 and then 3 again to be output. By default,
|
||||
there is no separator (but see the next option).
|
||||
are output for each match, in the order the options are
|
||||
given, and all on one line. For example, -o3 -o1 -o3 causes
|
||||
the substrings matched by capturing parentheses 3 and 1 and
|
||||
then 3 again to be output. By default, there is no separator
|
||||
(but see the next option).
|
||||
|
||||
--om-separator=text
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
Specify a separating string for multiple occurrences of -o.
|
||||
The default is an empty string. Separating strings are never
|
||||
coloured.
|
||||
|
||||
-q, --quiet
|
||||
Work quietly, that is, display nothing except error messages.
|
||||
The exit status indicates whether or not any matches were
|
||||
The exit status indicates whether or not any matches were
|
||||
found.
|
||||
|
||||
-r, --recursive
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to
|
||||
If any given path is a directory, recursively scan the files
|
||||
it contains, taking note of any --include and --exclude set-
|
||||
tings. By default, a directory is read as a normal file; in
|
||||
some operating systems this gives an immediate end-of-file.
|
||||
This option is a shorthand for setting the -d option to
|
||||
"recurse".
|
||||
|
||||
--recursion-limit=number
|
||||
See --match-limit above.
|
||||
|
||||
-s, --no-messages
|
||||
Suppress error messages about non-existent or unreadable
|
||||
files. Such files are quietly skipped. However, the return
|
||||
Suppress error messages about non-existent or unreadable
|
||||
files. Such files are quietly skipped. However, the return
|
||||
code is still 2, even if matches were found in other files.
|
||||
|
||||
-t, --total-count
|
||||
This option is useful when scanning more than one file. If
|
||||
used on its own, -t suppresses all output except for a grand
|
||||
total number of matching lines (or non-matching lines if -v
|
||||
is used) in all the files. If -t is used with -c, a grand
|
||||
total is output except when the previous output is just one
|
||||
line. In other words, it is not output when just one file's
|
||||
count is listed. If file names are being output, the grand
|
||||
total is preceded by "TOTAL:". Otherwise, it appears as just
|
||||
another number. The -t option is ignored when used with -L
|
||||
(list files without matches), because the grand total would
|
||||
always be zero.
|
||||
|
||||
-u, --utf-8
|
||||
Operate in UTF-8 mode. This option is available only if PCRE2
|
||||
has been compiled with UTF-8 support. All patterns (including
|
||||
those for any --exclude and --include options) and all sub-
|
||||
ject lines that are scanned must be valid strings of UTF-8
|
||||
those for any --exclude and --include options) and all sub-
|
||||
ject lines that are scanned must be valid strings of UTF-8
|
||||
characters.
|
||||
|
||||
-V, --version
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
Write the version numbers of pcre2grep and the PCRE2 library
|
||||
to the standard output and then exit. Anything else on the
|
||||
command line is ignored.
|
||||
|
||||
-v, --invert-match
|
||||
Invert the sense of the match, so that lines which do not
|
||||
Invert the sense of the match, so that lines which do not
|
||||
match any of the patterns are the ones that are found.
|
||||
|
||||
-w, --word-regex, --word-regexp
|
||||
Force the patterns to match only whole words. This is equiva-
|
||||
lent to having \b at the start and end of the pattern. This
|
||||
option applies only to the patterns that are matched against
|
||||
the contents of files; it does not apply to patterns speci-
|
||||
lent to having \b at the start and end of the pattern. This
|
||||
option applies only to the patterns that are matched against
|
||||
the contents of files; it does not apply to patterns speci-
|
||||
fied by any of the --include or --exclude options.
|
||||
|
||||
-x, --line-regex, --line-regexp
|
||||
Force the patterns to be anchored (each must start matching
|
||||
at the beginning of a line) and in addition, require them to
|
||||
match entire lines. This is equivalent to having ^ and $
|
||||
characters at the start and end of each alternative top-level
|
||||
Force the patterns to be anchored (each must start matching
|
||||
at the beginning of a line) and in addition, require them to
|
||||
match entire lines. In multiline mode the match may be more
|
||||
than one line. This is equivalent to having \A and \Z charac-
|
||||
ters at the start and end of each alternative top-level
|
||||
branch in every pattern. This option applies only to the pat-
|
||||
terns that are matched against the contents of files; it does
|
||||
not apply to patterns specified by any of the --include or
|
||||
|
@ -822,5 +852,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 19 June 2016
|
||||
Last updated: 31 October 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -465,7 +465,9 @@ PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
|||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_code_free(pcre2_code *); \
|
||||
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||
*pcre2_code_copy(const pcre2_code *);
|
||||
*pcre2_code_copy(const pcre2_code *); \
|
||||
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||
*pcre2_code_copy_with_tables(const pcre2_code *);
|
||||
|
||||
|
||||
/* Functions that give information about a compiled pattern. */
|
||||
|
@ -629,6 +631,7 @@ pcre2_compile are called by application code. */
|
|||
|
||||
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
||||
#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_)
|
||||
#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_)
|
||||
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
||||
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
||||
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
||||
|
|
|
@ -465,7 +465,9 @@ PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
|||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_code_free(pcre2_code *); \
|
||||
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||
*pcre2_code_copy(const pcre2_code *);
|
||||
*pcre2_code_copy(const pcre2_code *); \
|
||||
PCRE2_EXP_DECL pcre2_code PCRE2_CALL_CONVENTION \
|
||||
*pcre2_code_copy_with_tables(const pcre2_code *);
|
||||
|
||||
|
||||
/* Functions that give information about a compiled pattern. */
|
||||
|
@ -629,6 +631,7 @@ pcre2_compile are called by application code. */
|
|||
|
||||
#define pcre2_callout_enumerate PCRE2_SUFFIX(pcre2_callout_enumerate_)
|
||||
#define pcre2_code_copy PCRE2_SUFFIX(pcre2_code_copy_)
|
||||
#define pcre2_code_copy_with_tables PCRE2_SUFFIX(pcre2_code_copy_with_tables_)
|
||||
#define pcre2_code_free PCRE2_SUFFIX(pcre2_code_free_)
|
||||
#define pcre2_compile PCRE2_SUFFIX(pcre2_compile_)
|
||||
#define pcre2_compile_context_copy PCRE2_SUFFIX(pcre2_compile_context_copy_)
|
||||
|
|
|
@ -1042,6 +1042,45 @@ return newcode;
|
|||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Copy compiled code and character tables *
|
||||
*************************************************/
|
||||
|
||||
/* Compiled JIT code cannot be copied, so the new compiled block has no
|
||||
associated JIT data. This version of code_copy also makes a separate copy of
|
||||
the character tables. */
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
|
||||
pcre2_code_copy_with_tables(const pcre2_code *code)
|
||||
{
|
||||
PCRE2_SIZE* ref_count;
|
||||
pcre2_code *newcode;
|
||||
uint8_t *newtables;
|
||||
|
||||
if (code == NULL) return NULL;
|
||||
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
|
||||
if (newcode == NULL) return NULL;
|
||||
memcpy(newcode, code, code->blocksize);
|
||||
newcode->executable_jit = NULL;
|
||||
|
||||
newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE),
|
||||
code->memctl.memory_data);
|
||||
if (newtables == NULL)
|
||||
{
|
||||
code->memctl.free((void *)newcode, code->memctl.memory_data);
|
||||
return NULL;
|
||||
}
|
||||
memcpy(newtables, code->tables, tables_length);
|
||||
ref_count = (PCRE2_SIZE *)(newtables + tables_length);
|
||||
*ref_count = 1;
|
||||
|
||||
newcode->tables = newtables;
|
||||
newcode->flags |= PCRE2_DEREF_TABLES;
|
||||
return newcode;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Free compiled code *
|
||||
*************************************************/
|
||||
|
|
|
@ -427,15 +427,13 @@ so many of them that they are split into two fields. */
|
|||
#define CTL_NULLCONTEXT 0x00200000u
|
||||
#define CTL_POSIX 0x00400000u
|
||||
#define CTL_POSIX_NOSUB 0x00800000u
|
||||
#define CTL_PUSH 0x01000000u
|
||||
#define CTL_PUSHCOPY 0x02000000u
|
||||
#define CTL_STARTCHAR 0x04000000u
|
||||
#define CTL_USE_LENGTH 0x08000000u /* Same word as HEXPAT */
|
||||
#define CTL_UTF8_INPUT 0x10000000u
|
||||
#define CTL_ZERO_TERMINATE 0x20000000u
|
||||
|
||||
#define CTL_NL_SET 0x40000000u /* Informational */
|
||||
#define CTL_BSR_SET 0x80000000u /* Informational */
|
||||
#define CTL_PUSH 0x01000000u /* These three must be */
|
||||
#define CTL_PUSHCOPY 0x02000000u /* all in the same */
|
||||
#define CTL_PUSHTABLESCOPY 0x04000000u /* word. */
|
||||
#define CTL_STARTCHAR 0x08000000u
|
||||
#define CTL_USE_LENGTH 0x10000000u /* Same word as HEXPAT */
|
||||
#define CTL_UTF8_INPUT 0x20000000u
|
||||
#define CTL_ZERO_TERMINATE 0x40000000u
|
||||
|
||||
/* Second control word */
|
||||
|
||||
|
@ -444,6 +442,9 @@ so many of them that they are split into two fields. */
|
|||
#define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000004u
|
||||
#define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000008u
|
||||
|
||||
#define CTL_NL_SET 0x40000000u /* Informational */
|
||||
#define CTL_BSR_SET 0x80000000u /* Informational */
|
||||
|
||||
/* Combinations */
|
||||
|
||||
#define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */
|
||||
|
@ -607,7 +608,8 @@ static modstruct modlist[] = {
|
|||
{ "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) },
|
||||
{ "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) },
|
||||
{ "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) },
|
||||
{ "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) },
|
||||
{ "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) },
|
||||
{ "pushtablescopy", MOD_PAT, MOD_CTL, CTL_PUSHTABLESCOPY, PO(control) },
|
||||
{ "recursion_limit", MOD_CTM, MOD_INT, 0, MO(recursion_limit) },
|
||||
{ "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) },
|
||||
{ "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) },
|
||||
|
@ -651,10 +653,10 @@ static modstruct modlist[] = {
|
|||
|
||||
#define PUSH_SUPPORTED_COMPILE_CONTROLS ( \
|
||||
CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \
|
||||
CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_PUSHCOPY|CTL_BSR_SET|CTL_NL_SET| \
|
||||
CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_PUSHCOPY|CTL_PUSHTABLESCOPY| \
|
||||
CTL_USE_LENGTH)
|
||||
|
||||
#define PUSH_SUPPORTED_COMPILE_CONTROLS2 (0)
|
||||
#define PUSH_SUPPORTED_COMPILE_CONTROLS2 (CTL_BSR_SET|CTL_NL_SET)
|
||||
|
||||
/* Controls that apply only at compile time with 'push'. */
|
||||
|
||||
|
@ -664,7 +666,7 @@ static modstruct modlist[] = {
|
|||
/* Controls that are forbidden with #pop or #popcopy. */
|
||||
|
||||
#define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_POSIX_NOSUB|CTL_PUSH| \
|
||||
CTL_PUSHCOPY|CTL_USE_LENGTH)
|
||||
CTL_PUSHCOPY|CTL_PUSHTABLESCOPY|CTL_USE_LENGTH)
|
||||
|
||||
/* Pattern controls that are mutually exclusive. At present these are all in
|
||||
the first control word. Note that CTL_POSIX_NOSUB is always accompanied by
|
||||
|
@ -674,6 +676,7 @@ static uint32_t exclusive_pat_controls[] = {
|
|||
CTL_POSIX | CTL_HEXPAT,
|
||||
CTL_POSIX | CTL_PUSH,
|
||||
CTL_POSIX | CTL_PUSHCOPY,
|
||||
CTL_POSIX | CTL_PUSHTABLESCOPY,
|
||||
CTL_POSIX | CTL_USE_LENGTH,
|
||||
CTL_EXPAND | CTL_HEXPAT };
|
||||
|
||||
|
@ -973,6 +976,14 @@ are supported. */
|
|||
else \
|
||||
a = (void *)pcre2_code_copy_32(G(b,32))
|
||||
|
||||
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
a = (void *)pcre2_code_copy_with_tables_8(G(b,8)); \
|
||||
else if (test_mode == PCRE16_MODE) \
|
||||
a = (void *)pcre2_code_copy_with_tables_16(G(b,16)); \
|
||||
else \
|
||||
a = (void *)pcre2_code_copy_with_tables_32(G(b,32))
|
||||
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
if (test_mode == PCRE8_MODE) \
|
||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g); \
|
||||
|
@ -1436,6 +1447,12 @@ the three different cases. */
|
|||
else \
|
||||
a = (void *)G(pcre2_code_copy_,BITTWO)(G(b,BITTWO))
|
||||
|
||||
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
a = (void *)G(pcre2_code_copy_with_tables_,BITONE)(G(b,BITONE)); \
|
||||
else \
|
||||
a = (void *)G(pcre2_code_copy_with_tables_,BITTWO)(G(b,BITTWO))
|
||||
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
|
||||
G(a,BITONE) = G(pcre2_compile_,BITONE)(G(b,BITONE),c,d,e,f,g); \
|
||||
|
@ -1773,6 +1790,7 @@ the three different cases. */
|
|||
(int (*)(struct pcre2_callout_enumerate_block_8 *, void *))b,c)
|
||||
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,8) = pcre2_code_copy_8(b)
|
||||
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_8(G(b,8))
|
||||
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_8(G(b,8))
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,g)
|
||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||
|
@ -1868,6 +1886,7 @@ the three different cases. */
|
|||
(int (*)(struct pcre2_callout_enumerate_block_16 *, void *))b,c)
|
||||
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,16) = pcre2_code_copy_16(b)
|
||||
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_16(G(b,16))
|
||||
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_16(G(b,16))
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,g)
|
||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||
|
@ -1963,6 +1982,7 @@ the three different cases. */
|
|||
(int (*)(struct pcre2_callout_enumerate_block_32 *, void *))b,c)
|
||||
#define PCRE2_CODE_COPY_FROM_VOID(a,b) G(a,32) = pcre2_code_copy_32(b)
|
||||
#define PCRE2_CODE_COPY_TO_VOID(a,b) a = (void *)pcre2_code_copy_32(G(b,32))
|
||||
#define PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(a,b) a = (void *)pcre2_code_copy_with_tables_32(G(b,32))
|
||||
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
|
||||
G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,g)
|
||||
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
|
||||
|
@ -3435,8 +3455,8 @@ for (;;)
|
|||
#else
|
||||
*((uint16_t *)field) = PCRE2_BSR_UNICODE;
|
||||
#endif
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control &= ~CTL_BSR_SET;
|
||||
else dctl->control &= ~CTL_BSR_SET;
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 &= ~CTL_BSR_SET;
|
||||
else dctl->control2 &= ~CTL_BSR_SET;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -3445,8 +3465,8 @@ for (;;)
|
|||
else if (len == 7 && strncmpic(pp, (const uint8_t *)"unicode", 7) == 0)
|
||||
*((uint16_t *)field) = PCRE2_BSR_UNICODE;
|
||||
else goto INVALID_VALUE;
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control |= CTL_BSR_SET;
|
||||
else dctl->control |= CTL_BSR_SET;
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 |= CTL_BSR_SET;
|
||||
else dctl->control2 |= CTL_BSR_SET;
|
||||
}
|
||||
pp = ep;
|
||||
break;
|
||||
|
@ -3513,14 +3533,14 @@ for (;;)
|
|||
if (i == 0)
|
||||
{
|
||||
*((uint16_t *)field) = NEWLINE_DEFAULT;
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control &= ~CTL_NL_SET;
|
||||
else dctl->control &= ~CTL_NL_SET;
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 &= ~CTL_NL_SET;
|
||||
else dctl->control2 &= ~CTL_NL_SET;
|
||||
}
|
||||
else
|
||||
{
|
||||
*((uint16_t *)field) = i;
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control |= CTL_NL_SET;
|
||||
else dctl->control |= CTL_NL_SET;
|
||||
if (ctx == CTX_PAT || ctx == CTX_DEFPAT) pctl->control2 |= CTL_NL_SET;
|
||||
else dctl->control2 |= CTL_NL_SET;
|
||||
}
|
||||
pp = ep;
|
||||
break;
|
||||
|
@ -3691,7 +3711,7 @@ Returns: nothing
|
|||
static void
|
||||
show_controls(uint32_t controls, uint32_t controls2, const char *before)
|
||||
{
|
||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
|
||||
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
|
||||
|
@ -3699,7 +3719,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s
|
|||
((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "",
|
||||
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
|
||||
((controls & CTL_BINCODE) != 0)? " bincode" : "",
|
||||
((controls & CTL_BSR_SET) != 0)? " bsr" : "",
|
||||
((controls2 & CTL_BSR_SET) != 0)? " bsr" : "",
|
||||
((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "",
|
||||
((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "",
|
||||
((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "",
|
||||
|
@ -3715,12 +3735,13 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s
|
|||
((controls & CTL_JITVERIFY) != 0)? " jitverify" : "",
|
||||
((controls & CTL_MARK) != 0)? " mark" : "",
|
||||
((controls & CTL_MEMORY) != 0)? " memory" : "",
|
||||
((controls & CTL_NL_SET) != 0)? " newline" : "",
|
||||
((controls2 & CTL_NL_SET) != 0)? " newline" : "",
|
||||
((controls & CTL_NULLCONTEXT) != 0)? " null_context" : "",
|
||||
((controls & CTL_POSIX) != 0)? " posix" : "",
|
||||
((controls & CTL_POSIX_NOSUB) != 0)? " posix_nosub" : "",
|
||||
((controls & CTL_PUSH) != 0)? " push" : "",
|
||||
((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "",
|
||||
((controls & CTL_PUSHTABLESCOPY) != 0)? " pushtablescopy" : "",
|
||||
((controls & CTL_STARTCHAR) != 0)? " startchar" : "",
|
||||
((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "",
|
||||
((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "",
|
||||
|
@ -4061,7 +4082,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
|||
|
||||
if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
|
||||
|
||||
if ((pat_patctl.control & CTL_BSR_SET) != 0 ||
|
||||
if ((pat_patctl.control2 & CTL_BSR_SET) != 0 ||
|
||||
(FLD(compiled_code, flags) & PCRE2_BSR_SET) != 0)
|
||||
fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)?
|
||||
"any Unicode newline" : "CR, LF, or CRLF");
|
||||
|
@ -4930,7 +4951,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
|
|||
/* Handle compiling via the native interface. Controls that act later are
|
||||
ignored with "push". Replacements are locked out. */
|
||||
|
||||
if ((pat_patctl.control & (CTL_PUSH|CTL_PUSHCOPY)) != 0)
|
||||
if ((pat_patctl.control & (CTL_PUSH|CTL_PUSHCOPY|CTL_PUSHTABLESCOPY)) != 0)
|
||||
{
|
||||
if (pat_patctl.replacement[0] != 0)
|
||||
{
|
||||
|
@ -5031,7 +5052,7 @@ if (test_mode == PCRE32_MODE && pbuffer32 != NULL)
|
|||
appropriate default newline setting, local_newline_default will be non-zero. We
|
||||
use this if there is no explicit newline modifier. */
|
||||
|
||||
if ((pat_patctl.control & CTL_NL_SET) == 0 && local_newline_default != 0)
|
||||
if ((pat_patctl.control2 & CTL_NL_SET) == 0 && local_newline_default != 0)
|
||||
{
|
||||
SETFLD(pat_context, newline_convention, local_newline_default);
|
||||
}
|
||||
|
@ -5163,7 +5184,7 @@ if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0)
|
|||
/* If an explicit newline modifier was given, set the information flag in the
|
||||
pattern so that it is preserved over push/pop. */
|
||||
|
||||
if ((pat_patctl.control & CTL_NL_SET) != 0)
|
||||
if ((pat_patctl.control2 & CTL_NL_SET) != 0)
|
||||
{
|
||||
SETFLD(compiled_code, flags, FLD(compiled_code, flags) | PCRE2_NL_SET);
|
||||
}
|
||||
|
@ -5191,17 +5212,25 @@ if ((pat_patctl.control & CTL_PUSH) != 0)
|
|||
SET(compiled_code, NULL);
|
||||
}
|
||||
|
||||
/* The "pushcopy" control is similar, but pushes a copy of the pattern. This
|
||||
tests the pcre2_code_copy() function. */
|
||||
/* The "pushcopy" and "pushtablescopy" controls are similar, but push a
|
||||
copy of the pattern, the latter with a copy of its character tables. This tests
|
||||
the pcre2_code_copy() and pcre2_code_copy_with_tables() functions. */
|
||||
|
||||
if ((pat_patctl.control & CTL_PUSHCOPY) != 0)
|
||||
if ((pat_patctl.control & (CTL_PUSHCOPY|CTL_PUSHTABLESCOPY)) != 0)
|
||||
{
|
||||
if (patstacknext >= PATSTACKSIZE)
|
||||
{
|
||||
fprintf(outfile, "** Too many pushed patterns (max %d)\n", PATSTACKSIZE);
|
||||
return PR_ABEND;
|
||||
}
|
||||
PCRE2_CODE_COPY_TO_VOID(patstack[patstacknext++], compiled_code);
|
||||
if ((pat_patctl.control & CTL_PUSHCOPY) != 0)
|
||||
{
|
||||
PCRE2_CODE_COPY_TO_VOID(patstack[patstacknext++], compiled_code);
|
||||
}
|
||||
else
|
||||
{
|
||||
PCRE2_CODE_COPY_WITH_TABLES_TO_VOID(patstack[patstacknext++],
|
||||
compiled_code); }
|
||||
}
|
||||
|
||||
return PR_OK;
|
||||
|
|
|
@ -88,4 +88,13 @@
|
|||
|
||||
#pop should give an error
|
||||
|
||||
/abcd/pushtablescopy
|
||||
abcd
|
||||
|
||||
#popcopy
|
||||
abcd
|
||||
|
||||
#pop
|
||||
abcd
|
||||
|
||||
# End of testinput20
|
||||
|
|
|
@ -135,4 +135,16 @@ Serialization failed: error -30: patterns do not all use the same character tabl
|
|||
#pop should give an error
|
||||
** Can't pop off an empty stack
|
||||
|
||||
/abcd/pushtablescopy
|
||||
abcd
|
||||
0: abcd
|
||||
|
||||
#popcopy
|
||||
abcd
|
||||
0: abcd
|
||||
|
||||
#pop
|
||||
abcd
|
||||
0: abcd
|
||||
|
||||
# End of testinput20
|
||||
|
|
Loading…
Reference in New Issue