Tidy pcre2demo.c
This commit is contained in:
parent
6c1c817438
commit
4e67c0c9e9
|
@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
|
|||
modifier had this effect. That option is now ignored when the POSIX API is in
|
||||
use.
|
||||
|
||||
8. Minor tidies to the pcre2demo.c sample program, including more comments
|
||||
about its 8-bit-ness.
|
||||
|
||||
|
||||
Version 10.21 12-January-2016
|
||||
-----------------------------
|
||||
|
|
|
@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
|
|||
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
||||
were followed by ?: but named parentheses can still be used for capturing (and
|
||||
they acquire numbers in the usual way). There is no equivalent of this option
|
||||
in Perl.
|
||||
in Perl. Note that, if this option is set, references to capturing groups (back
|
||||
references or recursion/subroutine calls) may only refer to named groups,
|
||||
though the reference can be by name or by number.
|
||||
<pre>
|
||||
PCRE2_NO_AUTO_POSSESS
|
||||
</pre>
|
||||
|
@ -3121,9 +3123,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 16 December 2015
|
||||
Last updated: 31 January 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
|
|||
*************************************************/
|
||||
|
||||
/* This is a demonstration program to illustrate a straightforward way of
|
||||
calling the PCRE2 regular expression library from a C program. See the
|
||||
using the PCRE2 regular expression library from a C program. See the
|
||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||
incompatible with the original PCRE API.
|
||||
|
||||
There are actually three libraries, each supporting a different code unit
|
||||
width. This demonstration program uses the 8-bit library.
|
||||
width. This demonstration program uses the 8-bit library. The default is to
|
||||
process each code unit as a separate character, but if the pattern begins with
|
||||
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||
characters may occupy multiple code units.
|
||||
|
||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||
libraries, you should be able to compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
|
||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||
compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
|
||||
If you do not have pkg-config, you may have to use this:
|
||||
If you do not have pkg-config, you may have to use something like this:
|
||||
|
||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||
|
||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||
|
@ -56,9 +59,14 @@ the following line. */
|
|||
|
||||
/* #define PCRE2_STATIC */
|
||||
|
||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
||||
only one code unit width, it makes it possible to use generic function names
|
||||
such as pcre2_compile(). */
|
||||
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||
characters, the code for handling the table of named substrings will still need
|
||||
to be modified. */
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
|
||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
|||
{
|
||||
pcre2_code *re;
|
||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
||||
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||
PCRE2_SPTR name_table;
|
||||
|
||||
int crlf_is_newline;
|
||||
int errornumber;
|
||||
int find_all;
|
||||
int i;
|
||||
int namecount;
|
||||
int name_entry_size;
|
||||
int rc;
|
||||
int utf8;
|
||||
|
||||
uint32_t option_bits;
|
||||
uint32_t namecount;
|
||||
uint32_t name_entry_size;
|
||||
uint32_t newline;
|
||||
|
||||
PCRE2_SIZE erroroffset;
|
||||
|
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
|
|||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||
* if the -g option is present. Apart from that, there must be exactly two *
|
||||
* arguments. *
|
||||
* if the -g option is present. *
|
||||
**************************************************************************/
|
||||
|
||||
find_all = 0;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||
else if (argv[i][0] == '-')
|
||||
{
|
||||
printf("Unrecognised option %s\n", argv[i]);
|
||||
return 1;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
|
||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
|||
|
||||
if (argc - i != 2)
|
||||
{
|
||||
printf("Two arguments required: a regex and a subject string\n");
|
||||
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
|||
stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
||||
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
|||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||
&namecount); /* where to put the answer */
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr;
|
||||
printf("Named substrings\n");
|
||||
|
@ -371,7 +383,7 @@ for (;;)
|
|||
{
|
||||
if (options == 0) break; /* All matches found */
|
||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||
if (crlf_is_newline && /* If CRLF is newline & */
|
||||
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||
subject[start_offset] == '\r' &&
|
||||
subject[start_offset + 1] == '\n')
|
||||
|
@ -417,7 +429,7 @@ for (;;)
|
|||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr = name_table;
|
||||
printf("Named substrings\n");
|
||||
|
|
|
@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
|
|||
<a href="#lookbehind">(described below)</a>
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind. Neither the alternative matching function
|
||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||
<b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
</P>
|
||||
|
|
|
@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
|
|||
expression 8-bit library. See the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation for a description of PCRE2's native API, which contains much
|
||||
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit
|
||||
additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
|
||||
and 32-bit libraries.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
|
|||
replacement library. Other POSIX options are not even defined.
|
||||
</P>
|
||||
<P>
|
||||
There are also some other options that are not defined by POSIX. These have
|
||||
been added at the request of users who want to make use of certain
|
||||
PCRE2-specific features via the POSIX calling interface.
|
||||
There are also some options that are not defined by POSIX. These have been
|
||||
added at the request of users who want to make use of certain PCRE2-specific
|
||||
features via the POSIX calling interface.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2 is called via these functions, it is only the API that is POSIX-like
|
||||
|
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
|
|||
<pre>
|
||||
REG_NOSUB
|
||||
</pre>
|
||||
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed
|
||||
for compilation to the native function. In addition, when a pattern that is
|
||||
compiled with this flag is passed to <b>regexec()</b> for matching, the
|
||||
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings
|
||||
are returned.
|
||||
When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
|
||||
matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
|
||||
captured strings are returned. Versions of the PCRE library prior to 10.22 used
|
||||
to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
|
||||
because it disables the use of back references.
|
||||
<pre>
|
||||
REG_UCP
|
||||
</pre>
|
||||
|
@ -241,11 +241,12 @@ mutually exclusive; the error REG_INVARG is returned.
|
|||
<P>
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
|
||||
<b>regexec()</b> are ignored.
|
||||
<b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
|
||||
</P>
|
||||
<P>
|
||||
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL,
|
||||
no data about any matched strings is returned.
|
||||
The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
|
||||
(unless REG_STARTEND is set); in both these cases no data about any matched
|
||||
strings is returned.
|
||||
</P>
|
||||
<P>
|
||||
Otherwise, the portion of the string that was matched, and also any captured
|
||||
|
@ -290,9 +291,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 29 November 2015
|
||||
Last updated: 31 January 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
|
|||
save this listing to re-create the contents of <i>pcre2demo.c</i>.
|
||||
</P>
|
||||
<P>
|
||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
||||
regular expression that is its first argument, and matches it against the
|
||||
subject string in its second argument. No PCRE2 options are set, and default
|
||||
character tables are used. If matching succeeds, the program outputs the
|
||||
portion of the subject that matched, together with the contents of any captured
|
||||
substrings.
|
||||
The demonstration program compiles the regular expression that is its
|
||||
first argument, and matches it against the subject string in its second
|
||||
argument. No PCRE2 options are set, and default character tables are used. If
|
||||
matching succeeds, the program outputs the portion of the subject that matched,
|
||||
together with the contents of any captured substrings.
|
||||
</P>
|
||||
<P>
|
||||
If the -g option is given on the command line, the program then goes on to
|
||||
|
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
|
|||
an empty string. Comments in the code explain what is going on.
|
||||
</P>
|
||||
<P>
|
||||
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
|
||||
library. It handles strings and characters that are stored in 8-bit code units.
|
||||
By default, one character corresponds to one code unit, but if the pattern
|
||||
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||
where characters may occupy multiple code units.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2 is installed in the standard include and library directories for your
|
||||
operating system, you should be able to compile the demonstration program using
|
||||
this command:
|
||||
a command like this:
|
||||
<pre>
|
||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
</pre>
|
||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||
<i>/usr/local</i>, you can compile the demonstration program using a command
|
||||
like this:
|
||||
<pre>
|
||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
Once you have compiled and linked the demonstration program, you can run simple
|
||||
tests like this:
|
||||
cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
||||
</pre>
|
||||
Once you have built the demonstration program, you can run simple tests like
|
||||
this:
|
||||
<pre>
|
||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||
</pre>
|
||||
Note that there is a much more comprehensive test program, called
|
||||
<a href="pcre2test.html"><b>pcre2test</b>,</a>
|
||||
which supports many more facilities for testing regular expressions using the
|
||||
PCRE2 libraries. The
|
||||
which supports many more facilities for testing regular expressions using all
|
||||
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||
installed). The
|
||||
<a href="pcre2demo.html"><b>pcre2demo</b></a>
|
||||
program is provided as a simple coding example.
|
||||
program is provided as a relatively simple coding example.
|
||||
</P>
|
||||
<P>
|
||||
If you try to run
|
||||
|
@ -73,7 +77,7 @@ If you try to run
|
|||
when PCRE2 is not installed in the standard library directory, you may get an
|
||||
error like this on some operating systems (e.g. Solaris):
|
||||
<pre>
|
||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
||||
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||
</pre>
|
||||
This is caused by the way shared library support works on those systems. You
|
||||
need to add
|
||||
|
@ -97,9 +101,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 20 October 2014
|
||||
Last updated: 02 February 2016
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -98,10 +98,11 @@ further data is read.
|
|||
</P>
|
||||
<P>
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in <b>pcre2test</b> input files. There is a facility for specifying a
|
||||
pattern's characters as hexadecimal pairs, thus making it possible to include
|
||||
binary zeroes in a pattern for testing purposes. Subject lines are processed
|
||||
for backslash escapes, which makes it possible to include any data value.
|
||||
characters in <b>pcre2test</b> input files. There is a facility for specifying
|
||||
some or all of a pattern's characters as hexadecimal pairs, thus making it
|
||||
possible to include binary zeroes in a pattern for testing purposes. Subject
|
||||
lines are processed for backslash escapes, which makes it possible to include
|
||||
any data value.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
||||
<P>
|
||||
|
@ -559,7 +560,7 @@ about the pattern:
|
|||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
hex pattern is coded in hexadecimal
|
||||
hex unquoted characters are hexadecimal
|
||||
jit[=<number>] use JIT
|
||||
jitfast use JIT fast path
|
||||
jitverify verify JIT use
|
||||
|
@ -570,6 +571,7 @@ about the pattern:
|
|||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
posix_nosub use the POSIX API with REG_NOSUB
|
||||
push push compiled pattern onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
tables=[0|1|2] select internal tables
|
||||
|
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
|||
default values).
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying a pattern in hex
|
||||
Specifying pattern characters in hexadecimal
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>hex</b> modifier specifies that the characters of the pattern are to be
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted between
|
||||
pairs. For example:
|
||||
The <b>hex</b> modifier specifies that the characters of the pattern, except for
|
||||
substrings enclosed in single or double quotes, are to be interpreted as pairs
|
||||
of hexadecimal digits. This feature is provided as a way of creating patterns
|
||||
that contain binary zeros and other non-printing characters. White space is
|
||||
permitted between pairs of digits. For example, this pattern contains three
|
||||
characters:
|
||||
<pre>
|
||||
/ab 32 59/hex
|
||||
</pre>
|
||||
This feature is provided as a way of creating patterns that contain binary zero
|
||||
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
|
||||
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
|
||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
|
||||
actual length of the pattern is passed.
|
||||
Parts of such a pattern are taken literally if quoted. This pattern contains
|
||||
nine characters, only two of which are specified in hexadecimal:
|
||||
<pre>
|
||||
/ab "literal" 32/hex
|
||||
</pre>
|
||||
Either single or double quotes may be used. There is no way of including
|
||||
the delimiter within a substring.
|
||||
</P>
|
||||
<P>
|
||||
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
|
||||
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
|
||||
patterns specified with the <b>hex</b> modifier, the actual length of the
|
||||
pattern is passed.
|
||||
</P>
|
||||
<br><b>
|
||||
Generating long repetitive patterns
|
||||
|
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
|
|||
Using the POSIX wrapper API
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX
|
||||
wrapper API rather than its native API. This supports only the 8-bit library.
|
||||
Note that it does not imply POSIX matching semantics; for more detail see the
|
||||
The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
|
||||
PCRE2 via the POSIX wrapper API rather than its native API. When
|
||||
<b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
|
||||
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
|
||||
it does not imply POSIX matching semantics; for more detail see the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
documentation. When the POSIX API is being used, the following pattern
|
||||
modifiers set options for the <b>regcomp()</b> function:
|
||||
documentation. The following pattern modifiers set options for the
|
||||
<b>regcomp()</b> function:
|
||||
<pre>
|
||||
caseless REG_ICASE
|
||||
multiline REG_NEWLINE
|
||||
no_auto_capture REG_NOSUB
|
||||
dotall REG_DOTALL )
|
||||
ungreedy REG_UNGREEDY ) These options are not part of
|
||||
ucp REG_UCP ) the POSIX standard
|
||||
|
@ -847,7 +861,8 @@ large buffer is used.
|
|||
</P>
|
||||
<P>
|
||||
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
|
||||
below. All other modifiers cause an error.
|
||||
below. All other modifiers are either ignored, with a warning message, or cause
|
||||
an error.
|
||||
</P>
|
||||
<br><b>
|
||||
Testing the stack guard feature
|
||||
|
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
|
|||
wrapper API to be used, the only option-setting modifiers that have any effect
|
||||
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
|
||||
Any other modifiers cause an error.
|
||||
The other modifiers are ignored, with a warning message.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting match controls
|
||||
|
@ -1001,7 +1016,10 @@ pattern.
|
|||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
zero_terminate pass the subject as zero-terminated
|
||||
</pre>
|
||||
The effects of these modifiers are described in the following sections.
|
||||
The effects of these modifiers are described in the following sections. When
|
||||
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
|
||||
and <b>ovector</b> subject modifiers work as described below. All other
|
||||
modifiers are either ignored, with a warning message, or cause an error.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing more text
|
||||
|
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
|
|||
modifier list containing only
|
||||
<a href="#controlmodifiers">control modifiers</a>
|
||||
that act after a pattern has been compiled. In particular, <b>hex</b>,
|
||||
<b>posix</b>, and <b>push</b> are not allowed, nor are any
|
||||
<b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
|
||||
<a href="#optionmodifiers">option-setting modifiers.</a>
|
||||
The JIT modifiers are, however permitted. Here is an example that saves and
|
||||
reloads two patterns.
|
||||
|
@ -1660,9 +1678,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 12 December 2015
|
||||
Last updated: 31 January 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -1326,7 +1326,10 @@ COMPILING A PATTERN
|
|||
theses in the pattern. Any opening parenthesis that is not followed by
|
||||
? behaves as if it were followed by ?: but named parentheses can still
|
||||
be used for capturing (and they acquire numbers in the usual way).
|
||||
There is no equivalent of this option in Perl.
|
||||
There is no equivalent of this option in Perl. Note that, if this
|
||||
option is set, references to capturing groups (back references or
|
||||
recursion/subroutine calls) may only refer to named groups, though the
|
||||
reference can be by name or by number.
|
||||
|
||||
PCRE2_NO_AUTO_POSSESS
|
||||
|
||||
|
@ -3055,8 +3058,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 16 December 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
Last updated: 31 January 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -6231,7 +6234,7 @@ MATCHING A SINGLE CODE UNIT
|
|||
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
||||
below) in a UTF mode, because this would make it impossible to calcu-
|
||||
late the length of the lookbehind. Neither the alternative matching
|
||||
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
|
||||
function pcre2_dfa_match() nor the JIT optimizer support \C in a UTF
|
||||
mode. The former gives a match-time error; the latter fails to optimize
|
||||
and so the match is always run using the interpreter.
|
||||
|
||||
|
@ -8460,7 +8463,7 @@ DESCRIPTION
|
|||
This set of functions provides a POSIX-style API for the PCRE2 regular
|
||||
expression 8-bit library. See the pcre2api documentation for a descrip-
|
||||
tion of PCRE2's native API, which contains much additional functional-
|
||||
ity. There is no POSIX-style wrapper for PCRE2's 16-bit and 32-bit
|
||||
ity. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit
|
||||
libraries.
|
||||
|
||||
The functions described here are just wrapper functions that ultimately
|
||||
|
@ -8478,8 +8481,8 @@ DESCRIPTION
|
|||
easier to slot in PCRE2 as a replacement library. Other POSIX options
|
||||
are not even defined.
|
||||
|
||||
There are also some other options that are not defined by POSIX. These
|
||||
have been added at the request of users who want to make use of certain
|
||||
There are also some options that are not defined by POSIX. These have
|
||||
been added at the request of users who want to make use of certain
|
||||
PCRE2-specific features via the POSIX calling interface.
|
||||
|
||||
When PCRE2 is called via these functions, it is only the API that is
|
||||
|
@ -8530,11 +8533,11 @@ COMPILING A PATTERN
|
|||
|
||||
REG_NOSUB
|
||||
|
||||
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is
|
||||
passed for compilation to the native function. In addition, when a pat-
|
||||
tern that is compiled with this flag is passed to regexec() for match-
|
||||
ing, the nmatch and pmatch arguments are ignored, and no captured
|
||||
strings are returned.
|
||||
When a pattern that is compiled with this flag is passed to regexec()
|
||||
for matching, the nmatch and pmatch arguments are ignored, and no cap-
|
||||
tured strings are returned. Versions of the PCRE library prior to 10.22
|
||||
used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no
|
||||
longer happens because it disables the use of back references.
|
||||
|
||||
REG_UCP
|
||||
|
||||
|
@ -8653,17 +8656,18 @@ MATCHING A PATTERN
|
|||
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any
|
||||
matched strings is returned. The nmatch and pmatch arguments of
|
||||
regexec() are ignored.
|
||||
regexec() are ignored (except possibly as input for REG_STARTEND).
|
||||
|
||||
If the value of nmatch is zero, or if the value pmatch is NULL, no data
|
||||
about any matched strings is returned.
|
||||
The value of nmatch may be zero, and the value pmatch may be NULL
|
||||
(unless REG_STARTEND is set); in both these cases no data about any
|
||||
matched strings is returned.
|
||||
|
||||
Otherwise,the portion of the string that was matched, and also any cap-
|
||||
tured substrings, are returned via the pmatch argument, which points to
|
||||
an array of nmatch structures of type regmatch_t, containing the mem-
|
||||
bers rm_so and rm_eo. These contain the byte offset to the first char-
|
||||
acter of each substring and the offset to the first character after the
|
||||
end of each substring, respectively. The 0th element of the vector
|
||||
Otherwise, the portion of the string that was matched, and also any
|
||||
captured substrings, are returned via the pmatch argument, which points
|
||||
to an array of nmatch structures of type regmatch_t, containing the
|
||||
members rm_so and rm_eo. These contain the byte offset to the first
|
||||
character of each substring and the offset to the first character after
|
||||
the end of each substring, respectively. The 0th element of the vector
|
||||
relates to the entire portion of string that was matched; subsequent
|
||||
elements relate to the capturing subpatterns of the regular expression.
|
||||
Unused entries in the array have both structure members set to -1.
|
||||
|
@ -8702,8 +8706,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 29 November 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
Last updated: 31 January 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -8722,12 +8726,12 @@ PCRE2 SAMPLE PROGRAM
|
|||
documentation. If you do not have a copy of the PCRE2 distribution, you
|
||||
can save this listing to re-create the contents of pcre2demo.c.
|
||||
|
||||
The demonstration program, which uses the PCRE2 8-bit library, compiles
|
||||
the regular expression that is its first argument, and matches it
|
||||
against the subject string in its second argument. No PCRE2 options are
|
||||
set, and default character tables are used. If matching succeeds, the
|
||||
program outputs the portion of the subject that matched, together with
|
||||
the contents of any captured substrings.
|
||||
The demonstration program compiles the regular expression that is its
|
||||
first argument, and matches it against the subject string in its second
|
||||
argument. No PCRE2 options are set, and default character tables are
|
||||
used. If matching succeeds, the program outputs the portion of the sub-
|
||||
ject that matched, together with the contents of any captured sub-
|
||||
strings.
|
||||
|
||||
If the -g option is given on the command line, the program then goes on
|
||||
to check for further matches of the same regular expression in the same
|
||||
|
@ -8735,38 +8739,45 @@ PCRE2 SAMPLE PROGRAM
|
|||
bility of matching an empty string. Comments in the code explain what
|
||||
is going on.
|
||||
|
||||
The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit
|
||||
library. It handles strings and characters that are stored in 8-bit
|
||||
code units. By default, one character corresponds to one code unit,
|
||||
but if the pattern starts with "(*UTF)", both it and the subject are
|
||||
treated as UTF-8 strings, where characters may occupy multiple code
|
||||
units.
|
||||
|
||||
If PCRE2 is installed in the standard include and library directories
|
||||
for your operating system, you should be able to compile the demonstra-
|
||||
tion program using this command:
|
||||
tion program using a command like this:
|
||||
|
||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
|
||||
If PCRE2 is installed elsewhere, you may need to add additional options
|
||||
to the command line. For example, on a Unix-like system that has PCRE2
|
||||
installed in /usr/local, you can compile the demonstration program
|
||||
using a command like this:
|
||||
|
||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \
|
||||
cc -o pcre2demo -I/usr/local/include pcre2demo.c \
|
||||
-L/usr/local/lib -lpcre2-8
|
||||
|
||||
|
||||
Once you have compiled and linked the demonstration program, you can
|
||||
run simple tests like this:
|
||||
Once you have built the demonstration program, you can run simple tests
|
||||
like this:
|
||||
|
||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||
|
||||
Note that there is a much more comprehensive test program, called
|
||||
pcre2test, which supports many more facilities for testing regular
|
||||
expressions using the PCRE2 libraries. The pcre2demo program is pro-
|
||||
vided as a simple coding example.
|
||||
expressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit,
|
||||
though not all three need be installed). The pcre2demo program is pro-
|
||||
vided as a relatively simple coding example.
|
||||
|
||||
If you try to run pcre2demo when PCRE2 is not installed in the standard
|
||||
library directory, you may get an error like this on some operating
|
||||
systems (e.g. Solaris):
|
||||
|
||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or
|
||||
directory
|
||||
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file
|
||||
or directory
|
||||
|
||||
This is caused by the way shared library support works on those sys-
|
||||
tems. You need to add
|
||||
|
@ -8785,8 +8796,8 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 20 October 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 02 February 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3)
|
||||
|
||||
|
|
|
@ -20,28 +20,31 @@
|
|||
*************************************************/
|
||||
|
||||
/* This is a demonstration program to illustrate a straightforward way of
|
||||
calling the PCRE2 regular expression library from a C program. See the
|
||||
using the PCRE2 regular expression library from a C program. See the
|
||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||
incompatible with the original PCRE API.
|
||||
|
||||
There are actually three libraries, each supporting a different code unit
|
||||
width. This demonstration program uses the 8-bit library.
|
||||
width. This demonstration program uses the 8-bit library. The default is to
|
||||
process each code unit as a separate character, but if the pattern begins with
|
||||
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||
characters may occupy multiple code units.
|
||||
|
||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||
libraries, you should be able to compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
|
||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||
compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
|
||||
If you do not have pkg-config, you may have to use this:
|
||||
If you do not have pkg-config, you may have to use something like this:
|
||||
|
||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
||||
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||
|
||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||
|
@ -56,9 +59,14 @@ the following line. */
|
|||
|
||||
/* #define PCRE2_STATIC */
|
||||
|
||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
||||
only one code unit width, it makes it possible to use generic function names
|
||||
such as pcre2_compile(). */
|
||||
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||
characters, the code for handling the table of named substrings will still need
|
||||
to be modified. */
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
|
||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
|||
{
|
||||
pcre2_code *re;
|
||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
||||
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||
PCRE2_SPTR name_table;
|
||||
|
||||
int crlf_is_newline;
|
||||
int errornumber;
|
||||
int find_all;
|
||||
int i;
|
||||
int namecount;
|
||||
int name_entry_size;
|
||||
int rc;
|
||||
int utf8;
|
||||
|
||||
uint32_t option_bits;
|
||||
uint32_t namecount;
|
||||
uint32_t name_entry_size;
|
||||
uint32_t newline;
|
||||
|
||||
PCRE2_SIZE erroroffset;
|
||||
|
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
|
|||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||
* if the -g option is present. Apart from that, there must be exactly two *
|
||||
* arguments. *
|
||||
* if the -g option is present. *
|
||||
**************************************************************************/
|
||||
|
||||
find_all = 0;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||
else if (argv[i][0] == '-')
|
||||
{
|
||||
printf("Unrecognised option %s\en", argv[i]);
|
||||
return 1;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
|
||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
|||
|
||||
if (argc - i != 2)
|
||||
{
|
||||
printf("Two arguments required: a regex and a subject string\en");
|
||||
printf("Exactly two arguments required: a regex and a subject string\en");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
|||
stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
|
||||
printf("Match succeeded at offset %d\en", (int)ovector[0]);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
|||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||
&namecount); /* where to put the answer */
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\en"); else
|
||||
if (namecount == 0) printf("No named substrings\en"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr;
|
||||
printf("Named substrings\en");
|
||||
|
@ -371,7 +383,7 @@ for (;;)
|
|||
{
|
||||
if (options == 0) break; /* All matches found */
|
||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||
if (crlf_is_newline && /* If CRLF is newline & */
|
||||
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||
subject[start_offset] == '\er' &&
|
||||
subject[start_offset + 1] == '\en')
|
||||
|
@ -417,7 +429,7 @@ for (;;)
|
|||
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\en"); else
|
||||
if (namecount == 0) printf("No named substrings\en"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr = name_table;
|
||||
printf("Named substrings\en");
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00"
|
||||
.TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 SAMPLE PROGRAM"
|
||||
|
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
|
|||
documentation. If you do not have a copy of the PCRE2 distribution, you can
|
||||
save this listing to re-create the contents of \fIpcre2demo.c\fP.
|
||||
.P
|
||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
||||
regular expression that is its first argument, and matches it against the
|
||||
subject string in its second argument. No PCRE2 options are set, and default
|
||||
character tables are used. If matching succeeds, the program outputs the
|
||||
portion of the subject that matched, together with the contents of any captured
|
||||
substrings.
|
||||
The demonstration program compiles the regular expression that is its
|
||||
first argument, and matches it against the subject string in its second
|
||||
argument. No PCRE2 options are set, and default character tables are used. If
|
||||
matching succeeds, the program outputs the portion of the subject that matched,
|
||||
together with the contents of any captured substrings.
|
||||
.P
|
||||
If the -g option is given on the command line, the program then goes on to
|
||||
check for further matches of the same regular expression in the same subject
|
||||
string. The logic is a little bit tricky because of the possibility of matching
|
||||
an empty string. Comments in the code explain what is going on.
|
||||
.P
|
||||
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
|
||||
library. It handles strings and characters that are stored in 8-bit code units.
|
||||
By default, one character corresponds to one code unit, but if the pattern
|
||||
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||
where characters may occupy multiple code units.
|
||||
.P
|
||||
If PCRE2 is installed in the standard include and library directories for your
|
||||
operating system, you should be able to compile the demonstration program using
|
||||
this command:
|
||||
a command like this:
|
||||
.sp
|
||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
.sp
|
||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||
|
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
|
|||
like this:
|
||||
.sp
|
||||
.\" JOINSH
|
||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
||||
cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
||||
-L/usr/local/lib -lpcre2-8
|
||||
.sp
|
||||
.P
|
||||
Once you have compiled and linked the demonstration program, you can run simple
|
||||
tests like this:
|
||||
Once you have built the demonstration program, you can run simple tests like
|
||||
this:
|
||||
.sp
|
||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||
|
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
|
|||
.\" HREF
|
||||
\fBpcre2test\fP,
|
||||
.\"
|
||||
which supports many more facilities for testing regular expressions using the
|
||||
PCRE2 libraries. The
|
||||
which supports many more facilities for testing regular expressions using all
|
||||
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||
installed). The
|
||||
.\" HREF
|
||||
\fBpcre2demo\fP
|
||||
.\"
|
||||
program is provided as a simple coding example.
|
||||
program is provided as a relatively simple coding example.
|
||||
.P
|
||||
If you try to run
|
||||
.\" HREF
|
||||
|
@ -65,7 +70,7 @@ If you try to run
|
|||
when PCRE2 is not installed in the standard library directory, you may get an
|
||||
error like this on some operating systems (e.g. Solaris):
|
||||
.sp
|
||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
||||
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||
.sp
|
||||
This is caused by the way shared library support works on those systems. You
|
||||
need to add
|
||||
|
@ -89,6 +94,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 October 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 02 February 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -67,10 +67,10 @@ INPUT ENCODING
|
|||
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in pcre2test input files. There is a facility for specifying
|
||||
a pattern's characters as hexadecimal pairs, thus making it possible to
|
||||
include binary zeroes in a pattern for testing purposes. Subject lines
|
||||
are processed for backslash escapes, which makes it possible to include
|
||||
any data value.
|
||||
some or all of a pattern's characters as hexadecimal pairs, thus making
|
||||
it possible to include binary zeroes in a pattern for testing purposes.
|
||||
Subject lines are processed for backslash escapes, which makes it pos-
|
||||
sible to include any data value.
|
||||
|
||||
|
||||
COMMAND LINE OPTIONS
|
||||
|
@ -505,7 +505,7 @@ PATTERN MODIFIERS
|
|||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
hex pattern is coded in hexadecimal
|
||||
hex unquoted characters are hexadecimal
|
||||
jit[=<number>] use JIT
|
||||
jitfast use JIT fast path
|
||||
jitverify verify JIT use
|
||||
|
@ -516,6 +516,7 @@ PATTERN MODIFIERS
|
|||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
posix_nosub use the POSIX API with REG_NOSUB
|
||||
push push compiled pattern onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
tables=[0|1|2] select internal tables
|
||||
|
@ -591,19 +592,30 @@ PATTERN MODIFIERS
|
|||
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||
default values).
|
||||
|
||||
Specifying a pattern in hex
|
||||
Specifying pattern characters in hexadecimal
|
||||
|
||||
The hex modifier specifies that the characters of the pattern are to be
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
||||
between pairs. For example:
|
||||
The hex modifier specifies that the characters of the pattern, except
|
||||
for substrings enclosed in single or double quotes, are to be inter-
|
||||
preted as pairs of hexadecimal digits. This feature is provided as a
|
||||
way of creating patterns that contain binary zeros and other non-print-
|
||||
ing characters. White space is permitted between pairs of digits. For
|
||||
example, this pattern contains three characters:
|
||||
|
||||
/ab 32 59/hex
|
||||
|
||||
This feature is provided as a way of creating patterns that contain
|
||||
binary zero and other non-printing characters. By default, pcre2test
|
||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
||||
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
||||
hexadecimal, the actual length of the pattern is passed.
|
||||
Parts of such a pattern are taken literally if quoted. This pattern
|
||||
contains nine characters, only two of which are specified in hexadeci-
|
||||
mal:
|
||||
|
||||
/ab "literal" 32/hex
|
||||
|
||||
Either single or double quotes may be used. There is no way of includ-
|
||||
ing the delimiter within a substring.
|
||||
|
||||
By default, pcre2test passes patterns as zero-terminated strings to
|
||||
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
|
||||
for patterns specified with the hex modifier, the actual length of the
|
||||
pattern is passed.
|
||||
|
||||
Generating long repetitive patterns
|
||||
|
||||
|
@ -732,16 +744,16 @@ PATTERN MODIFIERS
|
|||
|
||||
Using the POSIX wrapper API
|
||||
|
||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
||||
per API rather than its native API. This supports only the 8-bit
|
||||
library. Note that it does not imply POSIX matching semantics; for
|
||||
more detail see the pcre2posix documentation. When the POSIX API is
|
||||
being used, the following pattern modifiers set options for the reg-
|
||||
comp() function:
|
||||
The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
|
||||
the POSIX wrapper API rather than its native API. When posix_nosub is
|
||||
used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
|
||||
wrapper supports only the 8-bit library. Note that it does not imply
|
||||
POSIX matching semantics; for more detail see the pcre2posix documenta-
|
||||
tion. The following pattern modifiers set options for the regcomp()
|
||||
function:
|
||||
|
||||
caseless REG_ICASE
|
||||
multiline REG_NEWLINE
|
||||
no_auto_capture REG_NOSUB
|
||||
dotall REG_DOTALL )
|
||||
ungreedy REG_UNGREEDY ) These options are not part of
|
||||
ucp REG_UCP ) the POSIX standard
|
||||
|
@ -758,7 +770,8 @@ PATTERN MODIFIERS
|
|||
been set, a large buffer is used.
|
||||
|
||||
The aftertext and allaftertext subject modifiers work as described
|
||||
below. All other modifiers cause an error.
|
||||
below. All other modifiers are either ignored, with a warning message,
|
||||
or cause an error.
|
||||
|
||||
Testing the stack guard feature
|
||||
|
||||
|
@ -855,7 +868,7 @@ SUBJECT MODIFIERS
|
|||
wrapper API to be used, the only option-setting modifiers that have any
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
Any other modifiers cause an error.
|
||||
The other modifiers are ignored, with a warning message.
|
||||
|
||||
Setting match controls
|
||||
|
||||
|
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
|
|||
zero_terminate pass the subject as zero-terminated
|
||||
|
||||
The effects of these modifiers are described in the following sections.
|
||||
When matching via the POSIX wrapper API, the aftertext, allaftertext,
|
||||
and ovector subject modifiers work as described below. All other modi-
|
||||
fiers are either ignored, with a warning message, or cause an error.
|
||||
|
||||
Showing more text
|
||||
|
||||
|
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
|
|||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, and push are not allowed, nor are any option-
|
||||
setting modifiers. The JIT modifiers are, however permitted. Here is
|
||||
an example that saves and reloads two patterns.
|
||||
particular, hex, posix, posix_nosub, and push are not allowed, nor are
|
||||
any option-setting modifiers. The JIT modifiers are, however permit-
|
||||
ted. Here is an example that saves and reloads two patterns.
|
||||
|
||||
/abc/push
|
||||
/xyz/push
|
||||
|
@ -1505,5 +1521,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 12 December 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
Last updated: 31 January 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
|
|
|
@ -3,28 +3,31 @@
|
|||
*************************************************/
|
||||
|
||||
/* This is a demonstration program to illustrate a straightforward way of
|
||||
calling the PCRE2 regular expression library from a C program. See the
|
||||
using the PCRE2 regular expression library from a C program. See the
|
||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||
incompatible with the original PCRE API.
|
||||
|
||||
There are actually three libraries, each supporting a different code unit
|
||||
width. This demonstration program uses the 8-bit library.
|
||||
width. This demonstration program uses the 8-bit library. The default is to
|
||||
process each code unit as a separate character, but if the pattern begins with
|
||||
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||
characters may occupy multiple code units.
|
||||
|
||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||
libraries, you should be able to compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
|
||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||
compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
|
||||
If you do not have pkg-config, you may have to use this:
|
||||
If you do not have pkg-config, you may have to use something like this:
|
||||
|
||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||
|
||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||
|
@ -39,9 +42,14 @@ the following line. */
|
|||
|
||||
/* #define PCRE2_STATIC */
|
||||
|
||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
||||
only one code unit width, it makes it possible to use generic function names
|
||||
such as pcre2_compile(). */
|
||||
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||
characters, the code for handling the table of named substrings will still need
|
||||
to be modified. */
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
|
||||
|
@ -62,19 +70,19 @@ int main(int argc, char **argv)
|
|||
{
|
||||
pcre2_code *re;
|
||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
||||
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||
PCRE2_SPTR name_table;
|
||||
|
||||
int crlf_is_newline;
|
||||
int errornumber;
|
||||
int find_all;
|
||||
int i;
|
||||
int namecount;
|
||||
int name_entry_size;
|
||||
int rc;
|
||||
int utf8;
|
||||
|
||||
uint32_t option_bits;
|
||||
uint32_t namecount;
|
||||
uint32_t name_entry_size;
|
||||
uint32_t newline;
|
||||
|
||||
PCRE2_SIZE erroroffset;
|
||||
|
@ -89,14 +97,18 @@ pcre2_match_data *match_data;
|
|||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||
* if the -g option is present. Apart from that, there must be exactly two *
|
||||
* arguments. *
|
||||
* if the -g option is present. *
|
||||
**************************************************************************/
|
||||
|
||||
find_all = 0;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||
else if (argv[i][0] == '-')
|
||||
{
|
||||
printf("Unrecognised option %s\n", argv[i]);
|
||||
return 1;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
|
||||
|
@ -105,7 +117,7 @@ and the subject string. */
|
|||
|
||||
if (argc - i != 2)
|
||||
{
|
||||
printf("Two arguments required: a regex and a subject string\n");
|
||||
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -184,7 +196,7 @@ if (rc < 0)
|
|||
stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
||||
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
|
|||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||
&namecount); /* where to put the answer */
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr;
|
||||
printf("Named substrings\n");
|
||||
|
@ -354,7 +366,7 @@ for (;;)
|
|||
{
|
||||
if (options == 0) break; /* All matches found */
|
||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||
if (crlf_is_newline && /* If CRLF is newline & */
|
||||
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||
subject[start_offset] == '\r' &&
|
||||
subject[start_offset + 1] == '\n')
|
||||
|
@ -400,7 +412,7 @@ for (;;)
|
|||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr = name_table;
|
||||
printf("Named substrings\n");
|
||||
|
|
Loading…
Reference in New Issue