Tidy pcre2demo.c
This commit is contained in:
parent
6c1c817438
commit
4e67c0c9e9
|
@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
|
||||||
modifier had this effect. That option is now ignored when the POSIX API is in
|
modifier had this effect. That option is now ignored when the POSIX API is in
|
||||||
use.
|
use.
|
||||||
|
|
||||||
|
8. Minor tidies to the pcre2demo.c sample program, including more comments
|
||||||
|
about its 8-bit-ness.
|
||||||
|
|
||||||
|
|
||||||
Version 10.21 12-January-2016
|
Version 10.21 12-January-2016
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
|
@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
|
||||||
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
||||||
were followed by ?: but named parentheses can still be used for capturing (and
|
were followed by ?: but named parentheses can still be used for capturing (and
|
||||||
they acquire numbers in the usual way). There is no equivalent of this option
|
they acquire numbers in the usual way). There is no equivalent of this option
|
||||||
in Perl.
|
in Perl. Note that, if this option is set, references to capturing groups (back
|
||||||
|
references or recursion/subroutine calls) may only refer to named groups,
|
||||||
|
though the reference can be by name or by number.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_NO_AUTO_POSSESS
|
PCRE2_NO_AUTO_POSSESS
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -3121,9 +3123,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 16 December 2015
|
Last updated: 31 January 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This is a demonstration program to illustrate a straightforward way of
|
/* This is a demonstration program to illustrate a straightforward way of
|
||||||
calling the PCRE2 regular expression library from a C program. See the
|
using the PCRE2 regular expression library from a C program. See the
|
||||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||||
incompatible with the original PCRE API.
|
incompatible with the original PCRE API.
|
||||||
|
|
||||||
There are actually three libraries, each supporting a different code unit
|
There are actually three libraries, each supporting a different code unit
|
||||||
width. This demonstration program uses the 8-bit library.
|
width. This demonstration program uses the 8-bit library. The default is to
|
||||||
|
process each code unit as a separate character, but if the pattern begins with
|
||||||
|
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||||
|
characters may occupy multiple code units.
|
||||||
|
|
||||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||||
libraries, you should be able to compile this program using this command:
|
libraries, you should be able to compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||||
compile this program using this command:
|
compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||||
|
|
||||||
If you do not have pkg-config, you may have to use this:
|
If you do not have pkg-config, you may have to use something like this:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||||
|
@ -56,9 +59,14 @@ the following line. */
|
||||||
|
|
||||||
/* #define PCRE2_STATIC */
|
/* #define PCRE2_STATIC */
|
||||||
|
|
||||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||||
only one code unit width, it makes it possible to use generic function names
|
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||||
such as pcre2_compile(). */
|
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||||
|
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||||
|
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||||
|
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||||
|
characters, the code for handling the table of named substrings will still need
|
||||||
|
to be modified. */
|
||||||
|
|
||||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||||
|
|
||||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pcre2_code *re;
|
pcre2_code *re;
|
||||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||||
PCRE2_SPTR name_table;
|
PCRE2_SPTR name_table;
|
||||||
|
|
||||||
int crlf_is_newline;
|
int crlf_is_newline;
|
||||||
int errornumber;
|
int errornumber;
|
||||||
int find_all;
|
int find_all;
|
||||||
int i;
|
int i;
|
||||||
int namecount;
|
|
||||||
int name_entry_size;
|
|
||||||
int rc;
|
int rc;
|
||||||
int utf8;
|
int utf8;
|
||||||
|
|
||||||
uint32_t option_bits;
|
uint32_t option_bits;
|
||||||
|
uint32_t namecount;
|
||||||
|
uint32_t name_entry_size;
|
||||||
uint32_t newline;
|
uint32_t newline;
|
||||||
|
|
||||||
PCRE2_SIZE erroroffset;
|
PCRE2_SIZE erroroffset;
|
||||||
|
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
|
||||||
* First, sort out the command line. There is only one possible option at *
|
* First, sort out the command line. There is only one possible option at *
|
||||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||||
* if the -g option is present. Apart from that, there must be exactly two *
|
* if the -g option is present. *
|
||||||
* arguments. *
|
|
||||||
**************************************************************************/
|
**************************************************************************/
|
||||||
|
|
||||||
find_all = 0;
|
find_all = 0;
|
||||||
for (i = 1; i < argc; i++)
|
for (i = 1; i < argc; i++)
|
||||||
{
|
{
|
||||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||||
|
else if (argv[i][0] == '-')
|
||||||
|
{
|
||||||
|
printf("Unrecognised option %s\n", argv[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
else break;
|
else break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
||||||
|
|
||||||
if (argc - i != 2)
|
if (argc - i != 2)
|
||||||
{
|
{
|
||||||
printf("Two arguments required: a regex and a subject string\n");
|
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
||||||
stored. */
|
stored. */
|
||||||
|
|
||||||
ovector = pcre2_get_ovector_pointer(match_data);
|
ovector = pcre2_get_ovector_pointer(match_data);
|
||||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
||||||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||||
&namecount); /* where to put the answer */
|
&namecount); /* where to put the answer */
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr;
|
PCRE2_SPTR tabptr;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
@ -371,7 +383,7 @@ for (;;)
|
||||||
{
|
{
|
||||||
if (options == 0) break; /* All matches found */
|
if (options == 0) break; /* All matches found */
|
||||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||||
if (crlf_is_newline && /* If CRLF is newline & */
|
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||||
subject[start_offset] == '\r' &&
|
subject[start_offset] == '\r' &&
|
||||||
subject[start_offset + 1] == '\n')
|
subject[start_offset + 1] == '\n')
|
||||||
|
@ -417,7 +429,7 @@ for (;;)
|
||||||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr = name_table;
|
PCRE2_SPTR tabptr = name_table;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
|
|
@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
|
||||||
<a href="#lookbehind">(described below)</a>
|
<a href="#lookbehind">(described below)</a>
|
||||||
in a UTF mode, because this would make it impossible to calculate the length of
|
in a UTF mode, because this would make it impossible to calculate the length of
|
||||||
the lookbehind. Neither the alternative matching function
|
the lookbehind. Neither the alternative matching function
|
||||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
<b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
|
||||||
former gives a match-time error; the latter fails to optimize and so the match
|
former gives a match-time error; the latter fails to optimize and so the match
|
||||||
is always run using the interpreter.
|
is always run using the interpreter.
|
||||||
</P>
|
</P>
|
||||||
|
|
|
@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
|
||||||
expression 8-bit library. See the
|
expression 8-bit library. See the
|
||||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
documentation for a description of PCRE2's native API, which contains much
|
documentation for a description of PCRE2's native API, which contains much
|
||||||
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit
|
additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
|
||||||
and 32-bit libraries.
|
and 32-bit libraries.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
|
||||||
replacement library. Other POSIX options are not even defined.
|
replacement library. Other POSIX options are not even defined.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
There are also some other options that are not defined by POSIX. These have
|
There are also some options that are not defined by POSIX. These have been
|
||||||
been added at the request of users who want to make use of certain
|
added at the request of users who want to make use of certain PCRE2-specific
|
||||||
PCRE2-specific features via the POSIX calling interface.
|
features via the POSIX calling interface.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is called via these functions, it is only the API that is POSIX-like
|
When PCRE2 is called via these functions, it is only the API that is POSIX-like
|
||||||
|
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
|
||||||
<pre>
|
<pre>
|
||||||
REG_NOSUB
|
REG_NOSUB
|
||||||
</pre>
|
</pre>
|
||||||
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed
|
When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
|
||||||
for compilation to the native function. In addition, when a pattern that is
|
matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
|
||||||
compiled with this flag is passed to <b>regexec()</b> for matching, the
|
captured strings are returned. Versions of the PCRE library prior to 10.22 used
|
||||||
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings
|
to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
|
||||||
are returned.
|
because it disables the use of back references.
|
||||||
<pre>
|
<pre>
|
||||||
REG_UCP
|
REG_UCP
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -241,11 +241,12 @@ mutually exclusive; the error REG_INVARG is returned.
|
||||||
<P>
|
<P>
|
||||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||||
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
|
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
|
||||||
<b>regexec()</b> are ignored.
|
<b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL,
|
The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
|
||||||
no data about any matched strings is returned.
|
(unless REG_STARTEND is set); in both these cases no data about any matched
|
||||||
|
strings is returned.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Otherwise, the portion of the string that was matched, and also any captured
|
Otherwise, the portion of the string that was matched, and also any captured
|
||||||
|
@ -290,9 +291,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 29 November 2015
|
Last updated: 31 January 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
|
||||||
save this listing to re-create the contents of <i>pcre2demo.c</i>.
|
save this listing to re-create the contents of <i>pcre2demo.c</i>.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
The demonstration program compiles the regular expression that is its
|
||||||
regular expression that is its first argument, and matches it against the
|
first argument, and matches it against the subject string in its second
|
||||||
subject string in its second argument. No PCRE2 options are set, and default
|
argument. No PCRE2 options are set, and default character tables are used. If
|
||||||
character tables are used. If matching succeeds, the program outputs the
|
matching succeeds, the program outputs the portion of the subject that matched,
|
||||||
portion of the subject that matched, together with the contents of any captured
|
together with the contents of any captured substrings.
|
||||||
substrings.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the -g option is given on the command line, the program then goes on to
|
If the -g option is given on the command line, the program then goes on to
|
||||||
|
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
|
||||||
an empty string. Comments in the code explain what is going on.
|
an empty string. Comments in the code explain what is going on.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
|
||||||
|
library. It handles strings and characters that are stored in 8-bit code units.
|
||||||
|
By default, one character corresponds to one code unit, but if the pattern
|
||||||
|
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||||
|
where characters may occupy multiple code units.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
If PCRE2 is installed in the standard include and library directories for your
|
If PCRE2 is installed in the standard include and library directories for your
|
||||||
operating system, you should be able to compile the demonstration program using
|
operating system, you should be able to compile the demonstration program using
|
||||||
this command:
|
a command like this:
|
||||||
<pre>
|
<pre>
|
||||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||||
</pre>
|
</pre>
|
||||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||||
<i>/usr/local</i>, you can compile the demonstration program using a command
|
<i>/usr/local</i>, you can compile the demonstration program using a command
|
||||||
like this:
|
like this:
|
||||||
<pre>
|
<pre>
|
||||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
||||||
|
</pre>
|
||||||
</PRE>
|
Once you have built the demonstration program, you can run simple tests like
|
||||||
</P>
|
this:
|
||||||
<P>
|
|
||||||
Once you have compiled and linked the demonstration program, you can run simple
|
|
||||||
tests like this:
|
|
||||||
<pre>
|
<pre>
|
||||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||||
</pre>
|
</pre>
|
||||||
Note that there is a much more comprehensive test program, called
|
Note that there is a much more comprehensive test program, called
|
||||||
<a href="pcre2test.html"><b>pcre2test</b>,</a>
|
<a href="pcre2test.html"><b>pcre2test</b>,</a>
|
||||||
which supports many more facilities for testing regular expressions using the
|
which supports many more facilities for testing regular expressions using all
|
||||||
PCRE2 libraries. The
|
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||||
|
installed). The
|
||||||
<a href="pcre2demo.html"><b>pcre2demo</b></a>
|
<a href="pcre2demo.html"><b>pcre2demo</b></a>
|
||||||
program is provided as a simple coding example.
|
program is provided as a relatively simple coding example.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If you try to run
|
If you try to run
|
||||||
|
@ -73,7 +77,7 @@ If you try to run
|
||||||
when PCRE2 is not installed in the standard library directory, you may get an
|
when PCRE2 is not installed in the standard library directory, you may get an
|
||||||
error like this on some operating systems (e.g. Solaris):
|
error like this on some operating systems (e.g. Solaris):
|
||||||
<pre>
|
<pre>
|
||||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||||
</pre>
|
</pre>
|
||||||
This is caused by the way shared library support works on those systems. You
|
This is caused by the way shared library support works on those systems. You
|
||||||
need to add
|
need to add
|
||||||
|
@ -97,9 +101,9 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 20 October 2014
|
Last updated: 02 February 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -98,10 +98,11 @@ further data is read.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
For maximum portability, therefore, it is safest to avoid non-printing
|
For maximum portability, therefore, it is safest to avoid non-printing
|
||||||
characters in <b>pcre2test</b> input files. There is a facility for specifying a
|
characters in <b>pcre2test</b> input files. There is a facility for specifying
|
||||||
pattern's characters as hexadecimal pairs, thus making it possible to include
|
some or all of a pattern's characters as hexadecimal pairs, thus making it
|
||||||
binary zeroes in a pattern for testing purposes. Subject lines are processed
|
possible to include binary zeroes in a pattern for testing purposes. Subject
|
||||||
for backslash escapes, which makes it possible to include any data value.
|
lines are processed for backslash escapes, which makes it possible to include
|
||||||
|
any data value.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -559,7 +560,7 @@ about the pattern:
|
||||||
debug same as info,fullbincode
|
debug same as info,fullbincode
|
||||||
fullbincode show binary code with lengths
|
fullbincode show binary code with lengths
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
hex pattern is coded in hexadecimal
|
hex unquoted characters are hexadecimal
|
||||||
jit[=<number>] use JIT
|
jit[=<number>] use JIT
|
||||||
jitfast use JIT fast path
|
jitfast use JIT fast path
|
||||||
jitverify verify JIT use
|
jitverify verify JIT use
|
||||||
|
@ -570,6 +571,7 @@ about the pattern:
|
||||||
null_context compile with a NULL context
|
null_context compile with a NULL context
|
||||||
parens_nest_limit=<n> set maximum parentheses depth
|
parens_nest_limit=<n> set maximum parentheses depth
|
||||||
posix use the POSIX API
|
posix use the POSIX API
|
||||||
|
posix_nosub use the POSIX API with REG_NOSUB
|
||||||
push push compiled pattern onto the stack
|
push push compiled pattern onto the stack
|
||||||
stackguard=<number> test the stackguard feature
|
stackguard=<number> test the stackguard feature
|
||||||
tables=[0|1|2] select internal tables
|
tables=[0|1|2] select internal tables
|
||||||
|
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Specifying a pattern in hex
|
Specifying pattern characters in hexadecimal
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>hex</b> modifier specifies that the characters of the pattern are to be
|
The <b>hex</b> modifier specifies that the characters of the pattern, except for
|
||||||
interpreted as pairs of hexadecimal digits. White space is permitted between
|
substrings enclosed in single or double quotes, are to be interpreted as pairs
|
||||||
pairs. For example:
|
of hexadecimal digits. This feature is provided as a way of creating patterns
|
||||||
|
that contain binary zeros and other non-printing characters. White space is
|
||||||
|
permitted between pairs of digits. For example, this pattern contains three
|
||||||
|
characters:
|
||||||
<pre>
|
<pre>
|
||||||
/ab 32 59/hex
|
/ab 32 59/hex
|
||||||
</pre>
|
</pre>
|
||||||
This feature is provided as a way of creating patterns that contain binary zero
|
Parts of such a pattern are taken literally if quoted. This pattern contains
|
||||||
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
|
nine characters, only two of which are specified in hexadecimal:
|
||||||
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
|
<pre>
|
||||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
|
/ab "literal" 32/hex
|
||||||
actual length of the pattern is passed.
|
</pre>
|
||||||
|
Either single or double quotes may be used. There is no way of including
|
||||||
|
the delimiter within a substring.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
|
||||||
|
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
|
||||||
|
patterns specified with the <b>hex</b> modifier, the actual length of the
|
||||||
|
pattern is passed.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Generating long repetitive patterns
|
Generating long repetitive patterns
|
||||||
|
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
|
||||||
Using the POSIX wrapper API
|
Using the POSIX wrapper API
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX
|
The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
|
||||||
wrapper API rather than its native API. This supports only the 8-bit library.
|
PCRE2 via the POSIX wrapper API rather than its native API. When
|
||||||
Note that it does not imply POSIX matching semantics; for more detail see the
|
<b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
|
||||||
|
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
|
||||||
|
it does not imply POSIX matching semantics; for more detail see the
|
||||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||||
documentation. When the POSIX API is being used, the following pattern
|
documentation. The following pattern modifiers set options for the
|
||||||
modifiers set options for the <b>regcomp()</b> function:
|
<b>regcomp()</b> function:
|
||||||
<pre>
|
<pre>
|
||||||
caseless REG_ICASE
|
caseless REG_ICASE
|
||||||
multiline REG_NEWLINE
|
multiline REG_NEWLINE
|
||||||
no_auto_capture REG_NOSUB
|
|
||||||
dotall REG_DOTALL )
|
dotall REG_DOTALL )
|
||||||
ungreedy REG_UNGREEDY ) These options are not part of
|
ungreedy REG_UNGREEDY ) These options are not part of
|
||||||
ucp REG_UCP ) the POSIX standard
|
ucp REG_UCP ) the POSIX standard
|
||||||
|
@ -847,7 +861,8 @@ large buffer is used.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
|
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
|
||||||
below. All other modifiers cause an error.
|
below. All other modifiers are either ignored, with a warning message, or cause
|
||||||
|
an error.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Testing the stack guard feature
|
Testing the stack guard feature
|
||||||
|
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
|
||||||
wrapper API to be used, the only option-setting modifiers that have any effect
|
wrapper API to be used, the only option-setting modifiers that have any effect
|
||||||
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
|
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
|
||||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
|
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
|
||||||
Any other modifiers cause an error.
|
The other modifiers are ignored, with a warning message.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Setting match controls
|
Setting match controls
|
||||||
|
@ -1001,7 +1016,10 @@ pattern.
|
||||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
zero_terminate pass the subject as zero-terminated
|
zero_terminate pass the subject as zero-terminated
|
||||||
</pre>
|
</pre>
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections. When
|
||||||
|
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
|
||||||
|
and <b>ovector</b> subject modifiers work as described below. All other
|
||||||
|
modifiers are either ignored, with a warning message, or cause an error.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Showing more text
|
Showing more text
|
||||||
|
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
|
||||||
modifier list containing only
|
modifier list containing only
|
||||||
<a href="#controlmodifiers">control modifiers</a>
|
<a href="#controlmodifiers">control modifiers</a>
|
||||||
that act after a pattern has been compiled. In particular, <b>hex</b>,
|
that act after a pattern has been compiled. In particular, <b>hex</b>,
|
||||||
<b>posix</b>, and <b>push</b> are not allowed, nor are any
|
<b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
|
||||||
<a href="#optionmodifiers">option-setting modifiers.</a>
|
<a href="#optionmodifiers">option-setting modifiers.</a>
|
||||||
The JIT modifiers are, however permitted. Here is an example that saves and
|
The JIT modifiers are, however permitted. Here is an example that saves and
|
||||||
reloads two patterns.
|
reloads two patterns.
|
||||||
|
@ -1660,9 +1678,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 12 December 2015
|
Last updated: 31 January 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -1326,7 +1326,10 @@ COMPILING A PATTERN
|
||||||
theses in the pattern. Any opening parenthesis that is not followed by
|
theses in the pattern. Any opening parenthesis that is not followed by
|
||||||
? behaves as if it were followed by ?: but named parentheses can still
|
? behaves as if it were followed by ?: but named parentheses can still
|
||||||
be used for capturing (and they acquire numbers in the usual way).
|
be used for capturing (and they acquire numbers in the usual way).
|
||||||
There is no equivalent of this option in Perl.
|
There is no equivalent of this option in Perl. Note that, if this
|
||||||
|
option is set, references to capturing groups (back references or
|
||||||
|
recursion/subroutine calls) may only refer to named groups, though the
|
||||||
|
reference can be by name or by number.
|
||||||
|
|
||||||
PCRE2_NO_AUTO_POSSESS
|
PCRE2_NO_AUTO_POSSESS
|
||||||
|
|
||||||
|
@ -3055,8 +3058,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 16 December 2015
|
Last updated: 31 January 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -6231,7 +6234,7 @@ MATCHING A SINGLE CODE UNIT
|
||||||
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
PCRE2 does not allow \C to appear in lookbehind assertions (described
|
||||||
below) in a UTF mode, because this would make it impossible to calcu-
|
below) in a UTF mode, because this would make it impossible to calcu-
|
||||||
late the length of the lookbehind. Neither the alternative matching
|
late the length of the lookbehind. Neither the alternative matching
|
||||||
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
|
function pcre2_dfa_match() nor the JIT optimizer support \C in a UTF
|
||||||
mode. The former gives a match-time error; the latter fails to optimize
|
mode. The former gives a match-time error; the latter fails to optimize
|
||||||
and so the match is always run using the interpreter.
|
and so the match is always run using the interpreter.
|
||||||
|
|
||||||
|
@ -8460,7 +8463,7 @@ DESCRIPTION
|
||||||
This set of functions provides a POSIX-style API for the PCRE2 regular
|
This set of functions provides a POSIX-style API for the PCRE2 regular
|
||||||
expression 8-bit library. See the pcre2api documentation for a descrip-
|
expression 8-bit library. See the pcre2api documentation for a descrip-
|
||||||
tion of PCRE2's native API, which contains much additional functional-
|
tion of PCRE2's native API, which contains much additional functional-
|
||||||
ity. There is no POSIX-style wrapper for PCRE2's 16-bit and 32-bit
|
ity. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit
|
||||||
libraries.
|
libraries.
|
||||||
|
|
||||||
The functions described here are just wrapper functions that ultimately
|
The functions described here are just wrapper functions that ultimately
|
||||||
|
@ -8478,8 +8481,8 @@ DESCRIPTION
|
||||||
easier to slot in PCRE2 as a replacement library. Other POSIX options
|
easier to slot in PCRE2 as a replacement library. Other POSIX options
|
||||||
are not even defined.
|
are not even defined.
|
||||||
|
|
||||||
There are also some other options that are not defined by POSIX. These
|
There are also some options that are not defined by POSIX. These have
|
||||||
have been added at the request of users who want to make use of certain
|
been added at the request of users who want to make use of certain
|
||||||
PCRE2-specific features via the POSIX calling interface.
|
PCRE2-specific features via the POSIX calling interface.
|
||||||
|
|
||||||
When PCRE2 is called via these functions, it is only the API that is
|
When PCRE2 is called via these functions, it is only the API that is
|
||||||
|
@ -8530,11 +8533,11 @@ COMPILING A PATTERN
|
||||||
|
|
||||||
REG_NOSUB
|
REG_NOSUB
|
||||||
|
|
||||||
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is
|
When a pattern that is compiled with this flag is passed to regexec()
|
||||||
passed for compilation to the native function. In addition, when a pat-
|
for matching, the nmatch and pmatch arguments are ignored, and no cap-
|
||||||
tern that is compiled with this flag is passed to regexec() for match-
|
tured strings are returned. Versions of the PCRE library prior to 10.22
|
||||||
ing, the nmatch and pmatch arguments are ignored, and no captured
|
used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no
|
||||||
strings are returned.
|
longer happens because it disables the use of back references.
|
||||||
|
|
||||||
REG_UCP
|
REG_UCP
|
||||||
|
|
||||||
|
@ -8653,17 +8656,18 @@ MATCHING A PATTERN
|
||||||
|
|
||||||
If the pattern was compiled with the REG_NOSUB flag, no data about any
|
If the pattern was compiled with the REG_NOSUB flag, no data about any
|
||||||
matched strings is returned. The nmatch and pmatch arguments of
|
matched strings is returned. The nmatch and pmatch arguments of
|
||||||
regexec() are ignored.
|
regexec() are ignored (except possibly as input for REG_STARTEND).
|
||||||
|
|
||||||
If the value of nmatch is zero, or if the value pmatch is NULL, no data
|
The value of nmatch may be zero, and the value pmatch may be NULL
|
||||||
about any matched strings is returned.
|
(unless REG_STARTEND is set); in both these cases no data about any
|
||||||
|
matched strings is returned.
|
||||||
|
|
||||||
Otherwise,the portion of the string that was matched, and also any cap-
|
Otherwise, the portion of the string that was matched, and also any
|
||||||
tured substrings, are returned via the pmatch argument, which points to
|
captured substrings, are returned via the pmatch argument, which points
|
||||||
an array of nmatch structures of type regmatch_t, containing the mem-
|
to an array of nmatch structures of type regmatch_t, containing the
|
||||||
bers rm_so and rm_eo. These contain the byte offset to the first char-
|
members rm_so and rm_eo. These contain the byte offset to the first
|
||||||
acter of each substring and the offset to the first character after the
|
character of each substring and the offset to the first character after
|
||||||
end of each substring, respectively. The 0th element of the vector
|
the end of each substring, respectively. The 0th element of the vector
|
||||||
relates to the entire portion of string that was matched; subsequent
|
relates to the entire portion of string that was matched; subsequent
|
||||||
elements relate to the capturing subpatterns of the regular expression.
|
elements relate to the capturing subpatterns of the regular expression.
|
||||||
Unused entries in the array have both structure members set to -1.
|
Unused entries in the array have both structure members set to -1.
|
||||||
|
@ -8702,8 +8706,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 29 November 2015
|
Last updated: 31 January 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -8722,12 +8726,12 @@ PCRE2 SAMPLE PROGRAM
|
||||||
documentation. If you do not have a copy of the PCRE2 distribution, you
|
documentation. If you do not have a copy of the PCRE2 distribution, you
|
||||||
can save this listing to re-create the contents of pcre2demo.c.
|
can save this listing to re-create the contents of pcre2demo.c.
|
||||||
|
|
||||||
The demonstration program, which uses the PCRE2 8-bit library, compiles
|
The demonstration program compiles the regular expression that is its
|
||||||
the regular expression that is its first argument, and matches it
|
first argument, and matches it against the subject string in its second
|
||||||
against the subject string in its second argument. No PCRE2 options are
|
argument. No PCRE2 options are set, and default character tables are
|
||||||
set, and default character tables are used. If matching succeeds, the
|
used. If matching succeeds, the program outputs the portion of the sub-
|
||||||
program outputs the portion of the subject that matched, together with
|
ject that matched, together with the contents of any captured sub-
|
||||||
the contents of any captured substrings.
|
strings.
|
||||||
|
|
||||||
If the -g option is given on the command line, the program then goes on
|
If the -g option is given on the command line, the program then goes on
|
||||||
to check for further matches of the same regular expression in the same
|
to check for further matches of the same regular expression in the same
|
||||||
|
@ -8735,38 +8739,45 @@ PCRE2 SAMPLE PROGRAM
|
||||||
bility of matching an empty string. Comments in the code explain what
|
bility of matching an empty string. Comments in the code explain what
|
||||||
is going on.
|
is going on.
|
||||||
|
|
||||||
|
The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit
|
||||||
|
library. It handles strings and characters that are stored in 8-bit
|
||||||
|
code units. By default, one character corresponds to one code unit,
|
||||||
|
but if the pattern starts with "(*UTF)", both it and the subject are
|
||||||
|
treated as UTF-8 strings, where characters may occupy multiple code
|
||||||
|
units.
|
||||||
|
|
||||||
If PCRE2 is installed in the standard include and library directories
|
If PCRE2 is installed in the standard include and library directories
|
||||||
for your operating system, you should be able to compile the demonstra-
|
for your operating system, you should be able to compile the demonstra-
|
||||||
tion program using this command:
|
tion program using a command like this:
|
||||||
|
|
||||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||||
|
|
||||||
If PCRE2 is installed elsewhere, you may need to add additional options
|
If PCRE2 is installed elsewhere, you may need to add additional options
|
||||||
to the command line. For example, on a Unix-like system that has PCRE2
|
to the command line. For example, on a Unix-like system that has PCRE2
|
||||||
installed in /usr/local, you can compile the demonstration program
|
installed in /usr/local, you can compile the demonstration program
|
||||||
using a command like this:
|
using a command like this:
|
||||||
|
|
||||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \
|
cc -o pcre2demo -I/usr/local/include pcre2demo.c \
|
||||||
-L/usr/local/lib -lpcre2-8
|
-L/usr/local/lib -lpcre2-8
|
||||||
|
|
||||||
|
Once you have built the demonstration program, you can run simple tests
|
||||||
Once you have compiled and linked the demonstration program, you can
|
like this:
|
||||||
run simple tests like this:
|
|
||||||
|
|
||||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||||
|
|
||||||
Note that there is a much more comprehensive test program, called
|
Note that there is a much more comprehensive test program, called
|
||||||
pcre2test, which supports many more facilities for testing regular
|
pcre2test, which supports many more facilities for testing regular
|
||||||
expressions using the PCRE2 libraries. The pcre2demo program is pro-
|
expressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit,
|
||||||
vided as a simple coding example.
|
though not all three need be installed). The pcre2demo program is pro-
|
||||||
|
vided as a relatively simple coding example.
|
||||||
|
|
||||||
If you try to run pcre2demo when PCRE2 is not installed in the standard
|
If you try to run pcre2demo when PCRE2 is not installed in the standard
|
||||||
library directory, you may get an error like this on some operating
|
library directory, you may get an error like this on some operating
|
||||||
systems (e.g. Solaris):
|
systems (e.g. Solaris):
|
||||||
|
|
||||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or
|
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file
|
||||||
directory
|
or directory
|
||||||
|
|
||||||
This is caused by the way shared library support works on those sys-
|
This is caused by the way shared library support works on those sys-
|
||||||
tems. You need to add
|
tems. You need to add
|
||||||
|
@ -8785,8 +8796,8 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 20 October 2014
|
Last updated: 02 February 2016
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3)
|
PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3)
|
||||||
|
|
||||||
|
|
|
@ -20,28 +20,31 @@
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This is a demonstration program to illustrate a straightforward way of
|
/* This is a demonstration program to illustrate a straightforward way of
|
||||||
calling the PCRE2 regular expression library from a C program. See the
|
using the PCRE2 regular expression library from a C program. See the
|
||||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||||
incompatible with the original PCRE API.
|
incompatible with the original PCRE API.
|
||||||
|
|
||||||
There are actually three libraries, each supporting a different code unit
|
There are actually three libraries, each supporting a different code unit
|
||||||
width. This demonstration program uses the 8-bit library.
|
width. This demonstration program uses the 8-bit library. The default is to
|
||||||
|
process each code unit as a separate character, but if the pattern begins with
|
||||||
|
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||||
|
characters may occupy multiple code units.
|
||||||
|
|
||||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||||
libraries, you should be able to compile this program using this command:
|
libraries, you should be able to compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||||
compile this program using this command:
|
compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||||
|
|
||||||
If you do not have pkg-config, you may have to use this:
|
If you do not have pkg-config, you may have to use something like this:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
||||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||||
|
@ -56,9 +59,14 @@ the following line. */
|
||||||
|
|
||||||
/* #define PCRE2_STATIC */
|
/* #define PCRE2_STATIC */
|
||||||
|
|
||||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||||
only one code unit width, it makes it possible to use generic function names
|
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||||
such as pcre2_compile(). */
|
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||||
|
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||||
|
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||||
|
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||||
|
characters, the code for handling the table of named substrings will still need
|
||||||
|
to be modified. */
|
||||||
|
|
||||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||||
|
|
||||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pcre2_code *re;
|
pcre2_code *re;
|
||||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||||
PCRE2_SPTR name_table;
|
PCRE2_SPTR name_table;
|
||||||
|
|
||||||
int crlf_is_newline;
|
int crlf_is_newline;
|
||||||
int errornumber;
|
int errornumber;
|
||||||
int find_all;
|
int find_all;
|
||||||
int i;
|
int i;
|
||||||
int namecount;
|
|
||||||
int name_entry_size;
|
|
||||||
int rc;
|
int rc;
|
||||||
int utf8;
|
int utf8;
|
||||||
|
|
||||||
uint32_t option_bits;
|
uint32_t option_bits;
|
||||||
|
uint32_t namecount;
|
||||||
|
uint32_t name_entry_size;
|
||||||
uint32_t newline;
|
uint32_t newline;
|
||||||
|
|
||||||
PCRE2_SIZE erroroffset;
|
PCRE2_SIZE erroroffset;
|
||||||
|
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
|
||||||
* First, sort out the command line. There is only one possible option at *
|
* First, sort out the command line. There is only one possible option at *
|
||||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||||
* if the -g option is present. Apart from that, there must be exactly two *
|
* if the -g option is present. *
|
||||||
* arguments. *
|
|
||||||
**************************************************************************/
|
**************************************************************************/
|
||||||
|
|
||||||
find_all = 0;
|
find_all = 0;
|
||||||
for (i = 1; i < argc; i++)
|
for (i = 1; i < argc; i++)
|
||||||
{
|
{
|
||||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||||
|
else if (argv[i][0] == '-')
|
||||||
|
{
|
||||||
|
printf("Unrecognised option %s\en", argv[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
else break;
|
else break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
||||||
|
|
||||||
if (argc - i != 2)
|
if (argc - i != 2)
|
||||||
{
|
{
|
||||||
printf("Two arguments required: a regex and a subject string\en");
|
printf("Exactly two arguments required: a regex and a subject string\en");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
||||||
stored. */
|
stored. */
|
||||||
|
|
||||||
ovector = pcre2_get_ovector_pointer(match_data);
|
ovector = pcre2_get_ovector_pointer(match_data);
|
||||||
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
|
printf("Match succeeded at offset %d\en", (int)ovector[0]);
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
||||||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||||
&namecount); /* where to put the answer */
|
&namecount); /* where to put the answer */
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\en"); else
|
if (namecount == 0) printf("No named substrings\en"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr;
|
PCRE2_SPTR tabptr;
|
||||||
printf("Named substrings\en");
|
printf("Named substrings\en");
|
||||||
|
@ -371,7 +383,7 @@ for (;;)
|
||||||
{
|
{
|
||||||
if (options == 0) break; /* All matches found */
|
if (options == 0) break; /* All matches found */
|
||||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||||
if (crlf_is_newline && /* If CRLF is newline & */
|
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||||
subject[start_offset] == '\er' &&
|
subject[start_offset] == '\er' &&
|
||||||
subject[start_offset + 1] == '\en')
|
subject[start_offset + 1] == '\en')
|
||||||
|
@ -417,7 +429,7 @@ for (;;)
|
||||||
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
|
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\en"); else
|
if (namecount == 0) printf("No named substrings\en"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr = name_table;
|
PCRE2_SPTR tabptr = name_table;
|
||||||
printf("Named substrings\en");
|
printf("Named substrings\en");
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00"
|
.TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 SAMPLE PROGRAM"
|
.SH "PCRE2 SAMPLE PROGRAM"
|
||||||
|
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
|
||||||
documentation. If you do not have a copy of the PCRE2 distribution, you can
|
documentation. If you do not have a copy of the PCRE2 distribution, you can
|
||||||
save this listing to re-create the contents of \fIpcre2demo.c\fP.
|
save this listing to re-create the contents of \fIpcre2demo.c\fP.
|
||||||
.P
|
.P
|
||||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
The demonstration program compiles the regular expression that is its
|
||||||
regular expression that is its first argument, and matches it against the
|
first argument, and matches it against the subject string in its second
|
||||||
subject string in its second argument. No PCRE2 options are set, and default
|
argument. No PCRE2 options are set, and default character tables are used. If
|
||||||
character tables are used. If matching succeeds, the program outputs the
|
matching succeeds, the program outputs the portion of the subject that matched,
|
||||||
portion of the subject that matched, together with the contents of any captured
|
together with the contents of any captured substrings.
|
||||||
substrings.
|
|
||||||
.P
|
.P
|
||||||
If the -g option is given on the command line, the program then goes on to
|
If the -g option is given on the command line, the program then goes on to
|
||||||
check for further matches of the same regular expression in the same subject
|
check for further matches of the same regular expression in the same subject
|
||||||
string. The logic is a little bit tricky because of the possibility of matching
|
string. The logic is a little bit tricky because of the possibility of matching
|
||||||
an empty string. Comments in the code explain what is going on.
|
an empty string. Comments in the code explain what is going on.
|
||||||
.P
|
.P
|
||||||
|
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
|
||||||
|
library. It handles strings and characters that are stored in 8-bit code units.
|
||||||
|
By default, one character corresponds to one code unit, but if the pattern
|
||||||
|
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||||
|
where characters may occupy multiple code units.
|
||||||
|
.P
|
||||||
If PCRE2 is installed in the standard include and library directories for your
|
If PCRE2 is installed in the standard include and library directories for your
|
||||||
operating system, you should be able to compile the demonstration program using
|
operating system, you should be able to compile the demonstration program using
|
||||||
this command:
|
a command like this:
|
||||||
.sp
|
.sp
|
||||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||||
.sp
|
.sp
|
||||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||||
|
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||||
like this:
|
like this:
|
||||||
.sp
|
.sp
|
||||||
.\" JOINSH
|
.\" JOINSH
|
||||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
||||||
-L/usr/local/lib -lpcre2-8
|
-L/usr/local/lib -lpcre2-8
|
||||||
.sp
|
.sp
|
||||||
.P
|
Once you have built the demonstration program, you can run simple tests like
|
||||||
Once you have compiled and linked the demonstration program, you can run simple
|
this:
|
||||||
tests like this:
|
|
||||||
.sp
|
.sp
|
||||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||||
|
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2test\fP,
|
\fBpcre2test\fP,
|
||||||
.\"
|
.\"
|
||||||
which supports many more facilities for testing regular expressions using the
|
which supports many more facilities for testing regular expressions using all
|
||||||
PCRE2 libraries. The
|
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||||
|
installed). The
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2demo\fP
|
\fBpcre2demo\fP
|
||||||
.\"
|
.\"
|
||||||
program is provided as a simple coding example.
|
program is provided as a relatively simple coding example.
|
||||||
.P
|
.P
|
||||||
If you try to run
|
If you try to run
|
||||||
.\" HREF
|
.\" HREF
|
||||||
|
@ -65,7 +70,7 @@ If you try to run
|
||||||
when PCRE2 is not installed in the standard library directory, you may get an
|
when PCRE2 is not installed in the standard library directory, you may get an
|
||||||
error like this on some operating systems (e.g. Solaris):
|
error like this on some operating systems (e.g. Solaris):
|
||||||
.sp
|
.sp
|
||||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||||
.sp
|
.sp
|
||||||
This is caused by the way shared library support works on those systems. You
|
This is caused by the way shared library support works on those systems. You
|
||||||
need to add
|
need to add
|
||||||
|
@ -89,6 +94,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 20 October 2014
|
Last updated: 02 February 2016
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -67,10 +67,10 @@ INPUT ENCODING
|
||||||
|
|
||||||
For maximum portability, therefore, it is safest to avoid non-printing
|
For maximum portability, therefore, it is safest to avoid non-printing
|
||||||
characters in pcre2test input files. There is a facility for specifying
|
characters in pcre2test input files. There is a facility for specifying
|
||||||
a pattern's characters as hexadecimal pairs, thus making it possible to
|
some or all of a pattern's characters as hexadecimal pairs, thus making
|
||||||
include binary zeroes in a pattern for testing purposes. Subject lines
|
it possible to include binary zeroes in a pattern for testing purposes.
|
||||||
are processed for backslash escapes, which makes it possible to include
|
Subject lines are processed for backslash escapes, which makes it pos-
|
||||||
any data value.
|
sible to include any data value.
|
||||||
|
|
||||||
|
|
||||||
COMMAND LINE OPTIONS
|
COMMAND LINE OPTIONS
|
||||||
|
@ -505,7 +505,7 @@ PATTERN MODIFIERS
|
||||||
debug same as info,fullbincode
|
debug same as info,fullbincode
|
||||||
fullbincode show binary code with lengths
|
fullbincode show binary code with lengths
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
hex pattern is coded in hexadecimal
|
hex unquoted characters are hexadecimal
|
||||||
jit[=<number>] use JIT
|
jit[=<number>] use JIT
|
||||||
jitfast use JIT fast path
|
jitfast use JIT fast path
|
||||||
jitverify verify JIT use
|
jitverify verify JIT use
|
||||||
|
@ -516,6 +516,7 @@ PATTERN MODIFIERS
|
||||||
null_context compile with a NULL context
|
null_context compile with a NULL context
|
||||||
parens_nest_limit=<n> set maximum parentheses depth
|
parens_nest_limit=<n> set maximum parentheses depth
|
||||||
posix use the POSIX API
|
posix use the POSIX API
|
||||||
|
posix_nosub use the POSIX API with REG_NOSUB
|
||||||
push push compiled pattern onto the stack
|
push push compiled pattern onto the stack
|
||||||
stackguard=<number> test the stackguard feature
|
stackguard=<number> test the stackguard feature
|
||||||
tables=[0|1|2] select internal tables
|
tables=[0|1|2] select internal tables
|
||||||
|
@ -591,19 +592,30 @@ PATTERN MODIFIERS
|
||||||
testing that pcre2_compile() behaves correctly in this case (it uses
|
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
|
|
||||||
Specifying a pattern in hex
|
Specifying pattern characters in hexadecimal
|
||||||
|
|
||||||
The hex modifier specifies that the characters of the pattern are to be
|
The hex modifier specifies that the characters of the pattern, except
|
||||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
for substrings enclosed in single or double quotes, are to be inter-
|
||||||
between pairs. For example:
|
preted as pairs of hexadecimal digits. This feature is provided as a
|
||||||
|
way of creating patterns that contain binary zeros and other non-print-
|
||||||
|
ing characters. White space is permitted between pairs of digits. For
|
||||||
|
example, this pattern contains three characters:
|
||||||
|
|
||||||
/ab 32 59/hex
|
/ab 32 59/hex
|
||||||
|
|
||||||
This feature is provided as a way of creating patterns that contain
|
Parts of such a pattern are taken literally if quoted. This pattern
|
||||||
binary zero and other non-printing characters. By default, pcre2test
|
contains nine characters, only two of which are specified in hexadeci-
|
||||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
mal:
|
||||||
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
|
||||||
hexadecimal, the actual length of the pattern is passed.
|
/ab "literal" 32/hex
|
||||||
|
|
||||||
|
Either single or double quotes may be used. There is no way of includ-
|
||||||
|
ing the delimiter within a substring.
|
||||||
|
|
||||||
|
By default, pcre2test passes patterns as zero-terminated strings to
|
||||||
|
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
|
||||||
|
for patterns specified with the hex modifier, the actual length of the
|
||||||
|
pattern is passed.
|
||||||
|
|
||||||
Generating long repetitive patterns
|
Generating long repetitive patterns
|
||||||
|
|
||||||
|
@ -732,16 +744,16 @@ PATTERN MODIFIERS
|
||||||
|
|
||||||
Using the POSIX wrapper API
|
Using the POSIX wrapper API
|
||||||
|
|
||||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
|
||||||
per API rather than its native API. This supports only the 8-bit
|
the POSIX wrapper API rather than its native API. When posix_nosub is
|
||||||
library. Note that it does not imply POSIX matching semantics; for
|
used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
|
||||||
more detail see the pcre2posix documentation. When the POSIX API is
|
wrapper supports only the 8-bit library. Note that it does not imply
|
||||||
being used, the following pattern modifiers set options for the reg-
|
POSIX matching semantics; for more detail see the pcre2posix documenta-
|
||||||
comp() function:
|
tion. The following pattern modifiers set options for the regcomp()
|
||||||
|
function:
|
||||||
|
|
||||||
caseless REG_ICASE
|
caseless REG_ICASE
|
||||||
multiline REG_NEWLINE
|
multiline REG_NEWLINE
|
||||||
no_auto_capture REG_NOSUB
|
|
||||||
dotall REG_DOTALL )
|
dotall REG_DOTALL )
|
||||||
ungreedy REG_UNGREEDY ) These options are not part of
|
ungreedy REG_UNGREEDY ) These options are not part of
|
||||||
ucp REG_UCP ) the POSIX standard
|
ucp REG_UCP ) the POSIX standard
|
||||||
|
@ -758,7 +770,8 @@ PATTERN MODIFIERS
|
||||||
been set, a large buffer is used.
|
been set, a large buffer is used.
|
||||||
|
|
||||||
The aftertext and allaftertext subject modifiers work as described
|
The aftertext and allaftertext subject modifiers work as described
|
||||||
below. All other modifiers cause an error.
|
below. All other modifiers are either ignored, with a warning message,
|
||||||
|
or cause an error.
|
||||||
|
|
||||||
Testing the stack guard feature
|
Testing the stack guard feature
|
||||||
|
|
||||||
|
@ -855,7 +868,7 @@ SUBJECT MODIFIERS
|
||||||
wrapper API to be used, the only option-setting modifiers that have any
|
wrapper API to be used, the only option-setting modifiers that have any
|
||||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||||
Any other modifiers cause an error.
|
The other modifiers are ignored, with a warning message.
|
||||||
|
|
||||||
Setting match controls
|
Setting match controls
|
||||||
|
|
||||||
|
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
|
||||||
zero_terminate pass the subject as zero-terminated
|
zero_terminate pass the subject as zero-terminated
|
||||||
|
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
|
When matching via the POSIX wrapper API, the aftertext, allaftertext,
|
||||||
|
and ovector subject modifiers work as described below. All other modi-
|
||||||
|
fiers are either ignored, with a warning message, or cause an error.
|
||||||
|
|
||||||
Showing more text
|
Showing more text
|
||||||
|
|
||||||
|
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
|
||||||
matched with the pattern, terminated as usual by an empty line or end
|
matched with the pattern, terminated as usual by an empty line or end
|
||||||
of file. This command may be followed by a modifier list containing
|
of file. This command may be followed by a modifier list containing
|
||||||
only control modifiers that act after a pattern has been compiled. In
|
only control modifiers that act after a pattern has been compiled. In
|
||||||
particular, hex, posix, and push are not allowed, nor are any option-
|
particular, hex, posix, posix_nosub, and push are not allowed, nor are
|
||||||
setting modifiers. The JIT modifiers are, however permitted. Here is
|
any option-setting modifiers. The JIT modifiers are, however permit-
|
||||||
an example that saves and reloads two patterns.
|
ted. Here is an example that saves and reloads two patterns.
|
||||||
|
|
||||||
/abc/push
|
/abc/push
|
||||||
/xyz/push
|
/xyz/push
|
||||||
|
@ -1505,5 +1521,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 12 December 2015
|
Last updated: 31 January 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
|
|
|
@ -3,28 +3,31 @@
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This is a demonstration program to illustrate a straightforward way of
|
/* This is a demonstration program to illustrate a straightforward way of
|
||||||
calling the PCRE2 regular expression library from a C program. See the
|
using the PCRE2 regular expression library from a C program. See the
|
||||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||||
incompatible with the original PCRE API.
|
incompatible with the original PCRE API.
|
||||||
|
|
||||||
There are actually three libraries, each supporting a different code unit
|
There are actually three libraries, each supporting a different code unit
|
||||||
width. This demonstration program uses the 8-bit library.
|
width. This demonstration program uses the 8-bit library. The default is to
|
||||||
|
process each code unit as a separate character, but if the pattern begins with
|
||||||
|
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||||
|
characters may occupy multiple code units.
|
||||||
|
|
||||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||||
libraries, you should be able to compile this program using this command:
|
libraries, you should be able to compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||||
compile this program using this command:
|
compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||||
|
|
||||||
If you do not have pkg-config, you may have to use this:
|
If you do not have pkg-config, you may have to use something like this:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||||
|
@ -39,9 +42,14 @@ the following line. */
|
||||||
|
|
||||||
/* #define PCRE2_STATIC */
|
/* #define PCRE2_STATIC */
|
||||||
|
|
||||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||||
only one code unit width, it makes it possible to use generic function names
|
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||||
such as pcre2_compile(). */
|
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||||
|
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||||
|
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||||
|
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||||
|
characters, the code for handling the table of named substrings will still need
|
||||||
|
to be modified. */
|
||||||
|
|
||||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||||
|
|
||||||
|
@ -62,19 +70,19 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pcre2_code *re;
|
pcre2_code *re;
|
||||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||||
PCRE2_SPTR name_table;
|
PCRE2_SPTR name_table;
|
||||||
|
|
||||||
int crlf_is_newline;
|
int crlf_is_newline;
|
||||||
int errornumber;
|
int errornumber;
|
||||||
int find_all;
|
int find_all;
|
||||||
int i;
|
int i;
|
||||||
int namecount;
|
|
||||||
int name_entry_size;
|
|
||||||
int rc;
|
int rc;
|
||||||
int utf8;
|
int utf8;
|
||||||
|
|
||||||
uint32_t option_bits;
|
uint32_t option_bits;
|
||||||
|
uint32_t namecount;
|
||||||
|
uint32_t name_entry_size;
|
||||||
uint32_t newline;
|
uint32_t newline;
|
||||||
|
|
||||||
PCRE2_SIZE erroroffset;
|
PCRE2_SIZE erroroffset;
|
||||||
|
@ -89,14 +97,18 @@ pcre2_match_data *match_data;
|
||||||
* First, sort out the command line. There is only one possible option at *
|
* First, sort out the command line. There is only one possible option at *
|
||||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||||
* if the -g option is present. Apart from that, there must be exactly two *
|
* if the -g option is present. *
|
||||||
* arguments. *
|
|
||||||
**************************************************************************/
|
**************************************************************************/
|
||||||
|
|
||||||
find_all = 0;
|
find_all = 0;
|
||||||
for (i = 1; i < argc; i++)
|
for (i = 1; i < argc; i++)
|
||||||
{
|
{
|
||||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||||
|
else if (argv[i][0] == '-')
|
||||||
|
{
|
||||||
|
printf("Unrecognised option %s\n", argv[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
else break;
|
else break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,7 +117,7 @@ and the subject string. */
|
||||||
|
|
||||||
if (argc - i != 2)
|
if (argc - i != 2)
|
||||||
{
|
{
|
||||||
printf("Two arguments required: a regex and a subject string\n");
|
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -184,7 +196,7 @@ if (rc < 0)
|
||||||
stored. */
|
stored. */
|
||||||
|
|
||||||
ovector = pcre2_get_ovector_pointer(match_data);
|
ovector = pcre2_get_ovector_pointer(match_data);
|
||||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
|
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
|
||||||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||||
&namecount); /* where to put the answer */
|
&namecount); /* where to put the answer */
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr;
|
PCRE2_SPTR tabptr;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
@ -354,7 +366,7 @@ for (;;)
|
||||||
{
|
{
|
||||||
if (options == 0) break; /* All matches found */
|
if (options == 0) break; /* All matches found */
|
||||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||||
if (crlf_is_newline && /* If CRLF is newline & */
|
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||||
subject[start_offset] == '\r' &&
|
subject[start_offset] == '\r' &&
|
||||||
subject[start_offset + 1] == '\n')
|
subject[start_offset + 1] == '\n')
|
||||||
|
@ -400,7 +412,7 @@ for (;;)
|
||||||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr = name_table;
|
PCRE2_SPTR tabptr = name_table;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
|
Loading…
Reference in New Issue