Tidy pcre2demo.c
This commit is contained in:
parent
6c1c817438
commit
4e67c0c9e9
|
@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
|
||||||
modifier had this effect. That option is now ignored when the POSIX API is in
|
modifier had this effect. That option is now ignored when the POSIX API is in
|
||||||
use.
|
use.
|
||||||
|
|
||||||
|
8. Minor tidies to the pcre2demo.c sample program, including more comments
|
||||||
|
about its 8-bit-ness.
|
||||||
|
|
||||||
|
|
||||||
Version 10.21 12-January-2016
|
Version 10.21 12-January-2016
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
|
@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
|
||||||
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
||||||
were followed by ?: but named parentheses can still be used for capturing (and
|
were followed by ?: but named parentheses can still be used for capturing (and
|
||||||
they acquire numbers in the usual way). There is no equivalent of this option
|
they acquire numbers in the usual way). There is no equivalent of this option
|
||||||
in Perl.
|
in Perl. Note that, if this option is set, references to capturing groups (back
|
||||||
|
references or recursion/subroutine calls) may only refer to named groups,
|
||||||
|
though the reference can be by name or by number.
|
||||||
<pre>
|
<pre>
|
||||||
PCRE2_NO_AUTO_POSSESS
|
PCRE2_NO_AUTO_POSSESS
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -3121,9 +3123,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 16 December 2015
|
Last updated: 31 January 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This is a demonstration program to illustrate a straightforward way of
|
/* This is a demonstration program to illustrate a straightforward way of
|
||||||
calling the PCRE2 regular expression library from a C program. See the
|
using the PCRE2 regular expression library from a C program. See the
|
||||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||||
incompatible with the original PCRE API.
|
incompatible with the original PCRE API.
|
||||||
|
|
||||||
There are actually three libraries, each supporting a different code unit
|
There are actually three libraries, each supporting a different code unit
|
||||||
width. This demonstration program uses the 8-bit library.
|
width. This demonstration program uses the 8-bit library. The default is to
|
||||||
|
process each code unit as a separate character, but if the pattern begins with
|
||||||
|
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||||
|
characters may occupy multiple code units.
|
||||||
|
|
||||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||||
libraries, you should be able to compile this program using this command:
|
libraries, you should be able to compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||||
compile this program using this command:
|
compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||||
|
|
||||||
If you do not have pkg-config, you may have to use this:
|
If you do not have pkg-config, you may have to use something like this:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||||
|
@ -56,9 +59,14 @@ the following line. */
|
||||||
|
|
||||||
/* #define PCRE2_STATIC */
|
/* #define PCRE2_STATIC */
|
||||||
|
|
||||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||||
only one code unit width, it makes it possible to use generic function names
|
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||||
such as pcre2_compile(). */
|
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||||
|
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||||
|
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||||
|
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||||
|
characters, the code for handling the table of named substrings will still need
|
||||||
|
to be modified. */
|
||||||
|
|
||||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||||
|
|
||||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pcre2_code *re;
|
pcre2_code *re;
|
||||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||||
PCRE2_SPTR name_table;
|
PCRE2_SPTR name_table;
|
||||||
|
|
||||||
int crlf_is_newline;
|
int crlf_is_newline;
|
||||||
int errornumber;
|
int errornumber;
|
||||||
int find_all;
|
int find_all;
|
||||||
int i;
|
int i;
|
||||||
int namecount;
|
|
||||||
int name_entry_size;
|
|
||||||
int rc;
|
int rc;
|
||||||
int utf8;
|
int utf8;
|
||||||
|
|
||||||
uint32_t option_bits;
|
uint32_t option_bits;
|
||||||
|
uint32_t namecount;
|
||||||
|
uint32_t name_entry_size;
|
||||||
uint32_t newline;
|
uint32_t newline;
|
||||||
|
|
||||||
PCRE2_SIZE erroroffset;
|
PCRE2_SIZE erroroffset;
|
||||||
|
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
|
||||||
* First, sort out the command line. There is only one possible option at *
|
* First, sort out the command line. There is only one possible option at *
|
||||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||||
* if the -g option is present. Apart from that, there must be exactly two *
|
* if the -g option is present. *
|
||||||
* arguments. *
|
|
||||||
**************************************************************************/
|
**************************************************************************/
|
||||||
|
|
||||||
find_all = 0;
|
find_all = 0;
|
||||||
for (i = 1; i < argc; i++)
|
for (i = 1; i < argc; i++)
|
||||||
{
|
{
|
||||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||||
else break;
|
else if (argv[i][0] == '-')
|
||||||
|
{
|
||||||
|
printf("Unrecognised option %s\n", argv[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* After the options, we require exactly two arguments, which are the pattern,
|
/* After the options, we require exactly two arguments, which are the pattern,
|
||||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
||||||
|
|
||||||
if (argc - i != 2)
|
if (argc - i != 2)
|
||||||
{
|
{
|
||||||
printf("Two arguments required: a regex and a subject string\n");
|
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
||||||
stored. */
|
stored. */
|
||||||
|
|
||||||
ovector = pcre2_get_ovector_pointer(match_data);
|
ovector = pcre2_get_ovector_pointer(match_data);
|
||||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
||||||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||||
&namecount); /* where to put the answer */
|
&namecount); /* where to put the answer */
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr;
|
PCRE2_SPTR tabptr;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
uint32_t options = 0; /* Normally no options */
|
uint32_t options = 0; /* Normally no options */
|
||||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||||
|
|
||||||
/* If the previous match was for an empty string, we are finished if we are
|
/* If the previous match was for an empty string, we are finished if we are
|
||||||
at the end of the subject. Otherwise, arrange to run another match at the
|
at the end of the subject. Otherwise, arrange to run another match at the
|
||||||
|
@ -371,7 +383,7 @@ for (;;)
|
||||||
{
|
{
|
||||||
if (options == 0) break; /* All matches found */
|
if (options == 0) break; /* All matches found */
|
||||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||||
if (crlf_is_newline && /* If CRLF is newline & */
|
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||||
subject[start_offset] == '\r' &&
|
subject[start_offset] == '\r' &&
|
||||||
subject[start_offset + 1] == '\n')
|
subject[start_offset + 1] == '\n')
|
||||||
|
@ -417,7 +429,7 @@ for (;;)
|
||||||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr = name_table;
|
PCRE2_SPTR tabptr = name_table;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
|
|
@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
|
||||||
<a href="#lookbehind">(described below)</a>
|
<a href="#lookbehind">(described below)</a>
|
||||||
in a UTF mode, because this would make it impossible to calculate the length of
|
in a UTF mode, because this would make it impossible to calculate the length of
|
||||||
the lookbehind. Neither the alternative matching function
|
the lookbehind. Neither the alternative matching function
|
||||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
<b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
|
||||||
former gives a match-time error; the latter fails to optimize and so the match
|
former gives a match-time error; the latter fails to optimize and so the match
|
||||||
is always run using the interpreter.
|
is always run using the interpreter.
|
||||||
</P>
|
</P>
|
||||||
|
|
|
@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
|
||||||
expression 8-bit library. See the
|
expression 8-bit library. See the
|
||||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
documentation for a description of PCRE2's native API, which contains much
|
documentation for a description of PCRE2's native API, which contains much
|
||||||
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit
|
additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
|
||||||
and 32-bit libraries.
|
and 32-bit libraries.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
|
||||||
replacement library. Other POSIX options are not even defined.
|
replacement library. Other POSIX options are not even defined.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
There are also some other options that are not defined by POSIX. These have
|
There are also some options that are not defined by POSIX. These have been
|
||||||
been added at the request of users who want to make use of certain
|
added at the request of users who want to make use of certain PCRE2-specific
|
||||||
PCRE2-specific features via the POSIX calling interface.
|
features via the POSIX calling interface.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is called via these functions, it is only the API that is POSIX-like
|
When PCRE2 is called via these functions, it is only the API that is POSIX-like
|
||||||
|
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
|
||||||
<pre>
|
<pre>
|
||||||
REG_NOSUB
|
REG_NOSUB
|
||||||
</pre>
|
</pre>
|
||||||
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed
|
When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
|
||||||
for compilation to the native function. In addition, when a pattern that is
|
matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
|
||||||
compiled with this flag is passed to <b>regexec()</b> for matching, the
|
captured strings are returned. Versions of the PCRE library prior to 10.22 used
|
||||||
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings
|
to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
|
||||||
are returned.
|
because it disables the use of back references.
|
||||||
<pre>
|
<pre>
|
||||||
REG_UCP
|
REG_UCP
|
||||||
</pre>
|
</pre>
|
||||||
|
@ -241,14 +241,15 @@ mutually exclusive; the error REG_INVARG is returned.
|
||||||
<P>
|
<P>
|
||||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||||
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
|
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
|
||||||
<b>regexec()</b> are ignored.
|
<b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL,
|
The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
|
||||||
no data about any matched strings is returned.
|
(unless REG_STARTEND is set); in both these cases no data about any matched
|
||||||
|
strings is returned.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Otherwise,the portion of the string that was matched, and also any captured
|
Otherwise, the portion of the string that was matched, and also any captured
|
||||||
substrings, are returned via the <i>pmatch</i> argument, which points to an
|
substrings, are returned via the <i>pmatch</i> argument, which points to an
|
||||||
array of <i>nmatch</i> structures of type <i>regmatch_t</i>, containing the
|
array of <i>nmatch</i> structures of type <i>regmatch_t</i>, containing the
|
||||||
members <i>rm_so</i> and <i>rm_eo</i>. These contain the byte offset to the first
|
members <i>rm_so</i> and <i>rm_eo</i>. These contain the byte offset to the first
|
||||||
|
@ -290,9 +291,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 29 November 2015
|
Last updated: 31 January 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
|
||||||
save this listing to re-create the contents of <i>pcre2demo.c</i>.
|
save this listing to re-create the contents of <i>pcre2demo.c</i>.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
The demonstration program compiles the regular expression that is its
|
||||||
regular expression that is its first argument, and matches it against the
|
first argument, and matches it against the subject string in its second
|
||||||
subject string in its second argument. No PCRE2 options are set, and default
|
argument. No PCRE2 options are set, and default character tables are used. If
|
||||||
character tables are used. If matching succeeds, the program outputs the
|
matching succeeds, the program outputs the portion of the subject that matched,
|
||||||
portion of the subject that matched, together with the contents of any captured
|
together with the contents of any captured substrings.
|
||||||
substrings.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If the -g option is given on the command line, the program then goes on to
|
If the -g option is given on the command line, the program then goes on to
|
||||||
|
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
|
||||||
an empty string. Comments in the code explain what is going on.
|
an empty string. Comments in the code explain what is going on.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
|
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
|
||||||
|
library. It handles strings and characters that are stored in 8-bit code units.
|
||||||
|
By default, one character corresponds to one code unit, but if the pattern
|
||||||
|
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||||
|
where characters may occupy multiple code units.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
If PCRE2 is installed in the standard include and library directories for your
|
If PCRE2 is installed in the standard include and library directories for your
|
||||||
operating system, you should be able to compile the demonstration program using
|
operating system, you should be able to compile the demonstration program using
|
||||||
this command:
|
a command like this:
|
||||||
<pre>
|
<pre>
|
||||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||||
</pre>
|
</pre>
|
||||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||||
<i>/usr/local</i>, you can compile the demonstration program using a command
|
<i>/usr/local</i>, you can compile the demonstration program using a command
|
||||||
like this:
|
like this:
|
||||||
<pre>
|
<pre>
|
||||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
||||||
|
</pre>
|
||||||
</PRE>
|
Once you have built the demonstration program, you can run simple tests like
|
||||||
</P>
|
this:
|
||||||
<P>
|
|
||||||
Once you have compiled and linked the demonstration program, you can run simple
|
|
||||||
tests like this:
|
|
||||||
<pre>
|
<pre>
|
||||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||||
</pre>
|
</pre>
|
||||||
Note that there is a much more comprehensive test program, called
|
Note that there is a much more comprehensive test program, called
|
||||||
<a href="pcre2test.html"><b>pcre2test</b>,</a>
|
<a href="pcre2test.html"><b>pcre2test</b>,</a>
|
||||||
which supports many more facilities for testing regular expressions using the
|
which supports many more facilities for testing regular expressions using all
|
||||||
PCRE2 libraries. The
|
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||||
|
installed). The
|
||||||
<a href="pcre2demo.html"><b>pcre2demo</b></a>
|
<a href="pcre2demo.html"><b>pcre2demo</b></a>
|
||||||
program is provided as a simple coding example.
|
program is provided as a relatively simple coding example.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If you try to run
|
If you try to run
|
||||||
|
@ -73,7 +77,7 @@ If you try to run
|
||||||
when PCRE2 is not installed in the standard library directory, you may get an
|
when PCRE2 is not installed in the standard library directory, you may get an
|
||||||
error like this on some operating systems (e.g. Solaris):
|
error like this on some operating systems (e.g. Solaris):
|
||||||
<pre>
|
<pre>
|
||||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||||
</pre>
|
</pre>
|
||||||
This is caused by the way shared library support works on those systems. You
|
This is caused by the way shared library support works on those systems. You
|
||||||
need to add
|
need to add
|
||||||
|
@ -97,9 +101,9 @@ Cambridge, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 20 October 2014
|
Last updated: 02 February 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
|
@ -98,10 +98,11 @@ further data is read.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
For maximum portability, therefore, it is safest to avoid non-printing
|
For maximum portability, therefore, it is safest to avoid non-printing
|
||||||
characters in <b>pcre2test</b> input files. There is a facility for specifying a
|
characters in <b>pcre2test</b> input files. There is a facility for specifying
|
||||||
pattern's characters as hexadecimal pairs, thus making it possible to include
|
some or all of a pattern's characters as hexadecimal pairs, thus making it
|
||||||
binary zeroes in a pattern for testing purposes. Subject lines are processed
|
possible to include binary zeroes in a pattern for testing purposes. Subject
|
||||||
for backslash escapes, which makes it possible to include any data value.
|
lines are processed for backslash escapes, which makes it possible to include
|
||||||
|
any data value.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
||||||
<P>
|
<P>
|
||||||
|
@ -559,7 +560,7 @@ about the pattern:
|
||||||
debug same as info,fullbincode
|
debug same as info,fullbincode
|
||||||
fullbincode show binary code with lengths
|
fullbincode show binary code with lengths
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
hex pattern is coded in hexadecimal
|
hex unquoted characters are hexadecimal
|
||||||
jit[=<number>] use JIT
|
jit[=<number>] use JIT
|
||||||
jitfast use JIT fast path
|
jitfast use JIT fast path
|
||||||
jitverify verify JIT use
|
jitverify verify JIT use
|
||||||
|
@ -570,6 +571,7 @@ about the pattern:
|
||||||
null_context compile with a NULL context
|
null_context compile with a NULL context
|
||||||
parens_nest_limit=<n> set maximum parentheses depth
|
parens_nest_limit=<n> set maximum parentheses depth
|
||||||
posix use the POSIX API
|
posix use the POSIX API
|
||||||
|
posix_nosub use the POSIX API with REG_NOSUB
|
||||||
push push compiled pattern onto the stack
|
push push compiled pattern onto the stack
|
||||||
stackguard=<number> test the stackguard feature
|
stackguard=<number> test the stackguard feature
|
||||||
tables=[0|1|2] select internal tables
|
tables=[0|1|2] select internal tables
|
||||||
|
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Specifying a pattern in hex
|
Specifying pattern characters in hexadecimal
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>hex</b> modifier specifies that the characters of the pattern are to be
|
The <b>hex</b> modifier specifies that the characters of the pattern, except for
|
||||||
interpreted as pairs of hexadecimal digits. White space is permitted between
|
substrings enclosed in single or double quotes, are to be interpreted as pairs
|
||||||
pairs. For example:
|
of hexadecimal digits. This feature is provided as a way of creating patterns
|
||||||
|
that contain binary zeros and other non-printing characters. White space is
|
||||||
|
permitted between pairs of digits. For example, this pattern contains three
|
||||||
|
characters:
|
||||||
<pre>
|
<pre>
|
||||||
/ab 32 59/hex
|
/ab 32 59/hex
|
||||||
</pre>
|
</pre>
|
||||||
This feature is provided as a way of creating patterns that contain binary zero
|
Parts of such a pattern are taken literally if quoted. This pattern contains
|
||||||
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
|
nine characters, only two of which are specified in hexadecimal:
|
||||||
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
|
<pre>
|
||||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
|
/ab "literal" 32/hex
|
||||||
actual length of the pattern is passed.
|
</pre>
|
||||||
|
Either single or double quotes may be used. There is no way of including
|
||||||
|
the delimiter within a substring.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
|
||||||
|
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
|
||||||
|
patterns specified with the <b>hex</b> modifier, the actual length of the
|
||||||
|
pattern is passed.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Generating long repetitive patterns
|
Generating long repetitive patterns
|
||||||
|
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
|
||||||
Using the POSIX wrapper API
|
Using the POSIX wrapper API
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX
|
The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
|
||||||
wrapper API rather than its native API. This supports only the 8-bit library.
|
PCRE2 via the POSIX wrapper API rather than its native API. When
|
||||||
Note that it does not imply POSIX matching semantics; for more detail see the
|
<b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
|
||||||
|
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
|
||||||
|
it does not imply POSIX matching semantics; for more detail see the
|
||||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||||
documentation. When the POSIX API is being used, the following pattern
|
documentation. The following pattern modifiers set options for the
|
||||||
modifiers set options for the <b>regcomp()</b> function:
|
<b>regcomp()</b> function:
|
||||||
<pre>
|
<pre>
|
||||||
caseless REG_ICASE
|
caseless REG_ICASE
|
||||||
multiline REG_NEWLINE
|
multiline REG_NEWLINE
|
||||||
no_auto_capture REG_NOSUB
|
|
||||||
dotall REG_DOTALL )
|
dotall REG_DOTALL )
|
||||||
ungreedy REG_UNGREEDY ) These options are not part of
|
ungreedy REG_UNGREEDY ) These options are not part of
|
||||||
ucp REG_UCP ) the POSIX standard
|
ucp REG_UCP ) the POSIX standard
|
||||||
|
@ -847,7 +861,8 @@ large buffer is used.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
|
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
|
||||||
below. All other modifiers cause an error.
|
below. All other modifiers are either ignored, with a warning message, or cause
|
||||||
|
an error.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Testing the stack guard feature
|
Testing the stack guard feature
|
||||||
|
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
|
||||||
wrapper API to be used, the only option-setting modifiers that have any effect
|
wrapper API to be used, the only option-setting modifiers that have any effect
|
||||||
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
|
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
|
||||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
|
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
|
||||||
Any other modifiers cause an error.
|
The other modifiers are ignored, with a warning message.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Setting match controls
|
Setting match controls
|
||||||
|
@ -1001,7 +1016,10 @@ pattern.
|
||||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
zero_terminate pass the subject as zero-terminated
|
zero_terminate pass the subject as zero-terminated
|
||||||
</pre>
|
</pre>
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections. When
|
||||||
|
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
|
||||||
|
and <b>ovector</b> subject modifiers work as described below. All other
|
||||||
|
modifiers are either ignored, with a warning message, or cause an error.
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
Showing more text
|
Showing more text
|
||||||
|
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
|
||||||
modifier list containing only
|
modifier list containing only
|
||||||
<a href="#controlmodifiers">control modifiers</a>
|
<a href="#controlmodifiers">control modifiers</a>
|
||||||
that act after a pattern has been compiled. In particular, <b>hex</b>,
|
that act after a pattern has been compiled. In particular, <b>hex</b>,
|
||||||
<b>posix</b>, and <b>push</b> are not allowed, nor are any
|
<b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
|
||||||
<a href="#optionmodifiers">option-setting modifiers.</a>
|
<a href="#optionmodifiers">option-setting modifiers.</a>
|
||||||
The JIT modifiers are, however permitted. Here is an example that saves and
|
The JIT modifiers are, however permitted. Here is an example that saves and
|
||||||
reloads two patterns.
|
reloads two patterns.
|
||||||
|
@ -1660,9 +1678,9 @@ Cambridge, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 12 December 2015
|
Last updated: 31 January 2016
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2015 University of Cambridge.
|
Copyright © 1997-2016 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
<p>
|
<p>
|
||||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
|
1549
doc/pcre2.txt
1549
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -20,28 +20,31 @@
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This is a demonstration program to illustrate a straightforward way of
|
/* This is a demonstration program to illustrate a straightforward way of
|
||||||
calling the PCRE2 regular expression library from a C program. See the
|
using the PCRE2 regular expression library from a C program. See the
|
||||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||||
incompatible with the original PCRE API.
|
incompatible with the original PCRE API.
|
||||||
|
|
||||||
There are actually three libraries, each supporting a different code unit
|
There are actually three libraries, each supporting a different code unit
|
||||||
width. This demonstration program uses the 8-bit library.
|
width. This demonstration program uses the 8-bit library. The default is to
|
||||||
|
process each code unit as a separate character, but if the pattern begins with
|
||||||
|
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||||
|
characters may occupy multiple code units.
|
||||||
|
|
||||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||||
libraries, you should be able to compile this program using this command:
|
libraries, you should be able to compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||||
compile this program using this command:
|
compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||||
|
|
||||||
If you do not have pkg-config, you may have to use this:
|
If you do not have pkg-config, you may have to use something like this:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
||||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||||
|
@ -56,9 +59,14 @@ the following line. */
|
||||||
|
|
||||||
/* #define PCRE2_STATIC */
|
/* #define PCRE2_STATIC */
|
||||||
|
|
||||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||||
only one code unit width, it makes it possible to use generic function names
|
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||||
such as pcre2_compile(). */
|
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||||
|
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||||
|
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||||
|
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||||
|
characters, the code for handling the table of named substrings will still need
|
||||||
|
to be modified. */
|
||||||
|
|
||||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||||
|
|
||||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pcre2_code *re;
|
pcre2_code *re;
|
||||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||||
PCRE2_SPTR name_table;
|
PCRE2_SPTR name_table;
|
||||||
|
|
||||||
int crlf_is_newline;
|
int crlf_is_newline;
|
||||||
int errornumber;
|
int errornumber;
|
||||||
int find_all;
|
int find_all;
|
||||||
int i;
|
int i;
|
||||||
int namecount;
|
|
||||||
int name_entry_size;
|
|
||||||
int rc;
|
int rc;
|
||||||
int utf8;
|
int utf8;
|
||||||
|
|
||||||
uint32_t option_bits;
|
uint32_t option_bits;
|
||||||
|
uint32_t namecount;
|
||||||
|
uint32_t name_entry_size;
|
||||||
uint32_t newline;
|
uint32_t newline;
|
||||||
|
|
||||||
PCRE2_SIZE erroroffset;
|
PCRE2_SIZE erroroffset;
|
||||||
|
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
|
||||||
* First, sort out the command line. There is only one possible option at *
|
* First, sort out the command line. There is only one possible option at *
|
||||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||||
* if the -g option is present. Apart from that, there must be exactly two *
|
* if the -g option is present. *
|
||||||
* arguments. *
|
|
||||||
**************************************************************************/
|
**************************************************************************/
|
||||||
|
|
||||||
find_all = 0;
|
find_all = 0;
|
||||||
for (i = 1; i < argc; i++)
|
for (i = 1; i < argc; i++)
|
||||||
{
|
{
|
||||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||||
else break;
|
else if (argv[i][0] == '-')
|
||||||
|
{
|
||||||
|
printf("Unrecognised option %s\en", argv[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* After the options, we require exactly two arguments, which are the pattern,
|
/* After the options, we require exactly two arguments, which are the pattern,
|
||||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
||||||
|
|
||||||
if (argc - i != 2)
|
if (argc - i != 2)
|
||||||
{
|
{
|
||||||
printf("Two arguments required: a regex and a subject string\en");
|
printf("Exactly two arguments required: a regex and a subject string\en");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
||||||
stored. */
|
stored. */
|
||||||
|
|
||||||
ovector = pcre2_get_ovector_pointer(match_data);
|
ovector = pcre2_get_ovector_pointer(match_data);
|
||||||
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
|
printf("Match succeeded at offset %d\en", (int)ovector[0]);
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
||||||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||||
&namecount); /* where to put the answer */
|
&namecount); /* where to put the answer */
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\en"); else
|
if (namecount == 0) printf("No named substrings\en"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr;
|
PCRE2_SPTR tabptr;
|
||||||
printf("Named substrings\en");
|
printf("Named substrings\en");
|
||||||
|
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
uint32_t options = 0; /* Normally no options */
|
uint32_t options = 0; /* Normally no options */
|
||||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||||
|
|
||||||
/* If the previous match was for an empty string, we are finished if we are
|
/* If the previous match was for an empty string, we are finished if we are
|
||||||
at the end of the subject. Otherwise, arrange to run another match at the
|
at the end of the subject. Otherwise, arrange to run another match at the
|
||||||
|
@ -371,7 +383,7 @@ for (;;)
|
||||||
{
|
{
|
||||||
if (options == 0) break; /* All matches found */
|
if (options == 0) break; /* All matches found */
|
||||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||||
if (crlf_is_newline && /* If CRLF is newline & */
|
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||||
subject[start_offset] == '\er' &&
|
subject[start_offset] == '\er' &&
|
||||||
subject[start_offset + 1] == '\en')
|
subject[start_offset + 1] == '\en')
|
||||||
|
@ -417,7 +429,7 @@ for (;;)
|
||||||
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
|
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\en"); else
|
if (namecount == 0) printf("No named substrings\en"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr = name_table;
|
PCRE2_SPTR tabptr = name_table;
|
||||||
printf("Named substrings\en");
|
printf("Named substrings\en");
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00"
|
.TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 SAMPLE PROGRAM"
|
.SH "PCRE2 SAMPLE PROGRAM"
|
||||||
|
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
|
||||||
documentation. If you do not have a copy of the PCRE2 distribution, you can
|
documentation. If you do not have a copy of the PCRE2 distribution, you can
|
||||||
save this listing to re-create the contents of \fIpcre2demo.c\fP.
|
save this listing to re-create the contents of \fIpcre2demo.c\fP.
|
||||||
.P
|
.P
|
||||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
The demonstration program compiles the regular expression that is its
|
||||||
regular expression that is its first argument, and matches it against the
|
first argument, and matches it against the subject string in its second
|
||||||
subject string in its second argument. No PCRE2 options are set, and default
|
argument. No PCRE2 options are set, and default character tables are used. If
|
||||||
character tables are used. If matching succeeds, the program outputs the
|
matching succeeds, the program outputs the portion of the subject that matched,
|
||||||
portion of the subject that matched, together with the contents of any captured
|
together with the contents of any captured substrings.
|
||||||
substrings.
|
|
||||||
.P
|
.P
|
||||||
If the -g option is given on the command line, the program then goes on to
|
If the -g option is given on the command line, the program then goes on to
|
||||||
check for further matches of the same regular expression in the same subject
|
check for further matches of the same regular expression in the same subject
|
||||||
string. The logic is a little bit tricky because of the possibility of matching
|
string. The logic is a little bit tricky because of the possibility of matching
|
||||||
an empty string. Comments in the code explain what is going on.
|
an empty string. Comments in the code explain what is going on.
|
||||||
.P
|
.P
|
||||||
|
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
|
||||||
|
library. It handles strings and characters that are stored in 8-bit code units.
|
||||||
|
By default, one character corresponds to one code unit, but if the pattern
|
||||||
|
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||||
|
where characters may occupy multiple code units.
|
||||||
|
.P
|
||||||
If PCRE2 is installed in the standard include and library directories for your
|
If PCRE2 is installed in the standard include and library directories for your
|
||||||
operating system, you should be able to compile the demonstration program using
|
operating system, you should be able to compile the demonstration program using
|
||||||
this command:
|
a command like this:
|
||||||
.sp
|
.sp
|
||||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||||
.sp
|
.sp
|
||||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||||
|
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||||
like this:
|
like this:
|
||||||
.sp
|
.sp
|
||||||
.\" JOINSH
|
.\" JOINSH
|
||||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
||||||
-L/usr/local/lib -lpcre2-8
|
-L/usr/local/lib -lpcre2-8
|
||||||
.sp
|
.sp
|
||||||
.P
|
Once you have built the demonstration program, you can run simple tests like
|
||||||
Once you have compiled and linked the demonstration program, you can run simple
|
this:
|
||||||
tests like this:
|
|
||||||
.sp
|
.sp
|
||||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||||
|
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2test\fP,
|
\fBpcre2test\fP,
|
||||||
.\"
|
.\"
|
||||||
which supports many more facilities for testing regular expressions using the
|
which supports many more facilities for testing regular expressions using all
|
||||||
PCRE2 libraries. The
|
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||||
|
installed). The
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2demo\fP
|
\fBpcre2demo\fP
|
||||||
.\"
|
.\"
|
||||||
program is provided as a simple coding example.
|
program is provided as a relatively simple coding example.
|
||||||
.P
|
.P
|
||||||
If you try to run
|
If you try to run
|
||||||
.\" HREF
|
.\" HREF
|
||||||
|
@ -65,7 +70,7 @@ If you try to run
|
||||||
when PCRE2 is not installed in the standard library directory, you may get an
|
when PCRE2 is not installed in the standard library directory, you may get an
|
||||||
error like this on some operating systems (e.g. Solaris):
|
error like this on some operating systems (e.g. Solaris):
|
||||||
.sp
|
.sp
|
||||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||||
.sp
|
.sp
|
||||||
This is caused by the way shared library support works on those systems. You
|
This is caused by the way shared library support works on those systems. You
|
||||||
need to add
|
need to add
|
||||||
|
@ -89,6 +94,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 20 October 2014
|
Last updated: 02 February 2016
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -67,10 +67,10 @@ INPUT ENCODING
|
||||||
|
|
||||||
For maximum portability, therefore, it is safest to avoid non-printing
|
For maximum portability, therefore, it is safest to avoid non-printing
|
||||||
characters in pcre2test input files. There is a facility for specifying
|
characters in pcre2test input files. There is a facility for specifying
|
||||||
a pattern's characters as hexadecimal pairs, thus making it possible to
|
some or all of a pattern's characters as hexadecimal pairs, thus making
|
||||||
include binary zeroes in a pattern for testing purposes. Subject lines
|
it possible to include binary zeroes in a pattern for testing purposes.
|
||||||
are processed for backslash escapes, which makes it possible to include
|
Subject lines are processed for backslash escapes, which makes it pos-
|
||||||
any data value.
|
sible to include any data value.
|
||||||
|
|
||||||
|
|
||||||
COMMAND LINE OPTIONS
|
COMMAND LINE OPTIONS
|
||||||
|
@ -505,7 +505,7 @@ PATTERN MODIFIERS
|
||||||
debug same as info,fullbincode
|
debug same as info,fullbincode
|
||||||
fullbincode show binary code with lengths
|
fullbincode show binary code with lengths
|
||||||
/I info show info about compiled pattern
|
/I info show info about compiled pattern
|
||||||
hex pattern is coded in hexadecimal
|
hex unquoted characters are hexadecimal
|
||||||
jit[=<number>] use JIT
|
jit[=<number>] use JIT
|
||||||
jitfast use JIT fast path
|
jitfast use JIT fast path
|
||||||
jitverify verify JIT use
|
jitverify verify JIT use
|
||||||
|
@ -516,6 +516,7 @@ PATTERN MODIFIERS
|
||||||
null_context compile with a NULL context
|
null_context compile with a NULL context
|
||||||
parens_nest_limit=<n> set maximum parentheses depth
|
parens_nest_limit=<n> set maximum parentheses depth
|
||||||
posix use the POSIX API
|
posix use the POSIX API
|
||||||
|
posix_nosub use the POSIX API with REG_NOSUB
|
||||||
push push compiled pattern onto the stack
|
push push compiled pattern onto the stack
|
||||||
stackguard=<number> test the stackguard feature
|
stackguard=<number> test the stackguard feature
|
||||||
tables=[0|1|2] select internal tables
|
tables=[0|1|2] select internal tables
|
||||||
|
@ -591,59 +592,70 @@ PATTERN MODIFIERS
|
||||||
testing that pcre2_compile() behaves correctly in this case (it uses
|
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||||
default values).
|
default values).
|
||||||
|
|
||||||
Specifying a pattern in hex
|
Specifying pattern characters in hexadecimal
|
||||||
|
|
||||||
The hex modifier specifies that the characters of the pattern are to be
|
The hex modifier specifies that the characters of the pattern, except
|
||||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
for substrings enclosed in single or double quotes, are to be inter-
|
||||||
between pairs. For example:
|
preted as pairs of hexadecimal digits. This feature is provided as a
|
||||||
|
way of creating patterns that contain binary zeros and other non-print-
|
||||||
|
ing characters. White space is permitted between pairs of digits. For
|
||||||
|
example, this pattern contains three characters:
|
||||||
|
|
||||||
/ab 32 59/hex
|
/ab 32 59/hex
|
||||||
|
|
||||||
This feature is provided as a way of creating patterns that contain
|
Parts of such a pattern are taken literally if quoted. This pattern
|
||||||
binary zero and other non-printing characters. By default, pcre2test
|
contains nine characters, only two of which are specified in hexadeci-
|
||||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
mal:
|
||||||
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
|
||||||
hexadecimal, the actual length of the pattern is passed.
|
/ab "literal" 32/hex
|
||||||
|
|
||||||
|
Either single or double quotes may be used. There is no way of includ-
|
||||||
|
ing the delimiter within a substring.
|
||||||
|
|
||||||
|
By default, pcre2test passes patterns as zero-terminated strings to
|
||||||
|
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
|
||||||
|
for patterns specified with the hex modifier, the actual length of the
|
||||||
|
pattern is passed.
|
||||||
|
|
||||||
Generating long repetitive patterns
|
Generating long repetitive patterns
|
||||||
|
|
||||||
Some tests use long patterns that are very repetitive. Instead of cre-
|
Some tests use long patterns that are very repetitive. Instead of cre-
|
||||||
ating a very long input line for such a pattern, you can use a special
|
ating a very long input line for such a pattern, you can use a special
|
||||||
repetition feature, similar to the one described for subject lines
|
repetition feature, similar to the one described for subject lines
|
||||||
above. If the expand modifier is present on a pattern, parts of the
|
above. If the expand modifier is present on a pattern, parts of the
|
||||||
pattern that have the form
|
pattern that have the form
|
||||||
|
|
||||||
\[<characters>]{<count>}
|
\[<characters>]{<count>}
|
||||||
|
|
||||||
are expanded before the pattern is passed to pcre2_compile(). For exam-
|
are expanded before the pattern is passed to pcre2_compile(). For exam-
|
||||||
ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction
|
ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction
|
||||||
cannot be nested. An initial "\[" sequence is recognized only if "]{"
|
cannot be nested. An initial "\[" sequence is recognized only if "]{"
|
||||||
followed by decimal digits and "}" is found later in the pattern. If
|
followed by decimal digits and "}" is found later in the pattern. If
|
||||||
not, the characters remain in the pattern unaltered.
|
not, the characters remain in the pattern unaltered.
|
||||||
|
|
||||||
If part of an expanded pattern looks like an expansion, but is really
|
If part of an expanded pattern looks like an expansion, but is really
|
||||||
part of the actual pattern, unwanted expansion can be avoided by giving
|
part of the actual pattern, unwanted expansion can be avoided by giving
|
||||||
two values in the quantifier. For example, \[AB]{6000,6000} is not rec-
|
two values in the quantifier. For example, \[AB]{6000,6000} is not rec-
|
||||||
ognized as an expansion item.
|
ognized as an expansion item.
|
||||||
|
|
||||||
If the info modifier is set on an expanded pattern, the result of the
|
If the info modifier is set on an expanded pattern, the result of the
|
||||||
expansion is included in the information that is output.
|
expansion is included in the information that is output.
|
||||||
|
|
||||||
JIT compilation
|
JIT compilation
|
||||||
|
|
||||||
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
||||||
greatly speed up pattern matching. See the pcre2jit documentation for
|
greatly speed up pattern matching. See the pcre2jit documentation for
|
||||||
details. JIT compiling happens, optionally, after a pattern has been
|
details. JIT compiling happens, optionally, after a pattern has been
|
||||||
successfully compiled into an internal form. The JIT compiler converts
|
successfully compiled into an internal form. The JIT compiler converts
|
||||||
this to optimized machine code. It needs to know whether the match-time
|
this to optimized machine code. It needs to know whether the match-time
|
||||||
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
|
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
|
||||||
because different code is generated for the different cases. See the
|
because different code is generated for the different cases. See the
|
||||||
partial modifier in "Subject Modifiers" below for details of how these
|
partial modifier in "Subject Modifiers" below for details of how these
|
||||||
options are specified for each match attempt.
|
options are specified for each match attempt.
|
||||||
|
|
||||||
JIT compilation is requested by the /jit pattern modifier, which may
|
JIT compilation is requested by the /jit pattern modifier, which may
|
||||||
optionally be followed by an equals sign and a number in the range 0 to
|
optionally be followed by an equals sign and a number in the range 0 to
|
||||||
7. The three bits that make up the number specify which of the three
|
7. The three bits that make up the number specify which of the three
|
||||||
JIT operating modes are to be compiled:
|
JIT operating modes are to be compiled:
|
||||||
|
|
||||||
1 compile JIT code for non-partial matching
|
1 compile JIT code for non-partial matching
|
||||||
|
@ -660,31 +672,31 @@ PATTERN MODIFIERS
|
||||||
6 soft and hard partial matching only
|
6 soft and hard partial matching only
|
||||||
7 all three modes
|
7 all three modes
|
||||||
|
|
||||||
If no number is given, 7 is assumed. The phrase "partial matching"
|
If no number is given, 7 is assumed. The phrase "partial matching"
|
||||||
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
|
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
|
||||||
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
||||||
plete match; the options enable the possibility of a partial match, but
|
plete match; the options enable the possibility of a partial match, but
|
||||||
do not require it. Note also that if you request JIT compilation only
|
do not require it. Note also that if you request JIT compilation only
|
||||||
for partial matching (for example, /jit=2) but do not set the partial
|
for partial matching (for example, /jit=2) but do not set the partial
|
||||||
modifier on a subject line, that match will not use JIT code because
|
modifier on a subject line, that match will not use JIT code because
|
||||||
none was compiled for non-partial matching.
|
none was compiled for non-partial matching.
|
||||||
|
|
||||||
If JIT compilation is successful, the compiled JIT code will automati-
|
If JIT compilation is successful, the compiled JIT code will automati-
|
||||||
cally be used when an appropriate type of match is run, except when
|
cally be used when an appropriate type of match is run, except when
|
||||||
incompatible run-time options are specified. For more details, see the
|
incompatible run-time options are specified. For more details, see the
|
||||||
pcre2jit documentation. See also the jitstack modifier below for a way
|
pcre2jit documentation. See also the jitstack modifier below for a way
|
||||||
of setting the size of the JIT stack.
|
of setting the size of the JIT stack.
|
||||||
|
|
||||||
If the jitfast modifier is specified, matching is done using the JIT
|
If the jitfast modifier is specified, matching is done using the JIT
|
||||||
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
||||||
ity checks that are done by pcre2_match(), and of course does not work
|
ity checks that are done by pcre2_match(), and of course does not work
|
||||||
when JIT is not supported. If jitfast is specified without jit, jit=7
|
when JIT is not supported. If jitfast is specified without jit, jit=7
|
||||||
is assumed.
|
is assumed.
|
||||||
|
|
||||||
If the jitverify modifier is specified, information about the compiled
|
If the jitverify modifier is specified, information about the compiled
|
||||||
pattern shows whether JIT compilation was or was not successful. If
|
pattern shows whether JIT compilation was or was not successful. If
|
||||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||||
the first output line after a match or non match when JIT-compiled code
|
the first output line after a match or non match when JIT-compiled code
|
||||||
was actually used in the match.
|
was actually used in the match.
|
||||||
|
|
||||||
|
@ -695,18 +707,18 @@ PATTERN MODIFIERS
|
||||||
/pattern/locale=fr_FR
|
/pattern/locale=fr_FR
|
||||||
|
|
||||||
The given locale is set, pcre2_maketables() is called to build a set of
|
The given locale is set, pcre2_maketables() is called to build a set of
|
||||||
character tables for the locale, and this is then passed to pcre2_com-
|
character tables for the locale, and this is then passed to pcre2_com-
|
||||||
pile() when compiling the regular expression. The same tables are used
|
pile() when compiling the regular expression. The same tables are used
|
||||||
when matching the following subject lines. The /locale modifier applies
|
when matching the following subject lines. The /locale modifier applies
|
||||||
only to the pattern on which it appears, but can be given in a #pattern
|
only to the pattern on which it appears, but can be given in a #pattern
|
||||||
command if a default is needed. Setting a locale and alternate charac-
|
command if a default is needed. Setting a locale and alternate charac-
|
||||||
ter tables are mutually exclusive.
|
ter tables are mutually exclusive.
|
||||||
|
|
||||||
Showing pattern memory
|
Showing pattern memory
|
||||||
|
|
||||||
The /memory modifier causes the size in bytes of the memory used to
|
The /memory modifier causes the size in bytes of the memory used to
|
||||||
hold the compiled pattern to be output. This does not include the size
|
hold the compiled pattern to be output. This does not include the size
|
||||||
of the pcre2_code block; it is just the actual compiled data. If the
|
of the pcre2_code block; it is just the actual compiled data. If the
|
||||||
pattern is subsequently passed to the JIT compiler, the size of the JIT
|
pattern is subsequently passed to the JIT compiler, the size of the JIT
|
||||||
compiled code is also output. Here is an example:
|
compiled code is also output. Here is an example:
|
||||||
|
|
||||||
|
@ -717,31 +729,31 @@ PATTERN MODIFIERS
|
||||||
|
|
||||||
Limiting nested parentheses
|
Limiting nested parentheses
|
||||||
|
|
||||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||||
parentheses in a pattern. Breaching the limit causes a compilation
|
parentheses in a pattern. Breaching the limit causes a compilation
|
||||||
error. The default for the library is set when PCRE2 is built, but
|
error. The default for the library is set when PCRE2 is built, but
|
||||||
pcre2test sets its own default of 220, which is required for running
|
pcre2test sets its own default of 220, which is required for running
|
||||||
the standard test suite.
|
the standard test suite.
|
||||||
|
|
||||||
Limiting the pattern length
|
Limiting the pattern length
|
||||||
|
|
||||||
The max_pattern_length modifier sets a limit, in code units, to the
|
The max_pattern_length modifier sets a limit, in code units, to the
|
||||||
length of pattern that pcre2_compile() will accept. Breaching the limit
|
length of pattern that pcre2_compile() will accept. Breaching the limit
|
||||||
causes a compilation error. The default is the largest number a
|
causes a compilation error. The default is the largest number a
|
||||||
PCRE2_SIZE variable can hold (essentially unlimited).
|
PCRE2_SIZE variable can hold (essentially unlimited).
|
||||||
|
|
||||||
Using the POSIX wrapper API
|
Using the POSIX wrapper API
|
||||||
|
|
||||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
|
||||||
per API rather than its native API. This supports only the 8-bit
|
the POSIX wrapper API rather than its native API. When posix_nosub is
|
||||||
library. Note that it does not imply POSIX matching semantics; for
|
used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
|
||||||
more detail see the pcre2posix documentation. When the POSIX API is
|
wrapper supports only the 8-bit library. Note that it does not imply
|
||||||
being used, the following pattern modifiers set options for the reg-
|
POSIX matching semantics; for more detail see the pcre2posix documenta-
|
||||||
comp() function:
|
tion. The following pattern modifiers set options for the regcomp()
|
||||||
|
function:
|
||||||
|
|
||||||
caseless REG_ICASE
|
caseless REG_ICASE
|
||||||
multiline REG_NEWLINE
|
multiline REG_NEWLINE
|
||||||
no_auto_capture REG_NOSUB
|
|
||||||
dotall REG_DOTALL )
|
dotall REG_DOTALL )
|
||||||
ungreedy REG_UNGREEDY ) These options are not part of
|
ungreedy REG_UNGREEDY ) These options are not part of
|
||||||
ucp REG_UCP ) the POSIX standard
|
ucp REG_UCP ) the POSIX standard
|
||||||
|
@ -758,23 +770,24 @@ PATTERN MODIFIERS
|
||||||
been set, a large buffer is used.
|
been set, a large buffer is used.
|
||||||
|
|
||||||
The aftertext and allaftertext subject modifiers work as described
|
The aftertext and allaftertext subject modifiers work as described
|
||||||
below. All other modifiers cause an error.
|
below. All other modifiers are either ignored, with a warning message,
|
||||||
|
or cause an error.
|
||||||
|
|
||||||
Testing the stack guard feature
|
Testing the stack guard feature
|
||||||
|
|
||||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||||
pile_recursion_guard(), a function that is provided to enable stack
|
pile_recursion_guard(), a function that is provided to enable stack
|
||||||
availability to be checked during compilation (see the pcre2api docu-
|
availability to be checked during compilation (see the pcre2api docu-
|
||||||
mentation for details). If the number specified by the modifier is
|
mentation for details). If the number specified by the modifier is
|
||||||
greater than zero, pcre2_set_compile_recursion_guard() is called to set
|
greater than zero, pcre2_set_compile_recursion_guard() is called to set
|
||||||
up callback from pcre2_compile() to a local function. The argument it
|
up callback from pcre2_compile() to a local function. The argument it
|
||||||
receives is the current nesting parenthesis depth; if this is greater
|
receives is the current nesting parenthesis depth; if this is greater
|
||||||
than the value given by the modifier, non-zero is returned, causing the
|
than the value given by the modifier, non-zero is returned, causing the
|
||||||
compilation to be aborted.
|
compilation to be aborted.
|
||||||
|
|
||||||
Using alternative character tables
|
Using alternative character tables
|
||||||
|
|
||||||
The value specified for the /tables modifier must be one of the digits
|
The value specified for the /tables modifier must be one of the digits
|
||||||
0, 1, or 2. It causes a specific set of built-in character tables to be
|
0, 1, or 2. It causes a specific set of built-in character tables to be
|
||||||
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
||||||
haviour with different character tables. The digit specifies the tables
|
haviour with different character tables. The digit specifies the tables
|
||||||
|
@ -785,15 +798,15 @@ PATTERN MODIFIERS
|
||||||
pcre2_chartables.c.dist
|
pcre2_chartables.c.dist
|
||||||
2 a set of tables defining ISO 8859 characters
|
2 a set of tables defining ISO 8859 characters
|
||||||
|
|
||||||
In table 2, some characters whose codes are greater than 128 are iden-
|
In table 2, some characters whose codes are greater than 128 are iden-
|
||||||
tified as letters, digits, spaces, etc. Setting alternate character
|
tified as letters, digits, spaces, etc. Setting alternate character
|
||||||
tables and a locale are mutually exclusive.
|
tables and a locale are mutually exclusive.
|
||||||
|
|
||||||
Setting certain match controls
|
Setting certain match controls
|
||||||
|
|
||||||
The following modifiers are really subject modifiers, and are described
|
The following modifiers are really subject modifiers, and are described
|
||||||
below. However, they may be included in a pattern's modifier list, in
|
below. However, they may be included in a pattern's modifier list, in
|
||||||
which case they are applied to every subject line that is processed
|
which case they are applied to every subject line that is processed
|
||||||
with that pattern. They may not appear in #pattern commands. These mod-
|
with that pattern. They may not appear in #pattern commands. These mod-
|
||||||
ifiers do not affect the compilation process.
|
ifiers do not affect the compilation process.
|
||||||
|
|
||||||
|
@ -810,20 +823,20 @@ PATTERN MODIFIERS
|
||||||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||||
|
|
||||||
These modifiers may not appear in a #pattern command. If you want them
|
These modifiers may not appear in a #pattern command. If you want them
|
||||||
as defaults, set them in a #subject command.
|
as defaults, set them in a #subject command.
|
||||||
|
|
||||||
Saving a compiled pattern
|
Saving a compiled pattern
|
||||||
|
|
||||||
When a pattern with the push modifier is successfully compiled, it is
|
When a pattern with the push modifier is successfully compiled, it is
|
||||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||||
next line to contain a new pattern (or a command) instead of a subject
|
next line to contain a new pattern (or a command) instead of a subject
|
||||||
line. This facility is used when saving compiled patterns to a file, as
|
line. This facility is used when saving compiled patterns to a file, as
|
||||||
described in the section entitled "Saving and restoring compiled pat-
|
described in the section entitled "Saving and restoring compiled pat-
|
||||||
terns" below. The push modifier is incompatible with compilation modi-
|
terns" below. The push modifier is incompatible with compilation modi-
|
||||||
fiers such as global that act at match time. Any that are specified are
|
fiers such as global that act at match time. Any that are specified are
|
||||||
ignored, with a warning message, except for replace, which causes an
|
ignored, with a warning message, except for replace, which causes an
|
||||||
error. Note that, jitverify, which is allowed, does not carry through
|
error. Note that, jitverify, which is allowed, does not carry through
|
||||||
to any subsequent matching that uses this pattern.
|
to any subsequent matching that uses this pattern.
|
||||||
|
|
||||||
|
|
||||||
|
@ -834,7 +847,7 @@ SUBJECT MODIFIERS
|
||||||
|
|
||||||
Setting match options
|
Setting match options
|
||||||
|
|
||||||
The following modifiers set options for pcre2_match() or
|
The following modifiers set options for pcre2_match() or
|
||||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||||
|
|
||||||
anchored set PCRE2_ANCHORED
|
anchored set PCRE2_ANCHORED
|
||||||
|
@ -848,20 +861,20 @@ SUBJECT MODIFIERS
|
||||||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||||
|
|
||||||
The partial matching modifiers are provided with abbreviations because
|
The partial matching modifiers are provided with abbreviations because
|
||||||
they appear frequently in tests.
|
they appear frequently in tests.
|
||||||
|
|
||||||
If the /posix modifier was present on the pattern, causing the POSIX
|
If the /posix modifier was present on the pattern, causing the POSIX
|
||||||
wrapper API to be used, the only option-setting modifiers that have any
|
wrapper API to be used, the only option-setting modifiers that have any
|
||||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||||
Any other modifiers cause an error.
|
The other modifiers are ignored, with a warning message.
|
||||||
|
|
||||||
Setting match controls
|
Setting match controls
|
||||||
|
|
||||||
The following modifiers affect the matching process or request addi-
|
The following modifiers affect the matching process or request addi-
|
||||||
tional information. Some of them may also be specified on a pattern
|
tional information. Some of them may also be specified on a pattern
|
||||||
line (see above), in which case they apply to every subject line that
|
line (see above), in which case they apply to every subject line that
|
||||||
is matched against that pattern.
|
is matched against that pattern.
|
||||||
|
|
||||||
aftertext show text after match
|
aftertext show text after match
|
||||||
|
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
|
||||||
zero_terminate pass the subject as zero-terminated
|
zero_terminate pass the subject as zero-terminated
|
||||||
|
|
||||||
The effects of these modifiers are described in the following sections.
|
The effects of these modifiers are described in the following sections.
|
||||||
|
When matching via the POSIX wrapper API, the aftertext, allaftertext,
|
||||||
|
and ovector subject modifiers work as described below. All other modi-
|
||||||
|
fiers are either ignored, with a warning message, or cause an error.
|
||||||
|
|
||||||
Showing more text
|
Showing more text
|
||||||
|
|
||||||
|
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
|
||||||
matched with the pattern, terminated as usual by an empty line or end
|
matched with the pattern, terminated as usual by an empty line or end
|
||||||
of file. This command may be followed by a modifier list containing
|
of file. This command may be followed by a modifier list containing
|
||||||
only control modifiers that act after a pattern has been compiled. In
|
only control modifiers that act after a pattern has been compiled. In
|
||||||
particular, hex, posix, and push are not allowed, nor are any option-
|
particular, hex, posix, posix_nosub, and push are not allowed, nor are
|
||||||
setting modifiers. The JIT modifiers are, however permitted. Here is
|
any option-setting modifiers. The JIT modifiers are, however permit-
|
||||||
an example that saves and reloads two patterns.
|
ted. Here is an example that saves and reloads two patterns.
|
||||||
|
|
||||||
/abc/push
|
/abc/push
|
||||||
/xyz/push
|
/xyz/push
|
||||||
|
@ -1505,5 +1521,5 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 12 December 2015
|
Last updated: 31 January 2016
|
||||||
Copyright (c) 1997-2015 University of Cambridge.
|
Copyright (c) 1997-2016 University of Cambridge.
|
||||||
|
|
|
@ -3,28 +3,31 @@
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* This is a demonstration program to illustrate a straightforward way of
|
/* This is a demonstration program to illustrate a straightforward way of
|
||||||
calling the PCRE2 regular expression library from a C program. See the
|
using the PCRE2 regular expression library from a C program. See the
|
||||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||||
incompatible with the original PCRE API.
|
incompatible with the original PCRE API.
|
||||||
|
|
||||||
There are actually three libraries, each supporting a different code unit
|
There are actually three libraries, each supporting a different code unit
|
||||||
width. This demonstration program uses the 8-bit library.
|
width. This demonstration program uses the 8-bit library. The default is to
|
||||||
|
process each code unit as a separate character, but if the pattern begins with
|
||||||
|
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||||
|
characters may occupy multiple code units.
|
||||||
|
|
||||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||||
libraries, you should be able to compile this program using this command:
|
libraries, you should be able to compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||||
compile this program using this command:
|
compile this program using this command:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||||
|
|
||||||
If you do not have pkg-config, you may have to use this:
|
If you do not have pkg-config, you may have to use something like this:
|
||||||
|
|
||||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||||
|
|
||||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||||
|
@ -39,9 +42,14 @@ the following line. */
|
||||||
|
|
||||||
/* #define PCRE2_STATIC */
|
/* #define PCRE2_STATIC */
|
||||||
|
|
||||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||||
only one code unit width, it makes it possible to use generic function names
|
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||||
such as pcre2_compile(). */
|
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||||
|
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||||
|
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||||
|
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||||
|
characters, the code for handling the table of named substrings will still need
|
||||||
|
to be modified. */
|
||||||
|
|
||||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||||
|
|
||||||
|
@ -62,19 +70,19 @@ int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
pcre2_code *re;
|
pcre2_code *re;
|
||||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||||
PCRE2_SPTR name_table;
|
PCRE2_SPTR name_table;
|
||||||
|
|
||||||
int crlf_is_newline;
|
int crlf_is_newline;
|
||||||
int errornumber;
|
int errornumber;
|
||||||
int find_all;
|
int find_all;
|
||||||
int i;
|
int i;
|
||||||
int namecount;
|
|
||||||
int name_entry_size;
|
|
||||||
int rc;
|
int rc;
|
||||||
int utf8;
|
int utf8;
|
||||||
|
|
||||||
uint32_t option_bits;
|
uint32_t option_bits;
|
||||||
|
uint32_t namecount;
|
||||||
|
uint32_t name_entry_size;
|
||||||
uint32_t newline;
|
uint32_t newline;
|
||||||
|
|
||||||
PCRE2_SIZE erroroffset;
|
PCRE2_SIZE erroroffset;
|
||||||
|
@ -89,15 +97,19 @@ pcre2_match_data *match_data;
|
||||||
* First, sort out the command line. There is only one possible option at *
|
* First, sort out the command line. There is only one possible option at *
|
||||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||||
* if the -g option is present. Apart from that, there must be exactly two *
|
* if the -g option is present. *
|
||||||
* arguments. *
|
|
||||||
**************************************************************************/
|
**************************************************************************/
|
||||||
|
|
||||||
find_all = 0;
|
find_all = 0;
|
||||||
for (i = 1; i < argc; i++)
|
for (i = 1; i < argc; i++)
|
||||||
{
|
{
|
||||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||||
else break;
|
else if (argv[i][0] == '-')
|
||||||
|
{
|
||||||
|
printf("Unrecognised option %s\n", argv[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* After the options, we require exactly two arguments, which are the pattern,
|
/* After the options, we require exactly two arguments, which are the pattern,
|
||||||
|
@ -105,7 +117,7 @@ and the subject string. */
|
||||||
|
|
||||||
if (argc - i != 2)
|
if (argc - i != 2)
|
||||||
{
|
{
|
||||||
printf("Two arguments required: a regex and a subject string\n");
|
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -184,7 +196,7 @@ if (rc < 0)
|
||||||
stored. */
|
stored. */
|
||||||
|
|
||||||
ovector = pcre2_get_ovector_pointer(match_data);
|
ovector = pcre2_get_ovector_pointer(match_data);
|
||||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
|
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
|
||||||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||||
&namecount); /* where to put the answer */
|
&namecount); /* where to put the answer */
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr;
|
PCRE2_SPTR tabptr;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
@ -313,8 +325,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
uint32_t options = 0; /* Normally no options */
|
uint32_t options = 0; /* Normally no options */
|
||||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||||
|
|
||||||
/* If the previous match was for an empty string, we are finished if we are
|
/* If the previous match was for an empty string, we are finished if we are
|
||||||
at the end of the subject. Otherwise, arrange to run another match at the
|
at the end of the subject. Otherwise, arrange to run another match at the
|
||||||
|
@ -354,7 +366,7 @@ for (;;)
|
||||||
{
|
{
|
||||||
if (options == 0) break; /* All matches found */
|
if (options == 0) break; /* All matches found */
|
||||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||||
if (crlf_is_newline && /* If CRLF is newline & */
|
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||||
subject[start_offset] == '\r' &&
|
subject[start_offset] == '\r' &&
|
||||||
subject[start_offset + 1] == '\n')
|
subject[start_offset + 1] == '\n')
|
||||||
|
@ -400,7 +412,7 @@ for (;;)
|
||||||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (namecount <= 0) printf("No named substrings\n"); else
|
if (namecount == 0) printf("No named substrings\n"); else
|
||||||
{
|
{
|
||||||
PCRE2_SPTR tabptr = name_table;
|
PCRE2_SPTR tabptr = name_table;
|
||||||
printf("Named substrings\n");
|
printf("Named substrings\n");
|
||||||
|
|
Loading…
Reference in New Issue