Tidy pcre2demo.c
This commit is contained in:
parent
6c1c817438
commit
4e67c0c9e9
|
@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
|
|||
modifier had this effect. That option is now ignored when the POSIX API is in
|
||||
use.
|
||||
|
||||
8. Minor tidies to the pcre2demo.c sample program, including more comments
|
||||
about its 8-bit-ness.
|
||||
|
||||
|
||||
Version 10.21 12-January-2016
|
||||
-----------------------------
|
||||
|
|
|
@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
|
|||
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
|
||||
were followed by ?: but named parentheses can still be used for capturing (and
|
||||
they acquire numbers in the usual way). There is no equivalent of this option
|
||||
in Perl.
|
||||
in Perl. Note that, if this option is set, references to capturing groups (back
|
||||
references or recursion/subroutine calls) may only refer to named groups,
|
||||
though the reference can be by name or by number.
|
||||
<pre>
|
||||
PCRE2_NO_AUTO_POSSESS
|
||||
</pre>
|
||||
|
@ -3121,9 +3123,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 16 December 2015
|
||||
Last updated: 31 January 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
|
|||
*************************************************/
|
||||
|
||||
/* This is a demonstration program to illustrate a straightforward way of
|
||||
calling the PCRE2 regular expression library from a C program. See the
|
||||
using the PCRE2 regular expression library from a C program. See the
|
||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||
incompatible with the original PCRE API.
|
||||
|
||||
There are actually three libraries, each supporting a different code unit
|
||||
width. This demonstration program uses the 8-bit library.
|
||||
width. This demonstration program uses the 8-bit library. The default is to
|
||||
process each code unit as a separate character, but if the pattern begins with
|
||||
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||
characters may occupy multiple code units.
|
||||
|
||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||
libraries, you should be able to compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
|
||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||
compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
|
||||
If you do not have pkg-config, you may have to use this:
|
||||
If you do not have pkg-config, you may have to use something like this:
|
||||
|
||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||
|
||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||
|
@ -56,9 +59,14 @@ the following line. */
|
|||
|
||||
/* #define PCRE2_STATIC */
|
||||
|
||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
||||
only one code unit width, it makes it possible to use generic function names
|
||||
such as pcre2_compile(). */
|
||||
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||
characters, the code for handling the table of named substrings will still need
|
||||
to be modified. */
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
|
||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
|||
{
|
||||
pcre2_code *re;
|
||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
||||
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||
PCRE2_SPTR name_table;
|
||||
|
||||
int crlf_is_newline;
|
||||
int errornumber;
|
||||
int find_all;
|
||||
int i;
|
||||
int namecount;
|
||||
int name_entry_size;
|
||||
int rc;
|
||||
int utf8;
|
||||
|
||||
uint32_t option_bits;
|
||||
uint32_t namecount;
|
||||
uint32_t name_entry_size;
|
||||
uint32_t newline;
|
||||
|
||||
PCRE2_SIZE erroroffset;
|
||||
|
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
|
|||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||
* if the -g option is present. Apart from that, there must be exactly two *
|
||||
* arguments. *
|
||||
* if the -g option is present. *
|
||||
**************************************************************************/
|
||||
|
||||
find_all = 0;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||
else break;
|
||||
else if (argv[i][0] == '-')
|
||||
{
|
||||
printf("Unrecognised option %s\n", argv[i]);
|
||||
return 1;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
|
||||
/* After the options, we require exactly two arguments, which are the pattern,
|
||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
|||
|
||||
if (argc - i != 2)
|
||||
{
|
||||
printf("Two arguments required: a regex and a subject string\n");
|
||||
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
|||
stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
||||
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
|||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||
&namecount); /* where to put the answer */
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr;
|
||||
printf("Named substrings\n");
|
||||
|
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
|
|||
|
||||
for (;;)
|
||||
{
|
||||
uint32_t options = 0; /* Normally no options */
|
||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||
uint32_t options = 0; /* Normally no options */
|
||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||
|
||||
/* If the previous match was for an empty string, we are finished if we are
|
||||
at the end of the subject. Otherwise, arrange to run another match at the
|
||||
|
@ -371,7 +383,7 @@ for (;;)
|
|||
{
|
||||
if (options == 0) break; /* All matches found */
|
||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||
if (crlf_is_newline && /* If CRLF is newline & */
|
||||
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||
subject[start_offset] == '\r' &&
|
||||
subject[start_offset + 1] == '\n')
|
||||
|
@ -417,7 +429,7 @@ for (;;)
|
|||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr = name_table;
|
||||
printf("Named substrings\n");
|
||||
|
|
|
@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
|
|||
<a href="#lookbehind">(described below)</a>
|
||||
in a UTF mode, because this would make it impossible to calculate the length of
|
||||
the lookbehind. Neither the alternative matching function
|
||||
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
|
||||
<b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
|
||||
former gives a match-time error; the latter fails to optimize and so the match
|
||||
is always run using the interpreter.
|
||||
</P>
|
||||
|
|
|
@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
|
|||
expression 8-bit library. See the
|
||||
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||
documentation for a description of PCRE2's native API, which contains much
|
||||
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit
|
||||
additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
|
||||
and 32-bit libraries.
|
||||
</P>
|
||||
<P>
|
||||
|
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
|
|||
replacement library. Other POSIX options are not even defined.
|
||||
</P>
|
||||
<P>
|
||||
There are also some other options that are not defined by POSIX. These have
|
||||
been added at the request of users who want to make use of certain
|
||||
PCRE2-specific features via the POSIX calling interface.
|
||||
There are also some options that are not defined by POSIX. These have been
|
||||
added at the request of users who want to make use of certain PCRE2-specific
|
||||
features via the POSIX calling interface.
|
||||
</P>
|
||||
<P>
|
||||
When PCRE2 is called via these functions, it is only the API that is POSIX-like
|
||||
|
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
|
|||
<pre>
|
||||
REG_NOSUB
|
||||
</pre>
|
||||
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed
|
||||
for compilation to the native function. In addition, when a pattern that is
|
||||
compiled with this flag is passed to <b>regexec()</b> for matching, the
|
||||
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings
|
||||
are returned.
|
||||
When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
|
||||
matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
|
||||
captured strings are returned. Versions of the PCRE library prior to 10.22 used
|
||||
to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
|
||||
because it disables the use of back references.
|
||||
<pre>
|
||||
REG_UCP
|
||||
</pre>
|
||||
|
@ -241,14 +241,15 @@ mutually exclusive; the error REG_INVARG is returned.
|
|||
<P>
|
||||
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
|
||||
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
|
||||
<b>regexec()</b> are ignored.
|
||||
<b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
|
||||
</P>
|
||||
<P>
|
||||
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL,
|
||||
no data about any matched strings is returned.
|
||||
The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
|
||||
(unless REG_STARTEND is set); in both these cases no data about any matched
|
||||
strings is returned.
|
||||
</P>
|
||||
<P>
|
||||
Otherwise,the portion of the string that was matched, and also any captured
|
||||
Otherwise, the portion of the string that was matched, and also any captured
|
||||
substrings, are returned via the <i>pmatch</i> argument, which points to an
|
||||
array of <i>nmatch</i> structures of type <i>regmatch_t</i>, containing the
|
||||
members <i>rm_so</i> and <i>rm_eo</i>. These contain the byte offset to the first
|
||||
|
@ -290,9 +291,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 29 November 2015
|
||||
Last updated: 31 January 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
|
|||
save this listing to re-create the contents of <i>pcre2demo.c</i>.
|
||||
</P>
|
||||
<P>
|
||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
||||
regular expression that is its first argument, and matches it against the
|
||||
subject string in its second argument. No PCRE2 options are set, and default
|
||||
character tables are used. If matching succeeds, the program outputs the
|
||||
portion of the subject that matched, together with the contents of any captured
|
||||
substrings.
|
||||
The demonstration program compiles the regular expression that is its
|
||||
first argument, and matches it against the subject string in its second
|
||||
argument. No PCRE2 options are set, and default character tables are used. If
|
||||
matching succeeds, the program outputs the portion of the subject that matched,
|
||||
together with the contents of any captured substrings.
|
||||
</P>
|
||||
<P>
|
||||
If the -g option is given on the command line, the program then goes on to
|
||||
|
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
|
|||
an empty string. Comments in the code explain what is going on.
|
||||
</P>
|
||||
<P>
|
||||
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
|
||||
library. It handles strings and characters that are stored in 8-bit code units.
|
||||
By default, one character corresponds to one code unit, but if the pattern
|
||||
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||
where characters may occupy multiple code units.
|
||||
</P>
|
||||
<P>
|
||||
If PCRE2 is installed in the standard include and library directories for your
|
||||
operating system, you should be able to compile the demonstration program using
|
||||
this command:
|
||||
a command like this:
|
||||
<pre>
|
||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
</pre>
|
||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||
<i>/usr/local</i>, you can compile the demonstration program using a command
|
||||
like this:
|
||||
<pre>
|
||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
||||
|
||||
</PRE>
|
||||
</P>
|
||||
<P>
|
||||
Once you have compiled and linked the demonstration program, you can run simple
|
||||
tests like this:
|
||||
cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
|
||||
</pre>
|
||||
Once you have built the demonstration program, you can run simple tests like
|
||||
this:
|
||||
<pre>
|
||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||
</pre>
|
||||
Note that there is a much more comprehensive test program, called
|
||||
<a href="pcre2test.html"><b>pcre2test</b>,</a>
|
||||
which supports many more facilities for testing regular expressions using the
|
||||
PCRE2 libraries. The
|
||||
which supports many more facilities for testing regular expressions using all
|
||||
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||
installed). The
|
||||
<a href="pcre2demo.html"><b>pcre2demo</b></a>
|
||||
program is provided as a simple coding example.
|
||||
program is provided as a relatively simple coding example.
|
||||
</P>
|
||||
<P>
|
||||
If you try to run
|
||||
|
@ -73,7 +77,7 @@ If you try to run
|
|||
when PCRE2 is not installed in the standard library directory, you may get an
|
||||
error like this on some operating systems (e.g. Solaris):
|
||||
<pre>
|
||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
||||
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||
</pre>
|
||||
This is caused by the way shared library support works on those systems. You
|
||||
need to add
|
||||
|
@ -97,9 +101,9 @@ Cambridge, England.
|
|||
REVISION
|
||||
</b><br>
|
||||
<P>
|
||||
Last updated: 20 October 2014
|
||||
Last updated: 02 February 2016
|
||||
<br>
|
||||
Copyright © 1997-2014 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
|
@ -98,10 +98,11 @@ further data is read.
|
|||
</P>
|
||||
<P>
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in <b>pcre2test</b> input files. There is a facility for specifying a
|
||||
pattern's characters as hexadecimal pairs, thus making it possible to include
|
||||
binary zeroes in a pattern for testing purposes. Subject lines are processed
|
||||
for backslash escapes, which makes it possible to include any data value.
|
||||
characters in <b>pcre2test</b> input files. There is a facility for specifying
|
||||
some or all of a pattern's characters as hexadecimal pairs, thus making it
|
||||
possible to include binary zeroes in a pattern for testing purposes. Subject
|
||||
lines are processed for backslash escapes, which makes it possible to include
|
||||
any data value.
|
||||
</P>
|
||||
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
|
||||
<P>
|
||||
|
@ -559,7 +560,7 @@ about the pattern:
|
|||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
hex pattern is coded in hexadecimal
|
||||
hex unquoted characters are hexadecimal
|
||||
jit[=<number>] use JIT
|
||||
jitfast use JIT fast path
|
||||
jitverify verify JIT use
|
||||
|
@ -570,6 +571,7 @@ about the pattern:
|
|||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
posix_nosub use the POSIX API with REG_NOSUB
|
||||
push push compiled pattern onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
tables=[0|1|2] select internal tables
|
||||
|
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
|
|||
default values).
|
||||
</P>
|
||||
<br><b>
|
||||
Specifying a pattern in hex
|
||||
Specifying pattern characters in hexadecimal
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>hex</b> modifier specifies that the characters of the pattern are to be
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted between
|
||||
pairs. For example:
|
||||
The <b>hex</b> modifier specifies that the characters of the pattern, except for
|
||||
substrings enclosed in single or double quotes, are to be interpreted as pairs
|
||||
of hexadecimal digits. This feature is provided as a way of creating patterns
|
||||
that contain binary zeros and other non-printing characters. White space is
|
||||
permitted between pairs of digits. For example, this pattern contains three
|
||||
characters:
|
||||
<pre>
|
||||
/ab 32 59/hex
|
||||
</pre>
|
||||
This feature is provided as a way of creating patterns that contain binary zero
|
||||
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
|
||||
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
|
||||
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
|
||||
actual length of the pattern is passed.
|
||||
Parts of such a pattern are taken literally if quoted. This pattern contains
|
||||
nine characters, only two of which are specified in hexadecimal:
|
||||
<pre>
|
||||
/ab "literal" 32/hex
|
||||
</pre>
|
||||
Either single or double quotes may be used. There is no way of including
|
||||
the delimiter within a substring.
|
||||
</P>
|
||||
<P>
|
||||
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
|
||||
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
|
||||
patterns specified with the <b>hex</b> modifier, the actual length of the
|
||||
pattern is passed.
|
||||
</P>
|
||||
<br><b>
|
||||
Generating long repetitive patterns
|
||||
|
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
|
|||
Using the POSIX wrapper API
|
||||
</b><br>
|
||||
<P>
|
||||
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX
|
||||
wrapper API rather than its native API. This supports only the 8-bit library.
|
||||
Note that it does not imply POSIX matching semantics; for more detail see the
|
||||
The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
|
||||
PCRE2 via the POSIX wrapper API rather than its native API. When
|
||||
<b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
|
||||
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
|
||||
it does not imply POSIX matching semantics; for more detail see the
|
||||
<a href="pcre2posix.html"><b>pcre2posix</b></a>
|
||||
documentation. When the POSIX API is being used, the following pattern
|
||||
modifiers set options for the <b>regcomp()</b> function:
|
||||
documentation. The following pattern modifiers set options for the
|
||||
<b>regcomp()</b> function:
|
||||
<pre>
|
||||
caseless REG_ICASE
|
||||
multiline REG_NEWLINE
|
||||
no_auto_capture REG_NOSUB
|
||||
dotall REG_DOTALL )
|
||||
ungreedy REG_UNGREEDY ) These options are not part of
|
||||
ucp REG_UCP ) the POSIX standard
|
||||
|
@ -847,7 +861,8 @@ large buffer is used.
|
|||
</P>
|
||||
<P>
|
||||
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
|
||||
below. All other modifiers cause an error.
|
||||
below. All other modifiers are either ignored, with a warning message, or cause
|
||||
an error.
|
||||
</P>
|
||||
<br><b>
|
||||
Testing the stack guard feature
|
||||
|
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
|
|||
wrapper API to be used, the only option-setting modifiers that have any effect
|
||||
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
|
||||
Any other modifiers cause an error.
|
||||
The other modifiers are ignored, with a warning message.
|
||||
</P>
|
||||
<br><b>
|
||||
Setting match controls
|
||||
|
@ -1001,7 +1016,10 @@ pattern.
|
|||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
zero_terminate pass the subject as zero-terminated
|
||||
</pre>
|
||||
The effects of these modifiers are described in the following sections.
|
||||
The effects of these modifiers are described in the following sections. When
|
||||
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
|
||||
and <b>ovector</b> subject modifiers work as described below. All other
|
||||
modifiers are either ignored, with a warning message, or cause an error.
|
||||
</P>
|
||||
<br><b>
|
||||
Showing more text
|
||||
|
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
|
|||
modifier list containing only
|
||||
<a href="#controlmodifiers">control modifiers</a>
|
||||
that act after a pattern has been compiled. In particular, <b>hex</b>,
|
||||
<b>posix</b>, and <b>push</b> are not allowed, nor are any
|
||||
<b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
|
||||
<a href="#optionmodifiers">option-setting modifiers.</a>
|
||||
The JIT modifiers are, however permitted. Here is an example that saves and
|
||||
reloads two patterns.
|
||||
|
@ -1660,9 +1678,9 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 12 December 2015
|
||||
Last updated: 31 January 2016
|
||||
<br>
|
||||
Copyright © 1997-2015 University of Cambridge.
|
||||
Copyright © 1997-2016 University of Cambridge.
|
||||
<br>
|
||||
<p>
|
||||
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||
|
|
1549
doc/pcre2.txt
1549
doc/pcre2.txt
File diff suppressed because it is too large
Load Diff
|
@ -20,28 +20,31 @@
|
|||
*************************************************/
|
||||
|
||||
/* This is a demonstration program to illustrate a straightforward way of
|
||||
calling the PCRE2 regular expression library from a C program. See the
|
||||
using the PCRE2 regular expression library from a C program. See the
|
||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||
incompatible with the original PCRE API.
|
||||
|
||||
There are actually three libraries, each supporting a different code unit
|
||||
width. This demonstration program uses the 8-bit library.
|
||||
width. This demonstration program uses the 8-bit library. The default is to
|
||||
process each code unit as a separate character, but if the pattern begins with
|
||||
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||
characters may occupy multiple code units.
|
||||
|
||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||
libraries, you should be able to compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
|
||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||
compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
|
||||
If you do not have pkg-config, you may have to use this:
|
||||
If you do not have pkg-config, you may have to use something like this:
|
||||
|
||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
||||
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
|
||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||
|
||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||
|
@ -56,9 +59,14 @@ the following line. */
|
|||
|
||||
/* #define PCRE2_STATIC */
|
||||
|
||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
||||
only one code unit width, it makes it possible to use generic function names
|
||||
such as pcre2_compile(). */
|
||||
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||
characters, the code for handling the table of named substrings will still need
|
||||
to be modified. */
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
|
||||
|
@ -79,19 +87,19 @@ int main(int argc, char **argv)
|
|||
{
|
||||
pcre2_code *re;
|
||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
||||
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||
PCRE2_SPTR name_table;
|
||||
|
||||
int crlf_is_newline;
|
||||
int errornumber;
|
||||
int find_all;
|
||||
int i;
|
||||
int namecount;
|
||||
int name_entry_size;
|
||||
int rc;
|
||||
int utf8;
|
||||
|
||||
uint32_t option_bits;
|
||||
uint32_t namecount;
|
||||
uint32_t name_entry_size;
|
||||
uint32_t newline;
|
||||
|
||||
PCRE2_SIZE erroroffset;
|
||||
|
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
|
|||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||
* if the -g option is present. Apart from that, there must be exactly two *
|
||||
* arguments. *
|
||||
* if the -g option is present. *
|
||||
**************************************************************************/
|
||||
|
||||
find_all = 0;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||
else break;
|
||||
else if (argv[i][0] == '-')
|
||||
{
|
||||
printf("Unrecognised option %s\en", argv[i]);
|
||||
return 1;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
|
||||
/* After the options, we require exactly two arguments, which are the pattern,
|
||||
|
@ -122,7 +134,7 @@ and the subject string. */
|
|||
|
||||
if (argc - i != 2)
|
||||
{
|
||||
printf("Two arguments required: a regex and a subject string\en");
|
||||
printf("Exactly two arguments required: a regex and a subject string\en");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -201,7 +213,7 @@ if (rc < 0)
|
|||
stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
|
||||
printf("Match succeeded at offset %d\en", (int)ovector[0]);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
|
|||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||
&namecount); /* where to put the answer */
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\en"); else
|
||||
if (namecount == 0) printf("No named substrings\en"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr;
|
||||
printf("Named substrings\en");
|
||||
|
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
|
|||
|
||||
for (;;)
|
||||
{
|
||||
uint32_t options = 0; /* Normally no options */
|
||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||
uint32_t options = 0; /* Normally no options */
|
||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||
|
||||
/* If the previous match was for an empty string, we are finished if we are
|
||||
at the end of the subject. Otherwise, arrange to run another match at the
|
||||
|
@ -371,7 +383,7 @@ for (;;)
|
|||
{
|
||||
if (options == 0) break; /* All matches found */
|
||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||
if (crlf_is_newline && /* If CRLF is newline & */
|
||||
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||
subject[start_offset] == '\er' &&
|
||||
subject[start_offset + 1] == '\en')
|
||||
|
@ -417,7 +429,7 @@ for (;;)
|
|||
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\en"); else
|
||||
if (namecount == 0) printf("No named substrings\en"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr = name_table;
|
||||
printf("Named substrings\en");
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00"
|
||||
.TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 SAMPLE PROGRAM"
|
||||
|
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
|
|||
documentation. If you do not have a copy of the PCRE2 distribution, you can
|
||||
save this listing to re-create the contents of \fIpcre2demo.c\fP.
|
||||
.P
|
||||
The demonstration program, which uses the PCRE2 8-bit library, compiles the
|
||||
regular expression that is its first argument, and matches it against the
|
||||
subject string in its second argument. No PCRE2 options are set, and default
|
||||
character tables are used. If matching succeeds, the program outputs the
|
||||
portion of the subject that matched, together with the contents of any captured
|
||||
substrings.
|
||||
The demonstration program compiles the regular expression that is its
|
||||
first argument, and matches it against the subject string in its second
|
||||
argument. No PCRE2 options are set, and default character tables are used. If
|
||||
matching succeeds, the program outputs the portion of the subject that matched,
|
||||
together with the contents of any captured substrings.
|
||||
.P
|
||||
If the -g option is given on the command line, the program then goes on to
|
||||
check for further matches of the same regular expression in the same subject
|
||||
string. The logic is a little bit tricky because of the possibility of matching
|
||||
an empty string. Comments in the code explain what is going on.
|
||||
.P
|
||||
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
|
||||
library. It handles strings and characters that are stored in 8-bit code units.
|
||||
By default, one character corresponds to one code unit, but if the pattern
|
||||
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
|
||||
where characters may occupy multiple code units.
|
||||
.P
|
||||
If PCRE2 is installed in the standard include and library directories for your
|
||||
operating system, you should be able to compile the demonstration program using
|
||||
this command:
|
||||
a command like this:
|
||||
.sp
|
||||
gcc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
cc -o pcre2demo pcre2demo.c -lpcre2-8
|
||||
.sp
|
||||
If PCRE2 is installed elsewhere, you may need to add additional options to the
|
||||
command line. For example, on a Unix-like system that has PCRE2 installed in
|
||||
|
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
|
|||
like this:
|
||||
.sp
|
||||
.\" JOINSH
|
||||
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
||||
-L/usr/local/lib -lpcre2-8
|
||||
cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
|
||||
-L/usr/local/lib -lpcre2-8
|
||||
.sp
|
||||
.P
|
||||
Once you have compiled and linked the demonstration program, you can run simple
|
||||
tests like this:
|
||||
Once you have built the demonstration program, you can run simple tests like
|
||||
this:
|
||||
.sp
|
||||
./pcre2demo 'cat|dog' 'the cat sat on the mat'
|
||||
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
|
||||
|
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
|
|||
.\" HREF
|
||||
\fBpcre2test\fP,
|
||||
.\"
|
||||
which supports many more facilities for testing regular expressions using the
|
||||
PCRE2 libraries. The
|
||||
which supports many more facilities for testing regular expressions using all
|
||||
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
|
||||
installed). The
|
||||
.\" HREF
|
||||
\fBpcre2demo\fP
|
||||
.\"
|
||||
program is provided as a simple coding example.
|
||||
program is provided as a relatively simple coding example.
|
||||
.P
|
||||
If you try to run
|
||||
.\" HREF
|
||||
|
@ -65,7 +70,7 @@ If you try to run
|
|||
when PCRE2 is not installed in the standard library directory, you may get an
|
||||
error like this on some operating systems (e.g. Solaris):
|
||||
.sp
|
||||
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
|
||||
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
|
||||
.sp
|
||||
This is caused by the way shared library support works on those systems. You
|
||||
need to add
|
||||
|
@ -89,6 +94,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 20 October 2014
|
||||
Copyright (c) 1997-2014 University of Cambridge.
|
||||
Last updated: 02 February 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -67,10 +67,10 @@ INPUT ENCODING
|
|||
|
||||
For maximum portability, therefore, it is safest to avoid non-printing
|
||||
characters in pcre2test input files. There is a facility for specifying
|
||||
a pattern's characters as hexadecimal pairs, thus making it possible to
|
||||
include binary zeroes in a pattern for testing purposes. Subject lines
|
||||
are processed for backslash escapes, which makes it possible to include
|
||||
any data value.
|
||||
some or all of a pattern's characters as hexadecimal pairs, thus making
|
||||
it possible to include binary zeroes in a pattern for testing purposes.
|
||||
Subject lines are processed for backslash escapes, which makes it pos-
|
||||
sible to include any data value.
|
||||
|
||||
|
||||
COMMAND LINE OPTIONS
|
||||
|
@ -505,7 +505,7 @@ PATTERN MODIFIERS
|
|||
debug same as info,fullbincode
|
||||
fullbincode show binary code with lengths
|
||||
/I info show info about compiled pattern
|
||||
hex pattern is coded in hexadecimal
|
||||
hex unquoted characters are hexadecimal
|
||||
jit[=<number>] use JIT
|
||||
jitfast use JIT fast path
|
||||
jitverify verify JIT use
|
||||
|
@ -516,6 +516,7 @@ PATTERN MODIFIERS
|
|||
null_context compile with a NULL context
|
||||
parens_nest_limit=<n> set maximum parentheses depth
|
||||
posix use the POSIX API
|
||||
posix_nosub use the POSIX API with REG_NOSUB
|
||||
push push compiled pattern onto the stack
|
||||
stackguard=<number> test the stackguard feature
|
||||
tables=[0|1|2] select internal tables
|
||||
|
@ -591,59 +592,70 @@ PATTERN MODIFIERS
|
|||
testing that pcre2_compile() behaves correctly in this case (it uses
|
||||
default values).
|
||||
|
||||
Specifying a pattern in hex
|
||||
Specifying pattern characters in hexadecimal
|
||||
|
||||
The hex modifier specifies that the characters of the pattern are to be
|
||||
interpreted as pairs of hexadecimal digits. White space is permitted
|
||||
between pairs. For example:
|
||||
The hex modifier specifies that the characters of the pattern, except
|
||||
for substrings enclosed in single or double quotes, are to be inter-
|
||||
preted as pairs of hexadecimal digits. This feature is provided as a
|
||||
way of creating patterns that contain binary zeros and other non-print-
|
||||
ing characters. White space is permitted between pairs of digits. For
|
||||
example, this pattern contains three characters:
|
||||
|
||||
/ab 32 59/hex
|
||||
|
||||
This feature is provided as a way of creating patterns that contain
|
||||
binary zero and other non-printing characters. By default, pcre2test
|
||||
passes patterns as zero-terminated strings to pcre2_compile(), giving
|
||||
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
|
||||
hexadecimal, the actual length of the pattern is passed.
|
||||
Parts of such a pattern are taken literally if quoted. This pattern
|
||||
contains nine characters, only two of which are specified in hexadeci-
|
||||
mal:
|
||||
|
||||
/ab "literal" 32/hex
|
||||
|
||||
Either single or double quotes may be used. There is no way of includ-
|
||||
ing the delimiter within a substring.
|
||||
|
||||
By default, pcre2test passes patterns as zero-terminated strings to
|
||||
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
|
||||
for patterns specified with the hex modifier, the actual length of the
|
||||
pattern is passed.
|
||||
|
||||
Generating long repetitive patterns
|
||||
|
||||
Some tests use long patterns that are very repetitive. Instead of cre-
|
||||
ating a very long input line for such a pattern, you can use a special
|
||||
repetition feature, similar to the one described for subject lines
|
||||
above. If the expand modifier is present on a pattern, parts of the
|
||||
Some tests use long patterns that are very repetitive. Instead of cre-
|
||||
ating a very long input line for such a pattern, you can use a special
|
||||
repetition feature, similar to the one described for subject lines
|
||||
above. If the expand modifier is present on a pattern, parts of the
|
||||
pattern that have the form
|
||||
|
||||
\[<characters>]{<count>}
|
||||
|
||||
are expanded before the pattern is passed to pcre2_compile(). For exam-
|
||||
ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction
|
||||
cannot be nested. An initial "\[" sequence is recognized only if "]{"
|
||||
followed by decimal digits and "}" is found later in the pattern. If
|
||||
cannot be nested. An initial "\[" sequence is recognized only if "]{"
|
||||
followed by decimal digits and "}" is found later in the pattern. If
|
||||
not, the characters remain in the pattern unaltered.
|
||||
|
||||
If part of an expanded pattern looks like an expansion, but is really
|
||||
If part of an expanded pattern looks like an expansion, but is really
|
||||
part of the actual pattern, unwanted expansion can be avoided by giving
|
||||
two values in the quantifier. For example, \[AB]{6000,6000} is not rec-
|
||||
ognized as an expansion item.
|
||||
|
||||
If the info modifier is set on an expanded pattern, the result of the
|
||||
If the info modifier is set on an expanded pattern, the result of the
|
||||
expansion is included in the information that is output.
|
||||
|
||||
JIT compilation
|
||||
|
||||
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
||||
greatly speed up pattern matching. See the pcre2jit documentation for
|
||||
details. JIT compiling happens, optionally, after a pattern has been
|
||||
successfully compiled into an internal form. The JIT compiler converts
|
||||
Just-in-time (JIT) compiling is a heavyweight optimization that can
|
||||
greatly speed up pattern matching. See the pcre2jit documentation for
|
||||
details. JIT compiling happens, optionally, after a pattern has been
|
||||
successfully compiled into an internal form. The JIT compiler converts
|
||||
this to optimized machine code. It needs to know whether the match-time
|
||||
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
|
||||
because different code is generated for the different cases. See the
|
||||
partial modifier in "Subject Modifiers" below for details of how these
|
||||
because different code is generated for the different cases. See the
|
||||
partial modifier in "Subject Modifiers" below for details of how these
|
||||
options are specified for each match attempt.
|
||||
|
||||
JIT compilation is requested by the /jit pattern modifier, which may
|
||||
JIT compilation is requested by the /jit pattern modifier, which may
|
||||
optionally be followed by an equals sign and a number in the range 0 to
|
||||
7. The three bits that make up the number specify which of the three
|
||||
7. The three bits that make up the number specify which of the three
|
||||
JIT operating modes are to be compiled:
|
||||
|
||||
1 compile JIT code for non-partial matching
|
||||
|
@ -660,31 +672,31 @@ PATTERN MODIFIERS
|
|||
6 soft and hard partial matching only
|
||||
7 all three modes
|
||||
|
||||
If no number is given, 7 is assumed. The phrase "partial matching"
|
||||
If no number is given, 7 is assumed. The phrase "partial matching"
|
||||
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
|
||||
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
||||
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
|
||||
plete match; the options enable the possibility of a partial match, but
|
||||
do not require it. Note also that if you request JIT compilation only
|
||||
for partial matching (for example, /jit=2) but do not set the partial
|
||||
modifier on a subject line, that match will not use JIT code because
|
||||
do not require it. Note also that if you request JIT compilation only
|
||||
for partial matching (for example, /jit=2) but do not set the partial
|
||||
modifier on a subject line, that match will not use JIT code because
|
||||
none was compiled for non-partial matching.
|
||||
|
||||
If JIT compilation is successful, the compiled JIT code will automati-
|
||||
cally be used when an appropriate type of match is run, except when
|
||||
incompatible run-time options are specified. For more details, see the
|
||||
pcre2jit documentation. See also the jitstack modifier below for a way
|
||||
If JIT compilation is successful, the compiled JIT code will automati-
|
||||
cally be used when an appropriate type of match is run, except when
|
||||
incompatible run-time options are specified. For more details, see the
|
||||
pcre2jit documentation. See also the jitstack modifier below for a way
|
||||
of setting the size of the JIT stack.
|
||||
|
||||
If the jitfast modifier is specified, matching is done using the JIT
|
||||
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
||||
ity checks that are done by pcre2_match(), and of course does not work
|
||||
when JIT is not supported. If jitfast is specified without jit, jit=7
|
||||
If the jitfast modifier is specified, matching is done using the JIT
|
||||
"fast path" interface, pcre2_jit_match(), which skips some of the san-
|
||||
ity checks that are done by pcre2_match(), and of course does not work
|
||||
when JIT is not supported. If jitfast is specified without jit, jit=7
|
||||
is assumed.
|
||||
|
||||
If the jitverify modifier is specified, information about the compiled
|
||||
pattern shows whether JIT compilation was or was not successful. If
|
||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||
If the jitverify modifier is specified, information about the compiled
|
||||
pattern shows whether JIT compilation was or was not successful. If
|
||||
jitverify is specified without jit, jit=7 is assumed. If JIT compila-
|
||||
tion is successful when jitverify is set, the text "(JIT)" is added to
|
||||
the first output line after a match or non match when JIT-compiled code
|
||||
was actually used in the match.
|
||||
|
||||
|
@ -695,18 +707,18 @@ PATTERN MODIFIERS
|
|||
/pattern/locale=fr_FR
|
||||
|
||||
The given locale is set, pcre2_maketables() is called to build a set of
|
||||
character tables for the locale, and this is then passed to pcre2_com-
|
||||
pile() when compiling the regular expression. The same tables are used
|
||||
character tables for the locale, and this is then passed to pcre2_com-
|
||||
pile() when compiling the regular expression. The same tables are used
|
||||
when matching the following subject lines. The /locale modifier applies
|
||||
only to the pattern on which it appears, but can be given in a #pattern
|
||||
command if a default is needed. Setting a locale and alternate charac-
|
||||
command if a default is needed. Setting a locale and alternate charac-
|
||||
ter tables are mutually exclusive.
|
||||
|
||||
Showing pattern memory
|
||||
|
||||
The /memory modifier causes the size in bytes of the memory used to
|
||||
hold the compiled pattern to be output. This does not include the size
|
||||
of the pcre2_code block; it is just the actual compiled data. If the
|
||||
The /memory modifier causes the size in bytes of the memory used to
|
||||
hold the compiled pattern to be output. This does not include the size
|
||||
of the pcre2_code block; it is just the actual compiled data. If the
|
||||
pattern is subsequently passed to the JIT compiler, the size of the JIT
|
||||
compiled code is also output. Here is an example:
|
||||
|
||||
|
@ -717,31 +729,31 @@ PATTERN MODIFIERS
|
|||
|
||||
Limiting nested parentheses
|
||||
|
||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||
parentheses in a pattern. Breaching the limit causes a compilation
|
||||
error. The default for the library is set when PCRE2 is built, but
|
||||
pcre2test sets its own default of 220, which is required for running
|
||||
The parens_nest_limit modifier sets a limit on the depth of nested
|
||||
parentheses in a pattern. Breaching the limit causes a compilation
|
||||
error. The default for the library is set when PCRE2 is built, but
|
||||
pcre2test sets its own default of 220, which is required for running
|
||||
the standard test suite.
|
||||
|
||||
Limiting the pattern length
|
||||
|
||||
The max_pattern_length modifier sets a limit, in code units, to the
|
||||
The max_pattern_length modifier sets a limit, in code units, to the
|
||||
length of pattern that pcre2_compile() will accept. Breaching the limit
|
||||
causes a compilation error. The default is the largest number a
|
||||
causes a compilation error. The default is the largest number a
|
||||
PCRE2_SIZE variable can hold (essentially unlimited).
|
||||
|
||||
Using the POSIX wrapper API
|
||||
|
||||
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
|
||||
per API rather than its native API. This supports only the 8-bit
|
||||
library. Note that it does not imply POSIX matching semantics; for
|
||||
more detail see the pcre2posix documentation. When the POSIX API is
|
||||
being used, the following pattern modifiers set options for the reg-
|
||||
comp() function:
|
||||
The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
|
||||
the POSIX wrapper API rather than its native API. When posix_nosub is
|
||||
used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
|
||||
wrapper supports only the 8-bit library. Note that it does not imply
|
||||
POSIX matching semantics; for more detail see the pcre2posix documenta-
|
||||
tion. The following pattern modifiers set options for the regcomp()
|
||||
function:
|
||||
|
||||
caseless REG_ICASE
|
||||
multiline REG_NEWLINE
|
||||
no_auto_capture REG_NOSUB
|
||||
dotall REG_DOTALL )
|
||||
ungreedy REG_UNGREEDY ) These options are not part of
|
||||
ucp REG_UCP ) the POSIX standard
|
||||
|
@ -758,23 +770,24 @@ PATTERN MODIFIERS
|
|||
been set, a large buffer is used.
|
||||
|
||||
The aftertext and allaftertext subject modifiers work as described
|
||||
below. All other modifiers cause an error.
|
||||
below. All other modifiers are either ignored, with a warning message,
|
||||
or cause an error.
|
||||
|
||||
Testing the stack guard feature
|
||||
|
||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||
pile_recursion_guard(), a function that is provided to enable stack
|
||||
availability to be checked during compilation (see the pcre2api docu-
|
||||
mentation for details). If the number specified by the modifier is
|
||||
The /stackguard modifier is used to test the use of pcre2_set_com-
|
||||
pile_recursion_guard(), a function that is provided to enable stack
|
||||
availability to be checked during compilation (see the pcre2api docu-
|
||||
mentation for details). If the number specified by the modifier is
|
||||
greater than zero, pcre2_set_compile_recursion_guard() is called to set
|
||||
up callback from pcre2_compile() to a local function. The argument it
|
||||
receives is the current nesting parenthesis depth; if this is greater
|
||||
up callback from pcre2_compile() to a local function. The argument it
|
||||
receives is the current nesting parenthesis depth; if this is greater
|
||||
than the value given by the modifier, non-zero is returned, causing the
|
||||
compilation to be aborted.
|
||||
|
||||
Using alternative character tables
|
||||
|
||||
The value specified for the /tables modifier must be one of the digits
|
||||
The value specified for the /tables modifier must be one of the digits
|
||||
0, 1, or 2. It causes a specific set of built-in character tables to be
|
||||
passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
|
||||
haviour with different character tables. The digit specifies the tables
|
||||
|
@ -785,15 +798,15 @@ PATTERN MODIFIERS
|
|||
pcre2_chartables.c.dist
|
||||
2 a set of tables defining ISO 8859 characters
|
||||
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
In table 2, some characters whose codes are greater than 128 are iden-
|
||||
tified as letters, digits, spaces, etc. Setting alternate character
|
||||
tables and a locale are mutually exclusive.
|
||||
|
||||
Setting certain match controls
|
||||
|
||||
The following modifiers are really subject modifiers, and are described
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
below. However, they may be included in a pattern's modifier list, in
|
||||
which case they are applied to every subject line that is processed
|
||||
with that pattern. They may not appear in #pattern commands. These mod-
|
||||
ifiers do not affect the compilation process.
|
||||
|
||||
|
@ -810,20 +823,20 @@ PATTERN MODIFIERS
|
|||
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
|
||||
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
|
||||
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
These modifiers may not appear in a #pattern command. If you want them
|
||||
as defaults, set them in a #subject command.
|
||||
|
||||
Saving a compiled pattern
|
||||
|
||||
When a pattern with the push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or a command) instead of a subject
|
||||
When a pattern with the push modifier is successfully compiled, it is
|
||||
pushed onto a stack of compiled patterns, and pcre2test expects the
|
||||
next line to contain a new pattern (or a command) instead of a subject
|
||||
line. This facility is used when saving compiled patterns to a file, as
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
described in the section entitled "Saving and restoring compiled pat-
|
||||
terns" below. The push modifier is incompatible with compilation modi-
|
||||
fiers such as global that act at match time. Any that are specified are
|
||||
ignored, with a warning message, except for replace, which causes an
|
||||
error. Note that, jitverify, which is allowed, does not carry through
|
||||
ignored, with a warning message, except for replace, which causes an
|
||||
error. Note that, jitverify, which is allowed, does not carry through
|
||||
to any subsequent matching that uses this pattern.
|
||||
|
||||
|
||||
|
@ -834,7 +847,7 @@ SUBJECT MODIFIERS
|
|||
|
||||
Setting match options
|
||||
|
||||
The following modifiers set options for pcre2_match() or
|
||||
The following modifiers set options for pcre2_match() or
|
||||
pcre2_dfa_match(). See pcreapi for a description of their effects.
|
||||
|
||||
anchored set PCRE2_ANCHORED
|
||||
|
@ -848,20 +861,20 @@ SUBJECT MODIFIERS
|
|||
partial_hard (or ph) set PCRE2_PARTIAL_HARD
|
||||
partial_soft (or ps) set PCRE2_PARTIAL_SOFT
|
||||
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
The partial matching modifiers are provided with abbreviations because
|
||||
they appear frequently in tests.
|
||||
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
If the /posix modifier was present on the pattern, causing the POSIX
|
||||
wrapper API to be used, the only option-setting modifiers that have any
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
Any other modifiers cause an error.
|
||||
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
|
||||
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
|
||||
The other modifiers are ignored, with a warning message.
|
||||
|
||||
Setting match controls
|
||||
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
The following modifiers affect the matching process or request addi-
|
||||
tional information. Some of them may also be specified on a pattern
|
||||
line (see above), in which case they apply to every subject line that
|
||||
is matched against that pattern.
|
||||
|
||||
aftertext show text after match
|
||||
|
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
|
|||
zero_terminate pass the subject as zero-terminated
|
||||
|
||||
The effects of these modifiers are described in the following sections.
|
||||
When matching via the POSIX wrapper API, the aftertext, allaftertext,
|
||||
and ovector subject modifiers work as described below. All other modi-
|
||||
fiers are either ignored, with a warning message, or cause an error.
|
||||
|
||||
Showing more text
|
||||
|
||||
|
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
|
|||
matched with the pattern, terminated as usual by an empty line or end
|
||||
of file. This command may be followed by a modifier list containing
|
||||
only control modifiers that act after a pattern has been compiled. In
|
||||
particular, hex, posix, and push are not allowed, nor are any option-
|
||||
setting modifiers. The JIT modifiers are, however permitted. Here is
|
||||
an example that saves and reloads two patterns.
|
||||
particular, hex, posix, posix_nosub, and push are not allowed, nor are
|
||||
any option-setting modifiers. The JIT modifiers are, however permit-
|
||||
ted. Here is an example that saves and reloads two patterns.
|
||||
|
||||
/abc/push
|
||||
/xyz/push
|
||||
|
@ -1505,5 +1521,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 12 December 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
Last updated: 31 January 2016
|
||||
Copyright (c) 1997-2016 University of Cambridge.
|
||||
|
|
|
@ -3,28 +3,31 @@
|
|||
*************************************************/
|
||||
|
||||
/* This is a demonstration program to illustrate a straightforward way of
|
||||
calling the PCRE2 regular expression library from a C program. See the
|
||||
using the PCRE2 regular expression library from a C program. See the
|
||||
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
|
||||
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
|
||||
incompatible with the original PCRE API.
|
||||
|
||||
There are actually three libraries, each supporting a different code unit
|
||||
width. This demonstration program uses the 8-bit library.
|
||||
width. This demonstration program uses the 8-bit library. The default is to
|
||||
process each code unit as a separate character, but if the pattern begins with
|
||||
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
|
||||
characters may occupy multiple code units.
|
||||
|
||||
In Unix-like environments, if PCRE2 is installed in your standard system
|
||||
libraries, you should be able to compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
|
||||
|
||||
If PCRE2 is not installed in a standard place, it is likely to be installed
|
||||
with support for the pkg-config mechanism. If you have pkg-config, you can
|
||||
compile this program using this command:
|
||||
|
||||
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
|
||||
|
||||
If you do not have pkg-config, you may have to use this:
|
||||
If you do not have pkg-config, you may have to use something like this:
|
||||
|
||||
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
|
||||
-R/usr/local/lib -lpcre2-8 -o pcre2demo
|
||||
|
||||
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
|
||||
|
@ -39,9 +42,14 @@ the following line. */
|
|||
|
||||
/* #define PCRE2_STATIC */
|
||||
|
||||
/* This macro must be defined before including pcre2.h. For a program that uses
|
||||
only one code unit width, it makes it possible to use generic function names
|
||||
such as pcre2_compile(). */
|
||||
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
|
||||
For a program that uses only one code unit width, setting it to 8, 16, or 32
|
||||
makes it possible to use generic function names such as pcre2_compile(). Note
|
||||
that just changing 8 to 16 (for example) is not sufficient to convert this
|
||||
program to process 16-bit characters. Even in a fully 16-bit environment, where
|
||||
string-handling functions such as strcmp() and printf() work with 16-bit
|
||||
characters, the code for handling the table of named substrings will still need
|
||||
to be modified. */
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
|
||||
|
@ -62,19 +70,19 @@ int main(int argc, char **argv)
|
|||
{
|
||||
pcre2_code *re;
|
||||
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
|
||||
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
|
||||
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
|
||||
PCRE2_SPTR name_table;
|
||||
|
||||
int crlf_is_newline;
|
||||
int errornumber;
|
||||
int find_all;
|
||||
int i;
|
||||
int namecount;
|
||||
int name_entry_size;
|
||||
int rc;
|
||||
int utf8;
|
||||
|
||||
uint32_t option_bits;
|
||||
uint32_t namecount;
|
||||
uint32_t name_entry_size;
|
||||
uint32_t newline;
|
||||
|
||||
PCRE2_SIZE erroroffset;
|
||||
|
@ -89,15 +97,19 @@ pcre2_match_data *match_data;
|
|||
* First, sort out the command line. There is only one possible option at *
|
||||
* the moment, "-g" to request repeated matching to find all occurrences, *
|
||||
* like Perl's /g option. We set the variable find_all to a non-zero value *
|
||||
* if the -g option is present. Apart from that, there must be exactly two *
|
||||
* arguments. *
|
||||
* if the -g option is present. *
|
||||
**************************************************************************/
|
||||
|
||||
find_all = 0;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
if (strcmp(argv[i], "-g") == 0) find_all = 1;
|
||||
else break;
|
||||
else if (argv[i][0] == '-')
|
||||
{
|
||||
printf("Unrecognised option %s\n", argv[i]);
|
||||
return 1;
|
||||
}
|
||||
else break;
|
||||
}
|
||||
|
||||
/* After the options, we require exactly two arguments, which are the pattern,
|
||||
|
@ -105,7 +117,7 @@ and the subject string. */
|
|||
|
||||
if (argc - i != 2)
|
||||
{
|
||||
printf("Two arguments required: a regex and a subject string\n");
|
||||
printf("Exactly two arguments required: a regex and a subject string\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -184,7 +196,7 @@ if (rc < 0)
|
|||
stored. */
|
||||
|
||||
ovector = pcre2_get_ovector_pointer(match_data);
|
||||
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
|
||||
printf("Match succeeded at offset %d\n", (int)ovector[0]);
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
|
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
|
|||
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
|
||||
&namecount); /* where to put the answer */
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr;
|
||||
printf("Named substrings\n");
|
||||
|
@ -313,8 +325,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
|
|||
|
||||
for (;;)
|
||||
{
|
||||
uint32_t options = 0; /* Normally no options */
|
||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||
uint32_t options = 0; /* Normally no options */
|
||||
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
|
||||
|
||||
/* If the previous match was for an empty string, we are finished if we are
|
||||
at the end of the subject. Otherwise, arrange to run another match at the
|
||||
|
@ -354,7 +366,7 @@ for (;;)
|
|||
{
|
||||
if (options == 0) break; /* All matches found */
|
||||
ovector[1] = start_offset + 1; /* Advance one code unit */
|
||||
if (crlf_is_newline && /* If CRLF is newline & */
|
||||
if (crlf_is_newline && /* If CRLF is a newline & */
|
||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||
subject[start_offset] == '\r' &&
|
||||
subject[start_offset + 1] == '\n')
|
||||
|
@ -400,7 +412,7 @@ for (;;)
|
|||
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
|
||||
}
|
||||
|
||||
if (namecount <= 0) printf("No named substrings\n"); else
|
||||
if (namecount == 0) printf("No named substrings\n"); else
|
||||
{
|
||||
PCRE2_SPTR tabptr = name_table;
|
||||
printf("Named substrings\n");
|
||||
|
|
Loading…
Reference in New Issue