Tidy pcre2demo.c

This commit is contained in:
Philip.Hazel 2016-02-02 16:25:47 +00:00
parent 6c1c817438
commit 4e67c0c9e9
12 changed files with 1116 additions and 1020 deletions

View File

@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
modifier had this effect. That option is now ignored when the POSIX API is in
use.
8. Minor tidies to the pcre2demo.c sample program, including more comments
about its 8-bit-ness.
Version 10.21 12-January-2016
-----------------------------

View File

@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
the pattern. Any opening parenthesis that is not followed by ? behaves as if it
were followed by ?: but named parentheses can still be used for capturing (and
they acquire numbers in the usual way). There is no equivalent of this option
in Perl.
in Perl. Note that, if this option is set, references to capturing groups (back
references or recursion/subroutine calls) may only refer to named groups,
though the reference can be by name or by number.
<pre>
PCRE2_NO_AUTO_POSSESS
</pre>
@ -3121,9 +3123,9 @@ Cambridge, England.
</P>
<br><a name="SEC40" href="#TOC1">REVISION</a><br>
<P>
Last updated: 16 December 2015
Last updated: 31 January 2016
<br>
Copyright &copy; 1997-2015 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline;
PCRE2_SIZE erroroffset;
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
* if the -g option is present. *
**************************************************************************/
find_all = 0;
for (i = 1; i &lt; argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break;
}
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\n");
printf("Exactly two arguments required: a regex and a subject string\n");
return 1;
}
@ -201,7 +213,7 @@ if (rc &lt; 0)
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
printf("Match succeeded at offset %d\n", (int)ovector[0]);
/*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&amp;namecount); /* where to put the answer */
if (namecount &lt;= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\n");
@ -371,7 +383,7 @@ for (;;)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */
if (crlf_is_newline &amp;&amp; /* If CRLF is a newline &amp; */
start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
subject[start_offset] == '\r' &amp;&amp;
subject[start_offset + 1] == '\n')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
if (namecount &lt;= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n");

View File

@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
<a href="#lookbehind">(described below)</a>
in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind. Neither the alternative matching function
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The
<b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter.
</P>

View File

@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
expression 8-bit library. See the
<a href="pcre2api.html"><b>pcre2api</b></a>
documentation for a description of PCRE2's native API, which contains much
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit
additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
and 32-bit libraries.
</P>
<P>
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
replacement library. Other POSIX options are not even defined.
</P>
<P>
There are also some other options that are not defined by POSIX. These have
been added at the request of users who want to make use of certain
PCRE2-specific features via the POSIX calling interface.
There are also some options that are not defined by POSIX. These have been
added at the request of users who want to make use of certain PCRE2-specific
features via the POSIX calling interface.
</P>
<P>
When PCRE2 is called via these functions, it is only the API that is POSIX-like
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
<pre>
REG_NOSUB
</pre>
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed
for compilation to the native function. In addition, when a pattern that is
compiled with this flag is passed to <b>regexec()</b> for matching, the
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings
are returned.
When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
captured strings are returned. Versions of the PCRE library prior to 10.22 used
to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
because it disables the use of back references.
<pre>
REG_UCP
</pre>
@ -241,11 +241,12 @@ mutually exclusive; the error REG_INVARG is returned.
<P>
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
<b>regexec()</b> are ignored.
<b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
</P>
<P>
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL,
no data about any matched strings is returned.
The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
(unless REG_STARTEND is set); in both these cases no data about any matched
strings is returned.
</P>
<P>
Otherwise, the portion of the string that was matched, and also any captured
@ -290,9 +291,9 @@ Cambridge, England.
</P>
<br><a name="SEC9" href="#TOC1">REVISION</a><br>
<P>
Last updated: 29 November 2015
Last updated: 31 January 2016
<br>
Copyright &copy; 1997-2015 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of <i>pcre2demo.c</i>.
</P>
<P>
The demonstration program, which uses the PCRE2 8-bit library, compiles the
regular expression that is its first argument, and matches it against the
subject string in its second argument. No PCRE2 options are set, and default
character tables are used. If matching succeeds, the program outputs the
portion of the subject that matched, together with the contents of any captured
substrings.
The demonstration program compiles the regular expression that is its
first argument, and matches it against the subject string in its second
argument. No PCRE2 options are set, and default character tables are used. If
matching succeeds, the program outputs the portion of the subject that matched,
together with the contents of any captured substrings.
</P>
<P>
If the -g option is given on the command line, the program then goes on to
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on.
</P>
<P>
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
</P>
<P>
If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using
this command:
a command like this:
<pre>
gcc -o pcre2demo pcre2demo.c -lpcre2-8
cc -o pcre2demo pcre2demo.c -lpcre2-8
</pre>
If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in
<i>/usr/local</i>, you can compile the demonstration program using a command
like this:
<pre>
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
</PRE>
</P>
<P>
Once you have compiled and linked the demonstration program, you can run simple
tests like this:
cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
</pre>
Once you have built the demonstration program, you can run simple tests like
this:
<pre>
./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
</pre>
Note that there is a much more comprehensive test program, called
<a href="pcre2test.html"><b>pcre2test</b>,</a>
which supports many more facilities for testing regular expressions using the
PCRE2 libraries. The
which supports many more facilities for testing regular expressions using all
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
<a href="pcre2demo.html"><b>pcre2demo</b></a>
program is provided as a simple coding example.
program is provided as a relatively simple coding example.
</P>
<P>
If you try to run
@ -73,7 +77,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris):
<pre>
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
</pre>
This is caused by the way shared library support works on those systems. You
need to add
@ -97,9 +101,9 @@ Cambridge, England.
REVISION
</b><br>
<P>
Last updated: 20 October 2014
Last updated: 02 February 2016
<br>
Copyright &copy; 1997-2014 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -98,10 +98,11 @@ further data is read.
</P>
<P>
For maximum portability, therefore, it is safest to avoid non-printing
characters in <b>pcre2test</b> input files. There is a facility for specifying a
pattern's characters as hexadecimal pairs, thus making it possible to include
binary zeroes in a pattern for testing purposes. Subject lines are processed
for backslash escapes, which makes it possible to include any data value.
characters in <b>pcre2test</b> input files. There is a facility for specifying
some or all of a pattern's characters as hexadecimal pairs, thus making it
possible to include binary zeroes in a pattern for testing purposes. Subject
lines are processed for backslash escapes, which makes it possible to include
any data value.
</P>
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
<P>
@ -559,7 +560,7 @@ about the pattern:
debug same as info,fullbincode
fullbincode show binary code with lengths
/I info show info about compiled pattern
hex pattern is coded in hexadecimal
hex unquoted characters are hexadecimal
jit[=&#60;number&#62;] use JIT
jitfast use JIT fast path
jitverify verify JIT use
@ -570,6 +571,7 @@ about the pattern:
null_context compile with a NULL context
parens_nest_limit=&#60;n&#62; set maximum parentheses depth
posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack
stackguard=&#60;number&#62; test the stackguard feature
tables=[0|1|2] select internal tables
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
default values).
</P>
<br><b>
Specifying a pattern in hex
Specifying pattern characters in hexadecimal
</b><br>
<P>
The <b>hex</b> modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted between
pairs. For example:
The <b>hex</b> modifier specifies that the characters of the pattern, except for
substrings enclosed in single or double quotes, are to be interpreted as pairs
of hexadecimal digits. This feature is provided as a way of creating patterns
that contain binary zeros and other non-printing characters. White space is
permitted between pairs of digits. For example, this pattern contains three
characters:
<pre>
/ab 32 59/hex
</pre>
This feature is provided as a way of creating patterns that contain binary zero
and other non-printing characters. By default, <b>pcre2test</b> passes patterns
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the
actual length of the pattern is passed.
Parts of such a pattern are taken literally if quoted. This pattern contains
nine characters, only two of which are specified in hexadecimal:
<pre>
/ab "literal" 32/hex
</pre>
Either single or double quotes may be used. There is no way of including
the delimiter within a substring.
</P>
<P>
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
patterns specified with the <b>hex</b> modifier, the actual length of the
pattern is passed.
</P>
<br><b>
Generating long repetitive patterns
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
Using the POSIX wrapper API
</b><br>
<P>
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX
wrapper API rather than its native API. This supports only the 8-bit library.
Note that it does not imply POSIX matching semantics; for more detail see the
The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
PCRE2 via the POSIX wrapper API rather than its native API. When
<b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
it does not imply POSIX matching semantics; for more detail see the
<a href="pcre2posix.html"><b>pcre2posix</b></a>
documentation. When the POSIX API is being used, the following pattern
modifiers set options for the <b>regcomp()</b> function:
documentation. The following pattern modifiers set options for the
<b>regcomp()</b> function:
<pre>
caseless REG_ICASE
multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard
@ -847,7 +861,8 @@ large buffer is used.
</P>
<P>
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
below. All other modifiers cause an error.
below. All other modifiers are either ignored, with a warning message, or cause
an error.
</P>
<br><b>
Testing the stack guard feature
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any effect
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
Any other modifiers cause an error.
The other modifiers are ignored, with a warning message.
</P>
<br><b>
Setting match controls
@ -1001,7 +1016,10 @@ pattern.
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
zero_terminate pass the subject as zero-terminated
</pre>
The effects of these modifiers are described in the following sections.
The effects of these modifiers are described in the following sections. When
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
and <b>ovector</b> subject modifiers work as described below. All other
modifiers are either ignored, with a warning message, or cause an error.
</P>
<br><b>
Showing more text
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
modifier list containing only
<a href="#controlmodifiers">control modifiers</a>
that act after a pattern has been compiled. In particular, <b>hex</b>,
<b>posix</b>, and <b>push</b> are not allowed, nor are any
<b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
<a href="#optionmodifiers">option-setting modifiers.</a>
The JIT modifiers are, however permitted. Here is an example that saves and
reloads two patterns.
@ -1660,9 +1678,9 @@ Cambridge, England.
</P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P>
Last updated: 12 December 2015
Last updated: 31 January 2016
<br>
Copyright &copy; 1997-2015 University of Cambridge.
Copyright &copy; 1997-2016 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -1326,7 +1326,10 @@ COMPILING A PATTERN
theses in the pattern. Any opening parenthesis that is not followed by
? behaves as if it were followed by ?: but named parentheses can still
be used for capturing (and they acquire numbers in the usual way).
There is no equivalent of this option in Perl.
There is no equivalent of this option in Perl. Note that, if this
option is set, references to capturing groups (back references or
recursion/subroutine calls) may only refer to named groups, though the
reference can be by name or by number.
PCRE2_NO_AUTO_POSSESS
@ -3055,8 +3058,8 @@ AUTHOR
REVISION
Last updated: 16 December 2015
Copyright (c) 1997-2015 University of Cambridge.
Last updated: 31 January 2016
Copyright (c) 1997-2016 University of Cambridge.
------------------------------------------------------------------------------
@ -6231,7 +6234,7 @@ MATCHING A SINGLE CODE UNIT
PCRE2 does not allow \C to appear in lookbehind assertions (described
below) in a UTF mode, because this would make it impossible to calcu-
late the length of the lookbehind. Neither the alternative matching
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF
function pcre2_dfa_match() nor the JIT optimizer support \C in a UTF
mode. The former gives a match-time error; the latter fails to optimize
and so the match is always run using the interpreter.
@ -8460,7 +8463,7 @@ DESCRIPTION
This set of functions provides a POSIX-style API for the PCRE2 regular
expression 8-bit library. See the pcre2api documentation for a descrip-
tion of PCRE2's native API, which contains much additional functional-
ity. There is no POSIX-style wrapper for PCRE2's 16-bit and 32-bit
ity. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit
libraries.
The functions described here are just wrapper functions that ultimately
@ -8478,8 +8481,8 @@ DESCRIPTION
easier to slot in PCRE2 as a replacement library. Other POSIX options
are not even defined.
There are also some other options that are not defined by POSIX. These
have been added at the request of users who want to make use of certain
There are also some options that are not defined by POSIX. These have
been added at the request of users who want to make use of certain
PCRE2-specific features via the POSIX calling interface.
When PCRE2 is called via these functions, it is only the API that is
@ -8530,11 +8533,11 @@ COMPILING A PATTERN
REG_NOSUB
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is
passed for compilation to the native function. In addition, when a pat-
tern that is compiled with this flag is passed to regexec() for match-
ing, the nmatch and pmatch arguments are ignored, and no captured
strings are returned.
When a pattern that is compiled with this flag is passed to regexec()
for matching, the nmatch and pmatch arguments are ignored, and no cap-
tured strings are returned. Versions of the PCRE library prior to 10.22
used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no
longer happens because it disables the use of back references.
REG_UCP
@ -8653,17 +8656,18 @@ MATCHING A PATTERN
If the pattern was compiled with the REG_NOSUB flag, no data about any
matched strings is returned. The nmatch and pmatch arguments of
regexec() are ignored.
regexec() are ignored (except possibly as input for REG_STARTEND).
If the value of nmatch is zero, or if the value pmatch is NULL, no data
about any matched strings is returned.
The value of nmatch may be zero, and the value pmatch may be NULL
(unless REG_STARTEND is set); in both these cases no data about any
matched strings is returned.
Otherwise,the portion of the string that was matched, and also any cap-
tured substrings, are returned via the pmatch argument, which points to
an array of nmatch structures of type regmatch_t, containing the mem-
bers rm_so and rm_eo. These contain the byte offset to the first char-
acter of each substring and the offset to the first character after the
end of each substring, respectively. The 0th element of the vector
Otherwise, the portion of the string that was matched, and also any
captured substrings, are returned via the pmatch argument, which points
to an array of nmatch structures of type regmatch_t, containing the
members rm_so and rm_eo. These contain the byte offset to the first
character of each substring and the offset to the first character after
the end of each substring, respectively. The 0th element of the vector
relates to the entire portion of string that was matched; subsequent
elements relate to the capturing subpatterns of the regular expression.
Unused entries in the array have both structure members set to -1.
@ -8702,8 +8706,8 @@ AUTHOR
REVISION
Last updated: 29 November 2015
Copyright (c) 1997-2015 University of Cambridge.
Last updated: 31 January 2016
Copyright (c) 1997-2016 University of Cambridge.
------------------------------------------------------------------------------
@ -8722,12 +8726,12 @@ PCRE2 SAMPLE PROGRAM
documentation. If you do not have a copy of the PCRE2 distribution, you
can save this listing to re-create the contents of pcre2demo.c.
The demonstration program, which uses the PCRE2 8-bit library, compiles
the regular expression that is its first argument, and matches it
against the subject string in its second argument. No PCRE2 options are
set, and default character tables are used. If matching succeeds, the
program outputs the portion of the subject that matched, together with
the contents of any captured substrings.
The demonstration program compiles the regular expression that is its
first argument, and matches it against the subject string in its second
argument. No PCRE2 options are set, and default character tables are
used. If matching succeeds, the program outputs the portion of the sub-
ject that matched, together with the contents of any captured sub-
strings.
If the -g option is given on the command line, the program then goes on
to check for further matches of the same regular expression in the same
@ -8735,38 +8739,45 @@ PCRE2 SAMPLE PROGRAM
bility of matching an empty string. Comments in the code explain what
is going on.
The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit
code units. By default, one character corresponds to one code unit,
but if the pattern starts with "(*UTF)", both it and the subject are
treated as UTF-8 strings, where characters may occupy multiple code
units.
If PCRE2 is installed in the standard include and library directories
for your operating system, you should be able to compile the demonstra-
tion program using this command:
tion program using a command like this:
gcc -o pcre2demo pcre2demo.c -lpcre2-8
cc -o pcre2demo pcre2demo.c -lpcre2-8
If PCRE2 is installed elsewhere, you may need to add additional options
to the command line. For example, on a Unix-like system that has PCRE2
installed in /usr/local, you can compile the demonstration program
using a command like this:
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \
cc -o pcre2demo -I/usr/local/include pcre2demo.c \
-L/usr/local/lib -lpcre2-8
Once you have compiled and linked the demonstration program, you can
run simple tests like this:
Once you have built the demonstration program, you can run simple tests
like this:
./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
Note that there is a much more comprehensive test program, called
pcre2test, which supports many more facilities for testing regular
expressions using the PCRE2 libraries. The pcre2demo program is pro-
vided as a simple coding example.
expressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit,
though not all three need be installed). The pcre2demo program is pro-
vided as a relatively simple coding example.
If you try to run pcre2demo when PCRE2 is not installed in the standard
library directory, you may get an error like this on some operating
systems (e.g. Solaris):
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or
directory
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file
or directory
This is caused by the way shared library support works on those sys-
tems. You need to add
@ -8785,8 +8796,8 @@ AUTHOR
REVISION
Last updated: 20 October 2014
Copyright (c) 1997-2014 University of Cambridge.
Last updated: 02 February 2016
Copyright (c) 1997-2016 University of Cambridge.
------------------------------------------------------------------------------
PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3)

View File

@ -20,28 +20,31 @@
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline;
PCRE2_SIZE erroroffset;
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
* if the -g option is present. *
**************************************************************************/
find_all = 0;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\en", argv[i]);
return 1;
}
else break;
}
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\en");
printf("Exactly two arguments required: a regex and a subject string\en");
return 1;
}
@ -201,7 +213,7 @@ if (rc < 0)
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
printf("Match succeeded at offset %d\en", (int)ovector[0]);
/*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\en"); else
if (namecount == 0) printf("No named substrings\en"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\en");
@ -371,7 +383,7 @@ for (;;)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */
if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\er' &&
subject[start_offset + 1] == '\en')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
}
if (namecount <= 0) printf("No named substrings\en"); else
if (namecount == 0) printf("No named substrings\en"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\en");

View File

@ -1,4 +1,4 @@
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00"
.TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 SAMPLE PROGRAM"
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of \fIpcre2demo.c\fP.
.P
The demonstration program, which uses the PCRE2 8-bit library, compiles the
regular expression that is its first argument, and matches it against the
subject string in its second argument. No PCRE2 options are set, and default
character tables are used. If matching succeeds, the program outputs the
portion of the subject that matched, together with the contents of any captured
substrings.
The demonstration program compiles the regular expression that is its
first argument, and matches it against the subject string in its second
argument. No PCRE2 options are set, and default character tables are used. If
matching succeeds, the program outputs the portion of the subject that matched,
together with the contents of any captured substrings.
.P
If the -g option is given on the command line, the program then goes on to
check for further matches of the same regular expression in the same subject
string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on.
.P
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
.P
If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using
this command:
a command like this:
.sp
gcc -o pcre2demo pcre2demo.c -lpcre2-8
cc -o pcre2demo pcre2demo.c -lpcre2-8
.sp
If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
like this:
.sp
.\" JOINSH
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e
cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
-L/usr/local/lib -lpcre2-8
.sp
.P
Once you have compiled and linked the demonstration program, you can run simple
tests like this:
Once you have built the demonstration program, you can run simple tests like
this:
.sp
./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
.\" HREF
\fBpcre2test\fP,
.\"
which supports many more facilities for testing regular expressions using the
PCRE2 libraries. The
which supports many more facilities for testing regular expressions using all
three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
.\" HREF
\fBpcre2demo\fP
.\"
program is provided as a simple coding example.
program is provided as a relatively simple coding example.
.P
If you try to run
.\" HREF
@ -65,7 +70,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris):
.sp
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory
ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
.sp
This is caused by the way shared library support works on those systems. You
need to add
@ -89,6 +94,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 20 October 2014
Copyright (c) 1997-2014 University of Cambridge.
Last updated: 02 February 2016
Copyright (c) 1997-2016 University of Cambridge.
.fi

View File

@ -67,10 +67,10 @@ INPUT ENCODING
For maximum portability, therefore, it is safest to avoid non-printing
characters in pcre2test input files. There is a facility for specifying
a pattern's characters as hexadecimal pairs, thus making it possible to
include binary zeroes in a pattern for testing purposes. Subject lines
are processed for backslash escapes, which makes it possible to include
any data value.
some or all of a pattern's characters as hexadecimal pairs, thus making
it possible to include binary zeroes in a pattern for testing purposes.
Subject lines are processed for backslash escapes, which makes it pos-
sible to include any data value.
COMMAND LINE OPTIONS
@ -505,7 +505,7 @@ PATTERN MODIFIERS
debug same as info,fullbincode
fullbincode show binary code with lengths
/I info show info about compiled pattern
hex pattern is coded in hexadecimal
hex unquoted characters are hexadecimal
jit[=<number>] use JIT
jitfast use JIT fast path
jitverify verify JIT use
@ -516,6 +516,7 @@ PATTERN MODIFIERS
null_context compile with a NULL context
parens_nest_limit=<n> set maximum parentheses depth
posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack
stackguard=<number> test the stackguard feature
tables=[0|1|2] select internal tables
@ -591,19 +592,30 @@ PATTERN MODIFIERS
testing that pcre2_compile() behaves correctly in this case (it uses
default values).
Specifying a pattern in hex
Specifying pattern characters in hexadecimal
The hex modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted
between pairs. For example:
The hex modifier specifies that the characters of the pattern, except
for substrings enclosed in single or double quotes, are to be inter-
preted as pairs of hexadecimal digits. This feature is provided as a
way of creating patterns that contain binary zeros and other non-print-
ing characters. White space is permitted between pairs of digits. For
example, this pattern contains three characters:
/ab 32 59/hex
This feature is provided as a way of creating patterns that contain
binary zero and other non-printing characters. By default, pcre2test
passes patterns as zero-terminated strings to pcre2_compile(), giving
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
hexadecimal, the actual length of the pattern is passed.
Parts of such a pattern are taken literally if quoted. This pattern
contains nine characters, only two of which are specified in hexadeci-
mal:
/ab "literal" 32/hex
Either single or double quotes may be used. There is no way of includ-
ing the delimiter within a substring.
By default, pcre2test passes patterns as zero-terminated strings to
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
for patterns specified with the hex modifier, the actual length of the
pattern is passed.
Generating long repetitive patterns
@ -732,16 +744,16 @@ PATTERN MODIFIERS
Using the POSIX wrapper API
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap-
per API rather than its native API. This supports only the 8-bit
library. Note that it does not imply POSIX matching semantics; for
more detail see the pcre2posix documentation. When the POSIX API is
being used, the following pattern modifiers set options for the reg-
comp() function:
The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
the POSIX wrapper API rather than its native API. When posix_nosub is
used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
wrapper supports only the 8-bit library. Note that it does not imply
POSIX matching semantics; for more detail see the pcre2posix documenta-
tion. The following pattern modifiers set options for the regcomp()
function:
caseless REG_ICASE
multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard
@ -758,7 +770,8 @@ PATTERN MODIFIERS
been set, a large buffer is used.
The aftertext and allaftertext subject modifiers work as described
below. All other modifiers cause an error.
below. All other modifiers are either ignored, with a warning message,
or cause an error.
Testing the stack guard feature
@ -855,7 +868,7 @@ SUBJECT MODIFIERS
wrapper API to be used, the only option-setting modifiers that have any
effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
Any other modifiers cause an error.
The other modifiers are ignored, with a warning message.
Setting match controls
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
zero_terminate pass the subject as zero-terminated
The effects of these modifiers are described in the following sections.
When matching via the POSIX wrapper API, the aftertext, allaftertext,
and ovector subject modifiers work as described below. All other modi-
fiers are either ignored, with a warning message, or cause an error.
Showing more text
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
matched with the pattern, terminated as usual by an empty line or end
of file. This command may be followed by a modifier list containing
only control modifiers that act after a pattern has been compiled. In
particular, hex, posix, and push are not allowed, nor are any option-
setting modifiers. The JIT modifiers are, however permitted. Here is
an example that saves and reloads two patterns.
particular, hex, posix, posix_nosub, and push are not allowed, nor are
any option-setting modifiers. The JIT modifiers are, however permit-
ted. Here is an example that saves and reloads two patterns.
/abc/push
/xyz/push
@ -1505,5 +1521,5 @@ AUTHOR
REVISION
Last updated: 12 December 2015
Copyright (c) 1997-2015 University of Cambridge.
Last updated: 31 January 2016
Copyright (c) 1997-2016 University of Cambridge.

View File

@ -3,28 +3,31 @@
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -39,9 +42,14 @@ the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
For a program that uses only one code unit width, setting it to 8, 16, or 32
makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8
@ -62,19 +70,19 @@ int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline;
PCRE2_SIZE erroroffset;
@ -89,14 +97,18 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
* if the -g option is present. *
**************************************************************************/
find_all = 0;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break;
}
@ -105,7 +117,7 @@ and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\n");
printf("Exactly two arguments required: a regex and a subject string\n");
return 1;
}
@ -184,7 +196,7 @@ if (rc < 0)
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
printf("Match succeeded at offset %d\n", (int)ovector[0]);
/*************************************************************************
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\n");
@ -354,7 +366,7 @@ for (;;)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */
if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\r' &&
subject[start_offset + 1] == '\n')
@ -400,7 +412,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
if (namecount <= 0) printf("No named substrings\n"); else
if (namecount == 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n");