Tidy pcre2demo.c

This commit is contained in:
Philip.Hazel 2016-02-02 16:25:47 +00:00
parent 6c1c817438
commit 4e67c0c9e9
12 changed files with 1116 additions and 1020 deletions

View File

@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
modifier had this effect. That option is now ignored when the POSIX API is in modifier had this effect. That option is now ignored when the POSIX API is in
use. use.
8. Minor tidies to the pcre2demo.c sample program, including more comments
about its 8-bit-ness.
Version 10.21 12-January-2016 Version 10.21 12-January-2016
----------------------------- -----------------------------

View File

@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
the pattern. Any opening parenthesis that is not followed by ? behaves as if it the pattern. Any opening parenthesis that is not followed by ? behaves as if it
were followed by ?: but named parentheses can still be used for capturing (and were followed by ?: but named parentheses can still be used for capturing (and
they acquire numbers in the usual way). There is no equivalent of this option they acquire numbers in the usual way). There is no equivalent of this option
in Perl. in Perl. Note that, if this option is set, references to capturing groups (back
references or recursion/subroutine calls) may only refer to named groups,
though the reference can be by name or by number.
<pre> <pre>
PCRE2_NO_AUTO_POSSESS PCRE2_NO_AUTO_POSSESS
</pre> </pre>
@ -3121,9 +3123,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC40" href="#TOC1">REVISION</a><br> <br><a name="SEC40" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 16 December 2015 Last updated: 31 January 2016
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
*************************************************/ *************************************************/
/* This is a demonstration program to illustrate a straightforward way of /* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API. incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library. width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command: libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command: compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this: If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo -R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */ /* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
only one code unit width, it makes it possible to use generic function names For a program that uses only one code unit width, setting it to 8, 16, or 32
such as pcre2_compile(). */ makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{ {
pcre2_code *re; pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table; PCRE2_SPTR name_table;
int crlf_is_newline; int crlf_is_newline;
int errornumber; int errornumber;
int find_all; int find_all;
int i; int i;
int namecount;
int name_entry_size;
int rc; int rc;
int utf8; int utf8;
uint32_t option_bits; uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline; uint32_t newline;
PCRE2_SIZE erroroffset; PCRE2_SIZE erroroffset;
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at * * First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, * * the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value * * like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two * * if the -g option is present. *
* arguments. *
**************************************************************************/ **************************************************************************/
find_all = 0; find_all = 0;
for (i = 1; i &lt; argc; i++) for (i = 1; i &lt; argc; i++)
{ {
if (strcmp(argv[i], "-g") == 0) find_all = 1; if (strcmp(argv[i], "-g") == 0) find_all = 1;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break; else break;
} }
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2) if (argc - i != 2)
{ {
printf("Two arguments required: a regex and a subject string\n"); printf("Exactly two arguments required: a regex and a subject string\n");
return 1; return 1;
} }
@ -201,7 +213,7 @@ if (rc &lt; 0)
stored. */ stored. */
ovector = pcre2_get_ovector_pointer(match_data); ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]); printf("Match succeeded at offset %d\n", (int)ovector[0]);
/************************************************************************* /*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&amp;namecount); /* where to put the answer */ &amp;namecount); /* where to put the answer */
if (namecount &lt;= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr; PCRE2_SPTR tabptr;
printf("Named substrings\n"); printf("Named substrings\n");
@ -371,7 +383,7 @@ for (;;)
{ {
if (options == 0) break; /* All matches found */ if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */ ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */ if (crlf_is_newline &amp;&amp; /* If CRLF is a newline &amp; */
start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */ start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
subject[start_offset] == '\r' &amp;&amp; subject[start_offset] == '\r' &amp;&amp;
subject[start_offset + 1] == '\n') subject[start_offset + 1] == '\n')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
} }
if (namecount &lt;= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr = name_table; PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n"); printf("Named substrings\n");

View File

@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
<a href="#lookbehind">(described below)</a> <a href="#lookbehind">(described below)</a>
in a UTF mode, because this would make it impossible to calculate the length of in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind. Neither the alternative matching function the lookbehind. Neither the alternative matching function
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The <b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter. is always run using the interpreter.
</P> </P>

View File

@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
expression 8-bit library. See the expression 8-bit library. See the
<a href="pcre2api.html"><b>pcre2api</b></a> <a href="pcre2api.html"><b>pcre2api</b></a>
documentation for a description of PCRE2's native API, which contains much documentation for a description of PCRE2's native API, which contains much
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
and 32-bit libraries. and 32-bit libraries.
</P> </P>
<P> <P>
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
replacement library. Other POSIX options are not even defined. replacement library. Other POSIX options are not even defined.
</P> </P>
<P> <P>
There are also some other options that are not defined by POSIX. These have There are also some options that are not defined by POSIX. These have been
been added at the request of users who want to make use of certain added at the request of users who want to make use of certain PCRE2-specific
PCRE2-specific features via the POSIX calling interface. features via the POSIX calling interface.
</P> </P>
<P> <P>
When PCRE2 is called via these functions, it is only the API that is POSIX-like When PCRE2 is called via these functions, it is only the API that is POSIX-like
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
<pre> <pre>
REG_NOSUB REG_NOSUB
</pre> </pre>
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
for compilation to the native function. In addition, when a pattern that is matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
compiled with this flag is passed to <b>regexec()</b> for matching, the captured strings are returned. Versions of the PCRE library prior to 10.22 used
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
are returned. because it disables the use of back references.
<pre> <pre>
REG_UCP REG_UCP
</pre> </pre>
@ -241,11 +241,12 @@ mutually exclusive; the error REG_INVARG is returned.
<P> <P>
If the pattern was compiled with the REG_NOSUB flag, no data about any matched If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
<b>regexec()</b> are ignored. <b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
</P> </P>
<P> <P>
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL, The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
no data about any matched strings is returned. (unless REG_STARTEND is set); in both these cases no data about any matched
strings is returned.
</P> </P>
<P> <P>
Otherwise, the portion of the string that was matched, and also any captured Otherwise, the portion of the string that was matched, and also any captured
@ -290,9 +291,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC9" href="#TOC1">REVISION</a><br> <br><a name="SEC9" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 29 November 2015 Last updated: 31 January 2016
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of <i>pcre2demo.c</i>. save this listing to re-create the contents of <i>pcre2demo.c</i>.
</P> </P>
<P> <P>
The demonstration program, which uses the PCRE2 8-bit library, compiles the The demonstration program compiles the regular expression that is its
regular expression that is its first argument, and matches it against the first argument, and matches it against the subject string in its second
subject string in its second argument. No PCRE2 options are set, and default argument. No PCRE2 options are set, and default character tables are used. If
character tables are used. If matching succeeds, the program outputs the matching succeeds, the program outputs the portion of the subject that matched,
portion of the subject that matched, together with the contents of any captured together with the contents of any captured substrings.
substrings.
</P> </P>
<P> <P>
If the -g option is given on the command line, the program then goes on to If the -g option is given on the command line, the program then goes on to
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on. an empty string. Comments in the code explain what is going on.
</P> </P>
<P> <P>
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
</P>
<P>
If PCRE2 is installed in the standard include and library directories for your If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using operating system, you should be able to compile the demonstration program using
this command: a command like this:
<pre> <pre>
gcc -o pcre2demo pcre2demo.c -lpcre2-8 cc -o pcre2demo pcre2demo.c -lpcre2-8
</pre> </pre>
If PCRE2 is installed elsewhere, you may need to add additional options to the If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in command line. For example, on a Unix-like system that has PCRE2 installed in
<i>/usr/local</i>, you can compile the demonstration program using a command <i>/usr/local</i>, you can compile the demonstration program using a command
like this: like this:
<pre> <pre>
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8 cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
</pre>
</PRE> Once you have built the demonstration program, you can run simple tests like
</P> this:
<P>
Once you have compiled and linked the demonstration program, you can run simple
tests like this:
<pre> <pre>
./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
</pre> </pre>
Note that there is a much more comprehensive test program, called Note that there is a much more comprehensive test program, called
<a href="pcre2test.html"><b>pcre2test</b>,</a> <a href="pcre2test.html"><b>pcre2test</b>,</a>
which supports many more facilities for testing regular expressions using the which supports many more facilities for testing regular expressions using all
PCRE2 libraries. The three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
<a href="pcre2demo.html"><b>pcre2demo</b></a> <a href="pcre2demo.html"><b>pcre2demo</b></a>
program is provided as a simple coding example. program is provided as a relatively simple coding example.
</P> </P>
<P> <P>
If you try to run If you try to run
@ -73,7 +77,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris): error like this on some operating systems (e.g. Solaris):
<pre> <pre>
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
</pre> </pre>
This is caused by the way shared library support works on those systems. You This is caused by the way shared library support works on those systems. You
need to add need to add
@ -97,9 +101,9 @@ Cambridge, England.
REVISION REVISION
</b><br> </b><br>
<P> <P>
Last updated: 20 October 2014 Last updated: 02 February 2016
<br> <br>
Copyright &copy; 1997-2014 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -98,10 +98,11 @@ further data is read.
</P> </P>
<P> <P>
For maximum portability, therefore, it is safest to avoid non-printing For maximum portability, therefore, it is safest to avoid non-printing
characters in <b>pcre2test</b> input files. There is a facility for specifying a characters in <b>pcre2test</b> input files. There is a facility for specifying
pattern's characters as hexadecimal pairs, thus making it possible to include some or all of a pattern's characters as hexadecimal pairs, thus making it
binary zeroes in a pattern for testing purposes. Subject lines are processed possible to include binary zeroes in a pattern for testing purposes. Subject
for backslash escapes, which makes it possible to include any data value. lines are processed for backslash escapes, which makes it possible to include
any data value.
</P> </P>
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br> <br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
<P> <P>
@ -559,7 +560,7 @@ about the pattern:
debug same as info,fullbincode debug same as info,fullbincode
fullbincode show binary code with lengths fullbincode show binary code with lengths
/I info show info about compiled pattern /I info show info about compiled pattern
hex pattern is coded in hexadecimal hex unquoted characters are hexadecimal
jit[=&#60;number&#62;] use JIT jit[=&#60;number&#62;] use JIT
jitfast use JIT fast path jitfast use JIT fast path
jitverify verify JIT use jitverify verify JIT use
@ -570,6 +571,7 @@ about the pattern:
null_context compile with a NULL context null_context compile with a NULL context
parens_nest_limit=&#60;n&#62; set maximum parentheses depth parens_nest_limit=&#60;n&#62; set maximum parentheses depth
posix use the POSIX API posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack push push compiled pattern onto the stack
stackguard=&#60;number&#62; test the stackguard feature stackguard=&#60;number&#62; test the stackguard feature
tables=[0|1|2] select internal tables tables=[0|1|2] select internal tables
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
default values). default values).
</P> </P>
<br><b> <br><b>
Specifying a pattern in hex Specifying pattern characters in hexadecimal
</b><br> </b><br>
<P> <P>
The <b>hex</b> modifier specifies that the characters of the pattern are to be The <b>hex</b> modifier specifies that the characters of the pattern, except for
interpreted as pairs of hexadecimal digits. White space is permitted between substrings enclosed in single or double quotes, are to be interpreted as pairs
pairs. For example: of hexadecimal digits. This feature is provided as a way of creating patterns
that contain binary zeros and other non-printing characters. White space is
permitted between pairs of digits. For example, this pattern contains three
characters:
<pre> <pre>
/ab 32 59/hex /ab 32 59/hex
</pre> </pre>
This feature is provided as a way of creating patterns that contain binary zero Parts of such a pattern are taken literally if quoted. This pattern contains
and other non-printing characters. By default, <b>pcre2test</b> passes patterns nine characters, only two of which are specified in hexadecimal:
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as <pre>
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the /ab "literal" 32/hex
actual length of the pattern is passed. </pre>
Either single or double quotes may be used. There is no way of including
the delimiter within a substring.
</P>
<P>
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
patterns specified with the <b>hex</b> modifier, the actual length of the
pattern is passed.
</P> </P>
<br><b> <br><b>
Generating long repetitive patterns Generating long repetitive patterns
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
Using the POSIX wrapper API Using the POSIX wrapper API
</b><br> </b><br>
<P> <P>
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
wrapper API rather than its native API. This supports only the 8-bit library. PCRE2 via the POSIX wrapper API rather than its native API. When
Note that it does not imply POSIX matching semantics; for more detail see the <b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
it does not imply POSIX matching semantics; for more detail see the
<a href="pcre2posix.html"><b>pcre2posix</b></a> <a href="pcre2posix.html"><b>pcre2posix</b></a>
documentation. When the POSIX API is being used, the following pattern documentation. The following pattern modifiers set options for the
modifiers set options for the <b>regcomp()</b> function: <b>regcomp()</b> function:
<pre> <pre>
caseless REG_ICASE caseless REG_ICASE
multiline REG_NEWLINE multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL ) dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard ucp REG_UCP ) the POSIX standard
@ -847,7 +861,8 @@ large buffer is used.
</P> </P>
<P> <P>
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
below. All other modifiers cause an error. below. All other modifiers are either ignored, with a warning message, or cause
an error.
</P> </P>
<br><b> <br><b>
Testing the stack guard feature Testing the stack guard feature
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any effect wrapper API to be used, the only option-setting modifiers that have any effect
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL, are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>. REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
Any other modifiers cause an error. The other modifiers are ignored, with a warning message.
</P> </P>
<br><b> <br><b>
Setting match controls Setting match controls
@ -1001,7 +1016,10 @@ pattern.
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
zero_terminate pass the subject as zero-terminated zero_terminate pass the subject as zero-terminated
</pre> </pre>
The effects of these modifiers are described in the following sections. The effects of these modifiers are described in the following sections. When
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
and <b>ovector</b> subject modifiers work as described below. All other
modifiers are either ignored, with a warning message, or cause an error.
</P> </P>
<br><b> <br><b>
Showing more text Showing more text
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
modifier list containing only modifier list containing only
<a href="#controlmodifiers">control modifiers</a> <a href="#controlmodifiers">control modifiers</a>
that act after a pattern has been compiled. In particular, <b>hex</b>, that act after a pattern has been compiled. In particular, <b>hex</b>,
<b>posix</b>, and <b>push</b> are not allowed, nor are any <b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
<a href="#optionmodifiers">option-setting modifiers.</a> <a href="#optionmodifiers">option-setting modifiers.</a>
The JIT modifiers are, however permitted. Here is an example that saves and The JIT modifiers are, however permitted. Here is an example that saves and
reloads two patterns. reloads two patterns.
@ -1660,9 +1678,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br> <br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 12 December 2015 Last updated: 31 January 2016
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -1326,7 +1326,10 @@ COMPILING A PATTERN
theses in the pattern. Any opening parenthesis that is not followed by theses in the pattern. Any opening parenthesis that is not followed by
? behaves as if it were followed by ?: but named parentheses can still ? behaves as if it were followed by ?: but named parentheses can still
be used for capturing (and they acquire numbers in the usual way). be used for capturing (and they acquire numbers in the usual way).
There is no equivalent of this option in Perl. There is no equivalent of this option in Perl. Note that, if this
option is set, references to capturing groups (back references or
recursion/subroutine calls) may only refer to named groups, though the
reference can be by name or by number.
PCRE2_NO_AUTO_POSSESS PCRE2_NO_AUTO_POSSESS
@ -3055,8 +3058,8 @@ AUTHOR
REVISION REVISION
Last updated: 16 December 2015 Last updated: 31 January 2016
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -6231,7 +6234,7 @@ MATCHING A SINGLE CODE UNIT
PCRE2 does not allow \C to appear in lookbehind assertions (described PCRE2 does not allow \C to appear in lookbehind assertions (described
below) in a UTF mode, because this would make it impossible to calcu- below) in a UTF mode, because this would make it impossible to calcu-
late the length of the lookbehind. Neither the alternative matching late the length of the lookbehind. Neither the alternative matching
function pcre2_dfa_match() not the JIT optimizer support \C in a UTF function pcre2_dfa_match() nor the JIT optimizer support \C in a UTF
mode. The former gives a match-time error; the latter fails to optimize mode. The former gives a match-time error; the latter fails to optimize
and so the match is always run using the interpreter. and so the match is always run using the interpreter.
@ -8460,7 +8463,7 @@ DESCRIPTION
This set of functions provides a POSIX-style API for the PCRE2 regular This set of functions provides a POSIX-style API for the PCRE2 regular
expression 8-bit library. See the pcre2api documentation for a descrip- expression 8-bit library. See the pcre2api documentation for a descrip-
tion of PCRE2's native API, which contains much additional functional- tion of PCRE2's native API, which contains much additional functional-
ity. There is no POSIX-style wrapper for PCRE2's 16-bit and 32-bit ity. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit
libraries. libraries.
The functions described here are just wrapper functions that ultimately The functions described here are just wrapper functions that ultimately
@ -8478,8 +8481,8 @@ DESCRIPTION
easier to slot in PCRE2 as a replacement library. Other POSIX options easier to slot in PCRE2 as a replacement library. Other POSIX options
are not even defined. are not even defined.
There are also some other options that are not defined by POSIX. These There are also some options that are not defined by POSIX. These have
have been added at the request of users who want to make use of certain been added at the request of users who want to make use of certain
PCRE2-specific features via the POSIX calling interface. PCRE2-specific features via the POSIX calling interface.
When PCRE2 is called via these functions, it is only the API that is When PCRE2 is called via these functions, it is only the API that is
@ -8530,11 +8533,11 @@ COMPILING A PATTERN
REG_NOSUB REG_NOSUB
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is When a pattern that is compiled with this flag is passed to regexec()
passed for compilation to the native function. In addition, when a pat- for matching, the nmatch and pmatch arguments are ignored, and no cap-
tern that is compiled with this flag is passed to regexec() for match- tured strings are returned. Versions of the PCRE library prior to 10.22
ing, the nmatch and pmatch arguments are ignored, and no captured used to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no
strings are returned. longer happens because it disables the use of back references.
REG_UCP REG_UCP
@ -8653,17 +8656,18 @@ MATCHING A PATTERN
If the pattern was compiled with the REG_NOSUB flag, no data about any If the pattern was compiled with the REG_NOSUB flag, no data about any
matched strings is returned. The nmatch and pmatch arguments of matched strings is returned. The nmatch and pmatch arguments of
regexec() are ignored. regexec() are ignored (except possibly as input for REG_STARTEND).
If the value of nmatch is zero, or if the value pmatch is NULL, no data The value of nmatch may be zero, and the value pmatch may be NULL
about any matched strings is returned. (unless REG_STARTEND is set); in both these cases no data about any
matched strings is returned.
Otherwise,the portion of the string that was matched, and also any cap- Otherwise, the portion of the string that was matched, and also any
tured substrings, are returned via the pmatch argument, which points to captured substrings, are returned via the pmatch argument, which points
an array of nmatch structures of type regmatch_t, containing the mem- to an array of nmatch structures of type regmatch_t, containing the
bers rm_so and rm_eo. These contain the byte offset to the first char- members rm_so and rm_eo. These contain the byte offset to the first
acter of each substring and the offset to the first character after the character of each substring and the offset to the first character after
end of each substring, respectively. The 0th element of the vector the end of each substring, respectively. The 0th element of the vector
relates to the entire portion of string that was matched; subsequent relates to the entire portion of string that was matched; subsequent
elements relate to the capturing subpatterns of the regular expression. elements relate to the capturing subpatterns of the regular expression.
Unused entries in the array have both structure members set to -1. Unused entries in the array have both structure members set to -1.
@ -8702,8 +8706,8 @@ AUTHOR
REVISION REVISION
Last updated: 29 November 2015 Last updated: 31 January 2016
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
@ -8722,12 +8726,12 @@ PCRE2 SAMPLE PROGRAM
documentation. If you do not have a copy of the PCRE2 distribution, you documentation. If you do not have a copy of the PCRE2 distribution, you
can save this listing to re-create the contents of pcre2demo.c. can save this listing to re-create the contents of pcre2demo.c.
The demonstration program, which uses the PCRE2 8-bit library, compiles The demonstration program compiles the regular expression that is its
the regular expression that is its first argument, and matches it first argument, and matches it against the subject string in its second
against the subject string in its second argument. No PCRE2 options are argument. No PCRE2 options are set, and default character tables are
set, and default character tables are used. If matching succeeds, the used. If matching succeeds, the program outputs the portion of the sub-
program outputs the portion of the subject that matched, together with ject that matched, together with the contents of any captured sub-
the contents of any captured substrings. strings.
If the -g option is given on the command line, the program then goes on If the -g option is given on the command line, the program then goes on
to check for further matches of the same regular expression in the same to check for further matches of the same regular expression in the same
@ -8735,38 +8739,45 @@ PCRE2 SAMPLE PROGRAM
bility of matching an empty string. Comments in the code explain what bility of matching an empty string. Comments in the code explain what
is going on. is going on.
The code in pcre2demo.c is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit
code units. By default, one character corresponds to one code unit,
but if the pattern starts with "(*UTF)", both it and the subject are
treated as UTF-8 strings, where characters may occupy multiple code
units.
If PCRE2 is installed in the standard include and library directories If PCRE2 is installed in the standard include and library directories
for your operating system, you should be able to compile the demonstra- for your operating system, you should be able to compile the demonstra-
tion program using this command: tion program using a command like this:
gcc -o pcre2demo pcre2demo.c -lpcre2-8 cc -o pcre2demo pcre2demo.c -lpcre2-8
If PCRE2 is installed elsewhere, you may need to add additional options If PCRE2 is installed elsewhere, you may need to add additional options
to the command line. For example, on a Unix-like system that has PCRE2 to the command line. For example, on a Unix-like system that has PCRE2
installed in /usr/local, you can compile the demonstration program installed in /usr/local, you can compile the demonstration program
using a command like this: using a command like this:
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \ cc -o pcre2demo -I/usr/local/include pcre2demo.c \
-L/usr/local/lib -lpcre2-8 -L/usr/local/lib -lpcre2-8
Once you have built the demonstration program, you can run simple tests
Once you have compiled and linked the demonstration program, you can like this:
run simple tests like this:
./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
Note that there is a much more comprehensive test program, called Note that there is a much more comprehensive test program, called
pcre2test, which supports many more facilities for testing regular pcre2test, which supports many more facilities for testing regular
expressions using the PCRE2 libraries. The pcre2demo program is pro- expressions using all three PCRE2 libraries (8-bit, 16-bit, and 32-bit,
vided as a simple coding example. though not all three need be installed). The pcre2demo program is pro-
vided as a relatively simple coding example.
If you try to run pcre2demo when PCRE2 is not installed in the standard If you try to run pcre2demo when PCRE2 is not installed in the standard
library directory, you may get an error like this on some operating library directory, you may get an error like this on some operating
systems (e.g. Solaris): systems (e.g. Solaris):
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file
directory or directory
This is caused by the way shared library support works on those sys- This is caused by the way shared library support works on those sys-
tems. You need to add tems. You need to add
@ -8785,8 +8796,8 @@ AUTHOR
REVISION REVISION
Last updated: 20 October 2014 Last updated: 02 February 2016
Copyright (c) 1997-2014 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3) PCRE2SERIALIZE(3) Library Functions Manual PCRE2SERIALIZE(3)

View File

@ -20,28 +20,31 @@
*************************************************/ *************************************************/
/* This is a demonstration program to illustrate a straightforward way of /* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API. incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library. width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command: libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command: compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this: If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
-R/usr/local/lib -lpcre2-8 -o pcre2demo -R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */ /* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
only one code unit width, it makes it possible to use generic function names For a program that uses only one code unit width, setting it to 8, 16, or 32
such as pcre2_compile(). */ makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{ {
pcre2_code *re; pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table; PCRE2_SPTR name_table;
int crlf_is_newline; int crlf_is_newline;
int errornumber; int errornumber;
int find_all; int find_all;
int i; int i;
int namecount;
int name_entry_size;
int rc; int rc;
int utf8; int utf8;
uint32_t option_bits; uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline; uint32_t newline;
PCRE2_SIZE erroroffset; PCRE2_SIZE erroroffset;
@ -106,14 +114,18 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at * * First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, * * the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value * * like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two * * if the -g option is present. *
* arguments. *
**************************************************************************/ **************************************************************************/
find_all = 0; find_all = 0;
for (i = 1; i < argc; i++) for (i = 1; i < argc; i++)
{ {
if (strcmp(argv[i], "-g") == 0) find_all = 1; if (strcmp(argv[i], "-g") == 0) find_all = 1;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\en", argv[i]);
return 1;
}
else break; else break;
} }
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2) if (argc - i != 2)
{ {
printf("Two arguments required: a regex and a subject string\en"); printf("Exactly two arguments required: a regex and a subject string\en");
return 1; return 1;
} }
@ -201,7 +213,7 @@ if (rc < 0)
stored. */ stored. */
ovector = pcre2_get_ovector_pointer(match_data); ovector = pcre2_get_ovector_pointer(match_data);
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]); printf("Match succeeded at offset %d\en", (int)ovector[0]);
/************************************************************************* /*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */ &namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\en"); else if (namecount == 0) printf("No named substrings\en"); else
{ {
PCRE2_SPTR tabptr; PCRE2_SPTR tabptr;
printf("Named substrings\en"); printf("Named substrings\en");
@ -371,7 +383,7 @@ for (;;)
{ {
if (options == 0) break; /* All matches found */ if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */ ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */ if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */ start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\er' && subject[start_offset] == '\er' &&
subject[start_offset + 1] == '\en') subject[start_offset + 1] == '\en')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start); printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
} }
if (namecount <= 0) printf("No named substrings\en"); else if (namecount == 0) printf("No named substrings\en"); else
{ {
PCRE2_SPTR tabptr = name_table; PCRE2_SPTR tabptr = name_table;
printf("Named substrings\en"); printf("Named substrings\en");

View File

@ -1,4 +1,4 @@
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00" .TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 SAMPLE PROGRAM" .SH "PCRE2 SAMPLE PROGRAM"
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
documentation. If you do not have a copy of the PCRE2 distribution, you can documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of \fIpcre2demo.c\fP. save this listing to re-create the contents of \fIpcre2demo.c\fP.
.P .P
The demonstration program, which uses the PCRE2 8-bit library, compiles the The demonstration program compiles the regular expression that is its
regular expression that is its first argument, and matches it against the first argument, and matches it against the subject string in its second
subject string in its second argument. No PCRE2 options are set, and default argument. No PCRE2 options are set, and default character tables are used. If
character tables are used. If matching succeeds, the program outputs the matching succeeds, the program outputs the portion of the subject that matched,
portion of the subject that matched, together with the contents of any captured together with the contents of any captured substrings.
substrings.
.P .P
If the -g option is given on the command line, the program then goes on to If the -g option is given on the command line, the program then goes on to
check for further matches of the same regular expression in the same subject check for further matches of the same regular expression in the same subject
string. The logic is a little bit tricky because of the possibility of matching string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on. an empty string. Comments in the code explain what is going on.
.P .P
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
.P
If PCRE2 is installed in the standard include and library directories for your If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using operating system, you should be able to compile the demonstration program using
this command: a command like this:
.sp .sp
gcc -o pcre2demo pcre2demo.c -lpcre2-8 cc -o pcre2demo pcre2demo.c -lpcre2-8
.sp .sp
If PCRE2 is installed elsewhere, you may need to add additional options to the If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in command line. For example, on a Unix-like system that has PCRE2 installed in
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
like this: like this:
.sp .sp
.\" JOINSH .\" JOINSH
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
-L/usr/local/lib -lpcre2-8 -L/usr/local/lib -lpcre2-8
.sp .sp
.P Once you have built the demonstration program, you can run simple tests like
Once you have compiled and linked the demonstration program, you can run simple this:
tests like this:
.sp .sp
./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
.\" HREF .\" HREF
\fBpcre2test\fP, \fBpcre2test\fP,
.\" .\"
which supports many more facilities for testing regular expressions using the which supports many more facilities for testing regular expressions using all
PCRE2 libraries. The three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
.\" HREF .\" HREF
\fBpcre2demo\fP \fBpcre2demo\fP
.\" .\"
program is provided as a simple coding example. program is provided as a relatively simple coding example.
.P .P
If you try to run If you try to run
.\" HREF .\" HREF
@ -65,7 +70,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris): error like this on some operating systems (e.g. Solaris):
.sp .sp
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
.sp .sp
This is caused by the way shared library support works on those systems. You This is caused by the way shared library support works on those systems. You
need to add need to add
@ -89,6 +94,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 20 October 2014 Last updated: 02 February 2016
Copyright (c) 1997-2014 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
.fi .fi

View File

@ -67,10 +67,10 @@ INPUT ENCODING
For maximum portability, therefore, it is safest to avoid non-printing For maximum portability, therefore, it is safest to avoid non-printing
characters in pcre2test input files. There is a facility for specifying characters in pcre2test input files. There is a facility for specifying
a pattern's characters as hexadecimal pairs, thus making it possible to some or all of a pattern's characters as hexadecimal pairs, thus making
include binary zeroes in a pattern for testing purposes. Subject lines it possible to include binary zeroes in a pattern for testing purposes.
are processed for backslash escapes, which makes it possible to include Subject lines are processed for backslash escapes, which makes it pos-
any data value. sible to include any data value.
COMMAND LINE OPTIONS COMMAND LINE OPTIONS
@ -505,7 +505,7 @@ PATTERN MODIFIERS
debug same as info,fullbincode debug same as info,fullbincode
fullbincode show binary code with lengths fullbincode show binary code with lengths
/I info show info about compiled pattern /I info show info about compiled pattern
hex pattern is coded in hexadecimal hex unquoted characters are hexadecimal
jit[=<number>] use JIT jit[=<number>] use JIT
jitfast use JIT fast path jitfast use JIT fast path
jitverify verify JIT use jitverify verify JIT use
@ -516,6 +516,7 @@ PATTERN MODIFIERS
null_context compile with a NULL context null_context compile with a NULL context
parens_nest_limit=<n> set maximum parentheses depth parens_nest_limit=<n> set maximum parentheses depth
posix use the POSIX API posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack push push compiled pattern onto the stack
stackguard=<number> test the stackguard feature stackguard=<number> test the stackguard feature
tables=[0|1|2] select internal tables tables=[0|1|2] select internal tables
@ -591,19 +592,30 @@ PATTERN MODIFIERS
testing that pcre2_compile() behaves correctly in this case (it uses testing that pcre2_compile() behaves correctly in this case (it uses
default values). default values).
Specifying a pattern in hex Specifying pattern characters in hexadecimal
The hex modifier specifies that the characters of the pattern are to be The hex modifier specifies that the characters of the pattern, except
interpreted as pairs of hexadecimal digits. White space is permitted for substrings enclosed in single or double quotes, are to be inter-
between pairs. For example: preted as pairs of hexadecimal digits. This feature is provided as a
way of creating patterns that contain binary zeros and other non-print-
ing characters. White space is permitted between pairs of digits. For
example, this pattern contains three characters:
/ab 32 59/hex /ab 32 59/hex
This feature is provided as a way of creating patterns that contain Parts of such a pattern are taken literally if quoted. This pattern
binary zero and other non-printing characters. By default, pcre2test contains nine characters, only two of which are specified in hexadeci-
passes patterns as zero-terminated strings to pcre2_compile(), giving mal:
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
hexadecimal, the actual length of the pattern is passed. /ab "literal" 32/hex
Either single or double quotes may be used. There is no way of includ-
ing the delimiter within a substring.
By default, pcre2test passes patterns as zero-terminated strings to
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
for patterns specified with the hex modifier, the actual length of the
pattern is passed.
Generating long repetitive patterns Generating long repetitive patterns
@ -732,16 +744,16 @@ PATTERN MODIFIERS
Using the POSIX wrapper API Using the POSIX wrapper API
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap- The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
per API rather than its native API. This supports only the 8-bit the POSIX wrapper API rather than its native API. When posix_nosub is
library. Note that it does not imply POSIX matching semantics; for used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
more detail see the pcre2posix documentation. When the POSIX API is wrapper supports only the 8-bit library. Note that it does not imply
being used, the following pattern modifiers set options for the reg- POSIX matching semantics; for more detail see the pcre2posix documenta-
comp() function: tion. The following pattern modifiers set options for the regcomp()
function:
caseless REG_ICASE caseless REG_ICASE
multiline REG_NEWLINE multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL ) dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard ucp REG_UCP ) the POSIX standard
@ -758,7 +770,8 @@ PATTERN MODIFIERS
been set, a large buffer is used. been set, a large buffer is used.
The aftertext and allaftertext subject modifiers work as described The aftertext and allaftertext subject modifiers work as described
below. All other modifiers cause an error. below. All other modifiers are either ignored, with a warning message,
or cause an error.
Testing the stack guard feature Testing the stack guard feature
@ -855,7 +868,7 @@ SUBJECT MODIFIERS
wrapper API to be used, the only option-setting modifiers that have any wrapper API to be used, the only option-setting modifiers that have any
effect are notbol, notempty, and noteol, causing REG_NOTBOL, effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
Any other modifiers cause an error. The other modifiers are ignored, with a warning message.
Setting match controls Setting match controls
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
zero_terminate pass the subject as zero-terminated zero_terminate pass the subject as zero-terminated
The effects of these modifiers are described in the following sections. The effects of these modifiers are described in the following sections.
When matching via the POSIX wrapper API, the aftertext, allaftertext,
and ovector subject modifiers work as described below. All other modi-
fiers are either ignored, with a warning message, or cause an error.
Showing more text Showing more text
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
matched with the pattern, terminated as usual by an empty line or end matched with the pattern, terminated as usual by an empty line or end
of file. This command may be followed by a modifier list containing of file. This command may be followed by a modifier list containing
only control modifiers that act after a pattern has been compiled. In only control modifiers that act after a pattern has been compiled. In
particular, hex, posix, and push are not allowed, nor are any option- particular, hex, posix, posix_nosub, and push are not allowed, nor are
setting modifiers. The JIT modifiers are, however permitted. Here is any option-setting modifiers. The JIT modifiers are, however permit-
an example that saves and reloads two patterns. ted. Here is an example that saves and reloads two patterns.
/abc/push /abc/push
/xyz/push /xyz/push
@ -1505,5 +1521,5 @@ AUTHOR
REVISION REVISION
Last updated: 12 December 2015 Last updated: 31 January 2016
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.

View File

@ -3,28 +3,31 @@
*************************************************/ *************************************************/
/* This is a demonstration program to illustrate a straightforward way of /* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API. incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library. width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command: libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command: compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this: If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo -R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -39,9 +42,14 @@ the following line. */
/* #define PCRE2_STATIC */ /* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
only one code unit width, it makes it possible to use generic function names For a program that uses only one code unit width, setting it to 8, 16, or 32
such as pcre2_compile(). */ makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_CODE_UNIT_WIDTH 8
@ -62,19 +70,19 @@ int main(int argc, char **argv)
{ {
pcre2_code *re; pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table; PCRE2_SPTR name_table;
int crlf_is_newline; int crlf_is_newline;
int errornumber; int errornumber;
int find_all; int find_all;
int i; int i;
int namecount;
int name_entry_size;
int rc; int rc;
int utf8; int utf8;
uint32_t option_bits; uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline; uint32_t newline;
PCRE2_SIZE erroroffset; PCRE2_SIZE erroroffset;
@ -89,14 +97,18 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at * * First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, * * the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value * * like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two * * if the -g option is present. *
* arguments. *
**************************************************************************/ **************************************************************************/
find_all = 0; find_all = 0;
for (i = 1; i < argc; i++) for (i = 1; i < argc; i++)
{ {
if (strcmp(argv[i], "-g") == 0) find_all = 1; if (strcmp(argv[i], "-g") == 0) find_all = 1;
else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break; else break;
} }
@ -105,7 +117,7 @@ and the subject string. */
if (argc - i != 2) if (argc - i != 2)
{ {
printf("Two arguments required: a regex and a subject string\n"); printf("Exactly two arguments required: a regex and a subject string\n");
return 1; return 1;
} }
@ -184,7 +196,7 @@ if (rc < 0)
stored. */ stored. */
ovector = pcre2_get_ovector_pointer(match_data); ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]); printf("Match succeeded at offset %d\n", (int)ovector[0]);
/************************************************************************* /*************************************************************************
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */ &namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr; PCRE2_SPTR tabptr;
printf("Named substrings\n"); printf("Named substrings\n");
@ -354,7 +366,7 @@ for (;;)
{ {
if (options == 0) break; /* All matches found */ if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */ ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */ if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */ start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\r' && subject[start_offset] == '\r' &&
subject[start_offset + 1] == '\n') subject[start_offset + 1] == '\n')
@ -400,7 +412,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
} }
if (namecount <= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr = name_table; PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n"); printf("Named substrings\n");