Tidy pcre2demo.c

This commit is contained in:
Philip.Hazel 2016-02-02 16:25:47 +00:00
parent 6c1c817438
commit 4e67c0c9e9
12 changed files with 1116 additions and 1020 deletions

View File

@ -34,6 +34,9 @@ posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture
modifier had this effect. That option is now ignored when the POSIX API is in modifier had this effect. That option is now ignored when the POSIX API is in
use. use.
8. Minor tidies to the pcre2demo.c sample program, including more comments
about its 8-bit-ness.
Version 10.21 12-January-2016 Version 10.21 12-January-2016
----------------------------- -----------------------------

View File

@ -1282,7 +1282,9 @@ If this option is set, it disables the use of numbered capturing parentheses in
the pattern. Any opening parenthesis that is not followed by ? behaves as if it the pattern. Any opening parenthesis that is not followed by ? behaves as if it
were followed by ?: but named parentheses can still be used for capturing (and were followed by ?: but named parentheses can still be used for capturing (and
they acquire numbers in the usual way). There is no equivalent of this option they acquire numbers in the usual way). There is no equivalent of this option
in Perl. in Perl. Note that, if this option is set, references to capturing groups (back
references or recursion/subroutine calls) may only refer to named groups,
though the reference can be by name or by number.
<pre> <pre>
PCRE2_NO_AUTO_POSSESS PCRE2_NO_AUTO_POSSESS
</pre> </pre>
@ -3121,9 +3123,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC40" href="#TOC1">REVISION</a><br> <br><a name="SEC40" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 16 December 2015 Last updated: 31 January 2016
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -20,28 +20,31 @@ please consult the man page, in case the conversion went wrong.
*************************************************/ *************************************************/
/* This is a demonstration program to illustrate a straightforward way of /* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API. incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library. width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command: libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command: compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this: If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo -R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */ /* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
only one code unit width, it makes it possible to use generic function names For a program that uses only one code unit width, setting it to 8, 16, or 32
such as pcre2_compile(). */ makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{ {
pcre2_code *re; pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table; PCRE2_SPTR name_table;
int crlf_is_newline; int crlf_is_newline;
int errornumber; int errornumber;
int find_all; int find_all;
int i; int i;
int namecount;
int name_entry_size;
int rc; int rc;
int utf8; int utf8;
uint32_t option_bits; uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline; uint32_t newline;
PCRE2_SIZE erroroffset; PCRE2_SIZE erroroffset;
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at * * First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, * * the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value * * like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two * * if the -g option is present. *
* arguments. *
**************************************************************************/ **************************************************************************/
find_all = 0; find_all = 0;
for (i = 1; i &lt; argc; i++) for (i = 1; i &lt; argc; i++)
{ {
if (strcmp(argv[i], "-g") == 0) find_all = 1; if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break; else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break;
} }
/* After the options, we require exactly two arguments, which are the pattern, /* After the options, we require exactly two arguments, which are the pattern,
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2) if (argc - i != 2)
{ {
printf("Two arguments required: a regex and a subject string\n"); printf("Exactly two arguments required: a regex and a subject string\n");
return 1; return 1;
} }
@ -201,7 +213,7 @@ if (rc &lt; 0)
stored. */ stored. */
ovector = pcre2_get_ovector_pointer(match_data); ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]); printf("Match succeeded at offset %d\n", (int)ovector[0]);
/************************************************************************* /*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&amp;namecount); /* where to put the answer */ &amp;namecount); /* where to put the answer */
if (namecount &lt;= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr; PCRE2_SPTR tabptr;
printf("Named substrings\n"); printf("Named substrings\n");
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
for (;;) for (;;)
{ {
uint32_t options = 0; /* Normally no options */ uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are /* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the at the end of the subject. Otherwise, arrange to run another match at the
@ -371,7 +383,7 @@ for (;;)
{ {
if (options == 0) break; /* All matches found */ if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */ ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */ if (crlf_is_newline &amp;&amp; /* If CRLF is a newline &amp; */
start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */ start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
subject[start_offset] == '\r' &amp;&amp; subject[start_offset] == '\r' &amp;&amp;
subject[start_offset + 1] == '\n') subject[start_offset + 1] == '\n')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
} }
if (namecount &lt;= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr = name_table; PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n"); printf("Named substrings\n");

View File

@ -1258,7 +1258,7 @@ PCRE2 does not allow \C to appear in lookbehind assertions
<a href="#lookbehind">(described below)</a> <a href="#lookbehind">(described below)</a>
in a UTF mode, because this would make it impossible to calculate the length of in a UTF mode, because this would make it impossible to calculate the length of
the lookbehind. Neither the alternative matching function the lookbehind. Neither the alternative matching function
<b>pcre2_dfa_match()</b> not the JIT optimizer support \C in a UTF mode. The <b>pcre2_dfa_match()</b> nor the JIT optimizer support \C in a UTF mode. The
former gives a match-time error; the latter fails to optimize and so the match former gives a match-time error; the latter fails to optimize and so the match
is always run using the interpreter. is always run using the interpreter.
</P> </P>

View File

@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE2 regular
expression 8-bit library. See the expression 8-bit library. See the
<a href="pcre2api.html"><b>pcre2api</b></a> <a href="pcre2api.html"><b>pcre2api</b></a>
documentation for a description of PCRE2's native API, which contains much documentation for a description of PCRE2's native API, which contains much
additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit
and 32-bit libraries. and 32-bit libraries.
</P> </P>
<P> <P>
@ -67,9 +67,9 @@ POSIX interface often use it, this makes it easier to slot in PCRE2 as a
replacement library. Other POSIX options are not even defined. replacement library. Other POSIX options are not even defined.
</P> </P>
<P> <P>
There are also some other options that are not defined by POSIX. These have There are also some options that are not defined by POSIX. These have been
been added at the request of users who want to make use of certain added at the request of users who want to make use of certain PCRE2-specific
PCRE2-specific features via the POSIX calling interface. features via the POSIX calling interface.
</P> </P>
<P> <P>
When PCRE2 is called via these functions, it is only the API that is POSIX-like When PCRE2 is called via these functions, it is only the API that is POSIX-like
@ -119,11 +119,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section).
<pre> <pre>
REG_NOSUB REG_NOSUB
</pre> </pre>
The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed When a pattern that is compiled with this flag is passed to <b>regexec()</b> for
for compilation to the native function. In addition, when a pattern that is matching, the <i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no
compiled with this flag is passed to <b>regexec()</b> for matching, the captured strings are returned. Versions of the PCRE library prior to 10.22 used
<i>nmatch</i> and <i>pmatch</i> arguments are ignored, and no captured strings to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens
are returned. because it disables the use of back references.
<pre> <pre>
REG_UCP REG_UCP
</pre> </pre>
@ -241,14 +241,15 @@ mutually exclusive; the error REG_INVARG is returned.
<P> <P>
If the pattern was compiled with the REG_NOSUB flag, no data about any matched If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of strings is returned. The <i>nmatch</i> and <i>pmatch</i> arguments of
<b>regexec()</b> are ignored. <b>regexec()</b> are ignored (except possibly as input for REG_STARTEND).
</P> </P>
<P> <P>
If the value of <i>nmatch</i> is zero, or if the value <i>pmatch</i> is NULL, The value of <i>nmatch</i> may be zero, and the value <i>pmatch</i> may be NULL
no data about any matched strings is returned. (unless REG_STARTEND is set); in both these cases no data about any matched
strings is returned.
</P> </P>
<P> <P>
Otherwise,the portion of the string that was matched, and also any captured Otherwise, the portion of the string that was matched, and also any captured
substrings, are returned via the <i>pmatch</i> argument, which points to an substrings, are returned via the <i>pmatch</i> argument, which points to an
array of <i>nmatch</i> structures of type <i>regmatch_t</i>, containing the array of <i>nmatch</i> structures of type <i>regmatch_t</i>, containing the
members <i>rm_so</i> and <i>rm_eo</i>. These contain the byte offset to the first members <i>rm_so</i> and <i>rm_eo</i>. These contain the byte offset to the first
@ -290,9 +291,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC9" href="#TOC1">REVISION</a><br> <br><a name="SEC9" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 29 November 2015 Last updated: 31 January 2016
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -24,12 +24,11 @@ documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of <i>pcre2demo.c</i>. save this listing to re-create the contents of <i>pcre2demo.c</i>.
</P> </P>
<P> <P>
The demonstration program, which uses the PCRE2 8-bit library, compiles the The demonstration program compiles the regular expression that is its
regular expression that is its first argument, and matches it against the first argument, and matches it against the subject string in its second
subject string in its second argument. No PCRE2 options are set, and default argument. No PCRE2 options are set, and default character tables are used. If
character tables are used. If matching succeeds, the program outputs the matching succeeds, the program outputs the portion of the subject that matched,
portion of the subject that matched, together with the contents of any captured together with the contents of any captured substrings.
substrings.
</P> </P>
<P> <P>
If the -g option is given on the command line, the program then goes on to If the -g option is given on the command line, the program then goes on to
@ -38,34 +37,39 @@ string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on. an empty string. Comments in the code explain what is going on.
</P> </P>
<P> <P>
The code in <b>pcre2demo.c</b> is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
</P>
<P>
If PCRE2 is installed in the standard include and library directories for your If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using operating system, you should be able to compile the demonstration program using
this command: a command like this:
<pre> <pre>
gcc -o pcre2demo pcre2demo.c -lpcre2-8 cc -o pcre2demo pcre2demo.c -lpcre2-8
</pre> </pre>
If PCRE2 is installed elsewhere, you may need to add additional options to the If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in command line. For example, on a Unix-like system that has PCRE2 installed in
<i>/usr/local</i>, you can compile the demonstration program using a command <i>/usr/local</i>, you can compile the demonstration program using a command
like this: like this:
<pre> <pre>
gcc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8 cc -o pcre2demo -I/usr/local/include pcre2demo.c -L/usr/local/lib -lpcre2-8
</pre>
</PRE> Once you have built the demonstration program, you can run simple tests like
</P> this:
<P>
Once you have compiled and linked the demonstration program, you can run simple
tests like this:
<pre> <pre>
./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
</pre> </pre>
Note that there is a much more comprehensive test program, called Note that there is a much more comprehensive test program, called
<a href="pcre2test.html"><b>pcre2test</b>,</a> <a href="pcre2test.html"><b>pcre2test</b>,</a>
which supports many more facilities for testing regular expressions using the which supports many more facilities for testing regular expressions using all
PCRE2 libraries. The three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
<a href="pcre2demo.html"><b>pcre2demo</b></a> <a href="pcre2demo.html"><b>pcre2demo</b></a>
program is provided as a simple coding example. program is provided as a relatively simple coding example.
</P> </P>
<P> <P>
If you try to run If you try to run
@ -73,7 +77,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris): error like this on some operating systems (e.g. Solaris):
<pre> <pre>
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
</pre> </pre>
This is caused by the way shared library support works on those systems. You This is caused by the way shared library support works on those systems. You
need to add need to add
@ -97,9 +101,9 @@ Cambridge, England.
REVISION REVISION
</b><br> </b><br>
<P> <P>
Last updated: 20 October 2014 Last updated: 02 February 2016
<br> <br>
Copyright &copy; 1997-2014 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

View File

@ -98,10 +98,11 @@ further data is read.
</P> </P>
<P> <P>
For maximum portability, therefore, it is safest to avoid non-printing For maximum portability, therefore, it is safest to avoid non-printing
characters in <b>pcre2test</b> input files. There is a facility for specifying a characters in <b>pcre2test</b> input files. There is a facility for specifying
pattern's characters as hexadecimal pairs, thus making it possible to include some or all of a pattern's characters as hexadecimal pairs, thus making it
binary zeroes in a pattern for testing purposes. Subject lines are processed possible to include binary zeroes in a pattern for testing purposes. Subject
for backslash escapes, which makes it possible to include any data value. lines are processed for backslash escapes, which makes it possible to include
any data value.
</P> </P>
<br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br> <br><a name="SEC4" href="#TOC1">COMMAND LINE OPTIONS</a><br>
<P> <P>
@ -559,7 +560,7 @@ about the pattern:
debug same as info,fullbincode debug same as info,fullbincode
fullbincode show binary code with lengths fullbincode show binary code with lengths
/I info show info about compiled pattern /I info show info about compiled pattern
hex pattern is coded in hexadecimal hex unquoted characters are hexadecimal
jit[=&#60;number&#62;] use JIT jit[=&#60;number&#62;] use JIT
jitfast use JIT fast path jitfast use JIT fast path
jitverify verify JIT use jitverify verify JIT use
@ -570,6 +571,7 @@ about the pattern:
null_context compile with a NULL context null_context compile with a NULL context
parens_nest_limit=&#60;n&#62; set maximum parentheses depth parens_nest_limit=&#60;n&#62; set maximum parentheses depth
posix use the POSIX API posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack push push compiled pattern onto the stack
stackguard=&#60;number&#62; test the stackguard feature stackguard=&#60;number&#62; test the stackguard feature
tables=[0|1|2] select internal tables tables=[0|1|2] select internal tables
@ -655,20 +657,31 @@ testing that <b>pcre2_compile()</b> behaves correctly in this case (it uses
default values). default values).
</P> </P>
<br><b> <br><b>
Specifying a pattern in hex Specifying pattern characters in hexadecimal
</b><br> </b><br>
<P> <P>
The <b>hex</b> modifier specifies that the characters of the pattern are to be The <b>hex</b> modifier specifies that the characters of the pattern, except for
interpreted as pairs of hexadecimal digits. White space is permitted between substrings enclosed in single or double quotes, are to be interpreted as pairs
pairs. For example: of hexadecimal digits. This feature is provided as a way of creating patterns
that contain binary zeros and other non-printing characters. White space is
permitted between pairs of digits. For example, this pattern contains three
characters:
<pre> <pre>
/ab 32 59/hex /ab 32 59/hex
</pre> </pre>
This feature is provided as a way of creating patterns that contain binary zero Parts of such a pattern are taken literally if quoted. This pattern contains
and other non-printing characters. By default, <b>pcre2test</b> passes patterns nine characters, only two of which are specified in hexadecimal:
as zero-terminated strings to <b>pcre2_compile()</b>, giving the length as <pre>
PCRE2_ZERO_TERMINATED. However, for patterns specified in hexadecimal, the /ab "literal" 32/hex
actual length of the pattern is passed. </pre>
Either single or double quotes may be used. There is no way of including
the delimiter within a substring.
</P>
<P>
By default, <b>pcre2test</b> passes patterns as zero-terminated strings to
<b>pcre2_compile()</b>, giving the length as PCRE2_ZERO_TERMINATED. However, for
patterns specified with the <b>hex</b> modifier, the actual length of the
pattern is passed.
</P> </P>
<br><b> <br><b>
Generating long repetitive patterns Generating long repetitive patterns
@ -821,16 +834,17 @@ variable can hold (essentially unlimited).
Using the POSIX wrapper API Using the POSIX wrapper API
</b><br> </b><br>
<P> <P>
The <b>/posix</b> modifier causes <b>pcre2test</b> to call PCRE2 via the POSIX The <b>/posix</b> and <b>posix_nosub</b> modifiers cause <b>pcre2test</b> to call
wrapper API rather than its native API. This supports only the 8-bit library. PCRE2 via the POSIX wrapper API rather than its native API. When
Note that it does not imply POSIX matching semantics; for more detail see the <b>posix_nosub</b> is used, the POSIX option REG_NOSUB is passed to
<b>regcomp()</b>. The POSIX wrapper supports only the 8-bit library. Note that
it does not imply POSIX matching semantics; for more detail see the
<a href="pcre2posix.html"><b>pcre2posix</b></a> <a href="pcre2posix.html"><b>pcre2posix</b></a>
documentation. When the POSIX API is being used, the following pattern documentation. The following pattern modifiers set options for the
modifiers set options for the <b>regcomp()</b> function: <b>regcomp()</b> function:
<pre> <pre>
caseless REG_ICASE caseless REG_ICASE
multiline REG_NEWLINE multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL ) dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard ucp REG_UCP ) the POSIX standard
@ -847,7 +861,8 @@ large buffer is used.
</P> </P>
<P> <P>
The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described The <b>aftertext</b> and <b>allaftertext</b> subject modifiers work as described
below. All other modifiers cause an error. below. All other modifiers are either ignored, with a warning message, or cause
an error.
</P> </P>
<br><b> <br><b>
Testing the stack guard feature Testing the stack guard feature
@ -957,7 +972,7 @@ If the <b>/posix</b> modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any effect wrapper API to be used, the only option-setting modifiers that have any effect
are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL, are <b>notbol</b>, <b>notempty</b>, and <b>noteol</b>, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>. REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to <b>regexec()</b>.
Any other modifiers cause an error. The other modifiers are ignored, with a warning message.
</P> </P>
<br><b> <br><b>
Setting match controls Setting match controls
@ -1001,7 +1016,10 @@ pattern.
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
zero_terminate pass the subject as zero-terminated zero_terminate pass the subject as zero-terminated
</pre> </pre>
The effects of these modifiers are described in the following sections. The effects of these modifiers are described in the following sections. When
matching via the POSIX wrapper API, the <b>aftertext</b>, <b>allaftertext</b>,
and <b>ovector</b> subject modifiers work as described below. All other
modifiers are either ignored, with a warning message, or cause an error.
</P> </P>
<br><b> <br><b>
Showing more text Showing more text
@ -1625,7 +1643,7 @@ usual by an empty line or end of file. This command may be followed by a
modifier list containing only modifier list containing only
<a href="#controlmodifiers">control modifiers</a> <a href="#controlmodifiers">control modifiers</a>
that act after a pattern has been compiled. In particular, <b>hex</b>, that act after a pattern has been compiled. In particular, <b>hex</b>,
<b>posix</b>, and <b>push</b> are not allowed, nor are any <b>posix</b>, <b>posix_nosub</b>, and <b>push</b> are not allowed, nor are any
<a href="#optionmodifiers">option-setting modifiers.</a> <a href="#optionmodifiers">option-setting modifiers.</a>
The JIT modifiers are, however permitted. Here is an example that saves and The JIT modifiers are, however permitted. Here is an example that saves and
reloads two patterns. reloads two patterns.
@ -1660,9 +1678,9 @@ Cambridge, England.
</P> </P>
<br><a name="SEC21" href="#TOC1">REVISION</a><br> <br><a name="SEC21" href="#TOC1">REVISION</a><br>
<P> <P>
Last updated: 12 December 2015 Last updated: 31 January 2016
<br> <br>
Copyright &copy; 1997-2015 University of Cambridge. Copyright &copy; 1997-2016 University of Cambridge.
<br> <br>
<p> <p>
Return to the <a href="index.html">PCRE2 index page</a>. Return to the <a href="index.html">PCRE2 index page</a>.

File diff suppressed because it is too large Load Diff

View File

@ -20,28 +20,31 @@
*************************************************/ *************************************************/
/* This is a demonstration program to illustrate a straightforward way of /* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API. incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library. width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command: libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command: compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this: If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
-R/usr/local/lib -lpcre2-8 -o pcre2demo -R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -56,9 +59,14 @@ the following line. */
/* #define PCRE2_STATIC */ /* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
only one code unit width, it makes it possible to use generic function names For a program that uses only one code unit width, setting it to 8, 16, or 32
such as pcre2_compile(). */ makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_CODE_UNIT_WIDTH 8
@ -79,19 +87,19 @@ int main(int argc, char **argv)
{ {
pcre2_code *re; pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table; PCRE2_SPTR name_table;
int crlf_is_newline; int crlf_is_newline;
int errornumber; int errornumber;
int find_all; int find_all;
int i; int i;
int namecount;
int name_entry_size;
int rc; int rc;
int utf8; int utf8;
uint32_t option_bits; uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline; uint32_t newline;
PCRE2_SIZE erroroffset; PCRE2_SIZE erroroffset;
@ -106,15 +114,19 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at * * First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, * * the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value * * like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two * * if the -g option is present. *
* arguments. *
**************************************************************************/ **************************************************************************/
find_all = 0; find_all = 0;
for (i = 1; i < argc; i++) for (i = 1; i < argc; i++)
{ {
if (strcmp(argv[i], "-g") == 0) find_all = 1; if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break; else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\en", argv[i]);
return 1;
}
else break;
} }
/* After the options, we require exactly two arguments, which are the pattern, /* After the options, we require exactly two arguments, which are the pattern,
@ -122,7 +134,7 @@ and the subject string. */
if (argc - i != 2) if (argc - i != 2)
{ {
printf("Two arguments required: a regex and a subject string\en"); printf("Exactly two arguments required: a regex and a subject string\en");
return 1; return 1;
} }
@ -201,7 +213,7 @@ if (rc < 0)
stored. */ stored. */
ovector = pcre2_get_ovector_pointer(match_data); ovector = pcre2_get_ovector_pointer(match_data);
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]); printf("Match succeeded at offset %d\en", (int)ovector[0]);
/************************************************************************* /*************************************************************************
@ -242,7 +254,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */ &namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\en"); else if (namecount == 0) printf("No named substrings\en"); else
{ {
PCRE2_SPTR tabptr; PCRE2_SPTR tabptr;
printf("Named substrings\en"); printf("Named substrings\en");
@ -330,8 +342,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
for (;;) for (;;)
{ {
uint32_t options = 0; /* Normally no options */ uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are /* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the at the end of the subject. Otherwise, arrange to run another match at the
@ -371,7 +383,7 @@ for (;;)
{ {
if (options == 0) break; /* All matches found */ if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */ ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */ if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */ start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\er' && subject[start_offset] == '\er' &&
subject[start_offset + 1] == '\en') subject[start_offset + 1] == '\en')
@ -417,7 +429,7 @@ for (;;)
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start); printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
} }
if (namecount <= 0) printf("No named substrings\en"); else if (namecount == 0) printf("No named substrings\en"); else
{ {
PCRE2_SPTR tabptr = name_table; PCRE2_SPTR tabptr = name_table;
printf("Named substrings\en"); printf("Named substrings\en");

View File

@ -1,4 +1,4 @@
.TH PCRE2SAMPLE 3 "20 October 2014" "PCRE2 10.00" .TH PCRE2SAMPLE 3 "02 February 2016" "PCRE2 10.22"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 SAMPLE PROGRAM" .SH "PCRE2 SAMPLE PROGRAM"
@ -13,23 +13,28 @@ distribution. A listing of this program is given in the
documentation. If you do not have a copy of the PCRE2 distribution, you can documentation. If you do not have a copy of the PCRE2 distribution, you can
save this listing to re-create the contents of \fIpcre2demo.c\fP. save this listing to re-create the contents of \fIpcre2demo.c\fP.
.P .P
The demonstration program, which uses the PCRE2 8-bit library, compiles the The demonstration program compiles the regular expression that is its
regular expression that is its first argument, and matches it against the first argument, and matches it against the subject string in its second
subject string in its second argument. No PCRE2 options are set, and default argument. No PCRE2 options are set, and default character tables are used. If
character tables are used. If matching succeeds, the program outputs the matching succeeds, the program outputs the portion of the subject that matched,
portion of the subject that matched, together with the contents of any captured together with the contents of any captured substrings.
substrings.
.P .P
If the -g option is given on the command line, the program then goes on to If the -g option is given on the command line, the program then goes on to
check for further matches of the same regular expression in the same subject check for further matches of the same regular expression in the same subject
string. The logic is a little bit tricky because of the possibility of matching string. The logic is a little bit tricky because of the possibility of matching
an empty string. Comments in the code explain what is going on. an empty string. Comments in the code explain what is going on.
.P .P
The code in \fBpcre2demo.c\fP is an 8-bit program that uses the PCRE2 8-bit
library. It handles strings and characters that are stored in 8-bit code units.
By default, one character corresponds to one code unit, but if the pattern
starts with "(*UTF)", both it and the subject are treated as UTF-8 strings,
where characters may occupy multiple code units.
.P
If PCRE2 is installed in the standard include and library directories for your If PCRE2 is installed in the standard include and library directories for your
operating system, you should be able to compile the demonstration program using operating system, you should be able to compile the demonstration program using
this command: a command like this:
.sp .sp
gcc -o pcre2demo pcre2demo.c -lpcre2-8 cc -o pcre2demo pcre2demo.c -lpcre2-8
.sp .sp
If PCRE2 is installed elsewhere, you may need to add additional options to the If PCRE2 is installed elsewhere, you may need to add additional options to the
command line. For example, on a Unix-like system that has PCRE2 installed in command line. For example, on a Unix-like system that has PCRE2 installed in
@ -37,12 +42,11 @@ command line. For example, on a Unix-like system that has PCRE2 installed in
like this: like this:
.sp .sp
.\" JOINSH .\" JOINSH
gcc -o pcre2demo -I/usr/local/include pcre2demo.c \e cc -o pcre2demo -I/usr/local/include pcre2demo.c \e
-L/usr/local/lib -lpcre2-8 -L/usr/local/lib -lpcre2-8
.sp .sp
.P Once you have built the demonstration program, you can run simple tests like
Once you have compiled and linked the demonstration program, you can run simple this:
tests like this:
.sp .sp
./pcre2demo 'cat|dog' 'the cat sat on the mat' ./pcre2demo 'cat|dog' 'the cat sat on the mat'
./pcre2demo -g 'cat|dog' 'the dog sat on the cat' ./pcre2demo -g 'cat|dog' 'the dog sat on the cat'
@ -51,12 +55,13 @@ Note that there is a much more comprehensive test program, called
.\" HREF .\" HREF
\fBpcre2test\fP, \fBpcre2test\fP,
.\" .\"
which supports many more facilities for testing regular expressions using the which supports many more facilities for testing regular expressions using all
PCRE2 libraries. The three PCRE2 libraries (8-bit, 16-bit, and 32-bit, though not all three need be
installed). The
.\" HREF .\" HREF
\fBpcre2demo\fP \fBpcre2demo\fP
.\" .\"
program is provided as a simple coding example. program is provided as a relatively simple coding example.
.P .P
If you try to run If you try to run
.\" HREF .\" HREF
@ -65,7 +70,7 @@ If you try to run
when PCRE2 is not installed in the standard library directory, you may get an when PCRE2 is not installed in the standard library directory, you may get an
error like this on some operating systems (e.g. Solaris): error like this on some operating systems (e.g. Solaris):
.sp .sp
ld.so.1: a.out: fatal: libpcre2.so.0: open failed: No such file or directory ld.so.1: pcre2demo: fatal: libpcre2-8.so.0: open failed: No such file or directory
.sp .sp
This is caused by the way shared library support works on those systems. You This is caused by the way shared library support works on those systems. You
need to add need to add
@ -89,6 +94,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 20 October 2014 Last updated: 02 February 2016
Copyright (c) 1997-2014 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
.fi .fi

View File

@ -67,10 +67,10 @@ INPUT ENCODING
For maximum portability, therefore, it is safest to avoid non-printing For maximum portability, therefore, it is safest to avoid non-printing
characters in pcre2test input files. There is a facility for specifying characters in pcre2test input files. There is a facility for specifying
a pattern's characters as hexadecimal pairs, thus making it possible to some or all of a pattern's characters as hexadecimal pairs, thus making
include binary zeroes in a pattern for testing purposes. Subject lines it possible to include binary zeroes in a pattern for testing purposes.
are processed for backslash escapes, which makes it possible to include Subject lines are processed for backslash escapes, which makes it pos-
any data value. sible to include any data value.
COMMAND LINE OPTIONS COMMAND LINE OPTIONS
@ -505,7 +505,7 @@ PATTERN MODIFIERS
debug same as info,fullbincode debug same as info,fullbincode
fullbincode show binary code with lengths fullbincode show binary code with lengths
/I info show info about compiled pattern /I info show info about compiled pattern
hex pattern is coded in hexadecimal hex unquoted characters are hexadecimal
jit[=<number>] use JIT jit[=<number>] use JIT
jitfast use JIT fast path jitfast use JIT fast path
jitverify verify JIT use jitverify verify JIT use
@ -516,6 +516,7 @@ PATTERN MODIFIERS
null_context compile with a NULL context null_context compile with a NULL context
parens_nest_limit=<n> set maximum parentheses depth parens_nest_limit=<n> set maximum parentheses depth
posix use the POSIX API posix use the POSIX API
posix_nosub use the POSIX API with REG_NOSUB
push push compiled pattern onto the stack push push compiled pattern onto the stack
stackguard=<number> test the stackguard feature stackguard=<number> test the stackguard feature
tables=[0|1|2] select internal tables tables=[0|1|2] select internal tables
@ -591,59 +592,70 @@ PATTERN MODIFIERS
testing that pcre2_compile() behaves correctly in this case (it uses testing that pcre2_compile() behaves correctly in this case (it uses
default values). default values).
Specifying a pattern in hex Specifying pattern characters in hexadecimal
The hex modifier specifies that the characters of the pattern are to be The hex modifier specifies that the characters of the pattern, except
interpreted as pairs of hexadecimal digits. White space is permitted for substrings enclosed in single or double quotes, are to be inter-
between pairs. For example: preted as pairs of hexadecimal digits. This feature is provided as a
way of creating patterns that contain binary zeros and other non-print-
ing characters. White space is permitted between pairs of digits. For
example, this pattern contains three characters:
/ab 32 59/hex /ab 32 59/hex
This feature is provided as a way of creating patterns that contain Parts of such a pattern are taken literally if quoted. This pattern
binary zero and other non-printing characters. By default, pcre2test contains nine characters, only two of which are specified in hexadeci-
passes patterns as zero-terminated strings to pcre2_compile(), giving mal:
the length as PCRE2_ZERO_TERMINATED. However, for patterns specified in
hexadecimal, the actual length of the pattern is passed. /ab "literal" 32/hex
Either single or double quotes may be used. There is no way of includ-
ing the delimiter within a substring.
By default, pcre2test passes patterns as zero-terminated strings to
pcre2_compile(), giving the length as PCRE2_ZERO_TERMINATED. However,
for patterns specified with the hex modifier, the actual length of the
pattern is passed.
Generating long repetitive patterns Generating long repetitive patterns
Some tests use long patterns that are very repetitive. Instead of cre- Some tests use long patterns that are very repetitive. Instead of cre-
ating a very long input line for such a pattern, you can use a special ating a very long input line for such a pattern, you can use a special
repetition feature, similar to the one described for subject lines repetition feature, similar to the one described for subject lines
above. If the expand modifier is present on a pattern, parts of the above. If the expand modifier is present on a pattern, parts of the
pattern that have the form pattern that have the form
\[<characters>]{<count>} \[<characters>]{<count>}
are expanded before the pattern is passed to pcre2_compile(). For exam- are expanded before the pattern is passed to pcre2_compile(). For exam-
ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction ple, \[AB]{6000} is expanded to "ABAB..." 6000 times. This construction
cannot be nested. An initial "\[" sequence is recognized only if "]{" cannot be nested. An initial "\[" sequence is recognized only if "]{"
followed by decimal digits and "}" is found later in the pattern. If followed by decimal digits and "}" is found later in the pattern. If
not, the characters remain in the pattern unaltered. not, the characters remain in the pattern unaltered.
If part of an expanded pattern looks like an expansion, but is really If part of an expanded pattern looks like an expansion, but is really
part of the actual pattern, unwanted expansion can be avoided by giving part of the actual pattern, unwanted expansion can be avoided by giving
two values in the quantifier. For example, \[AB]{6000,6000} is not rec- two values in the quantifier. For example, \[AB]{6000,6000} is not rec-
ognized as an expansion item. ognized as an expansion item.
If the info modifier is set on an expanded pattern, the result of the If the info modifier is set on an expanded pattern, the result of the
expansion is included in the information that is output. expansion is included in the information that is output.
JIT compilation JIT compilation
Just-in-time (JIT) compiling is a heavyweight optimization that can Just-in-time (JIT) compiling is a heavyweight optimization that can
greatly speed up pattern matching. See the pcre2jit documentation for greatly speed up pattern matching. See the pcre2jit documentation for
details. JIT compiling happens, optionally, after a pattern has been details. JIT compiling happens, optionally, after a pattern has been
successfully compiled into an internal form. The JIT compiler converts successfully compiled into an internal form. The JIT compiler converts
this to optimized machine code. It needs to know whether the match-time this to optimized machine code. It needs to know whether the match-time
options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used, options PCRE2_PARTIAL_HARD and PCRE2_PARTIAL_SOFT are going to be used,
because different code is generated for the different cases. See the because different code is generated for the different cases. See the
partial modifier in "Subject Modifiers" below for details of how these partial modifier in "Subject Modifiers" below for details of how these
options are specified for each match attempt. options are specified for each match attempt.
JIT compilation is requested by the /jit pattern modifier, which may JIT compilation is requested by the /jit pattern modifier, which may
optionally be followed by an equals sign and a number in the range 0 to optionally be followed by an equals sign and a number in the range 0 to
7. The three bits that make up the number specify which of the three 7. The three bits that make up the number specify which of the three
JIT operating modes are to be compiled: JIT operating modes are to be compiled:
1 compile JIT code for non-partial matching 1 compile JIT code for non-partial matching
@ -660,31 +672,31 @@ PATTERN MODIFIERS
6 soft and hard partial matching only 6 soft and hard partial matching only
7 all three modes 7 all three modes
If no number is given, 7 is assumed. The phrase "partial matching" If no number is given, 7 is assumed. The phrase "partial matching"
means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the means a call to pcre2_match() with either the PCRE2_PARTIAL_SOFT or the
PCRE2_PARTIAL_HARD option set. Note that such a call may return a com- PCRE2_PARTIAL_HARD option set. Note that such a call may return a com-
plete match; the options enable the possibility of a partial match, but plete match; the options enable the possibility of a partial match, but
do not require it. Note also that if you request JIT compilation only do not require it. Note also that if you request JIT compilation only
for partial matching (for example, /jit=2) but do not set the partial for partial matching (for example, /jit=2) but do not set the partial
modifier on a subject line, that match will not use JIT code because modifier on a subject line, that match will not use JIT code because
none was compiled for non-partial matching. none was compiled for non-partial matching.
If JIT compilation is successful, the compiled JIT code will automati- If JIT compilation is successful, the compiled JIT code will automati-
cally be used when an appropriate type of match is run, except when cally be used when an appropriate type of match is run, except when
incompatible run-time options are specified. For more details, see the incompatible run-time options are specified. For more details, see the
pcre2jit documentation. See also the jitstack modifier below for a way pcre2jit documentation. See also the jitstack modifier below for a way
of setting the size of the JIT stack. of setting the size of the JIT stack.
If the jitfast modifier is specified, matching is done using the JIT If the jitfast modifier is specified, matching is done using the JIT
"fast path" interface, pcre2_jit_match(), which skips some of the san- "fast path" interface, pcre2_jit_match(), which skips some of the san-
ity checks that are done by pcre2_match(), and of course does not work ity checks that are done by pcre2_match(), and of course does not work
when JIT is not supported. If jitfast is specified without jit, jit=7 when JIT is not supported. If jitfast is specified without jit, jit=7
is assumed. is assumed.
If the jitverify modifier is specified, information about the compiled If the jitverify modifier is specified, information about the compiled
pattern shows whether JIT compilation was or was not successful. If pattern shows whether JIT compilation was or was not successful. If
jitverify is specified without jit, jit=7 is assumed. If JIT compila- jitverify is specified without jit, jit=7 is assumed. If JIT compila-
tion is successful when jitverify is set, the text "(JIT)" is added to tion is successful when jitverify is set, the text "(JIT)" is added to
the first output line after a match or non match when JIT-compiled code the first output line after a match or non match when JIT-compiled code
was actually used in the match. was actually used in the match.
@ -695,18 +707,18 @@ PATTERN MODIFIERS
/pattern/locale=fr_FR /pattern/locale=fr_FR
The given locale is set, pcre2_maketables() is called to build a set of The given locale is set, pcre2_maketables() is called to build a set of
character tables for the locale, and this is then passed to pcre2_com- character tables for the locale, and this is then passed to pcre2_com-
pile() when compiling the regular expression. The same tables are used pile() when compiling the regular expression. The same tables are used
when matching the following subject lines. The /locale modifier applies when matching the following subject lines. The /locale modifier applies
only to the pattern on which it appears, but can be given in a #pattern only to the pattern on which it appears, but can be given in a #pattern
command if a default is needed. Setting a locale and alternate charac- command if a default is needed. Setting a locale and alternate charac-
ter tables are mutually exclusive. ter tables are mutually exclusive.
Showing pattern memory Showing pattern memory
The /memory modifier causes the size in bytes of the memory used to The /memory modifier causes the size in bytes of the memory used to
hold the compiled pattern to be output. This does not include the size hold the compiled pattern to be output. This does not include the size
of the pcre2_code block; it is just the actual compiled data. If the of the pcre2_code block; it is just the actual compiled data. If the
pattern is subsequently passed to the JIT compiler, the size of the JIT pattern is subsequently passed to the JIT compiler, the size of the JIT
compiled code is also output. Here is an example: compiled code is also output. Here is an example:
@ -717,31 +729,31 @@ PATTERN MODIFIERS
Limiting nested parentheses Limiting nested parentheses
The parens_nest_limit modifier sets a limit on the depth of nested The parens_nest_limit modifier sets a limit on the depth of nested
parentheses in a pattern. Breaching the limit causes a compilation parentheses in a pattern. Breaching the limit causes a compilation
error. The default for the library is set when PCRE2 is built, but error. The default for the library is set when PCRE2 is built, but
pcre2test sets its own default of 220, which is required for running pcre2test sets its own default of 220, which is required for running
the standard test suite. the standard test suite.
Limiting the pattern length Limiting the pattern length
The max_pattern_length modifier sets a limit, in code units, to the The max_pattern_length modifier sets a limit, in code units, to the
length of pattern that pcre2_compile() will accept. Breaching the limit length of pattern that pcre2_compile() will accept. Breaching the limit
causes a compilation error. The default is the largest number a causes a compilation error. The default is the largest number a
PCRE2_SIZE variable can hold (essentially unlimited). PCRE2_SIZE variable can hold (essentially unlimited).
Using the POSIX wrapper API Using the POSIX wrapper API
The /posix modifier causes pcre2test to call PCRE2 via the POSIX wrap- The /posix and posix_nosub modifiers cause pcre2test to call PCRE2 via
per API rather than its native API. This supports only the 8-bit the POSIX wrapper API rather than its native API. When posix_nosub is
library. Note that it does not imply POSIX matching semantics; for used, the POSIX option REG_NOSUB is passed to regcomp(). The POSIX
more detail see the pcre2posix documentation. When the POSIX API is wrapper supports only the 8-bit library. Note that it does not imply
being used, the following pattern modifiers set options for the reg- POSIX matching semantics; for more detail see the pcre2posix documenta-
comp() function: tion. The following pattern modifiers set options for the regcomp()
function:
caseless REG_ICASE caseless REG_ICASE
multiline REG_NEWLINE multiline REG_NEWLINE
no_auto_capture REG_NOSUB
dotall REG_DOTALL ) dotall REG_DOTALL )
ungreedy REG_UNGREEDY ) These options are not part of ungreedy REG_UNGREEDY ) These options are not part of
ucp REG_UCP ) the POSIX standard ucp REG_UCP ) the POSIX standard
@ -758,23 +770,24 @@ PATTERN MODIFIERS
been set, a large buffer is used. been set, a large buffer is used.
The aftertext and allaftertext subject modifiers work as described The aftertext and allaftertext subject modifiers work as described
below. All other modifiers cause an error. below. All other modifiers are either ignored, with a warning message,
or cause an error.
Testing the stack guard feature Testing the stack guard feature
The /stackguard modifier is used to test the use of pcre2_set_com- The /stackguard modifier is used to test the use of pcre2_set_com-
pile_recursion_guard(), a function that is provided to enable stack pile_recursion_guard(), a function that is provided to enable stack
availability to be checked during compilation (see the pcre2api docu- availability to be checked during compilation (see the pcre2api docu-
mentation for details). If the number specified by the modifier is mentation for details). If the number specified by the modifier is
greater than zero, pcre2_set_compile_recursion_guard() is called to set greater than zero, pcre2_set_compile_recursion_guard() is called to set
up callback from pcre2_compile() to a local function. The argument it up callback from pcre2_compile() to a local function. The argument it
receives is the current nesting parenthesis depth; if this is greater receives is the current nesting parenthesis depth; if this is greater
than the value given by the modifier, non-zero is returned, causing the than the value given by the modifier, non-zero is returned, causing the
compilation to be aborted. compilation to be aborted.
Using alternative character tables Using alternative character tables
The value specified for the /tables modifier must be one of the digits The value specified for the /tables modifier must be one of the digits
0, 1, or 2. It causes a specific set of built-in character tables to be 0, 1, or 2. It causes a specific set of built-in character tables to be
passed to pcre2_compile(). This is used in the PCRE2 tests to check be- passed to pcre2_compile(). This is used in the PCRE2 tests to check be-
haviour with different character tables. The digit specifies the tables haviour with different character tables. The digit specifies the tables
@ -785,15 +798,15 @@ PATTERN MODIFIERS
pcre2_chartables.c.dist pcre2_chartables.c.dist
2 a set of tables defining ISO 8859 characters 2 a set of tables defining ISO 8859 characters
In table 2, some characters whose codes are greater than 128 are iden- In table 2, some characters whose codes are greater than 128 are iden-
tified as letters, digits, spaces, etc. Setting alternate character tified as letters, digits, spaces, etc. Setting alternate character
tables and a locale are mutually exclusive. tables and a locale are mutually exclusive.
Setting certain match controls Setting certain match controls
The following modifiers are really subject modifiers, and are described The following modifiers are really subject modifiers, and are described
below. However, they may be included in a pattern's modifier list, in below. However, they may be included in a pattern's modifier list, in
which case they are applied to every subject line that is processed which case they are applied to every subject line that is processed
with that pattern. They may not appear in #pattern commands. These mod- with that pattern. They may not appear in #pattern commands. These mod-
ifiers do not affect the compilation process. ifiers do not affect the compilation process.
@ -810,20 +823,20 @@ PATTERN MODIFIERS
substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET substitute_unknown_unset use PCRE2_SUBSTITUTE_UNKNOWN_UNSET
substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY
These modifiers may not appear in a #pattern command. If you want them These modifiers may not appear in a #pattern command. If you want them
as defaults, set them in a #subject command. as defaults, set them in a #subject command.
Saving a compiled pattern Saving a compiled pattern
When a pattern with the push modifier is successfully compiled, it is When a pattern with the push modifier is successfully compiled, it is
pushed onto a stack of compiled patterns, and pcre2test expects the pushed onto a stack of compiled patterns, and pcre2test expects the
next line to contain a new pattern (or a command) instead of a subject next line to contain a new pattern (or a command) instead of a subject
line. This facility is used when saving compiled patterns to a file, as line. This facility is used when saving compiled patterns to a file, as
described in the section entitled "Saving and restoring compiled pat- described in the section entitled "Saving and restoring compiled pat-
terns" below. The push modifier is incompatible with compilation modi- terns" below. The push modifier is incompatible with compilation modi-
fiers such as global that act at match time. Any that are specified are fiers such as global that act at match time. Any that are specified are
ignored, with a warning message, except for replace, which causes an ignored, with a warning message, except for replace, which causes an
error. Note that, jitverify, which is allowed, does not carry through error. Note that, jitverify, which is allowed, does not carry through
to any subsequent matching that uses this pattern. to any subsequent matching that uses this pattern.
@ -834,7 +847,7 @@ SUBJECT MODIFIERS
Setting match options Setting match options
The following modifiers set options for pcre2_match() or The following modifiers set options for pcre2_match() or
pcre2_dfa_match(). See pcreapi for a description of their effects. pcre2_dfa_match(). See pcreapi for a description of their effects.
anchored set PCRE2_ANCHORED anchored set PCRE2_ANCHORED
@ -848,20 +861,20 @@ SUBJECT MODIFIERS
partial_hard (or ph) set PCRE2_PARTIAL_HARD partial_hard (or ph) set PCRE2_PARTIAL_HARD
partial_soft (or ps) set PCRE2_PARTIAL_SOFT partial_soft (or ps) set PCRE2_PARTIAL_SOFT
The partial matching modifiers are provided with abbreviations because The partial matching modifiers are provided with abbreviations because
they appear frequently in tests. they appear frequently in tests.
If the /posix modifier was present on the pattern, causing the POSIX If the /posix modifier was present on the pattern, causing the POSIX
wrapper API to be used, the only option-setting modifiers that have any wrapper API to be used, the only option-setting modifiers that have any
effect are notbol, notempty, and noteol, causing REG_NOTBOL, effect are notbol, notempty, and noteol, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec(). REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to regexec().
Any other modifiers cause an error. The other modifiers are ignored, with a warning message.
Setting match controls Setting match controls
The following modifiers affect the matching process or request addi- The following modifiers affect the matching process or request addi-
tional information. Some of them may also be specified on a pattern tional information. Some of them may also be specified on a pattern
line (see above), in which case they apply to every subject line that line (see above), in which case they apply to every subject line that
is matched against that pattern. is matched against that pattern.
aftertext show text after match aftertext show text after match
@ -898,6 +911,9 @@ SUBJECT MODIFIERS
zero_terminate pass the subject as zero-terminated zero_terminate pass the subject as zero-terminated
The effects of these modifiers are described in the following sections. The effects of these modifiers are described in the following sections.
When matching via the POSIX wrapper API, the aftertext, allaftertext,
and ovector subject modifiers work as described below. All other modi-
fiers are either ignored, with a warning message, or cause an error.
Showing more text Showing more text
@ -1472,9 +1488,9 @@ SAVING AND RESTORING COMPILED PATTERNS
matched with the pattern, terminated as usual by an empty line or end matched with the pattern, terminated as usual by an empty line or end
of file. This command may be followed by a modifier list containing of file. This command may be followed by a modifier list containing
only control modifiers that act after a pattern has been compiled. In only control modifiers that act after a pattern has been compiled. In
particular, hex, posix, and push are not allowed, nor are any option- particular, hex, posix, posix_nosub, and push are not allowed, nor are
setting modifiers. The JIT modifiers are, however permitted. Here is any option-setting modifiers. The JIT modifiers are, however permit-
an example that saves and reloads two patterns. ted. Here is an example that saves and reloads two patterns.
/abc/push /abc/push
/xyz/push /xyz/push
@ -1505,5 +1521,5 @@ AUTHOR
REVISION REVISION
Last updated: 12 December 2015 Last updated: 31 January 2016
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.

View File

@ -3,28 +3,31 @@
*************************************************/ *************************************************/
/* This is a demonstration program to illustrate a straightforward way of /* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the using the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API. incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library. width. This demonstration program uses the 8-bit library. The default is to
process each code unit as a separate character, but if the pattern begins with
"(*UTF)", both it and the subject are treated as UTF-8 strings, where
characters may occupy multiple code units.
In Unix-like environments, if PCRE2 is installed in your standard system In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command: libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command: compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this: If you do not have pkg-config, you may have to use something like this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo -R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
@ -39,9 +42,14 @@ the following line. */
/* #define PCRE2_STATIC */ /* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
only one code unit width, it makes it possible to use generic function names For a program that uses only one code unit width, setting it to 8, 16, or 32
such as pcre2_compile(). */ makes it possible to use generic function names such as pcre2_compile(). Note
that just changing 8 to 16 (for example) is not sufficient to convert this
program to process 16-bit characters. Even in a fully 16-bit environment, where
string-handling functions such as strcmp() and printf() work with 16-bit
characters, the code for handling the table of named substrings will still need
to be modified. */
#define PCRE2_CODE_UNIT_WIDTH 8 #define PCRE2_CODE_UNIT_WIDTH 8
@ -62,19 +70,19 @@ int main(int argc, char **argv)
{ {
pcre2_code *re; pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */ PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
PCRE2_SPTR name_table; PCRE2_SPTR name_table;
int crlf_is_newline; int crlf_is_newline;
int errornumber; int errornumber;
int find_all; int find_all;
int i; int i;
int namecount;
int name_entry_size;
int rc; int rc;
int utf8; int utf8;
uint32_t option_bits; uint32_t option_bits;
uint32_t namecount;
uint32_t name_entry_size;
uint32_t newline; uint32_t newline;
PCRE2_SIZE erroroffset; PCRE2_SIZE erroroffset;
@ -89,15 +97,19 @@ pcre2_match_data *match_data;
* First, sort out the command line. There is only one possible option at * * First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, * * the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value * * like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two * * if the -g option is present. *
* arguments. *
**************************************************************************/ **************************************************************************/
find_all = 0; find_all = 0;
for (i = 1; i < argc; i++) for (i = 1; i < argc; i++)
{ {
if (strcmp(argv[i], "-g") == 0) find_all = 1; if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break; else if (argv[i][0] == '-')
{
printf("Unrecognised option %s\n", argv[i]);
return 1;
}
else break;
} }
/* After the options, we require exactly two arguments, which are the pattern, /* After the options, we require exactly two arguments, which are the pattern,
@ -105,7 +117,7 @@ and the subject string. */
if (argc - i != 2) if (argc - i != 2)
{ {
printf("Two arguments required: a regex and a subject string\n"); printf("Exactly two arguments required: a regex and a subject string\n");
return 1; return 1;
} }
@ -184,7 +196,7 @@ if (rc < 0)
stored. */ stored. */
ovector = pcre2_get_ovector_pointer(match_data); ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]); printf("Match succeeded at offset %d\n", (int)ovector[0]);
/************************************************************************* /*************************************************************************
@ -225,7 +237,7 @@ we have to extract the count of named parentheses from the pattern. */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */ &namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr; PCRE2_SPTR tabptr;
printf("Named substrings\n"); printf("Named substrings\n");
@ -313,8 +325,8 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
for (;;) for (;;)
{ {
uint32_t options = 0; /* Normally no options */ uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are /* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the at the end of the subject. Otherwise, arrange to run another match at the
@ -354,7 +366,7 @@ for (;;)
{ {
if (options == 0) break; /* All matches found */ if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */ ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */ if (crlf_is_newline && /* If CRLF is a newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */ start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\r' && subject[start_offset] == '\r' &&
subject[start_offset + 1] == '\n') subject[start_offset + 1] == '\n')
@ -400,7 +412,7 @@ for (;;)
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
} }
if (namecount <= 0) printf("No named substrings\n"); else if (namecount == 0) printf("No named substrings\n"); else
{ {
PCRE2_SPTR tabptr = name_table; PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n"); printf("Named substrings\n");