Documentation scripts

This commit is contained in:
Philip.Hazel 2014-09-23 11:35:51 +00:00
parent a625f0ea01
commit d5495a30f4
19 changed files with 10412 additions and 6 deletions

313
132html Executable file
View File

@ -0,0 +1,313 @@
#! /usr/bin/perl -w
# Script to turn PCRE2 man pages into HTML
# Subroutine to handle font changes and other escapes
sub do_line {
my($s) = $_[0];
$s =~ s/</&#60;/g; # Deal with < and >
$s =~ s/>/&#62;/g;
$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
$s =~ s"\\e"\\"g;
$s =~ s/(?<=Copyright )\(c\)/&copy;/g;
$s;
}
# Subroutine to ensure not in a paragraph
sub end_para {
if ($inpara)
{
print TEMP "</PRE>\n" if ($inpre);
print TEMP "</P>\n";
}
$inpara = $inpre = 0;
$wrotetext = 0;
}
# Subroutine to start a new paragraph
sub new_para {
&end_para();
print TEMP "<P>\n";
$inpara = 1;
}
# Main program
$innf = 0;
$inpara = 0;
$inpre = 0;
$wrotetext = 0;
$toc = 0;
$ref = 1;
while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
{
$toc = 1 if $ARGV[0] eq "-toc";
shift;
}
# Initial output to STDOUT
print <<End ;
<html>
<head>
<title>$ARGV[0] specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>$ARGV[0] man page</h1>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>
<p>
This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong.
<br>
End
print "<ul>\n" if ($toc);
open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
while (<STDIN>)
{
# Handle lines beginning with a dot
if (/^\./)
{
# Some of the PCRE2 man pages used to contain instances of .br. However,
# they should have all been removed because they cause trouble in some
# (other) automated systems that translate man pages to HTML. Complain if
# we find .br or .in (another macro that is deprecated).
if (/^\.br/ || /^\.in/)
{
print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
print STDERR "*** $_\n";
die "*** Processing abandoned\n";
}
# Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
elsif (/^\.nf/)
{
$innf = 1;
}
elsif (/^\.fi/)
{
$innf = 0;
}
# Handling .sp is subtle. If it is inside a literal section, do nothing if
# the next line is a non literal text line; similarly, if not inside a
# literal section, do nothing if a literal follows, unless we are inside
# a .nf/.ne section. The point being that the <pre> and </pre> that delimit
# literal sections will do the spacing. Always skip if no previous output.
elsif (/^\.sp/)
{
if ($wrotetext)
{
$_ = <STDIN>;
if ($inpre)
{
print TEMP "\n" if (/^[\s.]/);
}
else
{
print TEMP "<br>\n<br>\n" if ($innf || !/^[\s.]/);
}
redo; # Now process the lookahead line we just read
}
}
elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
{
&new_para();
}
elsif (/^\.SH\s*("?)(.*)\1/)
{
# Ignore the NAME section
if ($2 =~ /^NAME\b/)
{
<STDIN>;
next;
}
&end_para();
my($title) = &do_line($2);
if ($toc)
{
printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
$ref, $ref);
printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
$ref, $ref);
$ref++;
}
else
{
print TEMP "<br><b>\n$title\n</b><br>\n";
}
}
elsif (/^\.SS\s*("?)(.*)\1/)
{
&end_para();
my($title) = &do_line($2);
print TEMP "<br><b>\n$title\n</b><br>\n";
}
elsif (/^\.B\s*(.*)/)
{
&new_para() if (!$inpara);
$_ = &do_line($1);
s/"(.*?)"/$1/g;
print TEMP "<b>$_</b>\n";
$wrotetext = 1;
}
elsif (/^\.I\s*(.*)/)
{
&new_para() if (!$inpara);
$_ = &do_line($1);
s/"(.*?)"/$1/g;
print TEMP "<i>$_</i>\n";
$wrotetext = 1;
}
# A comment that starts "HREF" takes the next line as a name that
# is turned into a hyperlink, using the text given, which might be
# in a special font. If it ends in () or (digits) or punctuation, they
# aren't part of the link.
elsif (/^\.\\"\s*HREF/)
{
$_=<STDIN>;
chomp;
$_ = &do_line($_);
$_ =~ s/\s+$//;
$_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
print TEMP "<a href=\"$1.html\">$_</a>\n";
}
# A comment that starts "HTML" inserts literal HTML
elsif (/^\.\\"\s*HTML\s*(.*)/)
{
print TEMP $1;
}
# A comment that starts < inserts that HTML at the end of the
# *next* input line - so as not to get a newline between them.
elsif (/^\.\\"\s*(<.*>)/)
{
my($markup) = $1;
$_=<STDIN>;
chomp;
$_ = &do_line($_);
$_ =~ s/\s+$//;
print TEMP "$_$markup\n";
}
# A comment that starts JOIN joins the next two lines together, with one
# space between them. Then that line is processed. This is used in some
# displays where two lines are needed for the "man" version. JOINSH works
# the same, except that it assumes this is a shell command, so removes
# continuation backslashes.
elsif (/^\.\\"\s*JOIN(SH)?/)
{
my($one,$two);
$one = <STDIN>;
$two = <STDIN>;
$one =~ s/\s*\\e\s*$// if (defined($1));
chomp($one);
$two =~ s/^\s+//;
$_ = "$one $two";
redo; # Process the joined lines
}
# .EX/.EE are used in the pcredemo page to bracket the entire program,
# which is unmodified except for turning backslash into "\e".
elsif (/^\.EX\s*$/)
{
print TEMP "<PRE>\n";
while (<STDIN>)
{
last if /^\.EE\s*$/;
s/\\e/\\/g;
s/&/&amp;/g;
s/</&lt;/g;
s/>/&gt;/g;
print TEMP;
}
}
# Ignore anything not recognized
next;
}
# Line does not begin with a dot. Replace blank lines with new paragraphs
if (/^\s*$/)
{
&end_para() if ($wrotetext);
next;
}
# Convert fonts changes and output an ordinary line. Ensure that indented
# lines are marked as literal.
$_ = &do_line($_);
&new_para() if (!$inpara);
if (/^\s/)
{
if (!$inpre)
{
print TEMP "<pre>\n";
$inpre = 1;
}
}
elsif ($inpre)
{
print TEMP "</pre>\n";
$inpre = 0;
}
# Add <br> to the end of a non-literal line if we are within .nf/.fi
$_ .= "<br>\n" if (!$inpre && $innf);
print TEMP;
$wrotetext = 1;
}
# The TOC, if present, will have been written - terminate it
print "</ul>\n" if ($toc);
# Copy the remainder to the standard output
close(TEMP);
open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
print while (<TEMP>);
print <<End ;
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>
End
close(TEMP);
unlink("/tmp/$$");
# End

67
CheckMan Executable file
View File

@ -0,0 +1,67 @@
#! /usr/bin/perl
# A script to scan PCRE2's man pages to check for typos in the control
# sequences. I use only a small set of the available repertoire, so it is
# straightforward to check that nothing else has slipped in by mistake. This
# script should be called in the doc directory.
$yield = 0;
while (scalar(@ARGV) > 0)
{
$line = 0;
$file = shift @ARGV;
open (IN, $file) || die "Failed to open $file\n";
while (<IN>)
{
$line++;
if (/^\s*$/)
{
printf "Empty line $line of $file\n";
$yield = 1;
}
elsif (/^\./)
{
if (!/^\.\s*$|
^\.B\s+\S|
^\.TH\s\S|
^\.SH\s\S|
^\.SS\s\S|
^\.TP(?:\s?\d+)?\s*$|
^\.SM\s*$|
^\.br\s*$|
^\.rs\s*$|
^\.sp\s*$|
^\.nf\s*$|
^\.fi\s*$|
^\.P\s*$|
^\.PP\s*$|
^\.\\"(?:\ HREF)?\s*$|
^\.\\"\sHTML\s<a\shref="[^"]+?">\s*$|
^\.\\"\sHTML\s<a\sname="[^"]+?"><\/a>\s*$|
^\.\\"\s<\/a>\s*$|
^\.\\"\sJOINSH\s*$|
^\.\\"\sJOIN\s*$/x
)
{
printf "Bad control line $line of $file\n";
$yield = 1;
}
}
else
{
if (/\\[^ef]|\\f[^IBP]/)
{
printf "Bad backslash in line $line of $file\n";
$yield = 1;
}
}
}
close(IN);
}
exit $yield;
# End

113
CleanTxt Executable file
View File

@ -0,0 +1,113 @@
#! /usr/bin/perl -w
# Script to take the output of nroff -man and remove all the backspacing and
# the page footers and the screen commands etc so that it is more usefully
# readable online. In fact, in the latest nroff, intermediate footers don't
# seem to be generated any more.
$blankcount = 0;
$lastwascut = 0;
$firstheader = 1;
# Input on STDIN; output to STDOUT.
while (<STDIN>)
{
s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
s/.\x8//g; # Remove "char, backspace"
# Handle header lines. Retain only the first one we encounter, but remove
# the blank line that follows. Any others (e.g. at end of document) and the
# following blank line are dropped.
if (/^PCRE(\w*)\(([13])\)\s+PCRE\1\(\2\)$/)
{
if ($firstheader)
{
$firstheader = 0;
print;
$lastprinted = $_;
$lastwascut = 0;
}
$_=<STDIN>; # Remove a blank that follows
next;
}
# Count runs of empty lines
if (/^\s*$/)
{
$blankcount++;
$lastwascut = 0;
next;
}
# If a chunk of lines has been cut out (page footer) and the next line
# has a different indentation, put back one blank line.
if ($lastwascut && $blankcount < 1 && defined($lastprinted))
{
($a) = $lastprinted =~ /^(\s*)/;
($b) = $_ =~ /^(\s*)/;
$blankcount++ if ($a ne $b);
}
# We get here only when we have a non-blank line in hand. If it was preceded
# by 3 or more blank lines, read the next 3 lines and see if they are blank.
# If so, remove all 7 lines, and remember that we have just done a cut.
if ($blankcount >= 3)
{
for ($i = 0; $i < 3; $i++)
{
$next[$i] = <STDIN>;
$next[$i] = "" if !defined $next[$i];
$next[$i] =~ s/\x1b\[\d+m//g; # Remove screen controls "ESC [ number m"
$next[$i] =~ s/.\x8//g; # Remove "char, backspace"
}
# Cut out chunks of the form <3 blanks><non-blank><3 blanks>
if ($next[0] =~ /^\s*$/ &&
$next[1] =~ /^\s*$/ &&
$next[2] =~ /^\s*$/)
{
$blankcount -= 3;
$lastwascut = 1;
}
# Otherwise output the saved blanks, the current, and the next three
# lines. Remember the last printed line.
else
{
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
print;
for ($i = 0; $i < 3; $i++)
{
$next[$i] =~ s/.\x8//g;
print $next[$i];
$lastprinted = $_;
}
$lastwascut = 0;
$blankcount = 0;
}
}
# This non-blank line is not preceded by 3 or more blank lines. Output
# any blanks there are, and the line. Remember it. Force two blank lines
# before headings.
else
{
$blankcount = 2 if /^\S/ && !/^Last updated/ && !/^Copyright/ &&
defined($lastprinted);
for ($i = 0; $i < $blankcount; $i++) { print "\n"; }
print;
$lastprinted = $_;
$lastwascut = 0;
$blankcount = 0;
}
}
# End

35
Detrail Executable file
View File

@ -0,0 +1,35 @@
#!/usr/bin/perl
# This is a script for removing trailing whitespace from lines in files that
# are listed on the command line.
# This subroutine does the work for one file.
sub detrail {
my($file) = $_[0];
my($changed) = 0;
open(IN, "$file") || die "Can't open $file for input";
@lines = <IN>;
close(IN);
foreach (@lines)
{
if (/\s+\n$/)
{
s/\s+\n$/\n/;
$changed = 1;
}
}
if ($changed)
{
open(OUT, ">$file") || die "Can't open $file for output";
print OUT @lines;
close(OUT);
}
}
# This is the main program
$, = ""; # Output field separator
for ($i = 0; $i < @ARGV; $i++) { &detrail($ARGV[$i]); }
# End

265
PrepareRelease Executable file
View File

@ -0,0 +1,265 @@
#/bin/sh
# Script to prepare the files for building a PCRE2 release. It does some
# processing of the documentation, detrails files, and creates pcre2.h.generic
# and config.h.generic (for use by builders who can't run ./configure).
# You must run this script before runnning "make dist". If its first argument
# is "doc", it stops after preparing the documentation. There are no other
# arguments. The script makes use of the following files:
# 132html A Perl script that converts a .1 or .3 man page into HTML. It
# "knows" the relevant troff constructs that are used in the PCRE2
# man pages.
# CheckMan A Perl script that checks man pages for typos in the mark up.
# CleanTxt A Perl script that cleans up the output of "nroff -man" by
# removing backspaces and other redundant text so as to produce
# a readable .txt file.
# Detrail A Perl script that removes trailing spaces from files.
# doc/index.html.src
# A file that is copied as index.html into the doc/html directory
# when the HTML documentation is built. It works like this so that
# doc/html can be deleted and re-created from scratch.
# README & NON-AUTOTOOLS-BUILD
# These files are copied into the doc/html directory, with .txt
# extensions so that they can by hyperlinked from the HTML
# documentation, because some people just go to the HTML without
# looking for text files.
# First, sort out the documentation. Remove pcre2demo.3 first because it won't
# pass the markup check (it is created below, using markup that none of the
# other pages use).
cd doc
echo Processing documentation
/bin/rm -f pcre2demo.3
# Check the remaining man pages
perl ../CheckMan *.1 *.3
if [ $? != 0 ] ; then exit 1; fi
# Make Text form of the documentation. It needs some mangling to make it
# tidy for online reading. Concatenate all the .3 stuff, but omit the
# individual function pages.
cat <<End >pcre2.txt
-----------------------------------------------------------------------------
This file contains a concatenation of the PCRE2 man pages, converted to plain
text format for ease of searching with a text editor, or for use on systems
that do not have a man page processor. The small individual files that give
synopses of each function in the library have not been included. Neither has
the pcre2demo program. There are separate text files for the pcre2grep and
pcre2test commands.
-----------------------------------------------------------------------------
End
echo "Making pcre2.txt"
for file in pcre2api pcre2callout pcre2unicode ; do
#for file in pcre pcre16 pcre32 pcrebuild pcrematching \
# pcrecompat pcrepattern pcresyntax pcrejit pcrepartial \
# pcreprecompile pcreperform pcreposix pcrecpp pcresample \
# pcrelimits pcrestack ; do
echo " Processing $file.3"
nroff -c -man $file.3 >$file.rawtxt
perl ../CleanTxt <$file.rawtxt >>pcre2.txt
/bin/rm $file.rawtxt
echo "------------------------------------------------------------------------------" >>pcre2.txt
if [ "$file" != "pcre2sample" ] ; then
echo " " >>pcre2.txt
echo " " >>pcre2.txt
fi
done
# The three commands
for file in pcre2test ; do
# for file in pcre2test pcre2grep pcre-config ; do
echo Making $file.txt
nroff -c -man $file.1 >$file.rawtxt
perl ../CleanTxt <$file.rawtxt >$file.txt
/bin/rm $file.rawtxt
done
# Make pcre2demo.3 from the pcre2demo.c source file
echo "Making pcre2demo.3"
perl <<"END" >pcre2demo.3
open(IN, "../src/pcre2demo.c") || die "Failed to open src/pcre2demo.c\n";
open(OUT, ">pcre2demo.3") || die "Failed to open pcre2demo.3\n";
print OUT ".\\\" Start example.\n" .
".de EX\n" .
". nr mE \\\\n(.f\n" .
". nf\n" .
". nh\n" .
". ft CW\n" .
"..\n" .
".\n" .
".\n" .
".\\\" End example.\n" .
".de EE\n" .
". ft \\\\n(mE\n" .
". fi\n" .
". hy \\\\n(HY\n" .
"..\n" .
".\n" .
".EX\n" ;
while (<IN>)
{
s/\\/\\e/g;
print OUT;
}
print OUT ".EE\n";
close(IN);
close(OUT);
END
if [ $? != 0 ] ; then exit 1; fi
# Make HTML form of the documentation.
echo "Making HTML documentation"
/bin/rm html/*
cp index.html.src html/index.html
cp ../README html/README.txt
# cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt
for file in *.1 ; do
base=`basename $file .1`
echo " Making $base.html"
perl ../132html -toc $base <$file >html/$base.html
done
# Exclude table of contents for function summaries. It seems that expr
# forces an anchored regex. Also exclude them for small pages that have
# only one section.
for file in *.3 ; do
base=`basename $file .3`
toc=-toc
if [ `expr $base : '.*_'` -ne 0 ] ; then toc="" ; fi
if [ "$base" = "pcre2sample" ] || \
[ "$base" = "pcre2stack" ] || \
[ "$base" = "pcre2compat" ] || \
[ "$base" = "pcre2limits" ] || \
[ "$base" = "pcre2perform" ] || \
[ "$base" = "pcre2unicode" ] ; then
toc=""
fi
echo " Making $base.html"
perl ../132html $toc $base <$file >html/$base.html
if [ $? != 0 ] ; then exit 1; fi
done
# End of documentation processing; stop if only documentation required.
cd ..
echo Documentation done
if [ "$1" = "doc" ] ; then exit; fi
# FIXME pro tem only do docs
exit
# These files are detrailed; do not detrail the test data because there may be
# significant trailing spaces. Do not detrail RunTest.bat, because it has CRLF
# line endings and the detrail script removes all trailing white space. The
# configure files are also omitted from the detrailing. We don't bother with
# those pcre[16|32]_xx files that just define COMPILE_PCRE16 and then #include the
# common file, because they aren't going to change.
files="\
Makefile.am \
Makefile.in \
configure.ac \
README \
LICENCE \
COPYING \
AUTHORS \
NEWS \
NON-UNIX-USE \
NON-AUTOTOOLS-BUILD \
INSTALL \
132html \
CleanTxt \
Detrail \
ChangeLog \
CMakeLists.txt \
RunGrepTest \
RunTest \
pcre-config.in \
libpcre.pc.in \
libpcre16.pc.in \
libpcre32.pc.in \
libpcreposix.pc.in \
libpcrecpp.pc.in \
config.h.in \
pcre_chartables.c.dist \
pcredemo.c \
pcregrep.c \
pcretest.c \
dftables.c \
pcreposix.c \
pcreposix.h \
pcre.h.in \
pcre_internal.h \
pcre_byte_order.c \
pcre_compile.c \
pcre_config.c \
pcre_dfa_exec.c \
pcre_exec.c \
pcre_fullinfo.c \
pcre_get.c \
pcre_globals.c \
pcre_jit_compile.c \
pcre_jit_test.c \
pcre_maketables.c \
pcre_newline.c \
pcre_ord2utf8.c \
pcre16_ord2utf16.c \
pcre32_ord2utf32.c \
pcre_printint.c \
pcre_refcount.c \
pcre_string_utils.c \
pcre_study.c \
pcre_tables.c \
pcre_valid_utf8.c \
pcre_version.c \
pcre_xclass.c \
pcre16_utf16_utils.c \
pcre32_utf32_utils.c \
pcre16_valid_utf16.c \
pcre32_valid_utf32.c \
pcre_scanner.cc \
pcre_scanner.h \
pcre_scanner_unittest.cc \
pcrecpp.cc \
pcrecpp.h \
pcrecpparg.h.in \
pcrecpp_unittest.cc \
pcre_stringpiece.cc \
pcre_stringpiece.h.in \
pcre_stringpiece_unittest.cc \
perltest.pl \
ucp.h \
makevp.bat \
pcre.def \
libpcre.def \
libpcreposix.def"
echo Detrailing
perl ./Detrail $files doc/p* doc/html/*
echo Done
#End

1
doc/html/README.txt Normal file
View File

@ -0,0 +1 @@
This is a placeholder README file for a work in progress.

177
doc/html/index.html Normal file
View File

@ -0,0 +1,177 @@
<html>
<!-- This is a manually maintained file that is the root of the HTML version of
the PCRE2 documentation. When the HTML documents are built from the man
page versions, the entire doc/html directory is emptied, this file is then
copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
-->
<head>
<title>PCRE2 specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>Perl-compatible Regular Expressions (revised API: PCRE2)</h1>
<p>
The HTML documentation for PCRE2 consists of a number of pages that are listed
below in alphabetical order. If you are new to PCRE2, please read the first one
first.
</p>
<table>
<tr><td><a href="pcre2.html">pcre</a></td>
<td>&nbsp;&nbsp;Introductory page</td></tr>
<tr><td><a href="pcre2-config.html">pcre-config</a></td>
<td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
<tr><td><a href="pcre2api.html">pcreapi</a></td>
<td>&nbsp;&nbsp;PCRE2's native API</td></tr>
<tr><td><a href="pcre2build.html">pcrebuild</a></td>
<td>&nbsp;&nbsp;Building PCRE2</td></tr>
<tr><td><a href="pcre2callout.html">pcre2callout</a></td>
<td>&nbsp;&nbsp;The <i>callout</i> facility</td></tr>
<tr><td><a href="pcre2compat.html">pcre2compat</a></td>
<td>&nbsp;&nbsp;Compability with Perl</td></tr>
<tr><td><a href="pcre2demo.html">pcre2demo</a></td>
<td>&nbsp;&nbsp;A demonstration C program that uses the PCRE2 library</td></tr>
<tr><td><a href="pcre2grep.html">pcre2grep</a></td>
<td>&nbsp;&nbsp;The <b>pcre2grep</b> command</td></tr>
<tr><td><a href="pcre2jit.html">pcre2jit</a></td>
<td>&nbsp;&nbsp;Discussion of the just-in-time optimization support</td></tr>
<tr><td><a href="pcre2limits.html">pcre2limits</a></td>
<td>&nbsp;&nbsp;Details of size and other limits</td></tr>
<tr><td><a href="pcre2matching.html">pcre2matching</a></td>
<td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
<tr><td><a href="pcre2partial.html">pcre2partial</a></td>
<td>&nbsp;&nbsp;Using PCRE2 for partial matching</td></tr>
<tr><td><a href="pcre2pattern.html">pcre2pattern</a></td>
<td>&nbsp;&nbsp;Specification of the regular expressions supported by PCRE2</td></tr>
<tr><td><a href="pcre2perform.html">pcre2perform</a></td>
<td>&nbsp;&nbsp;Some comments on performance</td></tr>
<tr><td><a href="pcre2posix.html">pcre2posix</a></td>
<td>&nbsp;&nbsp;The POSIX API to the PCRE2 8-bit library</td></tr>
<tr><td><a href="pcre2precompile.html">pcre2precompile</a></td>
<td>&nbsp;&nbsp;How to save and re-use compiled patterns</td></tr>
<tr><td><a href="pcre2sample.html">pcre2sample</a></td>
<td>&nbsp;&nbsp;Discussion of the pcre2demo program</td></tr>
<tr><td><a href="pcre2stack.html">pcre2stack</a></td>
<td>&nbsp;&nbsp;Discussion of PCRE2's stack usage</td></tr>
<tr><td><a href="pcre2syntax.html">pcre2syntax</a></td>
<td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
<tr><td><a href="pcre2test.html">pcre2test</a></td>
<td>&nbsp;&nbsp;The <b>pcre2test</b> command for testing PCRE2</td></tr>
<tr><td><a href="pcre2unicode.html">pcre2unicode</a></td>
<td>&nbsp;&nbsp;Discussion of Unicode and UTF-8/UTF-16/UTF-32 support</td></tr>
</table>
<p>
There are also individual pages that summarize the interface for each function
in the library. There is a single page for each triple of 8-bit/16-bit/32-bit
functions.
</p>
<table>
<tr><td><a href="pcre2_assign_jit_stack.html">pcre2_assign_jit_stack</a></td>
<td>&nbsp;&nbsp;Assign stack for JIT matching</td></tr>
<tr><td><a href="pcre2_compile.html">pcre2_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
<tr><td><a href="pcre2_compile2.html">pcre2_compile2</a></td>
<td>&nbsp;&nbsp;Compile a regular expression (alternate interface)</td></tr>
<tr><td><a href="pcre2_config.html">pcre2_config</a></td>
<td>&nbsp;&nbsp;Show build-time configuration options</td></tr>
<tr><td><a href="pcre2_copy_named_substring.html">pcre2_copy_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into given buffer</td></tr>
<tr><td><a href="pcre2_copy_substring.html">pcre2_copy_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into given buffer</td></tr>
<tr><td><a href="pcre2_dfa_exec.html">pcre2_dfa_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
<tr><td><a href="pcre2_exec.html">pcre2_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(Perl compatible)</td></tr>
<tr><td><a href="pcre2_free_study.html">pcre2_free_study</a></td>
<td>&nbsp;&nbsp;Free study data</td></tr>
<tr><td><a href="pcre2_free_substring.html">pcre2_free_substring</a></td>
<td>&nbsp;&nbsp;Free extracted substring</td></tr>
<tr><td><a href="pcre2_free_substring_list.html">pcre2_free_substring_list</a></td>
<td>&nbsp;&nbsp;Free list of extracted substrings</td></tr>
<tr><td><a href="pcre2_fullinfo.html">pcre2_fullinfo</a></td>
<td>&nbsp;&nbsp;Extract information about a pattern</td></tr>
<tr><td><a href="pcre2_get_named_substring.html">pcre2_get_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into new memory</td></tr>
<tr><td><a href="pcre2_get_stringnumber.html">pcre2_get_stringnumber</a></td>
<td>&nbsp;&nbsp;Convert captured string name to number</td></tr>
<tr><td><a href="pcre2_get_stringtable_entries.html">pcre2_get_stringtable_entries</a></td>
<td>&nbsp;&nbsp;Find table entries for given string name</td></tr>
<tr><td><a href="pcre2_get_substring.html">pcre2_get_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into new memory</td></tr>
<tr><td><a href="pcre2_get_substring_list.html">pcre2_get_substring_list</a></td>
<td>&nbsp;&nbsp;Extract all substrings into new memory</td></tr>
<tr><td><a href="pcre2_jit_exec.html">pcre2_jit_exec</a></td>
<td>&nbsp;&nbsp;Fast path interface to JIT matching</td></tr>
<tr><td><a href="pcre2_jit_stack_alloc.html">pcre2_jit_stack_alloc</a></td>
<td>&nbsp;&nbsp;Create a stack for JIT matching</td></tr>
<tr><td><a href="pcre2_jit_stack_free.html">pcre2_jit_stack_free</a></td>
<td>&nbsp;&nbsp;Free a JIT matching stack</td></tr>
<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
<tr><td><a href="pcre2_pattern_to_host_byte_order.html">pcre2_pattern_to_host_byte_order</a></td>
<td>&nbsp;&nbsp;Convert compiled pattern to host byte order if necessary</td></tr>
<tr><td><a href="pcre2_refcount.html">pcre2_refcount</a></td>
<td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
<tr><td><a href="pcre2_study.html">pcre2_study</a></td>
<td>&nbsp;&nbsp;Study a compiled pattern</td></tr>
<tr><td><a href="pcre2_utf16_to_host_byte_order.html">pcre2_utf16_to_host_byte_order</a></td>
<td>&nbsp;&nbsp;Convert UTF-16 string to host byte order if necessary</td></tr>
<tr><td><a href="pcre2_utf32_to_host_byte_order.html">pcre2_utf32_to_host_byte_order</a></td>
<td>&nbsp;&nbsp;Convert UTF-32 string to host byte order if necessary</td></tr>
<tr><td><a href="pcre2_version.html">pcre2_version</a></td>
<td>&nbsp;&nbsp;Return PCRE2 version and release date</td></tr>
</table>
</html>

2659
doc/html/pcre2api.html Normal file

File diff suppressed because it is too large Load Diff

270
doc/html/pcre2callout.html Normal file
View File

@ -0,0 +1,270 @@
<html>
<head>
<title>pcre2callout specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre2callout man page</h1>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>
<p>
This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong.
<br>
<ul>
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
<li><a name="TOC3" href="#SEC3">MISSING CALLOUTS</a>
<li><a name="TOC4" href="#SEC4">THE CALLOUT INTERFACE</a>
<li><a name="TOC5" href="#SEC5">RETURN VALUES</a>
<li><a name="TOC6" href="#SEC6">AUTHOR</a>
<li><a name="TOC7" href="#SEC7">REVISION</a>
</ul>
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
<b>#include &#60;pcre2.h&#62;</b>
</P>
<P>
<b>int (*pcre2_callout)(pcre2_callout_block *);</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
PCRE2 provides a feature called "callout", which is a means of temporarily
passing control to the caller of PCRE2 in the middle of pattern matching. The
caller of PCRE2 provides an external function by putting its entry point in
a match context (see <b>pcre2_set_callout()</b>) in the
<a href="pcre2api.html"><b>pcre2api</b></a>
documentation).
</P>
<P>
Within a regular expression, (?C) indicates the points at which the external
function is to be called. Different callout points can be identified by putting
a number less than 256 after the letter C. The default value is zero.
For example, this pattern has two callout points:
<pre>
(?C1)abc(?C2)def
</pre>
If the PCRE2_AUTO_CALLOUT option bit is set when a pattern is compiled, PCRE2
automatically inserts callouts, all with number 255, before each item in the
pattern. For example, if PCRE2_AUTO_CALLOUT is used with the pattern
<pre>
A(\d{2}|--)
</pre>
it is processed as if it were
<br>
<br>
(?C255)A(?C255)((?C255)\d{2}(?C255)|(?C255)-(?C255)-(?C255))(?C255)
<br>
<br>
Notice that there is a callout before and after each parenthesis and
alternation bar. If the pattern contains a conditional group whose condition is
an assertion, an automatic callout is inserted immediately before the
condition. Such a callout may also be inserted explicitly, for example:
<pre>
(?(?C9)(?=a)ab|de)
</pre>
This applies only to assertion conditions (because they are themselves
independent groups).
</P>
<P>
Automatic callouts can be used for tracking the progress of pattern matching.
The
<a href="pcre2test.html"><b>pcre2test</b></a>
program has a pattern qualifier (/auto_callout) that sets automatic callouts;
when it is used, the output indicates how the pattern is being matched. This is
useful information when you are trying to optimize the performance of a
particular pattern.
</P>
<br><a name="SEC3" href="#TOC1">MISSING CALLOUTS</a><br>
<P>
You should be aware that, because of optimizations in the way PCRE2 compiles
and matches patterns, callouts sometimes do not happen exactly as you might
expect.
</P>
<P>
At compile time, PCRE2 "auto-possessifies" repeated items when it knows that
what follows cannot be part of the repeat. For example, a+[bc] is compiled as
if it were a++[bc]. The <b>pcre2test</b> output when this pattern is anchored
and then applied with automatic callouts to the string "aaaa" is:
<pre>
---&#62;aaaa
+0 ^ ^
+1 ^ a+
+3 ^ ^ [bc]
No match
</pre>
This indicates that when matching [bc] fails, there is no backtracking into a+
and therefore the callouts that would be taken for the backtracks do not occur.
You can disable the auto-possessify feature by passing PCRE2_NO_AUTO_POSSESS
to <b>pcre2_compile()</b>, or starting the pattern with (*NO_AUTO_POSSESS). If
this is done in <b>pcre2test</b> (using the /no_auto_possess qualifier), the
output changes to this:
<pre>
---&#62;aaaa
+0 ^ ^
+1 ^ a+
+3 ^ ^ [bc]
+3 ^ ^ [bc]
+3 ^ ^ [bc]
+3 ^^ [bc]
No match
</pre>
This time, when matching [bc] fails, the matcher backtracks into a+ and tries
again, repeatedly, until a+ itself fails.
</P>
<P>
Other optimizations that provide fast "no match" results also affect callouts.
For example, if the pattern is
<pre>
ab(?C4)cd
</pre>
PCRE2 knows that any matching string must contain the letter "d". If the
subject string is "abyz", the lack of "d" means that matching doesn't ever
start, and the callout is never reached. However, with "abyd", though the
result is still no match, the callout is obeyed.
</P>
<P>
PCRE2 also knows the minimum length of a matching string, and will immediately
give a "no match" return without actually running a match if the subject is not
long enough, or, for unanchored patterns, if it has been scanned far enough.
</P>
<P>
You can disable these optimizations by passing the PCRE2_NO_START_OPTIMIZE
option to the matching function, or by starting the pattern with
(*NO_START_OPT). This slows down the matching process, but does ensure that
callouts such as the example above are obeyed.
</P>
<br><a name="SEC4" href="#TOC1">THE CALLOUT INTERFACE</a><br>
<P>
During matching, when PCRE2 reaches a callout point, the external function that
is set in the match context is called (if it is set). This applies to both
normal and DFA matching. The only argument to the callout function is a pointer
to a <b>pcre2_callout</b> block. This structure contains the following fields:
<pre>
uint32_t <i>version</i>;
uint32_t <i>callout_number</i>;
uint32_t <i>capture_top</i>;
uint32_t <i>capture_last</i>;
void *<i>callout_data</i>;
PCRE2_SIZE *<i>offset_vector</i>;
PCRE2_SPTR <i>mark</i>;
PCRE2_SPTR <i>subject</i>;
PCRE2_SIZE <i>subject_length</i>;
PCRE2_SIZE <i>start_match</i>;
PCRE2_SIZE <i>current_position</i>;
PCRE2_SIZE <i>pattern_position</i>;
PCRE2_SIZE <i>next_item_length</i>;
</pre>
The <i>version</i> field contains the version number of the block format. The
current version is 0. The version number will change in future if additional
fields are added, but the intention is never to remove any of the existing
fields.
</P>
<P>
The <i>callout_number</i> field contains the number of the callout, as compiled
into the pattern (that is, the number after ?C for manual callouts, and 255 for
automatically generated callouts).
</P>
<P>
The <i>offset_vector</i> field is a pointer to the vector of capturing offsets
(the "ovector") that was passed to the matching function in the match data
block. When <b>pcre2_match()</b> is used, the contents can be inspected, in
order to extract substrings that have been matched so far, in the same way as
for extracting substrings after a match has completed. For the DFA matching
function, this field is not useful.
</P>
<P>
The <i>subject</i> and <i>subject_length</i> fields contain copies of the values
that were passed to the matching function.
</P>
<P>
The <i>start_match</i> field normally contains the offset within the subject at
which the current match attempt started. However, if the escape sequence \K
has been encountered, this value is changed to reflect the modified starting
point. If the pattern is not anchored, the callout function may be called
several times from the same point in the pattern for different starting points
in the subject.
</P>
<P>
The <i>current_position</i> field contains the offset within the subject of the
current match pointer.
</P>
<P>
When the <b>pcre2_match()</b> is used, the <i>capture_top</i> field contains one
more than the number of the highest numbered captured substring so far. If no
substrings have been captured, the value of <i>capture_top</i> is one. This is
always the case when the DFA functions are used, because they do not support
captured substrings.
</P>
<P>
The <i>capture_last</i> field contains the number of the most recently captured
substring. However, when a recursion exits, the value reverts to what it was
outside the recursion, as do the values of all captured substrings. If no
substrings have been captured, the value of <i>capture_last</i> is 0. This is
always the case for the DFA matching functions.
</P>
<P>
The <i>callout_data</i> field contains a value that is passed to a matching
function specifically so that it can be passed back in callouts. It is set in
the match context when the callout is set up by calling
<b>pcre2_set_callout()</b> (see the
<a href="pcre2api.html"><b>pcre2api</b></a>
documentation).
</P>
<P>
The <i>pattern_position</i> field contains the offset to the next item to be
matched in the pattern string.
</P>
<P>
The <i>next_item_length</i> field contains the length of the next item to be
matched in the pattern string. When the callout immediately precedes an
alternation bar, a closing parenthesis, or the end of the pattern, the length
is zero. When the callout precedes an opening parenthesis, the length is that
of the entire subpattern.
</P>
<P>
The <i>pattern_position</i> and <i>next_item_length</i> fields are intended to
help in distinguishing between different automatic callouts, which all have the
same callout number. However, they are set for all callouts.
</P>
<P>
In callouts from <b>pcre2_match()</b> the <i>mark</i> field contains a pointer to
the zero-terminated name of the most recently passed (*MARK), (*PRUNE), or
(*THEN) item in the match, or NULL if no such items have been passed. Instances
of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In
callouts from the DFA matching function this field always contains NULL.
</P>
<br><a name="SEC5" href="#TOC1">RETURN VALUES</a><br>
<P>
The external callout function returns an integer to PCRE2. If the value is
zero, matching proceeds as normal. If the value is greater than zero, matching
fails at the current point, but the testing of other matching possibilities
goes ahead, just as if a lookahead assertion had failed. If the value is less
than zero, the match is abandoned, and the matching function returns the
negative value.
</P>
<P>
Negative values should normally be chosen from the set of PCRE2_ERROR_xxx
values. In particular, PCRE2_ERROR_NOMATCH forces a standard "no match"
failure. The error number PCRE2_ERROR_CALLOUT is reserved for use by callout
functions; it will never be used by PCRE2 itself.
</P>
<br><a name="SEC6" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><a name="SEC7" href="#TOC1">REVISION</a><br>
<P>
Last updated: 19 October 2014
<br>
Copyright &copy; 1997-2014 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>

443
doc/html/pcre2demo.html Normal file
View File

@ -0,0 +1,443 @@
<html>
<head>
<title>pcre2demo specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre2demo man page</h1>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>
<p>
This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong.
<br>
<ul>
</ul>
<PRE>
/*************************************************
* PCRE2 DEMONSTRATION PROGRAM *
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
library files for PCRE2 are installed on your system. Only some operating
systems (Solaris is one) use the -R option.
Building under Windows:
If you want to statically link this program against a non-dll .a file, you must
define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
#define PCRE2_CODE_UNIT_WIDTH 8
#include &lt;stdio.h&gt;
#include &lt;string.h&gt;
#include &lt;pcre2.h&gt;
/**************************************************************************
* Here is the program. The API includes the concept of "contexts" for *
* setting up unusual interface requirements for compiling and matching, *
* such as custom memory managers and non-standard newline definitions. *
* This program does not do any of this, so it makes no use of contexts, *
* always passing NULL where a context could be given. *
**************************************************************************/
int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t newline;
PCRE2_SIZE erroroffset;
PCRE2_SIZE *ovector;
size_t subject_length;
pcre2_match_data *match_data;
/**************************************************************************
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
**************************************************************************/
find_all = 0;
for (i = 1; i &lt; argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break;
}
/* After the options, we require exactly two arguments, which are the pattern,
and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\n");
return 1;
}
/* As pattern and subject are char arguments, they can be straightforwardly
cast to PCRE2_SPTR as we are working in 8-bit code units. */
pattern = (PCRE2_SPTR)argv[i];
subject = (PCRE2_SPTR)argv[i+1];
subject_length = strlen((char *)subject);
/*************************************************************************
* Now we are going to compile the regular expression pattern, and handle *
* any errors that are detected. *
*************************************************************************/
re = pcre2_compile(
pattern, /* the pattern */
-1, /* indicates pattern is zero-terminated */
0, /* default options */
&amp;errornumber, /* for error number */
&amp;erroroffset, /* for error offset */
NULL); /* use default compile context */
/* Compilation failed: print the error message and exit. */
if (re == NULL)
{
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
buffer);
return 1;
}
/*************************************************************************
* If the compilation succeeded, we call PCRE again, in order to do a *
* pattern match against the subject string. This does just ONE match. If *
* further matching is needed, it will be done below. Before running the *
* match we must set up a match_data block for holding the result. *
*************************************************************************/
/* Using this function ensures that the block is exactly the right size for
the number of capturing parentheses in the pattern. */
match_data = pcre2_match_data_create_from_pattern(re, NULL);
rc = pcre2_match(
re, /* the compiled pattern */
subject, /* the subject string */
subject_length, /* the length of the subject */
0, /* start at offset 0 in the subject */
0, /* default options */
match_data, /* block for storing the result */
NULL); /* use default match context */
/* Matching failed: handle error cases */
if (rc &lt; 0)
{
switch(rc)
{
case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
/*
Handle other special cases if you like
*/
default: printf("Matching error %d\n", rc); break;
}
pcre2_match_data_free(match_data); /* Release memory used for the match */
pcre2_code_free(re); /* data and the compiled pattern. */
return 1;
}
/* Match succeded. Get a pointer to the output vector, where string offsets are
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\nMatch succeeded at offset %d\n", (int)ovector[0]);
/*************************************************************************
* We have found the first match within the subject string. If the output *
* vector wasn't big enough, say so. Then output any substrings that were *
* captured. *
*************************************************************************/
/* The output vector wasn't big enough. This should not happen, because we used
pcre2_match_data_create_from_pattern() above. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */
for (i = 0; i &lt; rc; i++)
{
PCRE2_SPTR substring_start = subject + ovector[2*i];
size_t substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
/**************************************************************************
* That concludes the basic part of this demonstration program. We have *
* compiled a pattern, and performed a single match. The code that follows *
* shows first how to access named substrings, and then how to code for *
* repeated matches on the same subject. *
**************************************************************************/
/* See if there are any named substrings, and if so, show them by name. First
we have to extract the count of named parentheses from the pattern. */
(void)pcre2_pattern_info(
re, /* the compiled pattern */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&amp;namecount); /* where to put the answer */
if (namecount &lt;= 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\n");
/* Before we can access the substrings, we must extract the table for
translating names to numbers, and the size of each entry in the table. */
(void)pcre2_pattern_info(
re, /* the compiled pattern */
PCRE2_INFO_NAMETABLE, /* address of the table */
&amp;name_table); /* where to put the answer */
(void)pcre2_pattern_info(
re, /* the compiled pattern */
PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
&amp;name_entry_size); /* where to put the answer */
/* Now we can scan the table and, for each entry, print the number, the name,
and the substring itself. In the 8-bit library the number is held in two
bytes, most significant first. */
tabptr = name_table;
for (i = 0; i &lt; namecount; i++)
{
int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
(int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
tabptr += name_entry_size;
}
}
/*************************************************************************
* If the "-g" option was given on the command line, we want to continue *
* to search for additional matches in the subject string, in a similar *
* way to the /g option in Perl. This turns out to be trickier than you *
* might think because of the possibility of matching an empty string. *
* What happens is as follows: *
* *
* If the previous match was NOT for an empty string, we can just start *
* the next match at the end of the previous one. *
* *
* If the previous match WAS for an empty string, we can't do that, as it *
* would lead to an infinite loop. Instead, a call of pcre2_match() is *
* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
* first of these tells PCRE2 that an empty string at the start of the *
* subject is not a valid match; other possibilities must be tried. The *
* second flag restricts PCRE2 to one match attempt at the initial string *
* position. If this match succeeds, an alternative to the empty string *
* match has been found, and we can print it and proceed round the loop, *
* advancing by the length of whatever was found. If this match does not *
* succeed, we still stay in the loop, advancing by just one character. *
* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be *
* more than one byte. *
* *
* However, there is a complication concerned with newlines. When the *
* newline convention is such that CRLF is a valid newline, we must *
* advance by two characters rather than one. The newline convention can *
* be set in the regex by (*CR), etc.; if not, we must find the default. *
*************************************************************************/
if (!find_all) /* Check for -g */
{
pcre2_match_data_free(match_data); /* Release the memory that was used */
pcre2_code_free(re); /* for the match data and the pattern. */
return 0; /* Exit the program. */
}
/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
sequence. First, find the options with which the regex was compiled and extract
the UTF state. */
(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &amp;option_bits);
utf8 = (option_bits &amp; PCRE2_UTF) != 0;
/* Now find the newline convention and see whether CRLF is a valid newline
sequence. */
(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &amp;newline);
crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
newline == PCRE2_NEWLINE_CRLF ||
newline == PCRE2_NEWLINE_ANYCRLF;
/* Loop for second and subsequent matches */
for (;;)
{
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
same point to see if a non-empty match can be found. */
if (ovector[0] == ovector[1])
{
if (ovector[0] == subject_length) break;
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
/* Run the next matching operation */
rc = pcre2_match(
re, /* the compiled pattern */
subject, /* the subject string */
subject_length, /* the length of the subject */
start_offset, /* starting offset in the subject */
options, /* options */
match_data, /* block for storing the result */
NULL); /* use default match context */
/* This time, a result of NOMATCH isn't an error. If the value in "options"
is zero, it just means we have found all possible matches, so the loop ends.
Otherwise, it means we have failed to find a non-empty-string match at a
point where there was a previous empty-string match. In this case, we do what
Perl does: advance the matching position by one character, and continue. We
do this by setting the "end of previous match" offset, because that is picked
up at the top of the loop as the point at which to start again.
There are two complications: (a) When CRLF is a valid newline sequence, and
the current position is just before it, advance by an extra byte. (b)
Otherwise we must ensure that we skip an entire UTF character if we are in
UTF mode. */
if (rc == PCRE2_ERROR_NOMATCH)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline &amp;&amp; /* If CRLF is newline &amp; */
start_offset &lt; subject_length - 1 &amp;&amp; /* we are at CRLF, */
subject[start_offset] == '\r' &amp;&amp;
subject[start_offset + 1] == '\n')
ovector[1] += 1; /* Advance by one more. */
else if (utf8) /* Otherwise, ensure we */
{ /* advance a whole UTF-8 */
while (ovector[1] &lt; subject_length) /* character. */
{
if ((subject[ovector[1]] &amp; 0xc0) != 0x80) break;
ovector[1] += 1;
}
}
continue; /* Go round the loop again */
}
/* Other matching errors are not recoverable. */
if (rc &lt; 0)
{
printf("Matching error %d\n", rc);
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return 1;
}
/* Match succeded */
printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
/* The match succeeded, but the output vector wasn't big enough. This
should not happen. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
for (i = 0; i &lt; rc; i++)
{
PCRE2_SPTR substring_start = subject + ovector[2*i];
size_t substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
if (namecount &lt;= 0) printf("No named substrings\n"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\n");
for (i = 0; i &lt; namecount; i++)
{
int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
(int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
tabptr += name_entry_size;
}
}
} /* End of loop to find second and subsequent matches */
printf("\n");
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return 0;
}
/* End of pcre2demo.c */
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>

1199
doc/html/pcre2test.html Normal file

File diff suppressed because it is too large Load Diff

270
doc/html/pcre2unicode.html Normal file
View File

@ -0,0 +1,270 @@
<html>
<head>
<title>pcre2unicode specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>pcre2unicode man page</h1>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>
<p>
This page is part of the PCRE2 HTML documentation. It was generated
automatically from the original man page. If there is any nonsense in it,
please consult the man page, in case the conversion went wrong.
<br>
<br><b>
UNICODE AND UTF SUPPORT
</b><br>
<P>
When PCRE2 is built with Unicode support, it acquires knowledge of Unicode
character properties and can process text strings in UTF-8, UTF-16, or UTF-32
format (depending on the code unit width). By default, PCRE2 assumes that one
code unit is one character. To process a pattern as a UTF string, where a
character may require more than one code unit, you must call
<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
with the PCRE2_UTF option flag, or the pattern must start with the sequence
(*UTF). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF strings instead of
strings of individual one-code-unit characters.
</P>
<P>
If you build PCRE2 with Unicode support, the library will be bigger, but the
additional run time overhead is limited to testing the PCRE2_UTF flag
occasionally, so should not be very much.
</P>
<br><b>
UNICODE PROPERTY SUPPORT
</b><br>
<P>
When PCRE2 is built with Unicode support, the escape sequences \p{..},
\P{..}, and \X can be used. The Unicode properties that can be tested are
limited to the general category properties such as Lu for an upper case letter
or Nd for a decimal number, the Unicode script names such as Arabic or Han, and
the derived properties Any and L&. Full lists are given in the
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
and
<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
documentation. Only the short names for properties are supported. For example,
\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported.
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
compatibility with Perl 5.6. PCRE does not support this.
</P>
<br><b>
WIDE CHARACTERS AND UTF MODES
</b><br>
<P>
Codepoints less than 256 can be specified in patterns by either braced or
unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3). Larger
values have to use braced sequences. Unbraced octal code points up to \777 are
also recognized; larger ones can be coded using \o{...}.
</P>
<P>
In UTF modes, repeat quantifiers apply to complete UTF characters, not to
individual code units.
</P>
<P>
In UTF modes, the dot metacharacter matches one UTF character instead of a
single code unit.
</P>
<P>
The escape sequence \C can be used to match a single code unit, in a UTF mode,
but its use can lead to some strange effects because it breaks up multi-unit
characters (see the description of \C in the
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
documentation). The use of \C is not supported in the alternative matching
function <b>pcre2_dfa_exec()</b>, nor is it supported in UTF mode by the JIT
optimization. If JIT optimization is requested for a UTF pattern that contains
\C, it will not succeed, and so the matching will be carried out by the normal
interpretive function.
</P>
<P>
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
characters of any code value, but, by default, the characters that PCRE2
recognizes as digits, spaces, or word characters remain the same set as in
non-UTF mode, all with code points less than 256. This remains true even when
PCRE2 is built to include Unicode support, because to do otherwise would slow
down matching in many common cases. Note that this also applies to \b
and \B, because they are defined in terms of \w and \W. If you want
to test for a wider sense of, say, "digit", you can use explicit Unicode
property tests such as \p{Nd}. Alternatively, if you set the PCRE2_UCP option,
the way that the character escapes work is changed so that Unicode properties
are used to determine which characters match. There are more details in the
section on
<a href="pcre2pattern.html#genericchartypes">generic character types</a>
in the
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
documentation.
</P>
<P>
Similarly, characters that match the POSIX named character classes are all
low-valued characters, unless the PCRE2_UCP option is set.
</P>
<P>
However, the special horizontal and vertical white space matching escapes (\h,
\H, \v, and \V) do match all the appropriate Unicode characters, whether or
not PCRE2_UCP is set.
</P>
<P>
Case-insensitive matching in UTF mode makes use of Unicode properties. A few
Unicode characters such as Greek sigma have more than two codepoints that are
case-equivalent, and these are treated as such.
</P>
<br><b>
VALIDITY OF UTF STRINGS
</b><br>
<P>
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
are (by default) checked for validity on entry to the relevant functions.
If an invalid UTF string is passed, an error return is given.
</P>
<P>
UTF-16 and UTF-32 strings can indicate their endianness by special code knows
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
strings to be in host byte order.
</P>
<P>
The entire string is checked before any other processing takes place. In
addition to checking the format of the string, there is a check to ensure that
all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
The so-called "non-character" code points are not excluded because Unicode
corrigendum #9 makes it clear that they should not be.
</P>
<P>
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
where they are used in pairs to encode code points with values greater than
0xFFFF. The code points that are encoded by UTF-16 pairs are available
independently in the UTF-8 and UTF-32 encodings. (In other words, the whole
surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8 and
UTF-32.)
</P>
<P>
In some situations, you may already know that your strings are valid, and
therefore want to skip these checks in order to improve performance, for
example in the case of a long subject string that is being scanned repeatedly.
If you set the PCRE2_NO_UTF_CHECK flag at compile time or at run time, PCRE2
assumes that the pattern or subject it is given (respectively) contains only
valid UTF code unit sequences.
</P>
<P>
Passing PCRE2_NO_UTF_CHECK to <b>pcre2_compile()</b> just disables the check for
the pattern; it does not also apply to subject strings. If you want to disable
the check for a subject string you must pass this option to <b>pcre2_exec()</b>
or <b>pcre2_dfa_exec()</b>.
</P>
<P>
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
is undefined and your program may crash or loop indefinitely.
<a name="utf8strings"></a></P>
<br><b>
Errors in UTF-8 strings
</b><br>
<P>
The following negative error codes are given for invalid UTF-8 strings:
<pre>
PCRE2_ERROR_UTF8_ERR1
PCRE2_ERROR_UTF8_ERR2
PCRE2_ERROR_UTF8_ERR3
PCRE2_ERROR_UTF8_ERR4
PCRE2_ERROR_UTF8_ERR5
</pre>
The string ends with a truncated UTF-8 character; the code specifies how many
bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be
no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279)
allows for up to 6 bytes, and this is checked first; hence the possibility of
4 or 5 missing bytes.
<pre>
PCRE2_ERROR_UTF8_ERR6
PCRE2_ERROR_UTF8_ERR7
PCRE2_ERROR_UTF8_ERR8
PCRE2_ERROR_UTF8_ERR9
PCRE2_ERROR_UTF8_ERR10
</pre>
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the
character do not have the binary value 0b10 (that is, either the most
significant bit is 0, or the next bit is 1).
<pre>
PCRE2_ERROR_UTF8_ERR11
PCRE2_ERROR_UTF8_ERR12
</pre>
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long;
these code points are excluded by RFC 3629.
<pre>
PCRE2_ERROR_UTF8_ERR13
</pre>
A 4-byte character has a value greater than 0x10fff; these code points are
excluded by RFC 3629.
<pre>
PCRE2_ERROR_UTF8_ERR14
</pre>
A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of
code points are reserved by RFC 3629 for use with UTF-16, and so are excluded
from UTF-8.
<pre>
PCRE2_ERROR_UTF8_ERR15
PCRE2_ERROR_UTF8_ERR16
PCRE2_ERROR_UTF8_ERR17
PCRE2_ERROR_UTF8_ERR18
PCRE2_ERROR_UTF8_ERR19
</pre>
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a
value that can be represented by fewer bytes, which is invalid. For example,
the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just
one byte.
<pre>
PCRE2_ERROR_UTF8_ERR20
</pre>
The two most significant bits of the first byte of a character have the binary
value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a
byte can only validly occur as the second or subsequent byte of a multi-byte
character.
<pre>
PCRE2_ERROR_UTF8_ERR21
</pre>
The first byte of a character has the value 0xfe or 0xff. These values can
never occur in a valid UTF-8 string.
<a name="utf16strings"></a></P>
<br><b>
Errors in UTF-16 strings
</b><br>
<P>
The following negative error codes are given for invalid UTF-16 strings:
<pre>
PCRE_UTF16_ERR1 Missing low surrogate at end of string
PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate
PCRE_UTF16_ERR3 Isolated low surrogate
<a name="utf32strings"></a></PRE>
</P>
<br><b>
Errors in UTF-32 strings
</b><br>
<P>
The following negative error codes are given for invalid UTF-32 strings:
<pre>
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
PCRE_UTF32_ERR2 Code point is greater than 0x10ffff
</PRE>
</P>
<br><b>
AUTHOR
</b><br>
<P>
Philip Hazel
<br>
University Computing Service
<br>
Cambridge CB2 3QH, England.
<br>
</P>
<br><b>
REVISION
</b><br>
<P>
Last updated: 16 September 2014
<br>
Copyright &copy; 1997-2014 University of Cambridge.
<br>
<p>
Return to the <a href="index.html">PCRE2 index page</a>.
</p>

177
doc/index.html.src Normal file
View File

@ -0,0 +1,177 @@
<html>
<!-- This is a manually maintained file that is the root of the HTML version of
the PCRE2 documentation. When the HTML documents are built from the man
page versions, the entire doc/html directory is emptied, this file is then
copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
-->
<head>
<title>PCRE2 specification</title>
</head>
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
<h1>Perl-compatible Regular Expressions (revised API: PCRE2)</h1>
<p>
The HTML documentation for PCRE2 consists of a number of pages that are listed
below in alphabetical order. If you are new to PCRE2, please read the first one
first.
</p>
<table>
<tr><td><a href="pcre2.html">pcre</a></td>
<td>&nbsp;&nbsp;Introductory page</td></tr>
<tr><td><a href="pcre2-config.html">pcre-config</a></td>
<td>&nbsp;&nbsp;Information about the installation configuration</td></tr>
<tr><td><a href="pcre2api.html">pcreapi</a></td>
<td>&nbsp;&nbsp;PCRE2's native API</td></tr>
<tr><td><a href="pcre2build.html">pcrebuild</a></td>
<td>&nbsp;&nbsp;Building PCRE2</td></tr>
<tr><td><a href="pcre2callout.html">pcre2callout</a></td>
<td>&nbsp;&nbsp;The <i>callout</i> facility</td></tr>
<tr><td><a href="pcre2compat.html">pcre2compat</a></td>
<td>&nbsp;&nbsp;Compability with Perl</td></tr>
<tr><td><a href="pcre2demo.html">pcre2demo</a></td>
<td>&nbsp;&nbsp;A demonstration C program that uses the PCRE2 library</td></tr>
<tr><td><a href="pcre2grep.html">pcre2grep</a></td>
<td>&nbsp;&nbsp;The <b>pcre2grep</b> command</td></tr>
<tr><td><a href="pcre2jit.html">pcre2jit</a></td>
<td>&nbsp;&nbsp;Discussion of the just-in-time optimization support</td></tr>
<tr><td><a href="pcre2limits.html">pcre2limits</a></td>
<td>&nbsp;&nbsp;Details of size and other limits</td></tr>
<tr><td><a href="pcre2matching.html">pcre2matching</a></td>
<td>&nbsp;&nbsp;Discussion of the two matching algorithms</td></tr>
<tr><td><a href="pcre2partial.html">pcre2partial</a></td>
<td>&nbsp;&nbsp;Using PCRE2 for partial matching</td></tr>
<tr><td><a href="pcre2pattern.html">pcre2pattern</a></td>
<td>&nbsp;&nbsp;Specification of the regular expressions supported by PCRE2</td></tr>
<tr><td><a href="pcre2perform.html">pcre2perform</a></td>
<td>&nbsp;&nbsp;Some comments on performance</td></tr>
<tr><td><a href="pcre2posix.html">pcre2posix</a></td>
<td>&nbsp;&nbsp;The POSIX API to the PCRE2 8-bit library</td></tr>
<tr><td><a href="pcre2precompile.html">pcre2precompile</a></td>
<td>&nbsp;&nbsp;How to save and re-use compiled patterns</td></tr>
<tr><td><a href="pcre2sample.html">pcre2sample</a></td>
<td>&nbsp;&nbsp;Discussion of the pcre2demo program</td></tr>
<tr><td><a href="pcre2stack.html">pcre2stack</a></td>
<td>&nbsp;&nbsp;Discussion of PCRE2's stack usage</td></tr>
<tr><td><a href="pcre2syntax.html">pcre2syntax</a></td>
<td>&nbsp;&nbsp;Syntax quick-reference summary</td></tr>
<tr><td><a href="pcre2test.html">pcre2test</a></td>
<td>&nbsp;&nbsp;The <b>pcre2test</b> command for testing PCRE2</td></tr>
<tr><td><a href="pcre2unicode.html">pcre2unicode</a></td>
<td>&nbsp;&nbsp;Discussion of Unicode and UTF-8/UTF-16/UTF-32 support</td></tr>
</table>
<p>
There are also individual pages that summarize the interface for each function
in the library. There is a single page for each triple of 8-bit/16-bit/32-bit
functions.
</p>
<table>
<tr><td><a href="pcre2_assign_jit_stack.html">pcre2_assign_jit_stack</a></td>
<td>&nbsp;&nbsp;Assign stack for JIT matching</td></tr>
<tr><td><a href="pcre2_compile.html">pcre2_compile</a></td>
<td>&nbsp;&nbsp;Compile a regular expression</td></tr>
<tr><td><a href="pcre2_compile2.html">pcre2_compile2</a></td>
<td>&nbsp;&nbsp;Compile a regular expression (alternate interface)</td></tr>
<tr><td><a href="pcre2_config.html">pcre2_config</a></td>
<td>&nbsp;&nbsp;Show build-time configuration options</td></tr>
<tr><td><a href="pcre2_copy_named_substring.html">pcre2_copy_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into given buffer</td></tr>
<tr><td><a href="pcre2_copy_substring.html">pcre2_copy_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into given buffer</td></tr>
<tr><td><a href="pcre2_dfa_exec.html">pcre2_dfa_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(DFA algorithm; <i>not</i> Perl compatible)</td></tr>
<tr><td><a href="pcre2_exec.html">pcre2_exec</a></td>
<td>&nbsp;&nbsp;Match a compiled pattern to a subject string
(Perl compatible)</td></tr>
<tr><td><a href="pcre2_free_study.html">pcre2_free_study</a></td>
<td>&nbsp;&nbsp;Free study data</td></tr>
<tr><td><a href="pcre2_free_substring.html">pcre2_free_substring</a></td>
<td>&nbsp;&nbsp;Free extracted substring</td></tr>
<tr><td><a href="pcre2_free_substring_list.html">pcre2_free_substring_list</a></td>
<td>&nbsp;&nbsp;Free list of extracted substrings</td></tr>
<tr><td><a href="pcre2_fullinfo.html">pcre2_fullinfo</a></td>
<td>&nbsp;&nbsp;Extract information about a pattern</td></tr>
<tr><td><a href="pcre2_get_named_substring.html">pcre2_get_named_substring</a></td>
<td>&nbsp;&nbsp;Extract named substring into new memory</td></tr>
<tr><td><a href="pcre2_get_stringnumber.html">pcre2_get_stringnumber</a></td>
<td>&nbsp;&nbsp;Convert captured string name to number</td></tr>
<tr><td><a href="pcre2_get_stringtable_entries.html">pcre2_get_stringtable_entries</a></td>
<td>&nbsp;&nbsp;Find table entries for given string name</td></tr>
<tr><td><a href="pcre2_get_substring.html">pcre2_get_substring</a></td>
<td>&nbsp;&nbsp;Extract numbered substring into new memory</td></tr>
<tr><td><a href="pcre2_get_substring_list.html">pcre2_get_substring_list</a></td>
<td>&nbsp;&nbsp;Extract all substrings into new memory</td></tr>
<tr><td><a href="pcre2_jit_exec.html">pcre2_jit_exec</a></td>
<td>&nbsp;&nbsp;Fast path interface to JIT matching</td></tr>
<tr><td><a href="pcre2_jit_stack_alloc.html">pcre2_jit_stack_alloc</a></td>
<td>&nbsp;&nbsp;Create a stack for JIT matching</td></tr>
<tr><td><a href="pcre2_jit_stack_free.html">pcre2_jit_stack_free</a></td>
<td>&nbsp;&nbsp;Free a JIT matching stack</td></tr>
<tr><td><a href="pcre2_maketables.html">pcre2_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
<tr><td><a href="pcre2_pattern_to_host_byte_order.html">pcre2_pattern_to_host_byte_order</a></td>
<td>&nbsp;&nbsp;Convert compiled pattern to host byte order if necessary</td></tr>
<tr><td><a href="pcre2_refcount.html">pcre2_refcount</a></td>
<td>&nbsp;&nbsp;Maintain reference count in compiled pattern</td></tr>
<tr><td><a href="pcre2_study.html">pcre2_study</a></td>
<td>&nbsp;&nbsp;Study a compiled pattern</td></tr>
<tr><td><a href="pcre2_utf16_to_host_byte_order.html">pcre2_utf16_to_host_byte_order</a></td>
<td>&nbsp;&nbsp;Convert UTF-16 string to host byte order if necessary</td></tr>
<tr><td><a href="pcre2_utf32_to_host_byte_order.html">pcre2_utf32_to_host_byte_order</a></td>
<td>&nbsp;&nbsp;Convert UTF-32 string to host byte order if necessary</td></tr>
<tr><td><a href="pcre2_version.html">pcre2_version</a></td>
<td>&nbsp;&nbsp;Return PCRE2 version and release date</td></tr>
</table>
</html>

2903
doc/pcre2.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -214,7 +214,7 @@ document for an overview of all the PCRE2 documentation.
.B int pcre2_pattern_info(const pcre2 *\fIcode\fP, uint32_t \fIwhat\fP, void *\fIwhere\fP);
.sp
.B int pcre2_config(uint32_t \fIwhat\fP, void *\fIwhere\fP, PCRE2_SIZE \fIlength\fP);
.sp
.fi
.
.
.SH "PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES"

441
doc/pcre2demo.3 Normal file
View File

@ -0,0 +1,441 @@
.\" Start example.
.de EX
. nr mE \\n(.f
. nf
. nh
. ft CW
..
.
.
.\" End example.
.de EE
. ft \\n(mE
. fi
. hy \\n(HY
..
.
.EX
/*************************************************
* PCRE2 DEMONSTRATION PROGRAM *
*************************************************/
/* This is a demonstration program to illustrate a straightforward way of
calling the PCRE2 regular expression library from a C program. See the
pcre2sample documentation for a short discussion ("man pcre2sample" if you have
the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
incompatible with the original PCRE API.
There are actually three libraries, each supporting a different code unit
width. This demonstration program uses the 8-bit library.
In Unix-like environments, if PCRE2 is installed in your standard system
libraries, you should be able to compile this program using this command:
gcc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
If PCRE2 is not installed in a standard place, it is likely to be installed
with support for the pkg-config mechanism. If you have pkg-config, you can
compile this program using this command:
gcc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
If you do not have pkg-config, you may have to use this:
gcc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \e
-R/usr/local/lib -lpcre2-8 -o pcre2demo
Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
library files for PCRE2 are installed on your system. Only some operating
systems (Solaris is one) use the -R option.
Building under Windows:
If you want to statically link this program against a non-dll .a file, you must
define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
the following line. */
/* #define PCRE2_STATIC */
/* This macro must be defined before including pcre2.h. For a program that uses
only one code unit width, it makes it possible to use generic function names
such as pcre2_compile(). */
#define PCRE2_CODE_UNIT_WIDTH 8
#include <stdio.h>
#include <string.h>
#include <pcre2.h>
/**************************************************************************
* Here is the program. The API includes the concept of "contexts" for *
* setting up unusual interface requirements for compiling and matching, *
* such as custom memory managers and non-standard newline definitions. *
* This program does not do any of this, so it makes no use of contexts, *
* always passing NULL where a context could be given. *
**************************************************************************/
int main(int argc, char **argv)
{
pcre2_code *re;
PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
PCRE2_SPTR subject; /* the appropriate width (8, 16, or 32 bits). */
PCRE2_SPTR name_table;
int crlf_is_newline;
int errornumber;
int find_all;
int i;
int namecount;
int name_entry_size;
int rc;
int utf8;
uint32_t option_bits;
uint32_t newline;
PCRE2_SIZE erroroffset;
PCRE2_SIZE *ovector;
size_t subject_length;
pcre2_match_data *match_data;
/**************************************************************************
* First, sort out the command line. There is only one possible option at *
* the moment, "-g" to request repeated matching to find all occurrences, *
* like Perl's /g option. We set the variable find_all to a non-zero value *
* if the -g option is present. Apart from that, there must be exactly two *
* arguments. *
**************************************************************************/
find_all = 0;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-g") == 0) find_all = 1;
else break;
}
/* After the options, we require exactly two arguments, which are the pattern,
and the subject string. */
if (argc - i != 2)
{
printf("Two arguments required: a regex and a subject string\en");
return 1;
}
/* As pattern and subject are char arguments, they can be straightforwardly
cast to PCRE2_SPTR as we are working in 8-bit code units. */
pattern = (PCRE2_SPTR)argv[i];
subject = (PCRE2_SPTR)argv[i+1];
subject_length = strlen((char *)subject);
/*************************************************************************
* Now we are going to compile the regular expression pattern, and handle *
* any errors that are detected. *
*************************************************************************/
re = pcre2_compile(
pattern, /* the pattern */
-1, /* indicates pattern is zero-terminated */
0, /* default options */
&errornumber, /* for error number */
&erroroffset, /* for error offset */
NULL); /* use default compile context */
/* Compilation failed: print the error message and exit. */
if (re == NULL)
{
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
printf("PCRE2 compilation failed at offset %d: %s\en", (int)erroroffset,
buffer);
return 1;
}
/*************************************************************************
* If the compilation succeeded, we call PCRE again, in order to do a *
* pattern match against the subject string. This does just ONE match. If *
* further matching is needed, it will be done below. Before running the *
* match we must set up a match_data block for holding the result. *
*************************************************************************/
/* Using this function ensures that the block is exactly the right size for
the number of capturing parentheses in the pattern. */
match_data = pcre2_match_data_create_from_pattern(re, NULL);
rc = pcre2_match(
re, /* the compiled pattern */
subject, /* the subject string */
subject_length, /* the length of the subject */
0, /* start at offset 0 in the subject */
0, /* default options */
match_data, /* block for storing the result */
NULL); /* use default match context */
/* Matching failed: handle error cases */
if (rc < 0)
{
switch(rc)
{
case PCRE2_ERROR_NOMATCH: printf("No match\en"); break;
/*
Handle other special cases if you like
*/
default: printf("Matching error %d\en", rc); break;
}
pcre2_match_data_free(match_data); /* Release memory used for the match */
pcre2_code_free(re); /* data and the compiled pattern. */
return 1;
}
/* Match succeded. Get a pointer to the output vector, where string offsets are
stored. */
ovector = pcre2_get_ovector_pointer(match_data);
printf("\enMatch succeeded at offset %d\en", (int)ovector[0]);
/*************************************************************************
* We have found the first match within the subject string. If the output *
* vector wasn't big enough, say so. Then output any substrings that were *
* captured. *
*************************************************************************/
/* The output vector wasn't big enough. This should not happen, because we used
pcre2_match_data_create_from_pattern() above. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\en");
/* Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them. */
for (i = 0; i < rc; i++)
{
PCRE2_SPTR substring_start = subject + ovector[2*i];
size_t substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
}
/**************************************************************************
* That concludes the basic part of this demonstration program. We have *
* compiled a pattern, and performed a single match. The code that follows *
* shows first how to access named substrings, and then how to code for *
* repeated matches on the same subject. *
**************************************************************************/
/* See if there are any named substrings, and if so, show them by name. First
we have to extract the count of named parentheses from the pattern. */
(void)pcre2_pattern_info(
re, /* the compiled pattern */
PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
&namecount); /* where to put the answer */
if (namecount <= 0) printf("No named substrings\en"); else
{
PCRE2_SPTR tabptr;
printf("Named substrings\en");
/* Before we can access the substrings, we must extract the table for
translating names to numbers, and the size of each entry in the table. */
(void)pcre2_pattern_info(
re, /* the compiled pattern */
PCRE2_INFO_NAMETABLE, /* address of the table */
&name_table); /* where to put the answer */
(void)pcre2_pattern_info(
re, /* the compiled pattern */
PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
&name_entry_size); /* where to put the answer */
/* Now we can scan the table and, for each entry, print the number, the name,
and the substring itself. In the 8-bit library the number is held in two
bytes, most significant first. */
tabptr = name_table;
for (i = 0; i < namecount; i++)
{
int n = (tabptr[0] << 8) | tabptr[1];
printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
(int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
tabptr += name_entry_size;
}
}
/*************************************************************************
* If the "-g" option was given on the command line, we want to continue *
* to search for additional matches in the subject string, in a similar *
* way to the /g option in Perl. This turns out to be trickier than you *
* might think because of the possibility of matching an empty string. *
* What happens is as follows: *
* *
* If the previous match was NOT for an empty string, we can just start *
* the next match at the end of the previous one. *
* *
* If the previous match WAS for an empty string, we can't do that, as it *
* would lead to an infinite loop. Instead, a call of pcre2_match() is *
* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
* first of these tells PCRE2 that an empty string at the start of the *
* subject is not a valid match; other possibilities must be tried. The *
* second flag restricts PCRE2 to one match attempt at the initial string *
* position. If this match succeeds, an alternative to the empty string *
* match has been found, and we can print it and proceed round the loop, *
* advancing by the length of whatever was found. If this match does not *
* succeed, we still stay in the loop, advancing by just one character. *
* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be *
* more than one byte. *
* *
* However, there is a complication concerned with newlines. When the *
* newline convention is such that CRLF is a valid newline, we must *
* advance by two characters rather than one. The newline convention can *
* be set in the regex by (*CR), etc.; if not, we must find the default. *
*************************************************************************/
if (!find_all) /* Check for -g */
{
pcre2_match_data_free(match_data); /* Release the memory that was used */
pcre2_code_free(re); /* for the match data and the pattern. */
return 0; /* Exit the program. */
}
/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
sequence. First, find the options with which the regex was compiled and extract
the UTF state. */
(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits);
utf8 = (option_bits & PCRE2_UTF) != 0;
/* Now find the newline convention and see whether CRLF is a valid newline
sequence. */
(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline);
crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
newline == PCRE2_NEWLINE_CRLF ||
newline == PCRE2_NEWLINE_ANYCRLF;
/* Loop for second and subsequent matches */
for (;;)
{
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
same point to see if a non-empty match can be found. */
if (ovector[0] == ovector[1])
{
if (ovector[0] == subject_length) break;
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
/* Run the next matching operation */
rc = pcre2_match(
re, /* the compiled pattern */
subject, /* the subject string */
subject_length, /* the length of the subject */
start_offset, /* starting offset in the subject */
options, /* options */
match_data, /* block for storing the result */
NULL); /* use default match context */
/* This time, a result of NOMATCH isn't an error. If the value in "options"
is zero, it just means we have found all possible matches, so the loop ends.
Otherwise, it means we have failed to find a non-empty-string match at a
point where there was a previous empty-string match. In this case, we do what
Perl does: advance the matching position by one character, and continue. We
do this by setting the "end of previous match" offset, because that is picked
up at the top of the loop as the point at which to start again.
There are two complications: (a) When CRLF is a valid newline sequence, and
the current position is just before it, advance by an extra byte. (b)
Otherwise we must ensure that we skip an entire UTF character if we are in
UTF mode. */
if (rc == PCRE2_ERROR_NOMATCH)
{
if (options == 0) break; /* All matches found */
ovector[1] = start_offset + 1; /* Advance one code unit */
if (crlf_is_newline && /* If CRLF is newline & */
start_offset < subject_length - 1 && /* we are at CRLF, */
subject[start_offset] == '\er' &&
subject[start_offset + 1] == '\en')
ovector[1] += 1; /* Advance by one more. */
else if (utf8) /* Otherwise, ensure we */
{ /* advance a whole UTF-8 */
while (ovector[1] < subject_length) /* character. */
{
if ((subject[ovector[1]] & 0xc0) != 0x80) break;
ovector[1] += 1;
}
}
continue; /* Go round the loop again */
}
/* Other matching errors are not recoverable. */
if (rc < 0)
{
printf("Matching error %d\en", rc);
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return 1;
}
/* Match succeded */
printf("\enMatch succeeded again at offset %d\en", (int)ovector[0]);
/* The match succeeded, but the output vector wasn't big enough. This
should not happen. */
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\en");
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
for (i = 0; i < rc; i++)
{
PCRE2_SPTR substring_start = subject + ovector[2*i];
size_t substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\en", i, (int)substring_length, (char *)substring_start);
}
if (namecount <= 0) printf("No named substrings\en"); else
{
PCRE2_SPTR tabptr = name_table;
printf("Named substrings\en");
for (i = 0; i < namecount; i++)
{
int n = (tabptr[0] << 8) | tabptr[1];
printf("(%d) %*s: %.*s\en", n, name_entry_size - 3, tabptr + 2,
(int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
tabptr += name_entry_size;
}
}
} /* End of loop to find second and subsequent matches */
printf("\en");
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return 0;
}
/* End of pcre2demo.c */
.EE

View File

@ -366,7 +366,7 @@ include a closing square bracket in the characters, code it as \ex5D.
A backslash followed by an equals sign marke the end of the subject string and
the start of a modifier list. For example:
.sp
abc\=notbol,notempty
abc\e=notbol,notempty
.sp
A backslash followed by any other non-alphanumeric character just escapes that
character. A backslash followed by anything else causes an error. However, if
@ -746,7 +746,7 @@ the actual match are indicated in the output by '<' or '>' characters
underneath them. Here is an example:
.sp
/(?<=pqr)abc(?=xyz)/
123pqrabcxyz456\=allusedtext
123pqrabcxyz456\e=allusedtext
0: pqrabcxyz
<<< >>>
.sp
@ -789,7 +789,7 @@ The \fBcopy\fP and \fBget\fP modifiers can be used to test the
They can be given more than once, and each can specify a group name or number,
for example:
.sp
abcd\=copy=1,copy=3,get=G1
abcd\e=copy=1,copy=3,get=G1
.sp
If the \fB#subject\fP command is used to set default copy and get lists, these
can be unset by specifying a negative number for numbered groups and an empty

1073
doc/pcre2test.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -420,4 +420,4 @@ pcre2_code_free(re);
return 0;
}
/* End of pcredemo.c */
/* End of pcre2demo.c */