More draft documentation.
This commit is contained in:
parent
22fafb8a6f
commit
e15b64ef03
42
Makefile.am
42
Makefile.am
|
@ -6,29 +6,31 @@ AM_CPPFLAGS = -I$(builddir)/src -I$(srcdir)/src
|
||||||
|
|
||||||
## Specify the documentation files that are distributed.
|
## Specify the documentation files that are distributed.
|
||||||
|
|
||||||
# FIXME
|
|
||||||
dist_doc_DATA = \
|
dist_doc_DATA = \
|
||||||
AUTHORS \
|
AUTHORS \
|
||||||
COPYING \
|
COPYING \
|
||||||
ChangeLog \
|
ChangeLog \
|
||||||
LICENCE \
|
LICENCE \
|
||||||
README
|
NEWS \
|
||||||
|
README \
|
||||||
# doc/pcre.txt \
|
doc/pcre2.txt \
|
||||||
# doc/pcre-config.txt \
|
doc/pcre2-config.txt \
|
||||||
# doc/pcregrep.txt \
|
doc/pcre2grep.txt \
|
||||||
# doc/pcretest.txt \
|
doc/pcre2test.txt
|
||||||
# NEWS
|
|
||||||
|
|
||||||
# FIXME
|
# FIXME
|
||||||
#dist_html_DATA = \
|
dist_html_DATA = \
|
||||||
# doc/html/NON-AUTOTOOLS-BUILD.txt \
|
doc/html/NON-AUTOTOOLS-BUILD.txt \
|
||||||
# doc/html/README.txt \
|
doc/html/README.txt \
|
||||||
# doc/html/index.html \
|
doc/html/index.html \
|
||||||
# doc/html/pcre-config.html \
|
doc/html/pcre2-config.html \
|
||||||
|
doc/html/pcre2api.html \
|
||||||
|
doc/html/pcre2callout.html \
|
||||||
|
doc/html/pcre2demo.html \
|
||||||
|
doc/html/pcre2test.html \
|
||||||
|
doc/html/pcre2unicode.html
|
||||||
|
|
||||||
# doc/html/pcre.html \
|
# doc/html/pcre.html \
|
||||||
# doc/html/pcre16.html \
|
|
||||||
# doc/html/pcre32.html \
|
|
||||||
# doc/html/pcre_assign_jit_stack.html \
|
# doc/html/pcre_assign_jit_stack.html \
|
||||||
# doc/html/pcre_compile.html \
|
# doc/html/pcre_compile.html \
|
||||||
# doc/html/pcre_compile2.html \
|
# doc/html/pcre_compile2.html \
|
||||||
|
@ -56,11 +58,8 @@ dist_doc_DATA = \
|
||||||
# doc/html/pcre_utf16_to_host_byte_order.html \
|
# doc/html/pcre_utf16_to_host_byte_order.html \
|
||||||
# doc/html/pcre_utf32_to_host_byte_order.html \
|
# doc/html/pcre_utf32_to_host_byte_order.html \
|
||||||
# doc/html/pcre_version.html \
|
# doc/html/pcre_version.html \
|
||||||
# doc/html/pcreapi.html \
|
|
||||||
# doc/html/pcrebuild.html \
|
# doc/html/pcrebuild.html \
|
||||||
# doc/html/pcrecallout.html \
|
|
||||||
# doc/html/pcrecompat.html \
|
# doc/html/pcrecompat.html \
|
||||||
# doc/html/pcredemo.html \
|
|
||||||
# doc/html/pcregrep.html \
|
# doc/html/pcregrep.html \
|
||||||
# doc/html/pcrejit.html \
|
# doc/html/pcrejit.html \
|
||||||
# doc/html/pcrelimits.html \
|
# doc/html/pcrelimits.html \
|
||||||
|
@ -72,18 +71,16 @@ dist_doc_DATA = \
|
||||||
# doc/html/pcreprecompile.html \
|
# doc/html/pcreprecompile.html \
|
||||||
# doc/html/pcresample.html \
|
# doc/html/pcresample.html \
|
||||||
# doc/html/pcrestack.html \
|
# doc/html/pcrestack.html \
|
||||||
# doc/html/pcresyntax.html \
|
# doc/html/pcresyntax.html
|
||||||
# doc/html/pcretest.html \
|
|
||||||
# doc/html/pcreunicode.html
|
|
||||||
|
|
||||||
# FIXME
|
# FIXME
|
||||||
dist_man_MANS = \
|
dist_man_MANS = \
|
||||||
|
doc/pcre2-config.1 \
|
||||||
doc/pcre2api.3 \
|
doc/pcre2api.3 \
|
||||||
doc/pcre2callout.3 \
|
doc/pcre2callout.3 \
|
||||||
doc/pcre2test.1 \
|
doc/pcre2test.1 \
|
||||||
doc/pcre2unicode.3
|
doc/pcre2unicode.3
|
||||||
|
|
||||||
# doc/pcre2-config.1 \
|
|
||||||
# doc/pcre2.3 \
|
# doc/pcre2.3 \
|
||||||
# doc/pcre2-16.3 \
|
# doc/pcre2-16.3 \
|
||||||
# doc/pcre2-32.3 \
|
# doc/pcre2-32.3 \
|
||||||
|
@ -168,7 +165,6 @@ EXTRA_DIST += \
|
||||||
|
|
||||||
EXTRA_DIST += \
|
EXTRA_DIST += \
|
||||||
doc/perltest.txt \
|
doc/perltest.txt \
|
||||||
NON-UNIX-USE \
|
|
||||||
NON-AUTOTOOLS-BUILD \
|
NON-AUTOTOOLS-BUILD \
|
||||||
HACKING
|
HACKING
|
||||||
|
|
||||||
|
|
|
@ -83,8 +83,7 @@ for file in pcre2api pcre2callout pcre2unicode ; do
|
||||||
done
|
done
|
||||||
|
|
||||||
# The three commands
|
# The three commands
|
||||||
for file in pcre2test ; do
|
for file in pcre2test pcre2grep pcre2-config ; do
|
||||||
# for file in pcre2test pcre2grep pcre-config ; do
|
|
||||||
echo Making $file.txt
|
echo Making $file.txt
|
||||||
nroff -c -man $file.1 >$file.rawtxt
|
nroff -c -man $file.1 >$file.rawtxt
|
||||||
perl ../CleanTxt <$file.rawtxt >$file.txt
|
perl ../CleanTxt <$file.rawtxt >$file.txt
|
||||||
|
@ -133,7 +132,7 @@ echo "Making HTML documentation"
|
||||||
/bin/rm html/*
|
/bin/rm html/*
|
||||||
cp index.html.src html/index.html
|
cp index.html.src html/index.html
|
||||||
cp ../README html/README.txt
|
cp ../README html/README.txt
|
||||||
# cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt
|
cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt
|
||||||
|
|
||||||
for file in *.1 ; do
|
for file in *.1 ; do
|
||||||
base=`basename $file .1`
|
base=`basename $file .1`
|
||||||
|
@ -187,7 +186,6 @@ files="\
|
||||||
COPYING \
|
COPYING \
|
||||||
AUTHORS \
|
AUTHORS \
|
||||||
NEWS \
|
NEWS \
|
||||||
NON-UNIX-USE \
|
|
||||||
NON-AUTOTOOLS-BUILD \
|
NON-AUTOTOOLS-BUILD \
|
||||||
INSTALL \
|
INSTALL \
|
||||||
132html \
|
132html \
|
||||||
|
@ -240,16 +238,6 @@ files="\
|
||||||
pcre32_utf32_utils.c \
|
pcre32_utf32_utils.c \
|
||||||
pcre16_valid_utf16.c \
|
pcre16_valid_utf16.c \
|
||||||
pcre32_valid_utf32.c \
|
pcre32_valid_utf32.c \
|
||||||
pcre_scanner.cc \
|
|
||||||
pcre_scanner.h \
|
|
||||||
pcre_scanner_unittest.cc \
|
|
||||||
pcrecpp.cc \
|
|
||||||
pcrecpp.h \
|
|
||||||
pcrecpparg.h.in \
|
|
||||||
pcrecpp_unittest.cc \
|
|
||||||
pcre_stringpiece.cc \
|
|
||||||
pcre_stringpiece.h.in \
|
|
||||||
pcre_stringpiece_unittest.cc \
|
|
||||||
perltest.pl \
|
perltest.pl \
|
||||||
ucp.h \
|
ucp.h \
|
||||||
makevp.bat \
|
makevp.bat \
|
||||||
|
|
|
@ -0,0 +1,402 @@
|
||||||
|
Building PCRE2 without using autotools
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
This document has been converted from the PCRE1 document, but is not yet
|
||||||
|
complete. I have removed a number of quite old sections about building in
|
||||||
|
various environments, as they applied only to PCRE1 and are probably out of
|
||||||
|
date.
|
||||||
|
|
||||||
|
|
||||||
|
This document contains the following sections:
|
||||||
|
|
||||||
|
General
|
||||||
|
Generic instructions for the PCRE2 C library
|
||||||
|
Building for virtual Pascal
|
||||||
|
Stack size in Windows environments
|
||||||
|
Linking programs in Windows environments
|
||||||
|
Calling conventions in Windows environments
|
||||||
|
Comments about Win32 builds
|
||||||
|
Building PCRE2 on Windows with CMake
|
||||||
|
Testing with RunTest.bat
|
||||||
|
Building PCRE2 on native z/OS and z/VM
|
||||||
|
|
||||||
|
|
||||||
|
GENERAL
|
||||||
|
|
||||||
|
I (Philip Hazel) have no experience of Windows or VMS sytems and how their
|
||||||
|
libraries work. The items in the PCRE2 distribution and Makefile that relate to
|
||||||
|
anything other than Linux systems are untested by me.
|
||||||
|
|
||||||
|
The basic PCRE2 library consists entirely of code written in Standard C, and so
|
||||||
|
should compile successfully on any system that has a Standard C compiler and
|
||||||
|
library.
|
||||||
|
|
||||||
|
The PCRE2 distribution includes a "configure" file for use by the
|
||||||
|
configure/make (autotools) build system, as found in many Unix-like
|
||||||
|
environments. The README file contains information about the options for
|
||||||
|
"configure".
|
||||||
|
|
||||||
|
There is also support for CMake, which some users prefer, especially in Windows
|
||||||
|
environments, though it can also be run in Unix-like environments. See the
|
||||||
|
section entitled "Building PCRE2 on Windows with CMake" below.
|
||||||
|
|
||||||
|
Versions of src/config.h and src/pcre2.h are distributed in the PCRE2 tarballs
|
||||||
|
under the names src/config.h.generic and src/pcre2.h.generic. These are
|
||||||
|
provided for those who build PCRE2 without using "configure" or CMake. If you
|
||||||
|
use "configure" or CMake, the .generic versions are not used.
|
||||||
|
|
||||||
|
|
||||||
|
GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY
|
||||||
|
|
||||||
|
The following are generic instructions for building the PCRE2 C library "by
|
||||||
|
hand". If you are going to use CMake, this section does not apply to you; you
|
||||||
|
can skip ahead to the CMake section.
|
||||||
|
|
||||||
|
(1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
|
||||||
|
macro settings that it contains to whatever is appropriate for your
|
||||||
|
environment. In particular, you can alter the definition of the NEWLINE
|
||||||
|
macro to specify what character(s) you want to be interpreted as line
|
||||||
|
terminators.
|
||||||
|
|
||||||
|
When you compile any of the PCRE2 modules, you must specify
|
||||||
|
-DHAVE_CONFIG_H to your compiler so that src/config.h is included in the
|
||||||
|
sources.
|
||||||
|
|
||||||
|
An alternative approach is not to edit src/config.h, but to use -D on the
|
||||||
|
compiler command line to make any changes that you need to the
|
||||||
|
configuration options. In this case -DHAVE_CONFIG_H must not be set.
|
||||||
|
|
||||||
|
NOTE: There have been occasions when the way in which certain parameters
|
||||||
|
in src/config.h are used has changed between releases. (In the
|
||||||
|
configure/make world, this is handled automatically.) When upgrading to a
|
||||||
|
new release, you are strongly advised to review src/config.h.generic
|
||||||
|
before re-using what you had previously.
|
||||||
|
|
||||||
|
(2) Copy or rename the file src/pcre2.h.generic as src/pcre2.h.
|
||||||
|
|
||||||
|
(3) EITHER:
|
||||||
|
Copy or rename file src/pcre2_chartables.c.dist as
|
||||||
|
src/pcre2_chartables.c.
|
||||||
|
|
||||||
|
OR:
|
||||||
|
Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
|
||||||
|
if you have set up src/config.h), and then run it with the single
|
||||||
|
argument "src/pcre2_chartables.c". This generates a set of standard
|
||||||
|
character tables and writes them to that file. The tables are generated
|
||||||
|
using the default C locale for your system. If you want to use a locale
|
||||||
|
that is specified by LC_xxx environment variables, add the -L option to
|
||||||
|
the dftables command. You must use this method if you are building on a
|
||||||
|
system that uses EBCDIC code.
|
||||||
|
|
||||||
|
The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
|
||||||
|
specify alternative tables at run time.
|
||||||
|
|
||||||
|
(4) For an 8-bit library, compile the following source files, setting
|
||||||
|
-DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set -DHAVE_CONFIG_H
|
||||||
|
if you have set up src/config.h with your configuration, or else use other
|
||||||
|
-D settings to change the configuration as required.
|
||||||
|
|
||||||
|
pcre2_auto_possess.c
|
||||||
|
pcre2_chartables.c
|
||||||
|
pcre2_compile.c
|
||||||
|
pcre2_config.c
|
||||||
|
pcre2_context.c
|
||||||
|
pcre2_dfa_match.c
|
||||||
|
pcre2_error.c
|
||||||
|
pcre2_jit_compile.c
|
||||||
|
pcre2_jit_match.c
|
||||||
|
pcre2_jit_misc.c
|
||||||
|
pcre2_maketables.c
|
||||||
|
pcre2_match.c
|
||||||
|
pcre2_match_data.c
|
||||||
|
pcre2_newline.c
|
||||||
|
pcre2_ord2utf.c
|
||||||
|
pcre2_pattern_info.c
|
||||||
|
pcre2_string_utils.c
|
||||||
|
pcre2_study.c
|
||||||
|
pcre2_substring.c
|
||||||
|
pcre2_tables.c
|
||||||
|
pcre2_ucd.c
|
||||||
|
pcre2_valid_utf.c
|
||||||
|
pcre2_xclass.c
|
||||||
|
|
||||||
|
Make sure that you include -I. in the compiler command (or equivalent for
|
||||||
|
an unusual compiler) so that all included PCRE2 header files are first
|
||||||
|
sought in the src directory under the current directory. Otherwise you run
|
||||||
|
the risk of picking up a previously-installed file from somewhere else.
|
||||||
|
|
||||||
|
Note that you must compile pcre2_jit_xxx.c, even if you have not defined
|
||||||
|
SUPPORT_JIT in src/config.h, because when JIT support is not configured,
|
||||||
|
dummy functions are compiled. When JIT support IS configured, the JIT
|
||||||
|
sources #include other files from the sljit subdirectory, where there
|
||||||
|
should be 16 files, all of whose names begin with "sljit".
|
||||||
|
|
||||||
|
(5) Now link all the compiled code into an object library in whichever form
|
||||||
|
your system keeps such libraries. This is the basic PCRE2 C 8-bit library.
|
||||||
|
If your system has static and shared libraries, you may have to do this
|
||||||
|
once for each type.
|
||||||
|
|
||||||
|
(6) If you want to build a 16-bit library or 32-bit library (as well as, or
|
||||||
|
instead of the 8-bit library) just supply 16 or 32 as the value of
|
||||||
|
-DPCRE2_CODE_UNIT_WIDTH when you are compiling.
|
||||||
|
|
||||||
|
(7) If you want to build the POSIX wrapper functions (which apply only to the
|
||||||
|
8-bit library), ensure that you have the pcre2posix.h file and then
|
||||||
|
compile pcre2posix.c. Link the result (on its own) as the pcre2posix
|
||||||
|
library.
|
||||||
|
|
||||||
|
(8) The pcre2test program can be linked with any combination of the 8-bit,
|
||||||
|
16-bit and 32-bit libraries (depending on what you selected in
|
||||||
|
src/config.h). Compile pcre2test.c; don't forget -DHAVE_CONFIG_H if
|
||||||
|
necessary, but do NOT define PCRE2_CODE_UNIT_WIDTH. Then link with the
|
||||||
|
appropriate library/ies. If you compiled an 8-bit library, pcre2test also
|
||||||
|
needs the pcre2posix wrapper library.
|
||||||
|
|
||||||
|
(9) Run pcre2test on the testinput files in the testdata directory, and check
|
||||||
|
that the output matches the corresponding testoutput files. There are
|
||||||
|
comments about what each test does in the section entitled "Testing PCRE2"
|
||||||
|
in the README file. If you compiled more than one of the 8-bit, 16-bit and
|
||||||
|
32-bit libraries, you need to run pcre2test with the -16 option to do
|
||||||
|
16-bit tests and with the -32 option to do 32-bit tests.
|
||||||
|
|
||||||
|
Some tests are relevant only when certain build-time options are selected.
|
||||||
|
For example, test 4 is for Unicode support, and will not run if you have
|
||||||
|
built PCRE2 without it. See the comments at the start of each testinput
|
||||||
|
file. If you have a suitable Unix-like shell, the RunTest script will run
|
||||||
|
the appropriate tests for you. The command "RunTest list" will output a
|
||||||
|
list of all the tests.
|
||||||
|
|
||||||
|
Note that the supplied files are in Unix format, with just LF characters
|
||||||
|
as line terminators. You may need to edit them to change this if your
|
||||||
|
system uses a different convention.
|
||||||
|
|
||||||
|
(10) If you have built PCRE2 with SUPPORT_JIT, the JIT features can be tested
|
||||||
|
by running pcre2test with the -jit option. This is done automatically by
|
||||||
|
the RunTest script. You might also like to build and run the freestanding
|
||||||
|
JIT test program, pcre2_jit_test.c.
|
||||||
|
|
||||||
|
(11) If you want to use the pcre2grep command, compile and link pcre2grep.c; it
|
||||||
|
uses only the basic 8-bit PCRE2 library (it does not need the pcre2posix
|
||||||
|
library).
|
||||||
|
|
||||||
|
|
||||||
|
BUILDING FOR VIRTUAL PASCAL
|
||||||
|
|
||||||
|
FIXME FOR PCRE2
|
||||||
|
|
||||||
|
A script for building PCRE2 using Borland's C++ compiler for use with VPASCAL
|
||||||
|
was contributed by Alexander Tokarev. Stefan Weber updated the script and added
|
||||||
|
additional files. The following files in the distribution are for building
|
||||||
|
PCRE2 for use with VP/Borland: makevp_c.txt, makevp_l.txt, makevp.bat,
|
||||||
|
pcre2gexp.pas.
|
||||||
|
|
||||||
|
|
||||||
|
STACK SIZE IN WINDOWS ENVIRONMENTS
|
||||||
|
|
||||||
|
The default processor stack size of 1Mb in some Windows environments is too
|
||||||
|
small for matching patterns that need much recursion. In particular, test 2 may
|
||||||
|
fail because of this. Normally, running out of stack causes a crash, but there
|
||||||
|
have been cases where the test program has just died silently. See your linker
|
||||||
|
documentation for how to increase stack size if you experience problems. The
|
||||||
|
Linux default of 8Mb is a reasonable choice for the stack, though even that can
|
||||||
|
be too small for some pattern/subject combinations.
|
||||||
|
|
||||||
|
PCRE2 has a compile configuration option to disable the use of stack for
|
||||||
|
recursion so that heap is used instead. However, pattern matching is
|
||||||
|
significantly slower when this is done. There is more about stack usage in the
|
||||||
|
"pcre2stack" documentation.
|
||||||
|
|
||||||
|
|
||||||
|
LINKING PROGRAMS IN WINDOWS ENVIRONMENTS
|
||||||
|
|
||||||
|
If you want to statically link a program against a PCRE2 library in the form of
|
||||||
|
a non-dll .a file, you must define PCRE2_STATIC before including src/pcre2.h.
|
||||||
|
|
||||||
|
|
||||||
|
CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS
|
||||||
|
|
||||||
|
It is possible to compile programs to use different calling conventions using
|
||||||
|
MSVC. Search the web for "calling conventions" for more information. To make it
|
||||||
|
easier to change the calling convention for the exported functions in the
|
||||||
|
PCRE2 library, the macro PCRE2_CALL_CONVENTION is present in all the external
|
||||||
|
definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is
|
||||||
|
not set, it defaults to empty; the default calling convention is then used
|
||||||
|
(which is what is wanted most of the time).
|
||||||
|
|
||||||
|
|
||||||
|
COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE2 ON WINDOWS WITH CMAKE")
|
||||||
|
|
||||||
|
There are two ways of building PCRE2 using the "configure, make, make install"
|
||||||
|
paradigm on Windows systems: using MinGW or using Cygwin. These are not at all
|
||||||
|
the same thing; they are completely different from each other. There is also
|
||||||
|
support for building using CMake, which some users find a more straightforward
|
||||||
|
way of building PCRE2 under Windows.
|
||||||
|
|
||||||
|
The MinGW home page (http://www.mingw.org/) says this:
|
||||||
|
|
||||||
|
MinGW: A collection of freely available and freely distributable Windows
|
||||||
|
specific header files and import libraries combined with GNU toolsets that
|
||||||
|
allow one to produce native Windows programs that do not rely on any
|
||||||
|
3rd-party C runtime DLLs.
|
||||||
|
|
||||||
|
The Cygwin home page (http://www.cygwin.com/) says this:
|
||||||
|
|
||||||
|
Cygwin is a Linux-like environment for Windows. It consists of two parts:
|
||||||
|
|
||||||
|
. A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing
|
||||||
|
substantial Linux API functionality
|
||||||
|
|
||||||
|
. A collection of tools which provide Linux look and feel.
|
||||||
|
|
||||||
|
On both MinGW and Cygwin, PCRE2 should build correctly using:
|
||||||
|
|
||||||
|
./configure && make && make install
|
||||||
|
|
||||||
|
This should create two libraries called libpcre2-8 and libpcre2-posix. These
|
||||||
|
are independent libraries: when you link with libpcre2-posix you must also link
|
||||||
|
with libpcre2-8, which contains the basic functions.
|
||||||
|
|
||||||
|
Using Cygwin's compiler generates libraries and executables that depend on
|
||||||
|
cygwin1.dll. If a library that is generated this way is distributed,
|
||||||
|
cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL
|
||||||
|
licence, this forces not only PCRE2 to be under the GPL, but also the entire
|
||||||
|
application. A distributor who wants to keep their own code proprietary must
|
||||||
|
purchase an appropriate Cygwin licence.
|
||||||
|
|
||||||
|
MinGW has no such restrictions. The MinGW compiler generates a library or
|
||||||
|
executable that can run standalone on Windows without any third party dll or
|
||||||
|
licensing issues.
|
||||||
|
|
||||||
|
But there is more complication:
|
||||||
|
|
||||||
|
If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is
|
||||||
|
to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a
|
||||||
|
front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's
|
||||||
|
gcc and MinGW's gcc). So, a user can:
|
||||||
|
|
||||||
|
. Build native binaries by using MinGW or by getting Cygwin and using
|
||||||
|
-mno-cygwin.
|
||||||
|
|
||||||
|
. Build binaries that depend on cygwin1.dll by using Cygwin with the normal
|
||||||
|
compiler flags.
|
||||||
|
|
||||||
|
The test files that are supplied with PCRE2 are in UNIX format, with LF
|
||||||
|
characters as line terminators. Unless your PCRE2 library uses a default
|
||||||
|
newline option that includes LF as a valid newline, it may be necessary to
|
||||||
|
change the line terminators in the test files to get some of the tests to work.
|
||||||
|
|
||||||
|
|
||||||
|
BUILDING PCRE2 ON WINDOWS WITH CMAKE
|
||||||
|
|
||||||
|
CMake is an alternative configuration facility that can be used instead of
|
||||||
|
"configure". CMake creates project files (make files, solution files, etc.)
|
||||||
|
tailored to numerous development environments, including Visual Studio,
|
||||||
|
Borland, Msys, MinGW, NMake, and Unix. If possible, use short paths with no
|
||||||
|
spaces in the names for your CMake installation and your PCRE2 source and build
|
||||||
|
directories.
|
||||||
|
|
||||||
|
The following instructions were contributed by a PCRE1 user, but they should
|
||||||
|
also work for PCRE2. If they are not followed exactly, errors may occur. In the
|
||||||
|
event that errors do occur, it is recommended that you delete the CMake cache
|
||||||
|
before attempting to repeat the CMake build process. In the CMake GUI, the
|
||||||
|
cache can be deleted by selecting "File > Delete Cache".
|
||||||
|
|
||||||
|
1. Install the latest CMake version available from http://www.cmake.org/, and
|
||||||
|
ensure that cmake\bin is on your path.
|
||||||
|
|
||||||
|
2. Unzip (retaining folder structure) the PCRE2 source tree into a source
|
||||||
|
directory such as C:\pcre2. You should ensure your local date and time
|
||||||
|
is not earlier than the file dates in your source dir if the release is
|
||||||
|
very new.
|
||||||
|
|
||||||
|
3. Create a new, empty build directory, preferably a subdirectory of the
|
||||||
|
source dir. For example, C:\pcre2\pcre2-xx\build.
|
||||||
|
|
||||||
|
4. Run cmake-gui from the Shell envirornment of your build tool, for example,
|
||||||
|
Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
|
||||||
|
to start Cmake from the Windows Start menu, as this can lead to errors.
|
||||||
|
|
||||||
|
5. Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and
|
||||||
|
build directories, respectively.
|
||||||
|
|
||||||
|
6. Hit the "Configure" button.
|
||||||
|
|
||||||
|
7. Select the particular IDE / build tool that you are using (Visual
|
||||||
|
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
||||||
|
|
||||||
|
8. The GUI will then list several configuration options. This is where
|
||||||
|
you can enable Unicode support or other PCRE2 optional features.
|
||||||
|
|
||||||
|
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
||||||
|
active.
|
||||||
|
|
||||||
|
10. Hit "Generate".
|
||||||
|
|
||||||
|
11. The build directory should now contain a usable build system, be it a
|
||||||
|
solution file for Visual Studio, makefiles for MinGW, etc. Exit from
|
||||||
|
cmake-gui and use the generated build system with your compiler or IDE.
|
||||||
|
E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2
|
||||||
|
solution, select the desired configuration (Debug, or Release, etc.) and
|
||||||
|
build the ALL_BUILD project.
|
||||||
|
|
||||||
|
12. If during configuration with cmake-gui you've elected to build the test
|
||||||
|
programs, you can execute them by building the test project. E.g., for
|
||||||
|
MinGW: "make check"; for Visual Studio build the RUN_TESTS project. The
|
||||||
|
most recent build configuration is targeted by the tests. A summary of
|
||||||
|
test results is presented. Complete test output is subsequently
|
||||||
|
available for review in Testing\Temporary under your build dir.
|
||||||
|
|
||||||
|
|
||||||
|
TESTING WITH RUNTEST.BAT FIXME FIXME NOT YET TESTED/UPDATED FIXME
|
||||||
|
|
||||||
|
If configured with CMake, building the test project ("make check" or building
|
||||||
|
ALL_TESTS in Visual Studio) creates (and runs) pcre2_test.bat (and depending
|
||||||
|
on your configuration options, possibly other test programs) in the build
|
||||||
|
directory. Pcre_test.bat runs RunTest.Bat with correct source and exe paths.
|
||||||
|
|
||||||
|
For manual testing with RunTest.bat, provided the build dir is a subdirectory
|
||||||
|
of the source directory: Open command shell window. Chdir to the location
|
||||||
|
of your pcre2test.exe and pcre2grep.exe programs. Call RunTest.bat with
|
||||||
|
"..\RunTest.Bat" or "..\..\RunTest.bat" as appropriate.
|
||||||
|
|
||||||
|
To run only a particular test with RunTest.Bat provide a test number argument.
|
||||||
|
|
||||||
|
Otherwise:
|
||||||
|
|
||||||
|
1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
|
||||||
|
have been created.
|
||||||
|
|
||||||
|
2. Edit RunTest.bat to indentify the full or relative location of
|
||||||
|
the pcre2 source (wherein which the testdata folder resides), e.g.:
|
||||||
|
|
||||||
|
set srcdir=C:\pcre2\pcre2-10.00
|
||||||
|
|
||||||
|
3. In a Windows command environment, chdir to the location of your bat and
|
||||||
|
exe programs.
|
||||||
|
|
||||||
|
4. Run RunTest.bat. Test outputs will automatically be compared to expected
|
||||||
|
results, and discrepancies will be identified in the console output.
|
||||||
|
|
||||||
|
To independently test the just-in-time compiler, run pcre2_jit_test.exe.
|
||||||
|
|
||||||
|
|
||||||
|
BUILDING PCRE2 ON NATIVE Z/OS AND Z/VM
|
||||||
|
|
||||||
|
z/OS and z/VM are operating systems for mainframe computers, produced by IBM.
|
||||||
|
The character code used is EBCDIC, not ASCII or Unicode. In z/OS, UNIX APIs and
|
||||||
|
applications can be supported through UNIX System Services, and in such an
|
||||||
|
environment PCRE2 can be built in the same way as in other systems. However, in
|
||||||
|
native z/OS (without UNIX System Services) and in z/VM, special ports are
|
||||||
|
required. For details, please see this web site:
|
||||||
|
|
||||||
|
http://www.zaconsultants.net
|
||||||
|
|
||||||
|
There is also a mirror here:
|
||||||
|
|
||||||
|
http://www.vsoft-software.com/downloads.html
|
||||||
|
|
||||||
|
The site currently has ports for PCRE1 releases, but PCRE2 should follow in due
|
||||||
|
course.
|
||||||
|
|
||||||
|
==========================
|
||||||
|
Last Updated: 28 September 2014
|
|
@ -1 +1,832 @@
|
||||||
This is a placeholder README file for a work in progress.
|
README file for PCRE2 (Perl-compatible regular expression library)
|
||||||
|
------------------------------------------------------------------
|
||||||
|
|
||||||
|
PCRE2 is a re-implementation of the original PCRE library with an entirely new
|
||||||
|
API. The latest release of PCRE2 is always available in three alternative
|
||||||
|
formats from:
|
||||||
|
|
||||||
|
FIXME: THIS WILL NOT BE THE CASE UNTIL THERE IS A FORMAL RELEASE.
|
||||||
|
|
||||||
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.gz
|
||||||
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.bz2
|
||||||
|
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.zip
|
||||||
|
|
||||||
|
There is a mailing list for discussion about the development of PCRE (both the
|
||||||
|
original and new APIs) at pcre-dev@exim.org. You can access the archives and
|
||||||
|
subscribe or manage your subscription here:
|
||||||
|
|
||||||
|
https://lists.exim.org/mailman/listinfo/pcre-dev
|
||||||
|
|
||||||
|
Please read the NEWS file if you are upgrading from a previous release.
|
||||||
|
The contents of this README file are:
|
||||||
|
|
||||||
|
The PCRE2 APIs
|
||||||
|
Documentation for PCRE2
|
||||||
|
Contributions by users of PCRE2
|
||||||
|
Building PCRE2 on non-Unix-like systems
|
||||||
|
Building PCRE2 without using autotools
|
||||||
|
Building PCRE2 using autotools
|
||||||
|
Retrieving configuration information
|
||||||
|
Shared libraries
|
||||||
|
Cross-compiling using autotools
|
||||||
|
Making new tarballs
|
||||||
|
Testing PCRE2
|
||||||
|
Character tables
|
||||||
|
File manifest
|
||||||
|
|
||||||
|
|
||||||
|
The PCRE2 APIs
|
||||||
|
--------------
|
||||||
|
|
||||||
|
PCRE2 is written in C, and it has its own API. There are three sets of
|
||||||
|
functions, one for the 8-bit library, which processes strings of bytes, one for
|
||||||
|
the 16-bit library, which processes strings of 16-bit values, and one for the
|
||||||
|
32-bit library, which processes strings of 32-bit values. As this is a new API,
|
||||||
|
there as yet no C++ wrappers.
|
||||||
|
|
||||||
|
The distribution does contain a set of C wrapper functions for the 8-bit
|
||||||
|
library that are based on the POSIX regular expression API (see the pcre2posix
|
||||||
|
man page). These end up in the library called libpcre2posix. Note that this
|
||||||
|
just provides a POSIX calling interface to PCRE2; the regular expressions
|
||||||
|
themselves still follow Perl syntax and semantics. The POSIX API is restricted,
|
||||||
|
and does not give full access to all of PCRE2's facilities.
|
||||||
|
|
||||||
|
The header file for the POSIX-style functions is called pcre2posix.h. The
|
||||||
|
official POSIX name is regex.h, but I did not want to risk possible problems
|
||||||
|
with existing files of that name by distributing it that way. To use PCRE2 with
|
||||||
|
an existing program that uses the POSIX API, pcre2posix.h will have to be
|
||||||
|
renamed or pointed at by a link.
|
||||||
|
|
||||||
|
If you are using the POSIX interface to PCRE2 and there is already a POSIX
|
||||||
|
regex library installed on your system, as well as worrying about the regex.h
|
||||||
|
header file (as mentioned above), you must also take care when linking programs
|
||||||
|
to ensure that they link with PCRE2's libpcre2posix library. Otherwise they may
|
||||||
|
pick up the POSIX functions of the same name from the other library.
|
||||||
|
|
||||||
|
One way of avoiding this confusion is to compile PCRE2 with the addition of
|
||||||
|
-Dregcomp=PCRE2regcomp (and similarly for the other POSIX functions) to the
|
||||||
|
compiler flags (CFLAGS if you are using "configure" -- see below). This has the
|
||||||
|
effect of renaming the functions so that the names no longer clash. Of course,
|
||||||
|
you have to do the same thing for your applications, or write them using the
|
||||||
|
new names.
|
||||||
|
|
||||||
|
|
||||||
|
Documentation for PCRE2
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
If you install PCRE2 in the normal way on a Unix-like system, you will end up
|
||||||
|
with a set of man pages whose names all start with "pcre2". The one that is
|
||||||
|
just called "pcre2" lists all the others. In addition to these man pages, the
|
||||||
|
PCRE2 documentation is supplied in two other forms:
|
||||||
|
|
||||||
|
1. There are files called doc/pcre2.txt, doc/pcre2grep.txt, and
|
||||||
|
doc/pcre2test.txt in the source distribution. The first of these is a
|
||||||
|
concatenation of the text forms of all the section 3 man pages except the
|
||||||
|
listing of pcre2demo.c and those that summarize individual functions. The
|
||||||
|
other two are the text forms of the section 1 man pages for the pcre2grep
|
||||||
|
and pcre2test commands. These text forms are provided for ease of scanning
|
||||||
|
with text editors or similar tools. They are installed in
|
||||||
|
<prefix>/share/doc/pcre2, where <prefix> is the installation prefix
|
||||||
|
(defaulting to /usr/local).
|
||||||
|
|
||||||
|
2. A set of files containing all the documentation in HTML form, hyperlinked
|
||||||
|
in various ways, and rooted in a file called index.html, is distributed in
|
||||||
|
doc/html and installed in <prefix>/share/doc/pcre2/html.
|
||||||
|
|
||||||
|
|
||||||
|
Building PCRE2 on non-Unix-like systems
|
||||||
|
--------------------------------------
|
||||||
|
|
||||||
|
For a non-Unix-like system, please read the comments in the file
|
||||||
|
NON-AUTOTOOLS-BUILD, though if your system supports the use of "configure" and
|
||||||
|
"make" you may be able to build PCRE2 using autotools in the same way as for
|
||||||
|
many Unix-like systems.
|
||||||
|
|
||||||
|
PCRE2 can also be configured using CMake, which can be run in various ways
|
||||||
|
(command line, GUI, etc). This creates Makefiles, solution files, etc. The file
|
||||||
|
NON-AUTOTOOLS-BUILD has information about CMake.
|
||||||
|
|
||||||
|
PCRE2 has been compiled on many different operating systems. It should be
|
||||||
|
straightforward to build PCRE2 on any system that has a Standard C compiler and
|
||||||
|
library, because it uses only Standard C functions.
|
||||||
|
|
||||||
|
|
||||||
|
Building PCRE2 without using autotools
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
The use of autotools (in particular, libtool) is problematic in some
|
||||||
|
environments, even some that are Unix or Unix-like. See the NON-AUTOTOOLS-BUILD
|
||||||
|
file for ways of building PCRE2 without using autotools.
|
||||||
|
|
||||||
|
|
||||||
|
Building PCRE2 using autotools
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
The following instructions assume the use of the widely used "configure; make;
|
||||||
|
make install" (autotools) process.
|
||||||
|
|
||||||
|
To build PCRE2 on system that supports autotools, first run the "configure"
|
||||||
|
command from the PCRE2 distribution directory, with your current directory set
|
||||||
|
to the directory where you want the files to be created. This command is a
|
||||||
|
standard GNU "autoconf" configuration script, for which generic instructions
|
||||||
|
are supplied in the file INSTALL.
|
||||||
|
|
||||||
|
Most commonly, people build PCRE2 within its own distribution directory, and in
|
||||||
|
this case, on many systems, just running "./configure" is sufficient. However,
|
||||||
|
the usual methods of changing standard defaults are available. For example:
|
||||||
|
|
||||||
|
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
|
||||||
|
|
||||||
|
This command specifies that the C compiler should be run with the flags '-O2
|
||||||
|
-Wall' instead of the default, and that "make install" should install PCRE2
|
||||||
|
under /opt/local instead of the default /usr/local.
|
||||||
|
|
||||||
|
If you want to build in a different directory, just run "configure" with that
|
||||||
|
directory as current. For example, suppose you have unpacked the PCRE2 source
|
||||||
|
into /source/pcre2/pcre2-xxx, but you want to build it in
|
||||||
|
/build/pcre2/pcre2-xxx:
|
||||||
|
|
||||||
|
cd /build/pcre2/pcre2-xxx
|
||||||
|
/source/pcre2/pcre2-xxx/configure
|
||||||
|
|
||||||
|
PCRE2 is written in C and is normally compiled as a C library. However, it is
|
||||||
|
possible to build it as a C++ library, though the provided building apparatus
|
||||||
|
does not have any features to support this.
|
||||||
|
|
||||||
|
There are some optional features that can be included or omitted from the PCRE2
|
||||||
|
library. They are also documented in the pcre2build man page.
|
||||||
|
|
||||||
|
. By default, both shared and static libraries are built. You can change this
|
||||||
|
by adding one of these options to the "configure" command:
|
||||||
|
|
||||||
|
--disable-shared
|
||||||
|
--disable-static
|
||||||
|
|
||||||
|
(See also "Shared libraries on Unix-like systems" below.)
|
||||||
|
|
||||||
|
. By default, only the 8-bit library is built. If you add --enable-pcre16 to
|
||||||
|
the "configure" command, the 16-bit library is also built. If you add
|
||||||
|
--enable-pcre32 to the "configure" command, the 32-bit library is also built.
|
||||||
|
If you want only the 16-bit or 32-bit library, use --disable-pcre8 to disable
|
||||||
|
building the 8-bit library.
|
||||||
|
|
||||||
|
. If you want to include support for just-in-time compiling, which can give
|
||||||
|
large performance improvements on certain platforms, add --enable-jit to the
|
||||||
|
"configure" command. This support is available only for certain hardware
|
||||||
|
architectures. If you try to enable it on an unsupported architecture, there
|
||||||
|
will be a compile time error. FIXME: NOT YET IMPLEMENTED.
|
||||||
|
|
||||||
|
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||||
|
you add --disable-pcre2grep-jit to the "configure" command.
|
||||||
|
|
||||||
|
. If you want to make use of the support for UTF-8 Unicode character strings in
|
||||||
|
the 8-bit library, UTF-16 Unicode character strings in the 16-bit library,
|
||||||
|
and UTF-32 Unicode character strings in the 32-bit library, you must add
|
||||||
|
--enable-unicode to the "configure" command. Without it, the code for
|
||||||
|
handling UTF-8, UTF-16 and UTF-8 is not included. It is not possible to
|
||||||
|
configure one library with UTF support and the other without in the same
|
||||||
|
configuration.
|
||||||
|
|
||||||
|
Even when --enable-unicode is included, the use of a UTF encoding still has
|
||||||
|
to be enabled by an option at run time. When PCRE2 is compiled with this
|
||||||
|
option, its input can only either be ASCII or UTF-8/16/32, even when running
|
||||||
|
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
||||||
|
--enable-ebcdic at the same time.
|
||||||
|
|
||||||
|
When --enable-unicode is specified, as well as supporting UTF strings, PCRE2
|
||||||
|
includes support for the \P, \p, and \X sequences that recognize Unicode
|
||||||
|
character properties. However, only the basic two-letter properties such as
|
||||||
|
Lu are supported.
|
||||||
|
|
||||||
|
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
||||||
|
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||||
|
end of a line. Whatever you specify at build time is the default; the caller
|
||||||
|
of PCRE2 can change the selection at run time. The default newline indicator
|
||||||
|
is a single LF character (the Unix standard). You can specify the default
|
||||||
|
newline indicator by adding --enable-newline-is-cr or --enable-newline-is-lf
|
||||||
|
or --enable-newline-is-crlf or --enable-newline-is-anycrlf or
|
||||||
|
--enable-newline-is-any to the "configure" command, respectively.
|
||||||
|
|
||||||
|
If you specify --enable-newline-is-cr or --enable-newline-is-crlf, some of
|
||||||
|
the standard tests will fail, because the lines in the test files end with
|
||||||
|
LF. Even if the files are edited to change the line endings, there are likely
|
||||||
|
to be some failures. With --enable-newline-is-anycrlf or
|
||||||
|
--enable-newline-is-any, many tests should succeed, but there may be some
|
||||||
|
failures.
|
||||||
|
|
||||||
|
. By default, the sequence \R in a pattern matches any Unicode line ending
|
||||||
|
sequence. This is independent of the option specifying what PCRE2 considers
|
||||||
|
to be the end of a line (see above). However, the caller of PCRE2 can
|
||||||
|
restrict \R to match only CR, LF, or CRLF. You can make this the default by
|
||||||
|
adding --enable-bsr-anycrlf to the "configure" command (bsr = "backslash R").
|
||||||
|
|
||||||
|
. PCRE2 has a counter that limits the depth of nesting of parentheses in a
|
||||||
|
pattern. This limits the amount of system stack that a pattern uses when it
|
||||||
|
is compiled. The default is 250, but you can change it by setting, for
|
||||||
|
example,
|
||||||
|
|
||||||
|
--with-parens-nest-limit=500
|
||||||
|
|
||||||
|
. PCRE2 has a counter that can be set to limit the amount of resources it uses
|
||||||
|
when matching a pattern. If the limit is exceeded during a match, the match
|
||||||
|
fails. The default is ten million. You can change the default by setting, for
|
||||||
|
example,
|
||||||
|
|
||||||
|
--with-match-limit=500000
|
||||||
|
|
||||||
|
on the "configure" command. This is just the default; individual calls to
|
||||||
|
pcre2_match() can supply their own value. There is more discussion on the
|
||||||
|
pcre2api man page.
|
||||||
|
|
||||||
|
. There is a separate counter that limits the depth of recursive function calls
|
||||||
|
during a matching process. This also has a default of ten million, which is
|
||||||
|
essentially "unlimited". You can change the default by setting, for example,
|
||||||
|
|
||||||
|
--with-match-limit-recursion=500000
|
||||||
|
|
||||||
|
Recursive function calls use up the runtime stack; running out of stack can
|
||||||
|
cause programs to crash in strange ways. There is a discussion about stack
|
||||||
|
sizes in the pcre2stack man page.
|
||||||
|
|
||||||
|
. In the 8-bit library, the default maximum compiled pattern size is around
|
||||||
|
64K. You can increase this by adding --with-link-size=3 to the "configure"
|
||||||
|
command. PCRE2 then uses three bytes instead of two for offsets to different
|
||||||
|
parts of the compiled pattern. In the 16-bit library, --with-link-size=3 is
|
||||||
|
the same as --with-link-size=4, which (in both libraries) uses four-byte
|
||||||
|
offsets. Increasing the internal link size reduces performance. In the 32-bit
|
||||||
|
library, the link size setting is ignored, as 4-byte offsets are always used.
|
||||||
|
|
||||||
|
. You can build PCRE2 so that its internal match() function that is called from
|
||||||
|
pcre2_match() does not call itself recursively. Instead, it uses memory
|
||||||
|
blocks obtained from the heap to save data that would otherwise be saved on
|
||||||
|
the stack. To build PCRE2 like this, use
|
||||||
|
|
||||||
|
--disable-stack-for-recursion
|
||||||
|
|
||||||
|
on the "configure" command. PCRE2 runs more slowly in this mode, but it may
|
||||||
|
be necessary in environments with limited stack sizes. This applies only to
|
||||||
|
the normal execution of the pcre2_match() function; if JIT support is being
|
||||||
|
successfully used, it is not relevant. Equally, it does not apply to
|
||||||
|
pcre2_dfa_match(), which does not use deeply nested recursion. There is a
|
||||||
|
discussion about stack sizes in the pcre2stack man page.
|
||||||
|
|
||||||
|
. For speed, PCRE2 uses four tables for manipulating and identifying characters
|
||||||
|
whose code point values are less than 256. By default, it uses a set of
|
||||||
|
tables for ASCII encoding that is part of the distribution. If you specify
|
||||||
|
|
||||||
|
--enable-rebuild-chartables
|
||||||
|
|
||||||
|
a program called dftables is compiled and run in the default C locale when
|
||||||
|
you obey "make". It builds a source file called pcre2_chartables.c. If you do
|
||||||
|
not specify this option, pcre2_chartables.c is created as a copy of
|
||||||
|
pcre2_chartables.c.dist. See "Character tables" below for further
|
||||||
|
information.
|
||||||
|
|
||||||
|
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
||||||
|
character code (as opposed to ASCII/Unicode) by specifying
|
||||||
|
|
||||||
|
--enable-ebcdic
|
||||||
|
|
||||||
|
This automatically implies --enable-rebuild-chartables (see above). However,
|
||||||
|
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
||||||
|
both EBCDIC and UTF-8/16/32. There is a second option, --enable-ebcdic-nl25,
|
||||||
|
which specifies that the code value for the EBCDIC NL character is 0x25
|
||||||
|
instead of the default 0x15.
|
||||||
|
|
||||||
|
. In environments where valgrind is installed, if you specify
|
||||||
|
|
||||||
|
--enable-valgrind
|
||||||
|
|
||||||
|
PCRE2 will use valgrind annotations to mark certain memory regions as
|
||||||
|
unaddressable. This allows it to detect invalid memory accesses, and is
|
||||||
|
mostly useful for debugging PCRE2 itself.
|
||||||
|
|
||||||
|
. In environments where the gcc compiler is used and lcov version 1.6 or above
|
||||||
|
is installed, if you specify
|
||||||
|
|
||||||
|
--enable-coverage
|
||||||
|
|
||||||
|
the build process implements a code coverage report for the test suite. The
|
||||||
|
report is generated by running "make coverage". If ccache is installed on
|
||||||
|
your system, it must be disabled when building PCRE2 for coverage reporting.
|
||||||
|
You can do this by setting the environment variable CCACHE_DISABLE=1 before
|
||||||
|
running "make" to build PCRE2. There is more information about coverage
|
||||||
|
reporting in the "pcre2build" documentation.
|
||||||
|
|
||||||
|
. The pcre2grep program currently supports only 8-bit data files, and so
|
||||||
|
requires the 8-bit PCRE2 library. It is possible to compile pcre2grep to use
|
||||||
|
libz and/or libbz2, in order to read .gz and .bz2 files (respectively), by
|
||||||
|
specifying one or both of
|
||||||
|
|
||||||
|
--enable-pcre2grep-libz
|
||||||
|
--enable-pcre2grep-libbz2
|
||||||
|
|
||||||
|
Of course, the relevant libraries must be installed on your system.
|
||||||
|
|
||||||
|
. The default size (in bytes) of the internal buffer used by pcre2grep can be
|
||||||
|
set by, for example:
|
||||||
|
|
||||||
|
--with-pcre2grep-bufsize=51200
|
||||||
|
|
||||||
|
The value must be a plain integer. The default is 20480.
|
||||||
|
|
||||||
|
. It is possible to compile pcre2test so that it links with the libreadline
|
||||||
|
or libedit libraries, by specifying, respectively,
|
||||||
|
|
||||||
|
--enable-pcre2test-libreadline or --enable-pcre2test-libedit
|
||||||
|
|
||||||
|
If this is done, when pcre2test's input is from a terminal, it reads it using
|
||||||
|
the readline() function. This provides line-editing and history facilities.
|
||||||
|
Note that libreadline is GPL-licenced, so if you distribute a binary of
|
||||||
|
pcre2test linked in this way, there may be licensing issues. These can be
|
||||||
|
avoided by linking with libedit (which has a BSD licence) instead.
|
||||||
|
|
||||||
|
Enabling libreadline causes the -lreadline option to be added to the
|
||||||
|
pcre2test build. In many operating environments with a sytem-installed
|
||||||
|
readline library this is sufficient. However, in some environments (e.g. if
|
||||||
|
an unmodified distribution version of readline is in use), it may be
|
||||||
|
necessary to specify something like LIBS="-lncurses" as well. This is
|
||||||
|
because, to quote the readline INSTALL, "Readline uses the termcap functions,
|
||||||
|
but does not link with the termcap or curses library itself, allowing
|
||||||
|
applications which link with readline the to choose an appropriate library."
|
||||||
|
If you get error messages about missing functions tgetstr, tgetent, tputs,
|
||||||
|
tgetflag, or tgoto, this is the problem, and linking with the ncurses library
|
||||||
|
should fix it.
|
||||||
|
|
||||||
|
The "configure" script builds the following files for the basic C library:
|
||||||
|
|
||||||
|
. Makefile the makefile that builds the library
|
||||||
|
. src/config.h build-time configuration options for the library
|
||||||
|
. src/pcre2.h the public PCRE2 header file
|
||||||
|
. pcre2-config script that shows the building settings such as CFLAGS
|
||||||
|
that were set for "configure"
|
||||||
|
. libpcre2-8.pc )
|
||||||
|
. libpcre2-16.pc ) data for the pkg-config command
|
||||||
|
. libpcre2-32.pc )
|
||||||
|
. libpcre2-posix.pc )
|
||||||
|
. libtool script that builds shared and/or static libraries
|
||||||
|
|
||||||
|
Versions of config.h and pcre2.h are distributed in the src directory of PCRE2
|
||||||
|
tarballs under the names config.h.generic and pcre2.h.generic. These are
|
||||||
|
provided for those who have to build PCRE2 without using "configure" or CMake.
|
||||||
|
If you use "configure" or CMake, the .generic versions are not used.
|
||||||
|
|
||||||
|
The "configure" script also creates config.status, which is an executable
|
||||||
|
script that can be run to recreate the configuration, and config.log, which
|
||||||
|
contains compiler output from tests that "configure" runs.
|
||||||
|
|
||||||
|
Once "configure" has run, you can run "make". This builds whichever of the
|
||||||
|
libraries libpcre2-8, libpcre2-16 and libpcre2-32 are configured, and a test
|
||||||
|
program called pcre2test. If you enabled JIT support with --enable-jit, another
|
||||||
|
test program called pcre2_jit_test is built as well. FIXME: still to be
|
||||||
|
implemented. If the 8-bit library is built, libpcre2-posix and the pcre2grep
|
||||||
|
command are also built.
|
||||||
|
|
||||||
|
The command "make check" runs all the appropriate tests. Details of the PCRE2
|
||||||
|
tests are given below in a separate section of this document.
|
||||||
|
|
||||||
|
You can use "make install" to install PCRE2 into live directories on your
|
||||||
|
system. The following are installed (file names are all relative to the
|
||||||
|
<prefix> that is set when "configure" is run):
|
||||||
|
|
||||||
|
Commands (bin):
|
||||||
|
pcre2test
|
||||||
|
pcre2grep (if 8-bit support is enabled)
|
||||||
|
pcre2-config
|
||||||
|
|
||||||
|
Libraries (lib):
|
||||||
|
libpcre2-8 (if 8-bit support is enabled)
|
||||||
|
libpcre2-16 (if 16-bit support is enabled)
|
||||||
|
libpcre2-32 (if 32-bit support is enabled)
|
||||||
|
libpcre2-posix (if 8-bit support is enabled)
|
||||||
|
|
||||||
|
Configuration information (lib/pkgconfig):
|
||||||
|
libpcre2-8.pc
|
||||||
|
libpcre2-16.pc
|
||||||
|
libpcre2-32.pc
|
||||||
|
libpcre2-posix.pc
|
||||||
|
|
||||||
|
Header files (include):
|
||||||
|
pcre2.h
|
||||||
|
pcre2posix.h
|
||||||
|
|
||||||
|
Man pages (share/man/man{1,3}):
|
||||||
|
pcre2grep.1
|
||||||
|
pcre2test.1
|
||||||
|
pcre2-config.1
|
||||||
|
pcre2.3
|
||||||
|
pcre2*.3 (lots more pages, all starting "pcre2")
|
||||||
|
|
||||||
|
HTML documentation (share/doc/pcre2/html):
|
||||||
|
index.html
|
||||||
|
*.html (lots more pages, hyperlinked from index.html)
|
||||||
|
|
||||||
|
Text file documentation (share/doc/pcre2):
|
||||||
|
AUTHORS
|
||||||
|
COPYING
|
||||||
|
ChangeLog
|
||||||
|
LICENCE
|
||||||
|
NEWS
|
||||||
|
README
|
||||||
|
pcre2.txt (a concatenation of the man(3) pages)
|
||||||
|
pcre2test.txt the pcre2test man page
|
||||||
|
pcre2grep.txt the pcre2grep man page
|
||||||
|
pcre2-config.txt the pcre2-config man page
|
||||||
|
|
||||||
|
If you want to remove PCRE2 from your system, you can run "make uninstall".
|
||||||
|
This removes all the files that "make install" installed. However, it does not
|
||||||
|
remove any directories, because these are often shared with other programs.
|
||||||
|
|
||||||
|
|
||||||
|
Retrieving configuration information
|
||||||
|
------------------------------------
|
||||||
|
|
||||||
|
Running "make install" installs the command pcre2-config, which can be used to
|
||||||
|
recall information about the PCRE2 configuration and installation. For example:
|
||||||
|
|
||||||
|
pcre2-config --version
|
||||||
|
|
||||||
|
prints the version number, and
|
||||||
|
|
||||||
|
pcre2-config --libs8
|
||||||
|
|
||||||
|
outputs information about where the 8-bit library is installed. This command
|
||||||
|
can be included in makefiles for programs that use PCRE2, saving the programmer
|
||||||
|
from having to remember too many details. Run pcre2-config with no arguments to
|
||||||
|
obtain a list of possible arguments.
|
||||||
|
|
||||||
|
The pkg-config command is another system for saving and retrieving information
|
||||||
|
about installed libraries. Instead of separate commands for each library, a
|
||||||
|
single command is used. For example:
|
||||||
|
|
||||||
|
pkg-config --libs libpcre2-16
|
||||||
|
|
||||||
|
The data is held in *.pc files that are installed in a directory called
|
||||||
|
<prefix>/lib/pkgconfig.
|
||||||
|
|
||||||
|
|
||||||
|
Shared libraries
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The default distribution builds PCRE2 as shared libraries and static libraries,
|
||||||
|
as long as the operating system supports shared libraries. Shared library
|
||||||
|
support relies on the "libtool" script which is built as part of the
|
||||||
|
"configure" process.
|
||||||
|
|
||||||
|
The libtool script is used to compile and link both shared and static
|
||||||
|
libraries. They are placed in a subdirectory called .libs when they are newly
|
||||||
|
built. The programs pcre2test and pcre2grep are built to use these uninstalled
|
||||||
|
libraries (by means of wrapper scripts in the case of shared libraries). When
|
||||||
|
you use "make install" to install shared libraries, pcre2grep and pcre2test are
|
||||||
|
automatically re-built to use the newly installed shared libraries before being
|
||||||
|
installed themselves. However, the versions left in the build directory still
|
||||||
|
use the uninstalled libraries.
|
||||||
|
|
||||||
|
To build PCRE2 using static libraries only you must use --disable-shared when
|
||||||
|
configuring it. For example:
|
||||||
|
|
||||||
|
./configure --prefix=/usr/gnu --disable-shared
|
||||||
|
|
||||||
|
Then run "make" in the usual way. Similarly, you can use --disable-static to
|
||||||
|
build only shared libraries.
|
||||||
|
|
||||||
|
|
||||||
|
Cross-compiling using autotools
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
You can specify CC and CFLAGS in the normal way to the "configure" command, in
|
||||||
|
order to cross-compile PCRE2 for some other host. However, you should NOT
|
||||||
|
specify --enable-rebuild-chartables, because if you do, the dftables.c source
|
||||||
|
file is compiled and run on the local host, in order to generate the inbuilt
|
||||||
|
character tables (the pcre2_chartables.c file). This will probably not work,
|
||||||
|
because dftables.c needs to be compiled with the local compiler, not the cross
|
||||||
|
compiler.
|
||||||
|
|
||||||
|
When --enable-rebuild-chartables is not specified, pcre2_chartables.c is
|
||||||
|
created by making a copy of pcre2_chartables.c.dist, which is a default set of
|
||||||
|
tables that assumes ASCII code. Cross-compiling with the default tables should
|
||||||
|
not be a problem.
|
||||||
|
|
||||||
|
If you need to modify the character tables when cross-compiling, you should
|
||||||
|
move pcre2_chartables.c.dist out of the way, then compile dftables.c by hand
|
||||||
|
and run it on the local host to make a new version of pcre2_chartables.c.dist.
|
||||||
|
Then when you cross-compile PCRE2 this new version of the tables will be used.
|
||||||
|
|
||||||
|
|
||||||
|
Making new tarballs
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
The command "make dist" creates three PCRE2 tarballs, in tar.gz, tar.bz2, and
|
||||||
|
zip formats. The command "make distcheck" does the same, but then does a trial
|
||||||
|
build of the new distribution to ensure that it works.
|
||||||
|
|
||||||
|
If you have modified any of the man page sources in the doc directory, you
|
||||||
|
should first run the PrepareRelease script before making a distribution. This
|
||||||
|
script creates the .txt and HTML forms of the documentation from the man pages.
|
||||||
|
|
||||||
|
|
||||||
|
Testing PCRE2
|
||||||
|
------------
|
||||||
|
|
||||||
|
To test the basic PCRE2 library on a Unix-like system, run the RunTest script.
|
||||||
|
There is another script called RunGrepTest that tests the options of the
|
||||||
|
pcre2grep command. When JIT support is enabled, another test program called
|
||||||
|
pcre2_jit_test is built. Both the scripts and all the program tests are run if
|
||||||
|
you obey "make check". For other environments, see the instructions in
|
||||||
|
NON-AUTOTOOLS-BUILD.
|
||||||
|
|
||||||
|
The RunTest script runs the pcre2test test program (which is documented in its
|
||||||
|
own man page) on each of the relevant testinput files in the testdata
|
||||||
|
directory, and compares the output with the contents of the corresponding
|
||||||
|
testoutput files. RunTest uses a file called testtry to hold the main output
|
||||||
|
from pcre2test. Other files whose names begin with "test" are used as working
|
||||||
|
files in some tests.
|
||||||
|
|
||||||
|
Some tests are relevant only when certain build-time options were selected. For
|
||||||
|
example, the tests for UTF-8/16/32 support are run only if --enable-unicode was
|
||||||
|
used. RunTest outputs a comment when it skips a test.
|
||||||
|
|
||||||
|
Many of the tests that are not skipped are run twice if JIT support is
|
||||||
|
available. On the second run, JIT compilation is forced. This testing can be
|
||||||
|
suppressed by putting "nojit" on the RunTest command line.
|
||||||
|
|
||||||
|
The entire set of tests is run once for each of the 8-bit, 16-bit and 32-bit
|
||||||
|
libraries that are enabled. If you want to run just one set of tests, call
|
||||||
|
RunTest with either the -8, -16 or -32 option.
|
||||||
|
|
||||||
|
If valgrind is installed, you can run the tests under it by putting "valgrind"
|
||||||
|
on the RunTest command line. To run pcre2test on just one or more specific test
|
||||||
|
files, give their numbers as arguments to RunTest, for example:
|
||||||
|
|
||||||
|
RunTest 2 7 11
|
||||||
|
|
||||||
|
You can also specify ranges of tests such as 3-6 or 3- (meaning 3 to the
|
||||||
|
end), or a number preceded by ~ to exclude a test. For example:
|
||||||
|
|
||||||
|
Runtest 3-15 ~10
|
||||||
|
|
||||||
|
This runs tests 3 to 15, excluding test 10, and just ~13 runs all the tests
|
||||||
|
except test 13. Whatever order the arguments are in, the tests are always run
|
||||||
|
in numerical order.
|
||||||
|
|
||||||
|
You can also call RunTest with the single argument "list" to cause it to output
|
||||||
|
a list of tests.
|
||||||
|
|
||||||
|
The first two tests can always be run, as they expect only plain text strings
|
||||||
|
(not UTF) and make no use of Unicode properties. The first test file can be fed
|
||||||
|
directly into the perltest.pl script to check that Perl gives the same results.
|
||||||
|
The only difference you should see is in the first few lines, where the Perl
|
||||||
|
version is given instead of the PCRE2 version. The second set of tests check
|
||||||
|
auxiliary functions, error detection, and run-time flags that are specific to
|
||||||
|
PCRE2, as well as the POSIX wrapper API. It also uses the debugging flags to
|
||||||
|
check some of the internals of pcre2_compile().
|
||||||
|
|
||||||
|
If you build PCRE2 with a locale setting that is not the standard C locale, the
|
||||||
|
character tables may be different (see next paragraph). In some cases, this may
|
||||||
|
cause failures in the second set of tests. For example, in a locale where the
|
||||||
|
isprint() function yields TRUE for characters in the range 128-255, the use of
|
||||||
|
[:isascii:] inside a character class defines a different set of characters, and
|
||||||
|
this shows up in this test as a difference in the compiled code, which is being
|
||||||
|
listed for checking. Where the comparison test output contains [\x00-\x7f] the
|
||||||
|
test will contain [\x00-\xff], and similarly in some other cases. This is not a
|
||||||
|
bug in PCRE2.
|
||||||
|
|
||||||
|
The third set of tests checks pcre2_maketables(), the facility for building a
|
||||||
|
set of character tables for a specific locale and using them instead of the
|
||||||
|
default tables. The script uses the "locale" command to check for the
|
||||||
|
availability of the "fr_FR", "french", or "fr" locale, and uses the first one
|
||||||
|
that it finds. If the "locale" command fails, or if its output doesn't include
|
||||||
|
"fr_FR", "french", or "fr" in the list of available locales, the third test
|
||||||
|
cannot be run, and a comment is output to say why. If running this test
|
||||||
|
produces an error like this
|
||||||
|
|
||||||
|
** Failed to set locale "fr_FR"
|
||||||
|
|
||||||
|
it means that the given locale is not available on your system, despite being
|
||||||
|
listed by "locale". This does not mean that PCRE2 is broken. There are three
|
||||||
|
alternative output files for the third test, because three different versions
|
||||||
|
of the French locale have been encountered. The test passes if its output
|
||||||
|
matches any one of them.
|
||||||
|
|
||||||
|
The fourth and fifth tests check UTF and Unicode property support, the fourth
|
||||||
|
being compatible with the perltest.pl script, and the fifth checking
|
||||||
|
PCRE2-specific things.
|
||||||
|
|
||||||
|
The sixth and seventh tests check the pcre2_dfa_match() alternative matching
|
||||||
|
function, in non-UTF mode and UTF-mode with Unicode property support,
|
||||||
|
respectively.
|
||||||
|
|
||||||
|
The eighth test checks some internal offsets and code size features; it is
|
||||||
|
run only when the default "link size" of 2 is set (in other cases the sizes
|
||||||
|
change) and when Unicode support is enabled.
|
||||||
|
|
||||||
|
The ninth and tenth tests are run only in 8-bit mode, and the eleventh and
|
||||||
|
twelfth tests are run only in 16-bit and 32-bit modes. These are tests that
|
||||||
|
generate different output in 8-bit mode. Each pair are for general cases and
|
||||||
|
Unicode support, respectively. The thirteenth test checks the handling of
|
||||||
|
non-UTF characters greater than 255 by pcre2_dfa_match() in 16-bit and 32-bit
|
||||||
|
modes.
|
||||||
|
|
||||||
|
The fourteenth test is run only when JIT support is not available, and the
|
||||||
|
fifteenth test is run only when JIT support is available. They test some
|
||||||
|
JIT-specific features such as information output from pcre2test about JIT
|
||||||
|
compilation.
|
||||||
|
|
||||||
|
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
||||||
|
POSIX interface to the 8-bit library, withouth and with Unicode support,
|
||||||
|
respectively.
|
||||||
|
|
||||||
|
|
||||||
|
Character tables
|
||||||
|
----------------
|
||||||
|
|
||||||
|
For speed, PCRE2 uses four tables for manipulating and identifying characters
|
||||||
|
whose code point values are less than 256. By default, a set of tables that is
|
||||||
|
built into the library is used. The pcre2_maketables() function can be called
|
||||||
|
by an application to create a new set of tables in the current locale. This are
|
||||||
|
passed to PCRE2 by calling pcre2_set_character_tables() to put a pointer into a
|
||||||
|
compile context.
|
||||||
|
|
||||||
|
The source file called pcre2_chartables.c contains the default set of tables.
|
||||||
|
By default, this is created as a copy of pcre2_chartables.c.dist, which
|
||||||
|
contains tables for ASCII coding. However, if --enable-rebuild-chartables is
|
||||||
|
specified for ./configure, a different version of pcre2_chartables.c is built
|
||||||
|
by the program dftables (compiled from dftables.c), which uses the ANSI C
|
||||||
|
character handling functions such as isalnum(), isalpha(), isupper(),
|
||||||
|
islower(), etc. to build the table sources. This means that the default C
|
||||||
|
locale which is set for your system will control the contents of these default
|
||||||
|
tables. You can change the default tables by editing pcre2_chartables.c and
|
||||||
|
then re-building PCRE2. If you do this, you should take care to ensure that the
|
||||||
|
file does not get automatically re-generated. The best way to do this is to
|
||||||
|
move pcre2_chartables.c.dist out of the way and replace it with your customized
|
||||||
|
tables.
|
||||||
|
|
||||||
|
When the dftables program is run as a result of --enable-rebuild-chartables,
|
||||||
|
it uses the default C locale that is set on your system. It does not pay
|
||||||
|
attention to the LC_xxx environment variables. In other words, it uses the
|
||||||
|
system's default locale rather than whatever the compiling user happens to have
|
||||||
|
set. If you really do want to build a source set of character tables in a
|
||||||
|
locale that is specified by the LC_xxx variables, you can run the dftables
|
||||||
|
program by hand with the -L option. For example:
|
||||||
|
|
||||||
|
./dftables -L pcre2_chartables.c.special
|
||||||
|
|
||||||
|
The first two 256-byte tables provide lower casing and case flipping functions,
|
||||||
|
respectively. The next table consists of three 32-byte bit maps which identify
|
||||||
|
digits, "word" characters, and white space, respectively. These are used when
|
||||||
|
building 32-byte bit maps that represent character classes for code points less
|
||||||
|
than 256. The final 256-byte table has bits indicating various character types,
|
||||||
|
as follows:
|
||||||
|
|
||||||
|
1 white space character
|
||||||
|
2 letter
|
||||||
|
4 decimal digit
|
||||||
|
8 hexadecimal digit
|
||||||
|
16 alphanumeric or '_'
|
||||||
|
128 regular expression metacharacter or binary zero
|
||||||
|
|
||||||
|
You should not alter the set of characters that contain the 128 bit, as that
|
||||||
|
will cause PCRE2 to malfunction.
|
||||||
|
|
||||||
|
|
||||||
|
File manifest
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The distribution should contain the files listed below.
|
||||||
|
|
||||||
|
(A) Source files for the PCRE2 library functions and their headers are found in
|
||||||
|
the src directory:
|
||||||
|
|
||||||
|
src/dftables.c auxiliary program for building pcre2_chartables.c
|
||||||
|
when --enable-rebuild-chartables is specified
|
||||||
|
|
||||||
|
src/pcre2_chartables.c.dist a default set of character tables that assume
|
||||||
|
ASCII coding; unless --enable-rebuild-chartables is
|
||||||
|
specified, used by copying to pcre2_chartables.c
|
||||||
|
|
||||||
|
src/pcre2posix.c )
|
||||||
|
src/pcre2_auto_possess.c )
|
||||||
|
src/pcre2_compile.c )
|
||||||
|
src/pcre2_config.c )
|
||||||
|
src/pcre2_context.c )
|
||||||
|
src/pcre2_dfa_match.c )
|
||||||
|
src/pcre2_error.c )
|
||||||
|
src/pcre2_exec.c )
|
||||||
|
src/pcre2_jit_compile.c )
|
||||||
|
src/pcre2_jit_match.c ) sources for the functions in the library,
|
||||||
|
src/pcre2_jit_misc.c ) and some internal functions that they use
|
||||||
|
src/pcre2_maketables.c )
|
||||||
|
src/pcre2_match.c )
|
||||||
|
src/pcre2_match_data.c )
|
||||||
|
src/pcre2_newline.c )
|
||||||
|
src/pcre2_ord2utf.c )
|
||||||
|
src/pcre2_pattern_info.c )
|
||||||
|
src/pcre2_string_utils.c )
|
||||||
|
src/pcre2_study.c )
|
||||||
|
src/pcre2_substring.c )
|
||||||
|
src/pcre2_tables.c )
|
||||||
|
src/pcre2_ucd.c )
|
||||||
|
src/pcre2_valid_utf.c )
|
||||||
|
src/pcre2_xclass.c )
|
||||||
|
|
||||||
|
src/pcre2_printint.c debugging function that is used by pcre2test,
|
||||||
|
|
||||||
|
src/config.h.in template for config.h, when built by "configure"
|
||||||
|
src/pcre2.h.in template for pcre2.h when built by "configure"
|
||||||
|
src/pcre2posix.h header for the external POSIX wrapper API
|
||||||
|
src/pcre2_internal.h header for internal use
|
||||||
|
src/pcre2_intmodedep.h a mode-specific internal header
|
||||||
|
src/pcre2_ucp.h header for Unicode property handling
|
||||||
|
|
||||||
|
sljit/* 16 files that make up the JIT compiler FIXME
|
||||||
|
|
||||||
|
(B) Source files for programs that use PCRE2:
|
||||||
|
|
||||||
|
src/pcre2demo.c simple demonstration of coding calls to PCRE2
|
||||||
|
src/pcre2grep.c source of a grep utility that uses PCRE2
|
||||||
|
src/pcre2test.c comprehensive test program
|
||||||
|
|
||||||
|
(C) Auxiliary files:
|
||||||
|
|
||||||
|
132html script to turn "man" pages into HTML
|
||||||
|
AUTHORS information about the author of PCRE2
|
||||||
|
ChangeLog log of changes to the code
|
||||||
|
CleanTxt script to clean nroff output for txt man pages
|
||||||
|
Detrail script to remove trailing spaces
|
||||||
|
HACKING some notes about the internals of PCRE2
|
||||||
|
INSTALL generic installation instructions
|
||||||
|
LICENCE conditions for the use of PCRE2
|
||||||
|
COPYING the same, using GNU's standard name
|
||||||
|
Makefile.in ) template for Unix Makefile, which is built by
|
||||||
|
) "configure"
|
||||||
|
Makefile.am ) the automake input that was used to create
|
||||||
|
) Makefile.in
|
||||||
|
NEWS important changes in this release
|
||||||
|
NON-AUTOTOOLS-BUILD notes on building PCRE2 without using autotools
|
||||||
|
PrepareRelease script to make preparations for "make dist"
|
||||||
|
README this file
|
||||||
|
RunTest a Unix shell script for running tests
|
||||||
|
RunGrepTest a Unix shell script for pcre2grep tests
|
||||||
|
aclocal.m4 m4 macros (generated by "aclocal")
|
||||||
|
config.guess ) files used by libtool,
|
||||||
|
config.sub ) used only when building a shared library
|
||||||
|
configure a configuring shell script (built by autoconf)
|
||||||
|
configure.ac ) the autoconf input that was used to build
|
||||||
|
) "configure" and config.h
|
||||||
|
depcomp ) script to find program dependencies, generated by
|
||||||
|
) automake
|
||||||
|
doc/*.3 man page sources for PCRE2
|
||||||
|
doc/*.1 man page sources for pcre2grep and pcre2test
|
||||||
|
doc/index.html.src the base HTML page
|
||||||
|
doc/html/* HTML documentation
|
||||||
|
doc/pcre2.txt plain text version of the man pages
|
||||||
|
doc/pcre2test.txt plain text documentation of test program
|
||||||
|
doc/perltest.txt plain text documentation of Perl test program
|
||||||
|
install-sh a shell script for installing files
|
||||||
|
libpcre2-8.pc.in template for libpcre2-8.pc for pkg-config
|
||||||
|
libpcre2-16.pc.in template for libpcre2-16.pc for pkg-config
|
||||||
|
libpcre2-32.pc.in template for libpcre2-32.pc for pkg-config
|
||||||
|
libpcre2posix.pc.in template for libpcre2posix.pc for pkg-config
|
||||||
|
ltmain.sh file used to build a libtool script
|
||||||
|
missing ) common stub for a few missing GNU programs while
|
||||||
|
) installing, generated by automake
|
||||||
|
mkinstalldirs script for making install directories
|
||||||
|
perltest.pl Perl test program
|
||||||
|
pcre2-config.in source of script which retains PCRE2 information
|
||||||
|
pcre2_jit_test.c test program for the JIT compiler
|
||||||
|
testdata/testinput* test data for main library tests
|
||||||
|
testdata/testoutput* expected test results
|
||||||
|
testdata/grep* input and output for pcre2grep tests
|
||||||
|
testdata/* other supporting test files
|
||||||
|
|
||||||
|
(D) Auxiliary files for cmake support
|
||||||
|
|
||||||
|
cmake/COPYING-CMAKE-SCRIPTS
|
||||||
|
cmake/FindPackageHandleStandardArgs.cmake
|
||||||
|
cmake/FindEditline.cmake
|
||||||
|
cmake/FindReadline.cmake
|
||||||
|
CMakeLists.txt
|
||||||
|
config-cmake.h.in
|
||||||
|
|
||||||
|
(E) Auxiliary files for VPASCAL FIXME FIXME
|
||||||
|
|
||||||
|
makevp.bat
|
||||||
|
makevp_c.txt
|
||||||
|
makevp_l.txt
|
||||||
|
pcre2gexp.pas
|
||||||
|
|
||||||
|
(F) Auxiliary files for building PCRE2 "by hand"
|
||||||
|
|
||||||
|
pcre2.h.generic ) a version of the public PCRE2 header file
|
||||||
|
) for use in non-"configure" environments
|
||||||
|
config.h.generic ) a version of config.h for use in non-"configure"
|
||||||
|
) environments
|
||||||
|
|
||||||
|
(F) Miscellaneous
|
||||||
|
|
||||||
|
RunTest.bat a script for running tests under Windows FIXME
|
||||||
|
|
||||||
|
Philip Hazel
|
||||||
|
Email local part: ph10
|
||||||
|
Email domain: cam.ac.uk
|
||||||
|
Last updated: 27 October 2014
|
||||||
|
|
|
@ -0,0 +1,102 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>pcre2-config specification</title>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||||
|
<h1>pcre2-config man page</h1>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
|
please consult the man page, in case the conversion went wrong.
|
||||||
|
<br>
|
||||||
|
<ul>
|
||||||
|
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
|
||||||
|
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
||||||
|
<li><a name="TOC3" href="#SEC3">OPTIONS</a>
|
||||||
|
<li><a name="TOC4" href="#SEC4">SEE ALSO</a>
|
||||||
|
<li><a name="TOC5" href="#SEC5">AUTHOR</a>
|
||||||
|
<li><a name="TOC6" href="#SEC6">REVISION</a>
|
||||||
|
</ul>
|
||||||
|
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
||||||
|
<P>
|
||||||
|
<b>pcre2-config [--prefix] [--exec-prefix] [--version]</b>
|
||||||
|
<b> [--libs8] [--libs16] [--libs32] [--libs-posix]</b>
|
||||||
|
<b> [--cflags] [--cflags-posix]</b>
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
||||||
|
<P>
|
||||||
|
<b>pcre2-config</b> returns the configuration of the installed PCRE2 libraries
|
||||||
|
and the options required to compile a program to use them. Some of the options
|
||||||
|
apply only to the 8-bit, or 16-bit, or 32-bit libraries, respectively, and are
|
||||||
|
not available for libraries that have not been built. If an unavailable option
|
||||||
|
is encountered, the "usage" information is output.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
|
||||||
|
<P>
|
||||||
|
<b>--prefix</b>
|
||||||
|
Writes the directory prefix used in the PCRE2 installation for architecture
|
||||||
|
independent files (<i>/usr</i> on many systems, <i>/usr/local</i> on some
|
||||||
|
systems) to the standard output.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--exec-prefix</b>
|
||||||
|
Writes the directory prefix used in the PCRE2 installation for architecture
|
||||||
|
dependent files (normally the same as <b>--prefix</b>) to the standard output.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--version</b>
|
||||||
|
Writes the version number of the installed PCRE2 libraries to the standard
|
||||||
|
output.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--libs8</b>
|
||||||
|
Writes to the standard output the command line options required to link
|
||||||
|
with the 8-bit PCRE2 library (<b>-lpcre2-8</b> on many systems).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--libs16</b>
|
||||||
|
Writes to the standard output the command line options required to link
|
||||||
|
with the 16-bit PCRE2 library (<b>-lpcre2-16</b> on many systems).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--libs32</b>
|
||||||
|
Writes to the standard output the command line options required to link
|
||||||
|
with the 32-bit PCRE2 library (<b>-lpcre2-32</b> on many systems).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--libs-posix</b>
|
||||||
|
Writes to the standard output the command line options required to link with
|
||||||
|
PCRE2's POSIX API wrapper library (<b>-lpcre2-posix</b> <b>-lpcre2-8</b> on many
|
||||||
|
systems).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--cflags</b>
|
||||||
|
Writes to the standard output the command line options required to compile
|
||||||
|
files that use PCRE2 (this may include some <b>-I</b> options, but is blank on
|
||||||
|
many systems).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--cflags-posix</b>
|
||||||
|
Writes to the standard output the command line options required to compile
|
||||||
|
files that use PCRE2's POSIX API wrapper library (this may include some
|
||||||
|
<b>-I</b> options, but is blank on many systems).
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC4" href="#TOC1">SEE ALSO</a><br>
|
||||||
|
<P>
|
||||||
|
<b>pcre2(3)</b>
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC5" href="#TOC1">AUTHOR</a><br>
|
||||||
|
<P>
|
||||||
|
This manual page was originally written by Mark Baker for the Debian GNU/Linux
|
||||||
|
system. It has been subsequently revised as a generic PCRE2 man page.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC6" href="#TOC1">REVISION</a><br>
|
||||||
|
<P>
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
<br>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
|
@ -0,0 +1,182 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>pcre2 specification</title>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||||
|
<h1>pcre2 man page</h1>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
|
please consult the man page, in case the conversion went wrong.
|
||||||
|
<br>
|
||||||
|
<ul>
|
||||||
|
<li><a name="TOC1" href="#SEC1">INTRODUCTION</a>
|
||||||
|
<li><a name="TOC2" href="#SEC2">SECURITY CONSIDERATIONS</a>
|
||||||
|
<li><a name="TOC3" href="#SEC3">USER DOCUMENTATION</a>
|
||||||
|
<li><a name="TOC4" href="#SEC4">AUTHOR</a>
|
||||||
|
<li><a name="TOC5" href="#SEC5">REVISION</a>
|
||||||
|
</ul>
|
||||||
|
<br><a name="SEC1" href="#TOC1">INTRODUCTION</a><br>
|
||||||
|
<P>
|
||||||
|
PCRE2 is the name used for a revised API for the PCRE library, which is a set
|
||||||
|
of functions, written in C, that implement regular expression pattern matching
|
||||||
|
using the same syntax and semantics as Perl, with just a few differences. Some
|
||||||
|
features that appeared in Python and the original PCRE before they appeared in
|
||||||
|
Perl are also available using the Python syntax, there is some support for one
|
||||||
|
or two .NET and Oniguruma syntax items, and there are options for requesting
|
||||||
|
some minor changes that give better ECMAScript (aka JavaScript) compatibility.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
|
||||||
|
code units, which means that up to three separate libraries may be installed.
|
||||||
|
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||||
|
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||||
|
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||||
|
Unicode, with support for Unicode general category properties. Unicode is
|
||||||
|
optional at build time, and must be enabled explicitly at run time. The version
|
||||||
|
of Unicode in use can be discovered by running
|
||||||
|
<pre>
|
||||||
|
pcre2test -C
|
||||||
|
</PRE>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The three libraries contain identical sets of functions, with names ending in
|
||||||
|
_8, _16, or _32, respectively (for example, <b>pcre2_compile_8()</b>). However,
|
||||||
|
by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just
|
||||||
|
one code unit width can be written using generic names such as
|
||||||
|
<b>pcre2_compile()</b>, and the documentation is written assuming that this is
|
||||||
|
the case.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
In addition to the Perl-compatible matching function, PCRE2 contains an
|
||||||
|
alternative function that matches the same compiled patterns in a different
|
||||||
|
way. In certain circumstances, the alternative function has some advantages.
|
||||||
|
For a discussion of the two matching algorithms, see the
|
||||||
|
<a href="pcre2matching.html"><b>pcre2matching</b></a>
|
||||||
|
page.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Details of exactly which Perl regular expression features are and are not
|
||||||
|
supported by PCRE2 are given in separate documents. See the
|
||||||
|
<a href="pcre2pattern.html"><b>pcre2pattern</b></a>
|
||||||
|
and
|
||||||
|
<a href="pcre2compat.html"><b>pcre2compat</b></a>
|
||||||
|
pages. There is a syntax summary in the
|
||||||
|
<a href="pcre2syntax.html"><b>pcre2syntax</b></a>
|
||||||
|
page.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Some features of PCRE2 can be included, excluded, or changed when the library
|
||||||
|
is built. The
|
||||||
|
<a href="pcre2_config.html"><b>pcre2_config()</b></a>
|
||||||
|
function makes it possible for a client to discover which features are
|
||||||
|
available. The features themselves are described in the
|
||||||
|
<a href="pcre2build.html"><b>pcre2build</b></a>
|
||||||
|
page. Documentation about building PCRE2 for various operating systems can be
|
||||||
|
found in the
|
||||||
|
<a href="README.txt"><b>README</b></a>
|
||||||
|
and
|
||||||
|
<a href="NON-AUTOTOOLS-BUILD.txt"><b>NON-AUTOTOOLS_BUILD</b></a>
|
||||||
|
files in the source distribution.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The libraries contains a number of undocumented internal functions and data
|
||||||
|
tables that are used by more than one of the exported external functions, but
|
||||||
|
which are not intended for use by external callers. Their names all begin with
|
||||||
|
"_pcre2", which hopefully will not provoke any name clashes. In some
|
||||||
|
environments, it is possible to control which external symbols are exported
|
||||||
|
when a shared library is built, and in these cases the undocumented symbols are
|
||||||
|
not exported.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC2" href="#TOC1">SECURITY CONSIDERATIONS</a><br>
|
||||||
|
<P>
|
||||||
|
If you are using PCRE2 in a non-UTF application that permits users to supply
|
||||||
|
arbitrary patterns for compilation, you should be aware of a feature that
|
||||||
|
allows users to turn on UTF support from within a pattern, provided that PCRE2
|
||||||
|
was built with Unicode support. For example, an 8-bit pattern that begins with
|
||||||
|
"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings
|
||||||
|
of UTF-8 code units instead of individual 8-bit characters. This causes both
|
||||||
|
the pattern and any data against which it is matched to be checked for UTF-8
|
||||||
|
validity. If the data string is very long, such a check might use sufficiently
|
||||||
|
many resources as to cause your application to lose performance.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
One way of guarding against this possibility is to use the
|
||||||
|
<b>pcre2_pattern_info()</b> function to check the compiled pattern's options for
|
||||||
|
UTF. Alternatively, you can set the PCRE2_NEVER_UTF option at compile time.
|
||||||
|
This causes an compile time error if a pattern contains a UTF-setting sequence.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If your application is one that supports UTF, be aware that validity checking
|
||||||
|
can take time. If the same data string is to be matched many times, you can use
|
||||||
|
the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid
|
||||||
|
running redundant checks.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Another way that performance can be hit is by running a pattern that has a very
|
||||||
|
large search tree against a string that will never match. Nested unlimited
|
||||||
|
repeats in a pattern are a common example. PCRE2 provides some protection
|
||||||
|
against this: see the <b>pcre2_set_match_limit()</b> function in the
|
||||||
|
<a href="pcre2api.html"><b>pcre2api</b></a>
|
||||||
|
page.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC3" href="#TOC1">USER DOCUMENTATION</a><br>
|
||||||
|
<P>
|
||||||
|
The user documentation for PCRE2 comprises a number of different sections. In
|
||||||
|
the "man" format, each of these is a separate "man page". In the HTML format,
|
||||||
|
each is a separate page, linked from the index page. In the plain text format,
|
||||||
|
the descriptions of the <b>pcre2grep</b> and <b>pcre2test</b> programs are in
|
||||||
|
files called <b>pcre2grep.txt</b> and <b>pcre2test.txt</b>, respectively. The
|
||||||
|
remaining sections, except for the <b>pcre2demo</b> section (which is a program
|
||||||
|
listing), and the short pages for individual functions, are concatenated in
|
||||||
|
<b>pcre2.txt</b>, for ease of searching. The sections are as follows:
|
||||||
|
<pre>
|
||||||
|
pcre2 this document FIXME CHECK THIS LIST
|
||||||
|
pcre2-config show PCRE2 installation configuration information
|
||||||
|
pcre2api details of PCRE2's native C API
|
||||||
|
pcre2build building PCRE2
|
||||||
|
pcre2callout details of the callout feature
|
||||||
|
pcre2compat discussion of Perl compatibility
|
||||||
|
pcre2demo a demonstration C program that uses PCRE2
|
||||||
|
pcre2grep description of the <b>pcre2grep</b> command (8-bit only)
|
||||||
|
pcre2jit discussion of the just-in-time optimization support
|
||||||
|
pcre2limits details of size and other limits
|
||||||
|
pcre2matching discussion of the two matching algorithms
|
||||||
|
pcre2partial details of the partial matching facility
|
||||||
|
pcre2pattern syntax and semantics of supported regular expressions
|
||||||
|
pcre2perform discussion of performance issues
|
||||||
|
pcre2posix the POSIX-compatible C API for the 8-bit library
|
||||||
|
pcre2sample discussion of the pcre2demo program
|
||||||
|
pcre2stack discussion of stack usage
|
||||||
|
pcre2syntax quick syntax reference
|
||||||
|
pcre2test description of the <b>pcre2test</b> testing command
|
||||||
|
pcre2unicode discussion of Unicode and UTF support
|
||||||
|
</pre>
|
||||||
|
In the "man" and HTML formats, there is also a short page for each C library
|
||||||
|
function, listing its arguments and results.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC4" href="#TOC1">AUTHOR</a><br>
|
||||||
|
<P>
|
||||||
|
Philip Hazel
|
||||||
|
<br>
|
||||||
|
University Computing Service
|
||||||
|
<br>
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
<br>
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||||
|
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||||
|
<P>
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
<br>
|
||||||
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
|
<br>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
|
@ -0,0 +1,759 @@
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>pcre2grep specification</title>
|
||||||
|
</head>
|
||||||
|
<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
|
||||||
|
<h1>pcre2grep man page</h1>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
This page is part of the PCRE2 HTML documentation. It was generated
|
||||||
|
automatically from the original man page. If there is any nonsense in it,
|
||||||
|
please consult the man page, in case the conversion went wrong.
|
||||||
|
<br>
|
||||||
|
<ul>
|
||||||
|
<li><a name="TOC1" href="#SEC1">SYNOPSIS</a>
|
||||||
|
<li><a name="TOC2" href="#SEC2">DESCRIPTION</a>
|
||||||
|
<li><a name="TOC3" href="#SEC3">SUPPORT FOR COMPRESSED FILES</a>
|
||||||
|
<li><a name="TOC4" href="#SEC4">BINARY FILES</a>
|
||||||
|
<li><a name="TOC5" href="#SEC5">OPTIONS</a>
|
||||||
|
<li><a name="TOC6" href="#SEC6">ENVIRONMENT VARIABLES</a>
|
||||||
|
<li><a name="TOC7" href="#SEC7">NEWLINES</a>
|
||||||
|
<li><a name="TOC8" href="#SEC8">OPTIONS COMPATIBILITY</a>
|
||||||
|
<li><a name="TOC9" href="#SEC9">OPTIONS WITH DATA</a>
|
||||||
|
<li><a name="TOC10" href="#SEC10">MATCHING ERRORS</a>
|
||||||
|
<li><a name="TOC11" href="#SEC11">DIAGNOSTICS</a>
|
||||||
|
<li><a name="TOC12" href="#SEC12">SEE ALSO</a>
|
||||||
|
<li><a name="TOC13" href="#SEC13">AUTHOR</a>
|
||||||
|
<li><a name="TOC14" href="#SEC14">REVISION</a>
|
||||||
|
</ul>
|
||||||
|
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
|
||||||
|
<P>
|
||||||
|
<b>pcre2grep [options] [long options] [pattern] [path1 path2 ...]</b>
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
|
||||||
|
<P>
|
||||||
|
<b>pcre2grep</b> searches files for character patterns, in the same way as other
|
||||||
|
grep commands do, but it uses the PCRE2 regular expression library to support
|
||||||
|
patterns that are compatible with the regular expressions of Perl 5. See
|
||||||
|
<a href="pcre2syntax.html"><b>pcre2syntax</b>(3)</a>
|
||||||
|
for a quick-reference summary of pattern syntax, or
|
||||||
|
<a href="pcre2pattern.html"><b>pcre2pattern</b>(3)</a>
|
||||||
|
for a full description of the syntax and semantics of the regular expressions
|
||||||
|
that PCRE2 supports.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Patterns, whether supplied on the command line or in a separate file, are given
|
||||||
|
without delimiters. For example:
|
||||||
|
<pre>
|
||||||
|
pcre2grep Thursday /etc/motd
|
||||||
|
</pre>
|
||||||
|
If you attempt to use delimiters (for example, by surrounding a pattern with
|
||||||
|
slashes, as is common in Perl scripts), they are interpreted as part of the
|
||||||
|
pattern. Quotes can of course be used to delimit patterns on the command line
|
||||||
|
because they are interpreted by the shell, and indeed quotes are required if a
|
||||||
|
pattern contains white space or shell metacharacters.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The first argument that follows any option settings is treated as the single
|
||||||
|
pattern to be matched when neither <b>-e</b> nor <b>-f</b> is present.
|
||||||
|
Conversely, when one or both of these options are used to specify patterns, all
|
||||||
|
arguments are treated as path names. At least one of <b>-e</b>, <b>-f</b>, or an
|
||||||
|
argument pattern must be provided.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If no files are specified, <b>pcre2grep</b> reads the standard input. The
|
||||||
|
standard input can also be referenced by a name consisting of a single hyphen.
|
||||||
|
For example:
|
||||||
|
<pre>
|
||||||
|
pcre2grep some-pattern /file1 - /file3
|
||||||
|
</pre>
|
||||||
|
By default, each line that matches a pattern is copied to the standard
|
||||||
|
output, and if there is more than one file, the file name is output at the
|
||||||
|
start of each line, followed by a colon. However, there are options that can
|
||||||
|
change how <b>pcre2grep</b> behaves. In particular, the <b>-M</b> option makes it
|
||||||
|
possible to search for patterns that span line boundaries. What defines a line
|
||||||
|
boundary is controlled by the <b>-N</b> (<b>--newline</b>) option.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The amount of memory used for buffering files that are being scanned is
|
||||||
|
controlled by a parameter that can be set by the <b>--buffer-size</b> option.
|
||||||
|
The default value for this parameter is specified when <b>pcre2grep</b> is built,
|
||||||
|
with the default default being 20K. A block of memory three times this size is
|
||||||
|
used (to allow for buffering "before" and "after" lines). An error occurs if a
|
||||||
|
line overflows the buffer.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
|
||||||
|
BUFSIZ is defined in <b><stdio.h></b>. When there is more than one pattern
|
||||||
|
(specified by the use of <b>-e</b> and/or <b>-f</b>), each pattern is applied to
|
||||||
|
each line in the order in which they are defined, except that all the <b>-e</b>
|
||||||
|
patterns are tried before the <b>-f</b> patterns.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
By default, as soon as one pattern matches a line, no further patterns are
|
||||||
|
considered. However, if <b>--colour</b> (or <b>--color</b>) is used to colour the
|
||||||
|
matching substrings, or if <b>--only-matching</b>, <b>--file-offsets</b>, or
|
||||||
|
<b>--line-offsets</b> is used to output only the part of the line that matched
|
||||||
|
(either shown literally, or as an offset), scanning resumes immediately
|
||||||
|
following the match, so that further matches on the same line can be found. If
|
||||||
|
there are multiple patterns, they are all tried on the remainder of the line,
|
||||||
|
but patterns that follow the one that matched are not tried on the earlier part
|
||||||
|
of the line.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
This behaviour means that the order in which multiple patterns are specified
|
||||||
|
can affect the output when one of the above options is used. This is no longer
|
||||||
|
the same behaviour as GNU grep, which now manages to display earlier matches
|
||||||
|
for later patterns (as long as there is no overlap).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Patterns that can match an empty string are accepted, but empty string
|
||||||
|
matches are never recognized. An example is the pattern "(super)?(man)?", in
|
||||||
|
which all components are optional. This pattern finds all occurrences of both
|
||||||
|
"super" and "man"; the output differs from matching with "super|man" when only
|
||||||
|
the matching substrings are being shown.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variable is set,
|
||||||
|
<b>pcre2grep</b> uses the value to set a locale when calling the PCRE2 library.
|
||||||
|
The <b>--locale</b> option can be used to override this.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC3" href="#TOC1">SUPPORT FOR COMPRESSED FILES</a><br>
|
||||||
|
<P>
|
||||||
|
It is possible to compile <b>pcre2grep</b> so that it uses <b>libz</b> or
|
||||||
|
<b>libbz2</b> to read files whose names end in <b>.gz</b> or <b>.bz2</b>,
|
||||||
|
respectively. You can find out whether your binary has support for one or both
|
||||||
|
of these file types by running it with the <b>--help</b> option. If the
|
||||||
|
appropriate support is not present, files are treated as plain text. The
|
||||||
|
standard input is always so treated.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC4" href="#TOC1">BINARY FILES</a><br>
|
||||||
|
<P>
|
||||||
|
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||||
|
is identified as a binary file, and is processed specially. (GNU grep also
|
||||||
|
identifies binary files in this manner.) See the <b>--binary-files</b> option
|
||||||
|
for a means of changing the way binary files are handled.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC5" href="#TOC1">OPTIONS</a><br>
|
||||||
|
<P>
|
||||||
|
The order in which some of the options appear can affect the output. For
|
||||||
|
example, both the <b>-h</b> and <b>-l</b> options affect the printing of file
|
||||||
|
names. Whichever comes later in the command line will be the one that takes
|
||||||
|
effect. Similarly, except where noted below, if an option is given twice, the
|
||||||
|
later setting is used. Numerical values for options may be followed by K or M,
|
||||||
|
to signify multiplication by 1024 or 1024*1024 respectively.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--</b>
|
||||||
|
This terminates the list of options. It is useful if the next item on the
|
||||||
|
command line starts with a hyphen but is not an option. This allows for the
|
||||||
|
processing of patterns and filenames that start with hyphens.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-A</b> <i>number</i>, <b>--after-context=</b><i>number</i>
|
||||||
|
Output <i>number</i> lines of context after each matching line. If filenames
|
||||||
|
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||||
|
colon for the context lines. A line containing "--" is output between each
|
||||||
|
group of lines, unless they are in fact contiguous in the input file. The value
|
||||||
|
of <i>number</i> is expected to be relatively small. However, <b>pcre2grep</b>
|
||||||
|
guarantees to have up to 8K of following text available for context output.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-a</b>, <b>--text</b>
|
||||||
|
Treat binary files as text. This is equivalent to
|
||||||
|
<b>--binary-files</b>=<i>text</i>.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-B</b> <i>number</i>, <b>--before-context=</b><i>number</i>
|
||||||
|
Output <i>number</i> lines of context before each matching line. If filenames
|
||||||
|
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||||
|
colon for the context lines. A line containing "--" is output between each
|
||||||
|
group of lines, unless they are in fact contiguous in the input file. The value
|
||||||
|
of <i>number</i> is expected to be relatively small. However, <b>pcre2grep</b>
|
||||||
|
guarantees to have up to 8K of preceding text available for context output.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--binary-files=</b><i>word</i>
|
||||||
|
Specify how binary files are to be processed. If the word is "binary" (the
|
||||||
|
default), pattern matching is performed on binary files, but the only output is
|
||||||
|
"Binary file <name> matches" when a match succeeds. If the word is "text",
|
||||||
|
which is equivalent to the <b>-a</b> or <b>--text</b> option, binary files are
|
||||||
|
processed in the same way as any other file. In this case, when a match
|
||||||
|
succeeds, the output may be binary garbage, which can have nasty effects if
|
||||||
|
sent to a terminal. If the word is "without-match", which is equivalent to the
|
||||||
|
<b>-I</b> option, binary files are not processed at all; they are assumed not to
|
||||||
|
be of interest.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--buffer-size=</b><i>number</i>
|
||||||
|
Set the parameter that controls how much memory is used for buffering files
|
||||||
|
that are being scanned.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-C</b> <i>number</i>, <b>--context=</b><i>number</i>
|
||||||
|
Output <i>number</i> lines of context both before and after each matching line.
|
||||||
|
This is equivalent to setting both <b>-A</b> and <b>-B</b> to the same value.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-c</b>, <b>--count</b>
|
||||||
|
Do not output individual lines from the files that are being scanned; instead
|
||||||
|
output the number of lines that would otherwise have been shown. If no lines
|
||||||
|
are selected, the number zero is output. If several files are are being
|
||||||
|
scanned, a count is output for each of them. However, if the
|
||||||
|
<b>--files-with-matches</b> option is also used, only those files whose counts
|
||||||
|
are greater than zero are listed. When <b>-c</b> is used, the <b>-A</b>,
|
||||||
|
<b>-B</b>, and <b>-C</b> options are ignored.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--colour</b>, <b>--color</b>
|
||||||
|
If this option is given without any data, it is equivalent to "--colour=auto".
|
||||||
|
If data is required, it must be given in the same shell item, separated by an
|
||||||
|
equals sign.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--colour=</b><i>value</i>, <b>--color=</b><i>value</i>
|
||||||
|
This option specifies under what circumstances the parts of a line that matched
|
||||||
|
a pattern should be coloured in the output. By default, the output is not
|
||||||
|
coloured. The value (which is optional, see above) may be "never", "always", or
|
||||||
|
"auto". In the latter case, colouring happens only if the standard output is
|
||||||
|
connected to a terminal. More resources are used when colouring is enabled,
|
||||||
|
because <b>pcre2grep</b> has to search for all possible matches in a line, not
|
||||||
|
just one, in order to colour them all.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
The colour that is used can be specified by setting the environment variable
|
||||||
|
PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The value of this variable should be a
|
||||||
|
string of two numbers, separated by a semicolon. They are copied directly into
|
||||||
|
the control string for setting colour on a terminal, so it is your
|
||||||
|
responsibility to ensure that they make sense. If neither of the environment
|
||||||
|
variables is set, the default is "1;31", which gives red.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-D</b> <i>action</i>, <b>--devices=</b><i>action</i>
|
||||||
|
If an input path is not a regular file or a directory, "action" specifies how
|
||||||
|
it is to be processed. Valid values are "read" (the default) or "skip"
|
||||||
|
(silently skip the path).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-d</b> <i>action</i>, <b>--directories=</b><i>action</i>
|
||||||
|
If an input path is a directory, "action" specifies how it is to be processed.
|
||||||
|
Valid values are "read" (the default in non-Windows environments, for
|
||||||
|
compatibility with GNU grep), "recurse" (equivalent to the <b>-r</b> option), or
|
||||||
|
"skip" (silently skip the path, the default in Windows environments). In the
|
||||||
|
"read" case, directories are read as if they were ordinary files. In some
|
||||||
|
operating systems the effect of reading a directory like this is an immediate
|
||||||
|
end-of-file; in others it may provoke an error.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-e</b> <i>pattern</i>, <b>--regex=</b><i>pattern</i>, <b>--regexp=</b><i>pattern</i>
|
||||||
|
Specify a pattern to be matched. This option can be used multiple times in
|
||||||
|
order to specify several patterns. It can also be used as a way of specifying a
|
||||||
|
single pattern that starts with a hyphen. When <b>-e</b> is used, no argument
|
||||||
|
pattern is taken from the command line; all arguments are treated as file
|
||||||
|
names. There is no limit to the number of patterns. They are applied to each
|
||||||
|
line in the order in which they are defined until one matches.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
If <b>-f</b> is used with <b>-e</b>, the command line patterns are matched first,
|
||||||
|
followed by the patterns from the file(s), independent of the order in which
|
||||||
|
these options are specified. Note that multiple use of <b>-e</b> is not the same
|
||||||
|
as a single pattern with alternatives. For example, X|Y finds the first
|
||||||
|
character in a line that is X or Y, whereas if the two patterns are given
|
||||||
|
separately, with X first, <b>pcre2grep</b> finds X if it is present, even if it
|
||||||
|
follows Y in the line. It finds Y only if there is no X in the line. This
|
||||||
|
matters only if you are using <b>-o</b> or <b>--colo(u)r</b> to show the part(s)
|
||||||
|
of the line that matched.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--exclude</b>=<i>pattern</i>
|
||||||
|
Files (but not directories) whose names match the pattern are skipped without
|
||||||
|
being processed. This applies to all files, whether listed on the command line,
|
||||||
|
obtained from <b>--file-list</b>, or by scanning a directory. The pattern is a
|
||||||
|
PCRE2 regular expression, and is matched against the final component of the file
|
||||||
|
name, not the entire path. The <b>-F</b>, <b>-w</b>, and <b>-x</b> options do not
|
||||||
|
apply to this pattern. The option may be given any number of times in order to
|
||||||
|
specify multiple patterns. If a file name matches both an <b>--include</b>
|
||||||
|
and an <b>--exclude</b> pattern, it is excluded. There is no short form for this
|
||||||
|
option.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--exclude-from=</b><i>filename</i>
|
||||||
|
Treat each non-empty line of the file as the data for an <b>--exclude</b>
|
||||||
|
option. What constitutes a newline when reading the file is the operating
|
||||||
|
system's default. The <b>--newline</b> option has no effect on this option. This
|
||||||
|
option may be given more than once in order to specify a number of files to
|
||||||
|
read.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--exclude-dir</b>=<i>pattern</i>
|
||||||
|
Directories whose names match the pattern are skipped without being processed,
|
||||||
|
whatever the setting of the <b>--recursive</b> option. This applies to all
|
||||||
|
directories, whether listed on the command line, obtained from
|
||||||
|
<b>--file-list</b>, or by scanning a parent directory. The pattern is a PCRE2
|
||||||
|
regular expression, and is matched against the final component of the directory
|
||||||
|
name, not the entire path. The <b>-F</b>, <b>-w</b>, and <b>-x</b> options do not
|
||||||
|
apply to this pattern. The option may be given any number of times in order to
|
||||||
|
specify more than one pattern. If a directory matches both <b>--include-dir</b>
|
||||||
|
and <b>--exclude-dir</b>, it is excluded. There is no short form for this
|
||||||
|
option.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-F</b>, <b>--fixed-strings</b>
|
||||||
|
Interpret each data-matching pattern as a list of fixed strings, separated by
|
||||||
|
newlines, instead of as a regular expression. What constitutes a newline for
|
||||||
|
this purpose is controlled by the <b>--newline</b> option. The <b>-w</b> (match
|
||||||
|
as a word) and <b>-x</b> (match whole line) options can be used with <b>-F</b>.
|
||||||
|
They apply to each of the fixed strings. A line is selected if any of the fixed
|
||||||
|
strings are found in it (subject to <b>-w</b> or <b>-x</b>, if present). This
|
||||||
|
option applies only to the patterns that are matched against the contents of
|
||||||
|
files; it does not apply to patterns specified by any of the <b>--include</b> or
|
||||||
|
<b>--exclude</b> options.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-f</b> <i>filename</i>, <b>--file=</b><i>filename</i>
|
||||||
|
Read patterns from the file, one per line, and match them against
|
||||||
|
each line of input. What constitutes a newline when reading the file is the
|
||||||
|
operating system's default. The <b>--newline</b> option has no effect on this
|
||||||
|
option. Trailing white space is removed from each line, and blank lines are
|
||||||
|
ignored. An empty file contains no patterns and therefore matches nothing. See
|
||||||
|
also the comments about multiple patterns versus a single pattern with
|
||||||
|
alternatives in the description of <b>-e</b> above.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
If this option is given more than once, all the specified files are
|
||||||
|
read. A data line is output if any of the patterns match it. A filename can
|
||||||
|
be given as "-" to refer to the standard input. When <b>-f</b> is used, patterns
|
||||||
|
specified on the command line using <b>-e</b> may also be present; they are
|
||||||
|
tested before the file's patterns. However, no other pattern is taken from the
|
||||||
|
command line; all arguments are treated as the names of paths to be searched.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--file-list</b>=<i>filename</i>
|
||||||
|
Read a list of files and/or directories that are to be scanned from the given
|
||||||
|
file, one per line. Trailing white space is removed from each line, and blank
|
||||||
|
lines are ignored. These paths are processed before any that are listed on the
|
||||||
|
command line. The filename can be given as "-" to refer to the standard input.
|
||||||
|
If <b>--file</b> and <b>--file-list</b> are both specified as "-", patterns are
|
||||||
|
read first. This is useful only when the standard input is a terminal, from
|
||||||
|
which further lines (the list of files) can be read after an end-of-file
|
||||||
|
indication. If this option is given more than once, all the specified files are
|
||||||
|
read.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--file-offsets</b>
|
||||||
|
Instead of showing lines or parts of lines that match, show each match as an
|
||||||
|
offset from the start of the file and a length, separated by a comma. In this
|
||||||
|
mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b>
|
||||||
|
options are ignored. If there is more than one match in a line, each of them is
|
||||||
|
shown separately. This option is mutually exclusive with <b>--line-offsets</b>
|
||||||
|
and <b>--only-matching</b>.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-H</b>, <b>--with-filename</b>
|
||||||
|
Force the inclusion of the filename at the start of output lines when searching
|
||||||
|
a single file. By default, the filename is not shown in this case. For matching
|
||||||
|
lines, the filename is followed by a colon; for context lines, a hyphen
|
||||||
|
separator is used. If a line number is also being output, it follows the file
|
||||||
|
name.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-h</b>, <b>--no-filename</b>
|
||||||
|
Suppress the output filenames when searching multiple files. By default,
|
||||||
|
filenames are shown when multiple files are searched. For matching lines, the
|
||||||
|
filename is followed by a colon; for context lines, a hyphen separator is used.
|
||||||
|
If a line number is also being output, it follows the file name.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--help</b>
|
||||||
|
Output a help message, giving brief details of the command options and file
|
||||||
|
type support, and then exit. Anything else on the command line is
|
||||||
|
ignored.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-I</b>
|
||||||
|
Treat binary files as never matching. This is equivalent to
|
||||||
|
<b>--binary-files</b>=<i>without-match</i>.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-i</b>, <b>--ignore-case</b>
|
||||||
|
Ignore upper/lower case distinctions during comparisons.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--include</b>=<i>pattern</i>
|
||||||
|
If any <b>--include</b> patterns are specified, the only files that are
|
||||||
|
processed are those that match one of the patterns (and do not match an
|
||||||
|
<b>--exclude</b> pattern). This option does not affect directories, but it
|
||||||
|
applies to all files, whether listed on the command line, obtained from
|
||||||
|
<b>--file-list</b>, or by scanning a directory. The pattern is a PCRE2 regular
|
||||||
|
expression, and is matched against the final component of the file name, not
|
||||||
|
the entire path. The <b>-F</b>, <b>-w</b>, and <b>-x</b> options do not apply to
|
||||||
|
this pattern. The option may be given any number of times. If a file name
|
||||||
|
matches both an <b>--include</b> and an <b>--exclude</b> pattern, it is excluded.
|
||||||
|
There is no short form for this option.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--include-from=</b><i>filename</i>
|
||||||
|
Treat each non-empty line of the file as the data for an <b>--include</b>
|
||||||
|
option. What constitutes a newline for this purpose is the operating system's
|
||||||
|
default. The <b>--newline</b> option has no effect on this option. This option
|
||||||
|
may be given any number of times; all the files are read.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--include-dir</b>=<i>pattern</i>
|
||||||
|
If any <b>--include-dir</b> patterns are specified, the only directories that
|
||||||
|
are processed are those that match one of the patterns (and do not match an
|
||||||
|
<b>--exclude-dir</b> pattern). This applies to all directories, whether listed
|
||||||
|
on the command line, obtained from <b>--file-list</b>, or by scanning a parent
|
||||||
|
directory. The pattern is a PCRE2 regular expression, and is matched against the
|
||||||
|
final component of the directory name, not the entire path. The <b>-F</b>,
|
||||||
|
<b>-w</b>, and <b>-x</b> options do not apply to this pattern. The option may be
|
||||||
|
given any number of times. If a directory matches both <b>--include-dir</b> and
|
||||||
|
<b>--exclude-dir</b>, it is excluded. There is no short form for this option.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-L</b>, <b>--files-without-match</b>
|
||||||
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
|
that do not contain any lines that would have been output. Each file name is
|
||||||
|
output once, on a separate line.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-l</b>, <b>--files-with-matches</b>
|
||||||
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
|
containing lines that would have been output. Each file name is output
|
||||||
|
once, on a separate line. Searching normally stops as soon as a matching line
|
||||||
|
is found in a file. However, if the <b>-c</b> (count) option is also used,
|
||||||
|
matching continues in order to obtain the correct count, and those files that
|
||||||
|
have at least one match are listed along with their counts. Using this option
|
||||||
|
with <b>-c</b> is a way of suppressing the listing of files with no matches.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--label</b>=<i>name</i>
|
||||||
|
This option supplies a name to be used for the standard input when file names
|
||||||
|
are being output. If not supplied, "(standard input)" is used. There is no
|
||||||
|
short form for this option.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--line-buffered</b>
|
||||||
|
When this option is given, input is read and processed line by line, and the
|
||||||
|
output is flushed after each write. By default, input is read in large chunks,
|
||||||
|
unless <b>pcre2grep</b> can determine that it is reading from a terminal (which
|
||||||
|
is currently possible only in Unix-like environments). Output to terminal is
|
||||||
|
normally automatically flushed by the operating system. This option can be
|
||||||
|
useful when the input or output is attached to a pipe and you do not want
|
||||||
|
<b>pcre2grep</b> to buffer up large amounts of data. However, its use will affect
|
||||||
|
performance, and the <b>-M</b> (multiline) option ceases to work.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--line-offsets</b>
|
||||||
|
Instead of showing lines or parts of lines that match, show each match as a
|
||||||
|
line number, the offset from the start of the line, and a length. The line
|
||||||
|
number is terminated by a colon (as usual; see the <b>-n</b> option), and the
|
||||||
|
offset and length are separated by a comma. In this mode, no context is shown.
|
||||||
|
That is, the <b>-A</b>, <b>-B</b>, and <b>-C</b> options are ignored. If there is
|
||||||
|
more than one match in a line, each of them is shown separately. This option is
|
||||||
|
mutually exclusive with <b>--file-offsets</b> and <b>--only-matching</b>.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--locale</b>=<i>locale-name</i>
|
||||||
|
This option specifies a locale to be used for pattern matching. It overrides
|
||||||
|
the value in the <b>LC_ALL</b> or <b>LC_CTYPE</b> environment variables. If no
|
||||||
|
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||||
|
used. There is no short form for this option.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--match-limit</b>=<i>number</i>
|
||||||
|
Processing some regular expression patterns can require a very large amount of
|
||||||
|
memory, leading in some cases to a program crash if not enough is available.
|
||||||
|
Other patterns may take a very long time to search for all possible matching
|
||||||
|
strings. The <b>pcre2_exec()</b> function that is called by <b>pcre2grep</b> to do
|
||||||
|
the matching has two parameters that can limit the resources that it uses.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
The <b>--match-limit</b> option provides a means of limiting resource usage
|
||||||
|
when processing patterns that are not going to match, but which have a very
|
||||||
|
large number of possibilities in their search trees. The classic example is a
|
||||||
|
pattern that uses nested unlimited repeats. Internally, PCRE2 uses a function
|
||||||
|
called <b>match()</b> which it calls repeatedly (sometimes recursively). The
|
||||||
|
limit set by <b>--match-limit</b> is imposed on the number of times this
|
||||||
|
function is called during a match, which has the effect of limiting the amount
|
||||||
|
of backtracking that can take place.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
The <b>--recursion-limit</b> option is similar to <b>--match-limit</b>, but
|
||||||
|
instead of limiting the total number of times that <b>match()</b> is called, it
|
||||||
|
limits the depth of recursive calls, which in turn limits the amount of memory
|
||||||
|
that can be used. The recursion depth is a smaller number than the total number
|
||||||
|
of calls, because not all calls to <b>match()</b> are recursive. This limit is
|
||||||
|
of use only if it is set smaller than <b>--match-limit</b>.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
There are no short forms for these options. The default settings are specified
|
||||||
|
when the PCRE2 library is compiled, with the default default being 10 million.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-M</b>, <b>--multiline</b>
|
||||||
|
Allow patterns to match more than one line. When this option is given, patterns
|
||||||
|
may usefully contain literal newline characters and internal occurrences of ^
|
||||||
|
and $ characters. The output for a successful match may consist of more than
|
||||||
|
one line, the last of which is the one in which the match ended. If the matched
|
||||||
|
string ends with a newline sequence the output ends at the end of that line.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
When this option is set, the PCRE2 library is called in "multiline" mode.
|
||||||
|
There is a limit to the number of lines that can be matched, imposed by the way
|
||||||
|
that <b>pcre2grep</b> buffers the input file as it scans it. However,
|
||||||
|
<b>pcre2grep</b> ensures that at least 8K characters or the rest of the document
|
||||||
|
(whichever is the shorter) are available for forward matching, and similarly
|
||||||
|
the previous 8K characters (or all the previous characters, if fewer than 8K)
|
||||||
|
are guaranteed to be available for lookbehind assertions. This option does not
|
||||||
|
work when input is read line by line (see \fP--line-buffered\fP.)
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-N</b> <i>newline-type</i>, <b>--newline</b>=<i>newline-type</i>
|
||||||
|
The PCRE2 library supports five different conventions for indicating
|
||||||
|
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||||
|
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||||
|
which recognizes any of the preceding three types, and an "any" convention, in
|
||||||
|
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||||
|
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||||
|
(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||||
|
PS (paragraph separator, U+2029).
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
When the PCRE2 library is built, a default line-ending sequence is specified.
|
||||||
|
This is normally the standard sequence for the operating system. Unless
|
||||||
|
otherwise specified by this option, <b>pcre2grep</b> uses the library's default.
|
||||||
|
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||||
|
makes it possible to use <b>pcre2grep</b> to scan files that have come from other
|
||||||
|
environments without having to modify their line endings. If the data that is
|
||||||
|
being scanned does not agree with the convention set by this option,
|
||||||
|
<b>pcre2grep</b> may behave in strange ways. Note that this option does not
|
||||||
|
apply to files specified by the <b>-f</b>, <b>--exclude-from</b>, or
|
||||||
|
<b>--include-from</b> options, which are expected to use the operating system's
|
||||||
|
standard newline sequence.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-n</b>, <b>--line-number</b>
|
||||||
|
Precede each output line by its line number in the file, followed by a colon
|
||||||
|
for matching lines or a hyphen for context lines. If the filename is also being
|
||||||
|
output, it precedes the line number. This option is forced if
|
||||||
|
<b>--line-offsets</b> is used.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--no-jit</b>
|
||||||
|
If the PCRE2 library is built with support for just-in-time compiling (which
|
||||||
|
speeds up matching), <b>pcre2grep</b> automatically makes use of this, unless it
|
||||||
|
was explicitly disabled at build time. This option can be used to disable the
|
||||||
|
use of JIT at run time. It is provided for testing and working round problems.
|
||||||
|
It should never be needed in normal use.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-o</b>, <b>--only-matching</b>
|
||||||
|
Show only the part of the line that matched a pattern instead of the whole
|
||||||
|
line. In this mode, no context is shown. That is, the <b>-A</b>, <b>-B</b>, and
|
||||||
|
<b>-C</b> options are ignored. If there is more than one match in a line, each
|
||||||
|
of them is shown separately. If <b>-o</b> is combined with <b>-v</b> (invert the
|
||||||
|
sense of the match to find non-matching lines), no output is generated, but the
|
||||||
|
return code is set appropriately. If the matched portion of the line is empty,
|
||||||
|
nothing is output unless the file name or line number are being printed, in
|
||||||
|
which case they are shown on an otherwise empty line. This option is mutually
|
||||||
|
exclusive with <b>--file-offsets</b> and <b>--line-offsets</b>.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-o</b><i>number</i>, <b>--only-matching</b>=<i>number</i>
|
||||||
|
Show only the part of the line that matched the capturing parentheses of the
|
||||||
|
given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||||
|
equivalent to <b>-o</b> without a number. Because these options can be given
|
||||||
|
without an argument (see above), if an argument is present, it must be given in
|
||||||
|
the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||||
|
for the non-argument case above also apply to this case. If the specified
|
||||||
|
capturing parentheses do not exist in the pattern, or were not set in the
|
||||||
|
match, nothing is output unless the file name or line number are being printed.
|
||||||
|
<br>
|
||||||
|
<br>
|
||||||
|
If this option is given multiple times, multiple substrings are output, in the
|
||||||
|
order the options are given. For example, -o3 -o1 -o3 causes the substrings
|
||||||
|
matched by capturing parentheses 3 and 1 and then 3 again to be output. By
|
||||||
|
default, there is no separator (but see the next option).
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--om-separator</b>=<i>text</i>
|
||||||
|
Specify a separating string for multiple occurrences of <b>-o</b>. The default
|
||||||
|
is an empty string. Separating strings are never coloured.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-q</b>, <b>--quiet</b>
|
||||||
|
Work quietly, that is, display nothing except error messages. The exit
|
||||||
|
status indicates whether or not any matches were found.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-r</b>, <b>--recursive</b>
|
||||||
|
If any given path is a directory, recursively scan the files it contains,
|
||||||
|
taking note of any <b>--include</b> and <b>--exclude</b> settings. By default, a
|
||||||
|
directory is read as a normal file; in some operating systems this gives an
|
||||||
|
immediate end-of-file. This option is a shorthand for setting the <b>-d</b>
|
||||||
|
option to "recurse".
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>--recursion-limit</b>=<i>number</i>
|
||||||
|
See <b>--match-limit</b> above.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-s</b>, <b>--no-messages</b>
|
||||||
|
Suppress error messages about non-existent or unreadable files. Such files are
|
||||||
|
quietly skipped. However, the return code is still 2, even if matches were
|
||||||
|
found in other files.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-u</b>, <b>--utf-8</b>
|
||||||
|
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||||
|
with UTF-8 support. All patterns (including those for any <b>--exclude</b> and
|
||||||
|
<b>--include</b> options) and all subject lines that are scanned must be valid
|
||||||
|
strings of UTF-8 characters.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-V</b>, <b>--version</b>
|
||||||
|
Write the version numbers of <b>pcre2grep</b> and the PCRE2 library to the
|
||||||
|
standard output and then exit. Anything else on the command line is
|
||||||
|
ignored.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-v</b>, <b>--invert-match</b>
|
||||||
|
Invert the sense of the match, so that lines which do <i>not</i> match any of
|
||||||
|
the patterns are the ones that are found.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-w</b>, <b>--word-regex</b>, <b>--word-regexp</b>
|
||||||
|
Force the patterns to match only whole words. This is equivalent to having \b
|
||||||
|
at the start and end of the pattern. This option applies only to the patterns
|
||||||
|
that are matched against the contents of files; it does not apply to patterns
|
||||||
|
specified by any of the <b>--include</b> or <b>--exclude</b> options.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
<b>-x</b>, <b>--line-regex</b>, <b>--line-regexp</b>
|
||||||
|
Force the patterns to be anchored (each must start matching at the beginning of
|
||||||
|
a line) and in addition, require them to match entire lines. This is equivalent
|
||||||
|
to having ^ and $ characters at the start and end of each alternative branch in
|
||||||
|
every pattern. This option applies only to the patterns that are matched
|
||||||
|
against the contents of files; it does not apply to patterns specified by any
|
||||||
|
of the <b>--include</b> or <b>--exclude</b> options.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC6" href="#TOC1">ENVIRONMENT VARIABLES</a><br>
|
||||||
|
<P>
|
||||||
|
The environment variables <b>LC_ALL</b> and <b>LC_CTYPE</b> are examined, in that
|
||||||
|
order, for a locale. The first one that is set is used. This can be overridden
|
||||||
|
by the <b>--locale</b> option. If no locale is set, the PCRE2 library's default
|
||||||
|
(usually the "C" locale) is used.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC7" href="#TOC1">NEWLINES</a><br>
|
||||||
|
<P>
|
||||||
|
The <b>-N</b> (<b>--newline</b>) option allows <b>pcre2grep</b> to scan files with
|
||||||
|
different newline conventions from the default. Any parts of the input files
|
||||||
|
that are written to the standard output are copied identically, with whatever
|
||||||
|
newline sequences they have in the input. However, the setting of this option
|
||||||
|
does not affect the interpretation of files specified by the <b>-f</b>,
|
||||||
|
<b>--exclude-from</b>, or <b>--include-from</b> options, which are assumed to use
|
||||||
|
the operating system's standard newline sequence, nor does it affect the way in
|
||||||
|
which <b>pcre2grep</b> writes informational messages to the standard error and
|
||||||
|
output streams. For these it uses the string "\n" to indicate newlines,
|
||||||
|
relying on the C I/O library to convert this to an appropriate sequence.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC8" href="#TOC1">OPTIONS COMPATIBILITY</a><br>
|
||||||
|
<P>
|
||||||
|
Many of the short and long forms of <b>pcre2grep</b>'s options are the same
|
||||||
|
as in the GNU <b>grep</b> program. Any long option of the form
|
||||||
|
<b>--xxx-regexp</b> (GNU terminology) is also available as <b>--xxx-regex</b>
|
||||||
|
(PCRE2 terminology). However, the <b>--file-list</b>, <b>--file-offsets</b>,
|
||||||
|
<b>--include-dir</b>, <b>--line-offsets</b>, <b>--locale</b>, <b>--match-limit</b>,
|
||||||
|
<b>-M</b>, <b>--multiline</b>, <b>-N</b>, <b>--newline</b>, <b>--om-separator</b>,
|
||||||
|
<b>--recursion-limit</b>, <b>-u</b>, and <b>--utf-8</b> options are specific to
|
||||||
|
<b>pcre2grep</b>, as is the use of the <b>--only-matching</b> option with a
|
||||||
|
capturing parentheses number.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
Although most of the common options work the same way, a few are different in
|
||||||
|
<b>pcre2grep</b>. For example, the <b>--include</b> option's argument is a glob
|
||||||
|
for GNU <b>grep</b>, but a regular expression for <b>pcre2grep</b>. If both the
|
||||||
|
<b>-c</b> and <b>-l</b> options are given, GNU grep lists only file names,
|
||||||
|
without counts, but <b>pcre2grep</b> gives the counts.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC9" href="#TOC1">OPTIONS WITH DATA</a><br>
|
||||||
|
<P>
|
||||||
|
There are four different ways in which an option with data can be specified.
|
||||||
|
If a short form option is used, the data may follow immediately, or (with one
|
||||||
|
exception) in the next command line item. For example:
|
||||||
|
<pre>
|
||||||
|
-f/some/file
|
||||||
|
-f /some/file
|
||||||
|
</pre>
|
||||||
|
The exception is the <b>-o</b> option, which may appear with or without data.
|
||||||
|
Because of this, if data is present, it must follow immediately in the same
|
||||||
|
item, for example -o3.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
If a long form option is used, the data may appear in the same command line
|
||||||
|
item, separated by an equals character, or (with two exceptions) it may appear
|
||||||
|
in the next command line item. For example:
|
||||||
|
<pre>
|
||||||
|
--file=/some/file
|
||||||
|
--file /some/file
|
||||||
|
</pre>
|
||||||
|
Note, however, that if you want to supply a file name beginning with ~ as data
|
||||||
|
in a shell command, and have the shell expand ~ to a home directory, you must
|
||||||
|
separate the file name from the option, because the shell does not treat ~
|
||||||
|
specially unless it is at the start of an item.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The exceptions to the above are the <b>--colour</b> (or <b>--color</b>) and
|
||||||
|
<b>--only-matching</b> options, for which the data is optional. If one of these
|
||||||
|
options does have data, it must be given in the first form, using an equals
|
||||||
|
character. Otherwise <b>pcre2grep</b> will assume that it has no data.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC10" href="#TOC1">MATCHING ERRORS</a><br>
|
||||||
|
<P>
|
||||||
|
It is possible to supply a regular expression that takes a very long time to
|
||||||
|
fail to match certain lines. Such patterns normally involve nested indefinite
|
||||||
|
repeats, for example: (a+)*\d when matched against a line of a's with no final
|
||||||
|
digit. The PCRE2 matching function has a resource limit that causes it to abort
|
||||||
|
in these circumstances. If this happens, <b>pcre2grep</b> outputs an error
|
||||||
|
message and the line that caused the problem to the standard error stream. If
|
||||||
|
there are more than 20 such errors, <b>pcre2grep</b> gives up.
|
||||||
|
</P>
|
||||||
|
<P>
|
||||||
|
The <b>--match-limit</b> option of <b>pcre2grep</b> can be used to set the overall
|
||||||
|
resource limit; there is a second option called <b>--recursion-limit</b> that
|
||||||
|
sets a limit on the amount of memory (usually stack) that is used (see the
|
||||||
|
discussion of these options above).
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC11" href="#TOC1">DIAGNOSTICS</a><br>
|
||||||
|
<P>
|
||||||
|
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
|
||||||
|
for syntax errors, overlong lines, non-existent or inaccessible files (even if
|
||||||
|
matches were found in other files) or too many matching errors. Using the
|
||||||
|
<b>-s</b> option to suppress error messages about inaccessible files does not
|
||||||
|
affect the return code.
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC12" href="#TOC1">SEE ALSO</a><br>
|
||||||
|
<P>
|
||||||
|
<b>pcre2pattern</b>(3), <b>pcre2syntax</b>(3), <b>pcre2test</b>(1).
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC13" href="#TOC1">AUTHOR</a><br>
|
||||||
|
<P>
|
||||||
|
Philip Hazel
|
||||||
|
<br>
|
||||||
|
University Computing Service
|
||||||
|
<br>
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
<br>
|
||||||
|
</P>
|
||||||
|
<br><a name="SEC14" href="#TOC1">REVISION</a><br>
|
||||||
|
<P>
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
<br>
|
||||||
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
|
<br>
|
||||||
|
<p>
|
||||||
|
Return to the <a href="index.html">PCRE2 index page</a>.
|
||||||
|
</p>
|
|
@ -0,0 +1,86 @@
|
||||||
|
.TH PCRE2-CONFIG 1 "28 September 2014" "PCRE2 10.00"
|
||||||
|
.SH NAME
|
||||||
|
pcre2-config - program to return PCRE2 configuration
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
.B pcre2-config [--prefix] [--exec-prefix] [--version]
|
||||||
|
.B " [--libs8] [--libs16] [--libs32] [--libs-posix]"
|
||||||
|
.B " [--cflags] [--cflags-posix]"
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
\fBpcre2-config\fP returns the configuration of the installed PCRE2 libraries
|
||||||
|
and the options required to compile a program to use them. Some of the options
|
||||||
|
apply only to the 8-bit, or 16-bit, or 32-bit libraries, respectively, and are
|
||||||
|
not available for libraries that have not been built. If an unavailable option
|
||||||
|
is encountered, the "usage" information is output.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH OPTIONS
|
||||||
|
.rs
|
||||||
|
.TP 10
|
||||||
|
\fB--prefix\fP
|
||||||
|
Writes the directory prefix used in the PCRE2 installation for architecture
|
||||||
|
independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some
|
||||||
|
systems) to the standard output.
|
||||||
|
.TP 10
|
||||||
|
\fB--exec-prefix\fP
|
||||||
|
Writes the directory prefix used in the PCRE2 installation for architecture
|
||||||
|
dependent files (normally the same as \fB--prefix\fP) to the standard output.
|
||||||
|
.TP 10
|
||||||
|
\fB--version\fP
|
||||||
|
Writes the version number of the installed PCRE2 libraries to the standard
|
||||||
|
output.
|
||||||
|
.TP 10
|
||||||
|
\fB--libs8\fP
|
||||||
|
Writes to the standard output the command line options required to link
|
||||||
|
with the 8-bit PCRE2 library (\fB-lpcre2-8\fP on many systems).
|
||||||
|
.TP 10
|
||||||
|
\fB--libs16\fP
|
||||||
|
Writes to the standard output the command line options required to link
|
||||||
|
with the 16-bit PCRE2 library (\fB-lpcre2-16\fP on many systems).
|
||||||
|
.TP 10
|
||||||
|
\fB--libs32\fP
|
||||||
|
Writes to the standard output the command line options required to link
|
||||||
|
with the 32-bit PCRE2 library (\fB-lpcre2-32\fP on many systems).
|
||||||
|
.TP 10
|
||||||
|
\fB--libs-posix\fP
|
||||||
|
Writes to the standard output the command line options required to link with
|
||||||
|
PCRE2's POSIX API wrapper library (\fB-lpcre2-posix\fP \fB-lpcre2-8\fP on many
|
||||||
|
systems).
|
||||||
|
.TP 10
|
||||||
|
\fB--cflags\fP
|
||||||
|
Writes to the standard output the command line options required to compile
|
||||||
|
files that use PCRE2 (this may include some \fB-I\fP options, but is blank on
|
||||||
|
many systems).
|
||||||
|
.TP 10
|
||||||
|
\fB--cflags-posix\fP
|
||||||
|
Writes to the standard output the command line options required to compile
|
||||||
|
files that use PCRE2's POSIX API wrapper library (this may include some
|
||||||
|
\fB-I\fP options, but is blank on many systems).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "SEE ALSO"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
\fBpcre2(3)\fP
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH AUTHOR
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
This manual page was originally written by Mark Baker for the Debian GNU/Linux
|
||||||
|
system. It has been subsequently revised as a generic PCRE2 man page.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH REVISION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
.fi
|
|
@ -0,0 +1,81 @@
|
||||||
|
PCRE2-CONFIG(1) General Commands Manual PCRE2-CONFIG(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
NAME
|
||||||
|
pcre2-config - program to return PCRE2 configuration
|
||||||
|
|
||||||
|
SYNOPSIS
|
||||||
|
|
||||||
|
pcre2-config [--prefix] [--exec-prefix] [--version]
|
||||||
|
[--libs8] [--libs16] [--libs32] [--libs-posix]
|
||||||
|
[--cflags] [--cflags-posix]
|
||||||
|
|
||||||
|
|
||||||
|
DESCRIPTION
|
||||||
|
|
||||||
|
pcre2-config returns the configuration of the installed PCRE2 libraries
|
||||||
|
and the options required to compile a program to use them. Some of the
|
||||||
|
options apply only to the 8-bit, or 16-bit, or 32-bit libraries,
|
||||||
|
respectively, and are not available for libraries that have not been
|
||||||
|
built. If an unavailable option is encountered, the "usage" information
|
||||||
|
is output.
|
||||||
|
|
||||||
|
|
||||||
|
OPTIONS
|
||||||
|
|
||||||
|
--prefix Writes the directory prefix used in the PCRE2 installation
|
||||||
|
for architecture independent files (/usr on many systems,
|
||||||
|
/usr/local on some systems) to the standard output.
|
||||||
|
|
||||||
|
--exec-prefix
|
||||||
|
Writes the directory prefix used in the PCRE2 installation
|
||||||
|
for architecture dependent files (normally the same as --pre-
|
||||||
|
fix) to the standard output.
|
||||||
|
|
||||||
|
--version Writes the version number of the installed PCRE2 libraries to
|
||||||
|
the standard output.
|
||||||
|
|
||||||
|
--libs8 Writes to the standard output the command line options
|
||||||
|
required to link with the 8-bit PCRE2 library (-lpcre2-8 on
|
||||||
|
many systems).
|
||||||
|
|
||||||
|
--libs16 Writes to the standard output the command line options
|
||||||
|
required to link with the 16-bit PCRE2 library (-lpcre2-16 on
|
||||||
|
many systems).
|
||||||
|
|
||||||
|
--libs32 Writes to the standard output the command line options
|
||||||
|
required to link with the 32-bit PCRE2 library (-lpcre2-32 on
|
||||||
|
many systems).
|
||||||
|
|
||||||
|
--libs-posix
|
||||||
|
Writes to the standard output the command line options
|
||||||
|
required to link with PCRE2's POSIX API wrapper library
|
||||||
|
(-lpcre2-posix -lpcre2-8 on many systems).
|
||||||
|
|
||||||
|
--cflags Writes to the standard output the command line options
|
||||||
|
required to compile files that use PCRE2 (this may include
|
||||||
|
some -I options, but is blank on many systems).
|
||||||
|
|
||||||
|
--cflags-posix
|
||||||
|
Writes to the standard output the command line options
|
||||||
|
required to compile files that use PCRE2's POSIX API wrapper
|
||||||
|
library (this may include some -I options, but is blank on
|
||||||
|
many systems).
|
||||||
|
|
||||||
|
|
||||||
|
SEE ALSO
|
||||||
|
|
||||||
|
pcre2(3)
|
||||||
|
|
||||||
|
|
||||||
|
AUTHOR
|
||||||
|
|
||||||
|
This manual page was originally written by Mark Baker for the Debian
|
||||||
|
GNU/Linux system. It has been subsequently revised as a generic PCRE2
|
||||||
|
man page.
|
||||||
|
|
||||||
|
|
||||||
|
REVISION
|
||||||
|
|
||||||
|
Last updated: 28 September 2014
|
|
@ -0,0 +1,180 @@
|
||||||
|
.TH PCRE2 3 "28 September 2014" "PCRE2 10.00"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH INTRODUCTION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
PCRE2 is the name used for a revised API for the PCRE library, which is a set
|
||||||
|
of functions, written in C, that implement regular expression pattern matching
|
||||||
|
using the same syntax and semantics as Perl, with just a few differences. Some
|
||||||
|
features that appeared in Python and the original PCRE before they appeared in
|
||||||
|
Perl are also available using the Python syntax, there is some support for one
|
||||||
|
or two .NET and Oniguruma syntax items, and there are options for requesting
|
||||||
|
some minor changes that give better ECMAScript (aka JavaScript) compatibility.
|
||||||
|
.P
|
||||||
|
The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
|
||||||
|
code units, which means that up to three separate libraries may be installed.
|
||||||
|
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||||
|
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||||
|
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||||
|
Unicode, with support for Unicode general category properties. Unicode is
|
||||||
|
optional at build time, and must be enabled explicitly at run time. The version
|
||||||
|
of Unicode in use can be discovered by running
|
||||||
|
.sp
|
||||||
|
pcre2test -C
|
||||||
|
.P
|
||||||
|
The three libraries contain identical sets of functions, with names ending in
|
||||||
|
_8, _16, or _32, respectively (for example, \fBpcre2_compile_8()\fP). However,
|
||||||
|
by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just
|
||||||
|
one code unit width can be written using generic names such as
|
||||||
|
\fBpcre2_compile()\fP, and the documentation is written assuming that this is
|
||||||
|
the case.
|
||||||
|
.P
|
||||||
|
In addition to the Perl-compatible matching function, PCRE2 contains an
|
||||||
|
alternative function that matches the same compiled patterns in a different
|
||||||
|
way. In certain circumstances, the alternative function has some advantages.
|
||||||
|
For a discussion of the two matching algorithms, see the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2matching\fP
|
||||||
|
.\"
|
||||||
|
page.
|
||||||
|
.P
|
||||||
|
Details of exactly which Perl regular expression features are and are not
|
||||||
|
supported by PCRE2 are given in separate documents. See the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2pattern\fP
|
||||||
|
.\"
|
||||||
|
and
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2compat\fP
|
||||||
|
.\"
|
||||||
|
pages. There is a syntax summary in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2syntax\fP
|
||||||
|
.\"
|
||||||
|
page.
|
||||||
|
.P
|
||||||
|
Some features of PCRE2 can be included, excluded, or changed when the library
|
||||||
|
is built. The
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2_config()\fP
|
||||||
|
.\"
|
||||||
|
function makes it possible for a client to discover which features are
|
||||||
|
available. The features themselves are described in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2build\fP
|
||||||
|
.\"
|
||||||
|
page. Documentation about building PCRE2 for various operating systems can be
|
||||||
|
found in the
|
||||||
|
.\" HTML <a href="README.txt">
|
||||||
|
.\" </a>
|
||||||
|
\fBREADME\fP
|
||||||
|
.\"
|
||||||
|
and
|
||||||
|
.\" HTML <a href="NON-AUTOTOOLS-BUILD.txt">
|
||||||
|
.\" </a>
|
||||||
|
\fBNON-AUTOTOOLS_BUILD\fP
|
||||||
|
.\"
|
||||||
|
files in the source distribution.
|
||||||
|
.P
|
||||||
|
The libraries contains a number of undocumented internal functions and data
|
||||||
|
tables that are used by more than one of the exported external functions, but
|
||||||
|
which are not intended for use by external callers. Their names all begin with
|
||||||
|
"_pcre2", which hopefully will not provoke any name clashes. In some
|
||||||
|
environments, it is possible to control which external symbols are exported
|
||||||
|
when a shared library is built, and in these cases the undocumented symbols are
|
||||||
|
not exported.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "SECURITY CONSIDERATIONS"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
If you are using PCRE2 in a non-UTF application that permits users to supply
|
||||||
|
arbitrary patterns for compilation, you should be aware of a feature that
|
||||||
|
allows users to turn on UTF support from within a pattern, provided that PCRE2
|
||||||
|
was built with Unicode support. For example, an 8-bit pattern that begins with
|
||||||
|
"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings
|
||||||
|
of UTF-8 code units instead of individual 8-bit characters. This causes both
|
||||||
|
the pattern and any data against which it is matched to be checked for UTF-8
|
||||||
|
validity. If the data string is very long, such a check might use sufficiently
|
||||||
|
many resources as to cause your application to lose performance.
|
||||||
|
.P
|
||||||
|
One way of guarding against this possibility is to use the
|
||||||
|
\fBpcre2_pattern_info()\fP function to check the compiled pattern's options for
|
||||||
|
UTF. Alternatively, you can set the PCRE2_NEVER_UTF option at compile time.
|
||||||
|
This causes an compile time error if a pattern contains a UTF-setting sequence.
|
||||||
|
.P
|
||||||
|
If your application is one that supports UTF, be aware that validity checking
|
||||||
|
can take time. If the same data string is to be matched many times, you can use
|
||||||
|
the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid
|
||||||
|
running redundant checks.
|
||||||
|
.P
|
||||||
|
Another way that performance can be hit is by running a pattern that has a very
|
||||||
|
large search tree against a string that will never match. Nested unlimited
|
||||||
|
repeats in a pattern are a common example. PCRE2 provides some protection
|
||||||
|
against this: see the \fBpcre2_set_match_limit()\fP function in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2api\fP
|
||||||
|
.\"
|
||||||
|
page.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "USER DOCUMENTATION"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The user documentation for PCRE2 comprises a number of different sections. In
|
||||||
|
the "man" format, each of these is a separate "man page". In the HTML format,
|
||||||
|
each is a separate page, linked from the index page. In the plain text format,
|
||||||
|
the descriptions of the \fBpcre2grep\fP and \fBpcre2test\fP programs are in
|
||||||
|
files called \fBpcre2grep.txt\fP and \fBpcre2test.txt\fP, respectively. The
|
||||||
|
remaining sections, except for the \fBpcre2demo\fP section (which is a program
|
||||||
|
listing), and the short pages for individual functions, are concatenated in
|
||||||
|
\fBpcre2.txt\fP, for ease of searching. The sections are as follows:
|
||||||
|
.sp
|
||||||
|
pcre2 this document FIXME CHECK THIS LIST
|
||||||
|
pcre2-config show PCRE2 installation configuration information
|
||||||
|
pcre2api details of PCRE2's native C API
|
||||||
|
pcre2build building PCRE2
|
||||||
|
pcre2callout details of the callout feature
|
||||||
|
pcre2compat discussion of Perl compatibility
|
||||||
|
pcre2demo a demonstration C program that uses PCRE2
|
||||||
|
pcre2grep description of the \fBpcre2grep\fP command (8-bit only)
|
||||||
|
pcre2jit discussion of the just-in-time optimization support
|
||||||
|
pcre2limits details of size and other limits
|
||||||
|
pcre2matching discussion of the two matching algorithms
|
||||||
|
pcre2partial details of the partial matching facility
|
||||||
|
.\" JOIN
|
||||||
|
pcre2pattern syntax and semantics of supported
|
||||||
|
regular expressions
|
||||||
|
pcre2perform discussion of performance issues
|
||||||
|
pcre2posix the POSIX-compatible C API for the 8-bit library
|
||||||
|
pcre2sample discussion of the pcre2demo program
|
||||||
|
pcre2stack discussion of stack usage
|
||||||
|
pcre2syntax quick syntax reference
|
||||||
|
pcre2test description of the \fBpcre2test\fP testing command
|
||||||
|
pcre2unicode discussion of Unicode and UTF support
|
||||||
|
.sp
|
||||||
|
In the "man" and HTML formats, there is also a short page for each C library
|
||||||
|
function, listing its arguments and results.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH AUTHOR
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Philip Hazel
|
||||||
|
University Computing Service
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
.fi
|
||||||
|
.P
|
||||||
|
Putting an actual email address here is a spam magnet. If you want to email me,
|
||||||
|
use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH REVISION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
|
.fi
|
|
@ -0,0 +1,490 @@
|
||||||
|
.TH PCRE2BUILD 3 "28 Sepember 2014" "PCRE2 10.00"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "BUILDING PCRE2"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
PCRE2 is distributed with a \fBconfigure\fP script that can be used to build
|
||||||
|
the library in Unix-like environments using the applications known as
|
||||||
|
Autotools. Also in the distribution are files to support building using
|
||||||
|
\fBCMake\fP instead of \fBconfigure\fP. The text file
|
||||||
|
.\" HTML <a href="README.txt">
|
||||||
|
.\" </a>
|
||||||
|
\fBREADME\fP
|
||||||
|
.\"
|
||||||
|
contains general information about building with Autotools (some of which is
|
||||||
|
repeated below), and also has some comments about building on various operating
|
||||||
|
systems. There is a lot more information about building PCRE2 without using
|
||||||
|
Autotools (including information about using \fBCMake\fP and building "by
|
||||||
|
hand") in the text file called
|
||||||
|
.\" HTML <a href="NON-AUTOTOOLS-BUILD.txt">
|
||||||
|
.\" </a>
|
||||||
|
\fBNON-AUTOTOOLS-BUILD\fP.
|
||||||
|
.\"
|
||||||
|
You should consult this file as well as the
|
||||||
|
.\" HTML <a href="README.txt">
|
||||||
|
.\" </a>
|
||||||
|
\fBREADME\fP
|
||||||
|
.\"
|
||||||
|
file if you are building in a non-Unix-like environment.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "PCRE2 BUILD-TIME OPTIONS"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The rest of this document describes the optional features of PCRE2 that can be
|
||||||
|
selected when the library is compiled. It assumes use of the \fBconfigure\fP
|
||||||
|
script, where the optional features are selected or deselected by providing
|
||||||
|
options to \fBconfigure\fP before running the \fBmake\fP command. However, the
|
||||||
|
same options can be selected in both Unix-like and non-Unix-like environments
|
||||||
|
if you are using \fBCMake\fP instead of \fBconfigure\fP to build PCRE2.
|
||||||
|
.P
|
||||||
|
If you are not using Autotools or \fBCMake\fP, option selection can be done by
|
||||||
|
editing the \fBconfig.h\fP file, or by passing parameter settings to the
|
||||||
|
compiler, as described in
|
||||||
|
.\" HTML <a href="NON-AUTOTOOLS-BUILD.txt">
|
||||||
|
.\" </a>
|
||||||
|
\fBNON-AUTOTOOLS-BUILD\fP.
|
||||||
|
.\"
|
||||||
|
.P
|
||||||
|
The complete list of options for \fBconfigure\fP (which includes the standard
|
||||||
|
ones such as the selection of the installation directory) can be obtained by
|
||||||
|
running
|
||||||
|
.sp
|
||||||
|
./configure --help
|
||||||
|
.sp
|
||||||
|
The following sections include descriptions of options whose names begin with
|
||||||
|
--enable or --disable. These settings specify changes to the defaults for the
|
||||||
|
\fBconfigure\fP command. Because of the way that \fBconfigure\fP works,
|
||||||
|
--enable and --disable always come in pairs, so the complementary option always
|
||||||
|
exists as well, but as it specifies the default, it is not described.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
By default, a library called \fBlibpcre2-8\fP is built, containing functions
|
||||||
|
that take string arguments contained in vectors of bytes, interpreted either as
|
||||||
|
single-byte characters, or UTF-8 strings. You can also build two other
|
||||||
|
libraries, called \fBlibpcre2-16\fP and \fBlibpcre2-32\fP, which process
|
||||||
|
strings that are contained in vectors of 16-bit and 32-bit code units,
|
||||||
|
respectively. These can be interpreted either as single-unit characters or
|
||||||
|
UTF-16/UTF-32 strings. To build these additional libraries, add one or both of
|
||||||
|
the following to the \fBconfigure\fP command:
|
||||||
|
.sp
|
||||||
|
--enable-pcre16
|
||||||
|
--enable-pcre32
|
||||||
|
.sp
|
||||||
|
If you do not want the 8-bit library, add
|
||||||
|
.sp
|
||||||
|
--disable-pcre8
|
||||||
|
.sp
|
||||||
|
as well. At least one of the three libraries must be built. Note that the POSIX
|
||||||
|
wrapper is for the 8-bit library only, and that \fBpcre2grep\fP is an 8-bit
|
||||||
|
program. Neither of these are built if you select only the 16-bit or 32-bit
|
||||||
|
libraries.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "BUILDING SHARED AND STATIC LIBRARIES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The Autotools PCRE2 building process uses \fBlibtool\fP to build both shared
|
||||||
|
and static libraries by default. You can suppress one of these by adding one of
|
||||||
|
.sp
|
||||||
|
--disable-shared
|
||||||
|
--disable-static
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command, as required.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "Unicode and UTF SUPPORT"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
To build PCRE2 with support for Unicode and UTF character strings, add
|
||||||
|
.sp
|
||||||
|
--enable-unicode
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. This setting applies to all three libraries,
|
||||||
|
adding support for UTF-8 to the 8-bit library, support for UTF-16 to the 16-bit
|
||||||
|
library, and support for UTF-32 to the to the 32-bit library.
|
||||||
|
It is not possible to build one library with
|
||||||
|
UTF support and another without in the same configuration.
|
||||||
|
.P
|
||||||
|
Of itself, this setting does not make PCRE2 treat strings as UTF-8, UTF-16 or
|
||||||
|
UTF-32. As well as compiling PCRE2 with this option, you also have have to set
|
||||||
|
the PCRE2_UTF option when you call \fBpcre2_compile()\fP to compile a pattern.
|
||||||
|
.P
|
||||||
|
If you set --enable-unicode when compiling in an EBCDIC environment, PCRE2
|
||||||
|
expects its input to be either ASCII or UTF-8 (depending on the run-time
|
||||||
|
option). It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||||
|
version of the library. Consequently, --enable-unicode and --enable-ebcdic are
|
||||||
|
mutually exclusive.
|
||||||
|
.P
|
||||||
|
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
||||||
|
in the strings that they handle. It also provides support for accessing the
|
||||||
|
properties of such characters, using pattern escapes such as \eP, \ep, and \eX.
|
||||||
|
Only the general category properties such as \fILu\fP and \fINd\fP are
|
||||||
|
supported. Details are given in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2pattern\fP
|
||||||
|
.\"
|
||||||
|
documentation.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "JUST-IN-TIME COMPILER SUPPORT"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Just-in-time compiler support is included in the build by specifying
|
||||||
|
.sp
|
||||||
|
--enable-jit
|
||||||
|
.sp
|
||||||
|
This support is available only for certain hardware architectures. If this
|
||||||
|
option is set for an unsupported architecture, a compile time error occurs.
|
||||||
|
See the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2jit\fP
|
||||||
|
.\"
|
||||||
|
documentation for a discussion of JIT usage. When JIT support is enabled,
|
||||||
|
pcre2grep automatically makes use of it, unless you add
|
||||||
|
.sp
|
||||||
|
--disable-pcre2grep-jit
|
||||||
|
.sp
|
||||||
|
to the "configure" command.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "CODE VALUE OF NEWLINE"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
By default, PCRE2 interprets the linefeed (LF) character as indicating the end
|
||||||
|
of a line. This is the normal newline character on Unix-like systems. You can
|
||||||
|
compile PCRE2 to use carriage return (CR) instead, by adding
|
||||||
|
.sp
|
||||||
|
--enable-newline-is-cr
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. There is also a --enable-newline-is-lf option,
|
||||||
|
which explicitly specifies linefeed as the newline character.
|
||||||
|
.sp
|
||||||
|
Alternatively, you can specify that line endings are to be indicated by the two
|
||||||
|
character sequence CRLF. If you want this, add
|
||||||
|
.sp
|
||||||
|
--enable-newline-is-crlf
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. There is a fourth option, specified by
|
||||||
|
.sp
|
||||||
|
--enable-newline-is-anycrlf
|
||||||
|
.sp
|
||||||
|
which causes PCRE2 to recognize any of the three sequences CR, LF, or CRLF as
|
||||||
|
indicating a line ending. Finally, a fifth option, specified by
|
||||||
|
.sp
|
||||||
|
--enable-newline-is-any
|
||||||
|
.sp
|
||||||
|
causes PCRE2 to recognize any Unicode newline sequence.
|
||||||
|
.P
|
||||||
|
Whatever line ending convention is selected when PCRE2 is built can be
|
||||||
|
overridden when the library functions are called. At build time it is
|
||||||
|
conventional to use the standard for your operating system.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "WHAT \eR MATCHES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
By default, the sequence \eR in a pattern matches any Unicode newline sequence,
|
||||||
|
whatever has been selected as the line ending sequence. If you specify
|
||||||
|
.sp
|
||||||
|
--enable-bsr-anycrlf
|
||||||
|
.sp
|
||||||
|
the default is changed so that \eR matches only CR, LF, or CRLF. Whatever is
|
||||||
|
selected when PCRE2 is built can be overridden when the library functions are
|
||||||
|
called.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "HANDLING VERY LARGE PATTERNS"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Within a compiled pattern, offset values are used to point from one part to
|
||||||
|
another (for example, from an opening parenthesis to an alternation
|
||||||
|
metacharacter). By default, in the 8-bit and 16-bit libraries, two-byte values
|
||||||
|
are used for these offsets, leading to a maximum size for a compiled pattern of
|
||||||
|
around 64K. This is sufficient to handle all but the most gigantic patterns.
|
||||||
|
Nevertheless, some people do want to process truly enormous patterns, so it is
|
||||||
|
possible to compile PCRE2 to use three-byte or four-byte offsets by adding a
|
||||||
|
setting such as
|
||||||
|
.sp
|
||||||
|
--with-link-size=3
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. The value given must be 2, 3, or 4. For the
|
||||||
|
16-bit library, a value of 3 is rounded up to 4. In these libraries, using
|
||||||
|
longer offsets slows down the operation of PCRE2 because it has to load
|
||||||
|
additional data when handling them. For the 32-bit library the value is always
|
||||||
|
4 and cannot be overridden; the value of --with-link-size is ignored.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "AVOIDING EXCESSIVE STACK USAGE"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
When matching with the \fBpcre2_match()\fP function, PCRE2 implements
|
||||||
|
backtracking by making recursive calls to an internal function called
|
||||||
|
\fBmatch()\fP. In environments where the size of the stack is limited, this can
|
||||||
|
severely limit PCRE2's operation. (The Unix environment does not usually suffer
|
||||||
|
from this problem, but it may sometimes be necessary to increase the maximum
|
||||||
|
stack size. There is a discussion in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2stack\fP
|
||||||
|
.\"
|
||||||
|
documentation.) An alternative approach to recursion that uses memory from the
|
||||||
|
heap to remember data, instead of using recursive function calls, has been
|
||||||
|
implemented to work round the problem of limited stack size. If you want to
|
||||||
|
build a version of PCRE2 that works this way, add
|
||||||
|
.sp
|
||||||
|
--disable-stack-for-recursion
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. By default, the system functions \fBmalloc()\fP
|
||||||
|
and \fBfree()\fP are called to manage the heap memory that is required, but
|
||||||
|
custom memory management functions can be called instead. PCRE2 runs noticeably
|
||||||
|
more slowly when built in this way. This option affects only the
|
||||||
|
\fBpcre2_match()\fP function; it is not relevant for \fBpcre2_dfa_match()\fP.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "LIMITING PCRE2 RESOURCE USAGE"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Internally, PCRE2 has a function called \fBmatch()\fP, which it calls
|
||||||
|
repeatedly (sometimes recursively) when matching a pattern with the
|
||||||
|
\fBpcre2_match()\fP function. By controlling the maximum number of times this
|
||||||
|
function may be called during a single matching operation, a limit can be
|
||||||
|
placed on the resources used by a single call to \fBpcre2_match()\fP. The limit
|
||||||
|
can be changed at run time, as described in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2api\fP
|
||||||
|
.\"
|
||||||
|
documentation. The default is 10 million, but this can be changed by adding a
|
||||||
|
setting such as
|
||||||
|
.sp
|
||||||
|
--with-match-limit=500000
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. This setting has no effect on the
|
||||||
|
\fBpcre2_dfa_match()\fP matching function.
|
||||||
|
.P
|
||||||
|
In some environments it is desirable to limit the depth of recursive calls of
|
||||||
|
\fBmatch()\fP more strictly than the total number of calls, in order to
|
||||||
|
restrict the maximum amount of stack (or heap, if --disable-stack-for-recursion
|
||||||
|
is specified) that is used. A second limit controls this; it defaults to the
|
||||||
|
value that is set for --with-match-limit, which imposes no additional
|
||||||
|
constraints. However, you can set a lower limit by adding, for example,
|
||||||
|
.sp
|
||||||
|
--with-match-limit-recursion=10000
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. This value can also be overridden at run time.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "CREATING CHARACTER TABLES AT BUILD TIME"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
PCRE2 uses fixed tables for processing characters whose code points are less
|
||||||
|
than 256. By default, PCRE2 is built with a set of tables that are distributed
|
||||||
|
in the file \fIsrc/pcre2_chartables.c.dist\fP. These tables are for ASCII codes
|
||||||
|
only. If you add
|
||||||
|
.sp
|
||||||
|
--enable-rebuild-chartables
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command, the distributed tables are no longer used.
|
||||||
|
Instead, a program called \fBdftables\fP is compiled and run. This outputs the
|
||||||
|
source for new set of tables, created in the default locale of your C run-time
|
||||||
|
system. (This method of replacing the tables does not work if you are cross
|
||||||
|
compiling, because \fBdftables\fP is run on the local host. If you need to
|
||||||
|
create alternative tables when cross compiling, you will have to do so "by
|
||||||
|
hand".)
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "USING EBCDIC CODE"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
PCRE2 assumes by default that it will run in an environment where the character
|
||||||
|
code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
|
||||||
|
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
||||||
|
EBCDIC environment by adding
|
||||||
|
.sp
|
||||||
|
--enable-ebcdic
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. This setting implies
|
||||||
|
--enable-rebuild-chartables. You should only use it if you know that you are in
|
||||||
|
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
||||||
|
--enable-ebcdic option is incompatible with --enable-unicode.
|
||||||
|
.P
|
||||||
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
||||||
|
value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In
|
||||||
|
such an environment you should use
|
||||||
|
.sp
|
||||||
|
--enable-ebcdic-nl25
|
||||||
|
.sp
|
||||||
|
as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR has the
|
||||||
|
same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is \fInot\fP
|
||||||
|
chosen as LF is made to correspond to the Unicode NEL character (which, in
|
||||||
|
Unicode, is 0x85).
|
||||||
|
.P
|
||||||
|
The options that select newline behaviour, such as --enable-newline-is-cr,
|
||||||
|
and equivalent run-time options, refer to these character values in an EBCDIC
|
||||||
|
environment.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
By default, \fBpcre2grep\fP reads all files as plain text. You can build it so
|
||||||
|
that it recognizes files whose names end in \fB.gz\fP or \fB.bz2\fP, and reads
|
||||||
|
them with \fBlibz\fP or \fBlibbz2\fP, respectively, by adding one or both of
|
||||||
|
.sp
|
||||||
|
--enable-pcre2grep-libz
|
||||||
|
--enable-pcre2grep-libbz2
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. These options naturally require that the
|
||||||
|
relevant libraries are installed on your system. Configuration will fail if
|
||||||
|
they are not.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "PCRE2GREP BUFFER SIZE"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
\fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is
|
||||||
|
scanning, in order to be able to output "before" and "after" lines when it
|
||||||
|
finds a match. The size of the buffer is controlled by a parameter whose
|
||||||
|
default value is 20K. The buffer itself is three times this size, but because
|
||||||
|
of the way it is used for holding "before" lines, the longest line that is
|
||||||
|
guaranteed to be processable is the parameter size. You can change the default
|
||||||
|
parameter value by adding, for example,
|
||||||
|
.sp
|
||||||
|
--with-pcre2grep-bufsize=50K
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can, however,
|
||||||
|
override this value by specifying a run-time option.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
If you add one of
|
||||||
|
.sp
|
||||||
|
--enable-pcre2test-libreadline
|
||||||
|
--enable-pcre2test-libedit
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command, \fBpcre2test\fP is linked with the
|
||||||
|
\fBlibreadline\fP or\fBlibedit\fP library, respectively, and when its input is
|
||||||
|
from a terminal, it reads it using the \fBreadline()\fP function. This provides
|
||||||
|
line-editing and history facilities. Note that \fBlibreadline\fP is
|
||||||
|
GPL-licensed, so if you distribute a binary of \fBpcre2test\fP linked in this
|
||||||
|
way, there may be licensing issues. These can be avoided by linking with
|
||||||
|
\fBlibedit\fP (which has a BSD licence) instead.
|
||||||
|
.P
|
||||||
|
Setting this option causes the \fB-lreadline\fP option to be added to the
|
||||||
|
\fBpcre2test\fP build. In many operating environments with a sytem-installed
|
||||||
|
readline library this is sufficient. However, in some environments (e.g. if an
|
||||||
|
unmodified distribution version of readline is in use), some extra
|
||||||
|
configuration may be necessary. The INSTALL file for \fBlibreadline\fP says
|
||||||
|
this:
|
||||||
|
.sp
|
||||||
|
"Readline uses the termcap functions, but does not link with
|
||||||
|
the termcap or curses library itself, allowing applications
|
||||||
|
which link with readline the to choose an appropriate library."
|
||||||
|
.sp
|
||||||
|
If your environment has not been set up so that an appropriate library is
|
||||||
|
automatically included, you may need to add something like
|
||||||
|
.sp
|
||||||
|
LIBS="-ncurses"
|
||||||
|
.sp
|
||||||
|
immediately before the \fBconfigure\fP command.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "DEBUGGING WITH VALGRIND SUPPORT"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
By adding the
|
||||||
|
.sp
|
||||||
|
--enable-valgrind
|
||||||
|
.sp
|
||||||
|
option to to the \fBconfigure\fP command, PCRE2 will use valgrind annotations
|
||||||
|
to mark certain memory regions as unaddressable. This allows it to detect
|
||||||
|
invalid memory accesses, and is mostly useful for debugging PCRE2 itself.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "CODE COVERAGE REPORTING"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
If your C compiler is gcc, you can build a version of PCRE2 that can generate a
|
||||||
|
code coverage report for its test suite. To enable this, you must install
|
||||||
|
\fBlcov\fP version 1.6 or above. Then specify
|
||||||
|
.sp
|
||||||
|
--enable-coverage
|
||||||
|
.sp
|
||||||
|
to the \fBconfigure\fP command and build PCRE2 in the usual way.
|
||||||
|
.P
|
||||||
|
Note that using \fBccache\fP (a caching C compiler) is incompatible with code
|
||||||
|
coverage reporting. If you have configured \fBccache\fP to run automatically
|
||||||
|
on your system, you must set the environment variable
|
||||||
|
.sp
|
||||||
|
CCACHE_DISABLE=1
|
||||||
|
.sp
|
||||||
|
before running \fBmake\fP to build PCRE2, so that \fBccache\fP is not used.
|
||||||
|
.P
|
||||||
|
When --enable-coverage is used, the following addition targets are added to the
|
||||||
|
\fIMakefile\fP:
|
||||||
|
.sp
|
||||||
|
make coverage
|
||||||
|
.sp
|
||||||
|
This creates a fresh coverage report for the PCRE2 test suite. It is equivalent
|
||||||
|
to running "make coverage-reset", "make coverage-baseline", "make check", and
|
||||||
|
then "make coverage-report".
|
||||||
|
.sp
|
||||||
|
make coverage-reset
|
||||||
|
.sp
|
||||||
|
This zeroes the coverage counters, but does nothing else.
|
||||||
|
.sp
|
||||||
|
make coverage-baseline
|
||||||
|
.sp
|
||||||
|
This captures baseline coverage information.
|
||||||
|
.sp
|
||||||
|
make coverage-report
|
||||||
|
.sp
|
||||||
|
This creates the coverage report.
|
||||||
|
.sp
|
||||||
|
make coverage-clean-report
|
||||||
|
.sp
|
||||||
|
This removes the generated coverage report without cleaning the coverage data
|
||||||
|
itself.
|
||||||
|
.sp
|
||||||
|
make coverage-clean-data
|
||||||
|
.sp
|
||||||
|
This removes the captured coverage data without removing the coverage files
|
||||||
|
created at compile time (*.gcno).
|
||||||
|
.sp
|
||||||
|
make coverage-clean
|
||||||
|
.sp
|
||||||
|
This cleans all coverage data including the generated coverage report. For more
|
||||||
|
information about code coverage, see the \fBgcov\fP and \fBlcov\fP
|
||||||
|
documentation.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "SEE ALSO"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
\fBpcre2api\fP(3), \fBpcre2_config\fP(3).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH AUTHOR
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Philip Hazel
|
||||||
|
University Computing Service
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH REVISION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
|
.fi
|
|
@ -0,0 +1,190 @@
|
||||||
|
.TH PCRE2COMPAT 3 "28 September 2014" "PCRE2 10.0"
|
||||||
|
.SH NAME
|
||||||
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
|
.SH "DIFFERENCES BETWEEN PCRE2 AND PERL"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
This document describes the differences in the ways that PCRE2 and Perl handle
|
||||||
|
regular expressions. The differences described here are with respect to Perl
|
||||||
|
versions 5.10 and above.
|
||||||
|
.P
|
||||||
|
1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does
|
||||||
|
have are given in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2unicode\fP
|
||||||
|
.\"
|
||||||
|
page.
|
||||||
|
.P
|
||||||
|
2. PCRE2 allows repeat quantifiers only on parenthesized assertions, but they
|
||||||
|
do not mean what you might think. For example, (?!a){3} does not assert that
|
||||||
|
the next three characters are not "a". It just asserts that the next character
|
||||||
|
is not "a" three times (in principle: PCRE2 optimizes this to run the assertion
|
||||||
|
just once). Perl allows repeat quantifiers on other assertions such as \eb, but
|
||||||
|
these do not seem to have any use.
|
||||||
|
.P
|
||||||
|
3. Capturing subpatterns that occur inside negative lookahead assertions are
|
||||||
|
counted, but their entries in the offsets vector are never set. Perl sometimes
|
||||||
|
(but not always) sets its numerical variables from inside negative assertions.
|
||||||
|
.P
|
||||||
|
4. The following Perl escape sequences are not supported: \el, \eu, \eL,
|
||||||
|
\eU, and \eN when followed by a character name or Unicode value. (\eN on its
|
||||||
|
own, matching a non-newline character, is supported.) In fact these are
|
||||||
|
implemented by Perl's general string-handling and are not part of its pattern
|
||||||
|
matching engine. If any of these are encountered by PCRE2, an error is
|
||||||
|
generated by default. However, if the PCRE2_ALT_BSUX option is set,
|
||||||
|
\eU and \eu are interpreted as ECMAScript interprets them.
|
||||||
|
.P
|
||||||
|
5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is
|
||||||
|
built with Unicode support. The properties that can be tested with \ep and \eP
|
||||||
|
are limited to the general category properties such as Lu and Nd, script names
|
||||||
|
such as Greek or Han, and the derived properties Any and L&. PCRE2 does support
|
||||||
|
the Cs (surrogate) property, which Perl does not; the Perl documentation says
|
||||||
|
"Because Perl hides the need for the user to understand the internal
|
||||||
|
representation of Unicode characters, there is no need to implement the
|
||||||
|
somewhat messy concept of surrogates."
|
||||||
|
.P
|
||||||
|
6. PCRE2 does support the \eQ...\eE escape for quoting substrings. Characters
|
||||||
|
in between are treated as literals. This is slightly different from Perl in
|
||||||
|
that $ and @ are also handled as literals inside the quotes. In Perl, they
|
||||||
|
cause variable interpolation (but of course PCRE2 does not have variables).
|
||||||
|
Note the following examples:
|
||||||
|
.sp
|
||||||
|
Pattern PCRE2 matches Perl matches
|
||||||
|
.sp
|
||||||
|
.\" JOIN
|
||||||
|
\eQabc$xyz\eE abc$xyz abc followed by the
|
||||||
|
contents of $xyz
|
||||||
|
\eQabc\e$xyz\eE abc\e$xyz abc\e$xyz
|
||||||
|
\eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz
|
||||||
|
.sp
|
||||||
|
The \eQ...\eE sequence is recognized both inside and outside character classes.
|
||||||
|
.P
|
||||||
|
7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code})
|
||||||
|
constructions. However, there is support for recursive patterns. This is not
|
||||||
|
available in Perl 5.8, but it is in Perl 5.10. Also, the PCRE2 "callout"
|
||||||
|
feature allows an external function to be called during pattern matching. See
|
||||||
|
the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2callout\fP
|
||||||
|
.\"
|
||||||
|
documentation for details.
|
||||||
|
.P
|
||||||
|
8. Subpatterns that are called as subroutines (whether or not recursively) are
|
||||||
|
always treated as atomic groups in PCRE2. This is like Python, but unlike Perl.
|
||||||
|
Captured values that are set outside a subroutine call can be reference from
|
||||||
|
inside in PCRE2, but not in Perl. There is a discussion that explains these
|
||||||
|
differences in more detail in the
|
||||||
|
.\" HTML <a href="pcre2pattern.html#recursiondifference">
|
||||||
|
.\" </a>
|
||||||
|
section on recursion differences from Perl
|
||||||
|
.\"
|
||||||
|
in the
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2pattern\fP
|
||||||
|
.\"
|
||||||
|
page.
|
||||||
|
.P
|
||||||
|
9. If any of the backtracking control verbs are used in a subpattern that is
|
||||||
|
called as a subroutine (whether or not recursively), their effect is confined
|
||||||
|
to that subpattern; it does not extend to the surrounding pattern. This is not
|
||||||
|
always the case in Perl. In particular, if (*THEN) is present in a group that
|
||||||
|
is called as a subroutine, its action is limited to that group, even if the
|
||||||
|
group does not contain any | characters. Note that such subpatterns are
|
||||||
|
processed as anchored at the point where they are tested.
|
||||||
|
.P
|
||||||
|
10. If a pattern contains more than one backtracking control verb, the first
|
||||||
|
one that is backtracked onto acts. For example, in the pattern
|
||||||
|
A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C
|
||||||
|
triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the
|
||||||
|
same as PCRE2, but there are examples where it differs.
|
||||||
|
.P
|
||||||
|
11. Most backtracking verbs in assertions have their normal actions. They are
|
||||||
|
not confined to the assertion.
|
||||||
|
.P
|
||||||
|
12. There are some differences that are concerned with the settings of captured
|
||||||
|
strings when part of a pattern is repeated. For example, matching "aba" against
|
||||||
|
the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to
|
||||||
|
"b".
|
||||||
|
.P
|
||||||
|
13. PCRE2's handling of duplicate subpattern numbers and duplicate subpattern
|
||||||
|
names is not as general as Perl's. This is a consequence of the fact the PCRE2
|
||||||
|
works internally just with numbers, using an external table to translate
|
||||||
|
between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b)B),
|
||||||
|
where the two capturing parentheses have the same number but different names,
|
||||||
|
is not supported, and causes an error at compile time. If it were allowed, it
|
||||||
|
would not be possible to distinguish which parentheses matched, because both
|
||||||
|
names map to capturing subpattern number 1. To avoid this confusing situation,
|
||||||
|
an error is given at compile time.
|
||||||
|
.P
|
||||||
|
14. Perl recognizes comments in some places that PCRE2 does not, for example,
|
||||||
|
between the ( and ? at the start of a subpattern. If the /x modifier is set,
|
||||||
|
Perl allows white space between ( and ? (though current Perls warn that this is
|
||||||
|
deprecated) but PCRE2 never does, even if the PCRE2_EXTENDED option is set.
|
||||||
|
.P
|
||||||
|
15. Perl, when in warning mode, gives warnings for character classes such as
|
||||||
|
[A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no
|
||||||
|
warning features, so it gives an error in these cases because they are almost
|
||||||
|
certainly user mistakes.
|
||||||
|
.P
|
||||||
|
16. In PCRE2, the upper/lower case character properties Lu and Ll are not
|
||||||
|
affected when case-independent matching is specified. For example, \ep{Lu}
|
||||||
|
always matches an upper case letter. I think Perl has changed in this respect;
|
||||||
|
in the release at the time of writing (5.16), \ep{Lu} and \ep{Ll} match all
|
||||||
|
letters, regardless of case, when case independence is specified.
|
||||||
|
.P
|
||||||
|
17. PCRE2 provides some extensions to the Perl regular expression facilities.
|
||||||
|
Perl 5.10 includes new features that are not in earlier versions of Perl, some
|
||||||
|
of which (such as named parentheses) have been in PCRE2 for some time. This
|
||||||
|
list is with respect to Perl 5.10:
|
||||||
|
.sp
|
||||||
|
(a) Although lookbehind assertions in PCRE2 must match fixed length strings,
|
||||||
|
each alternative branch of a lookbehind assertion can match a different length
|
||||||
|
of string. Perl requires them all to have the same length.
|
||||||
|
.sp
|
||||||
|
(b) If PCRE2_DOLLAR_ENDONLY is set and PCRE2_MULTILINE is not set, the $
|
||||||
|
meta-character matches only at the very end of the string.
|
||||||
|
.sp
|
||||||
|
(c) A backslash followed by a letter with no special meaning is faulted. (Perl
|
||||||
|
can be made to issue a warning.)
|
||||||
|
.sp
|
||||||
|
(d) If PCRE2_UNGREEDY is set, the greediness of the repetition quantifiers is
|
||||||
|
inverted, that is, by default they are not greedy, but if followed by a
|
||||||
|
question mark they are.
|
||||||
|
.sp
|
||||||
|
(e) PCRE2_ANCHORED can be used at matching time to force a pattern to be tried
|
||||||
|
only at the first matching position in the subject string.
|
||||||
|
.sp
|
||||||
|
(f) The PCRE2_NOTBOL, PCRE2_NOTEOL, PCRE2_NOTEMPTY, PCRE2_NOTEMPTY_ATSTART, and
|
||||||
|
PCRE2_NO_AUTO_CAPTURE options have no Perl equivalents.
|
||||||
|
.sp
|
||||||
|
(g) The \eR escape sequence can be restricted to match only CR, LF, or CRLF
|
||||||
|
by the PCRE2_BSR_ANYCRLF option.
|
||||||
|
.sp
|
||||||
|
(h) The callout facility is PCRE2-specific.
|
||||||
|
.sp
|
||||||
|
(i) The partial matching facility is PCRE2-specific.
|
||||||
|
.sp
|
||||||
|
(j) The alternative matching function (\fBpcre2_dfa_match()\fP matches in a
|
||||||
|
different way and is not Perl-compatible.
|
||||||
|
.sp
|
||||||
|
(k) PCRE2 recognizes some special sequences such as (*CR) at the start of
|
||||||
|
a pattern that set overall options that cannot be changed within the pattern.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH AUTHOR
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Philip Hazel
|
||||||
|
University Computing Service
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH REVISION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
|
.fi
|
|
@ -0,0 +1,683 @@
|
||||||
|
.TH PCRE2GREP 1 "28 September 2014" "PCRE2 10.00"
|
||||||
|
.SH NAME
|
||||||
|
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.B pcre2grep [options] [long options] [pattern] [path1 path2 ...]
|
||||||
|
.
|
||||||
|
.SH DESCRIPTION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
\fBpcre2grep\fP searches files for character patterns, in the same way as other
|
||||||
|
grep commands do, but it uses the PCRE2 regular expression library to support
|
||||||
|
patterns that are compatible with the regular expressions of Perl 5. See
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2syntax\fP(3)
|
||||||
|
.\"
|
||||||
|
for a quick-reference summary of pattern syntax, or
|
||||||
|
.\" HREF
|
||||||
|
\fBpcre2pattern\fP(3)
|
||||||
|
.\"
|
||||||
|
for a full description of the syntax and semantics of the regular expressions
|
||||||
|
that PCRE2 supports.
|
||||||
|
.P
|
||||||
|
Patterns, whether supplied on the command line or in a separate file, are given
|
||||||
|
without delimiters. For example:
|
||||||
|
.sp
|
||||||
|
pcre2grep Thursday /etc/motd
|
||||||
|
.sp
|
||||||
|
If you attempt to use delimiters (for example, by surrounding a pattern with
|
||||||
|
slashes, as is common in Perl scripts), they are interpreted as part of the
|
||||||
|
pattern. Quotes can of course be used to delimit patterns on the command line
|
||||||
|
because they are interpreted by the shell, and indeed quotes are required if a
|
||||||
|
pattern contains white space or shell metacharacters.
|
||||||
|
.P
|
||||||
|
The first argument that follows any option settings is treated as the single
|
||||||
|
pattern to be matched when neither \fB-e\fP nor \fB-f\fP is present.
|
||||||
|
Conversely, when one or both of these options are used to specify patterns, all
|
||||||
|
arguments are treated as path names. At least one of \fB-e\fP, \fB-f\fP, or an
|
||||||
|
argument pattern must be provided.
|
||||||
|
.P
|
||||||
|
If no files are specified, \fBpcre2grep\fP reads the standard input. The
|
||||||
|
standard input can also be referenced by a name consisting of a single hyphen.
|
||||||
|
For example:
|
||||||
|
.sp
|
||||||
|
pcre2grep some-pattern /file1 - /file3
|
||||||
|
.sp
|
||||||
|
By default, each line that matches a pattern is copied to the standard
|
||||||
|
output, and if there is more than one file, the file name is output at the
|
||||||
|
start of each line, followed by a colon. However, there are options that can
|
||||||
|
change how \fBpcre2grep\fP behaves. In particular, the \fB-M\fP option makes it
|
||||||
|
possible to search for patterns that span line boundaries. What defines a line
|
||||||
|
boundary is controlled by the \fB-N\fP (\fB--newline\fP) option.
|
||||||
|
.P
|
||||||
|
The amount of memory used for buffering files that are being scanned is
|
||||||
|
controlled by a parameter that can be set by the \fB--buffer-size\fP option.
|
||||||
|
The default value for this parameter is specified when \fBpcre2grep\fP is built,
|
||||||
|
with the default default being 20K. A block of memory three times this size is
|
||||||
|
used (to allow for buffering "before" and "after" lines). An error occurs if a
|
||||||
|
line overflows the buffer.
|
||||||
|
.P
|
||||||
|
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater.
|
||||||
|
BUFSIZ is defined in \fB<stdio.h>\fP. When there is more than one pattern
|
||||||
|
(specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to
|
||||||
|
each line in the order in which they are defined, except that all the \fB-e\fP
|
||||||
|
patterns are tried before the \fB-f\fP patterns.
|
||||||
|
.P
|
||||||
|
By default, as soon as one pattern matches a line, no further patterns are
|
||||||
|
considered. However, if \fB--colour\fP (or \fB--color\fP) is used to colour the
|
||||||
|
matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or
|
||||||
|
\fB--line-offsets\fP is used to output only the part of the line that matched
|
||||||
|
(either shown literally, or as an offset), scanning resumes immediately
|
||||||
|
following the match, so that further matches on the same line can be found. If
|
||||||
|
there are multiple patterns, they are all tried on the remainder of the line,
|
||||||
|
but patterns that follow the one that matched are not tried on the earlier part
|
||||||
|
of the line.
|
||||||
|
.P
|
||||||
|
This behaviour means that the order in which multiple patterns are specified
|
||||||
|
can affect the output when one of the above options is used. This is no longer
|
||||||
|
the same behaviour as GNU grep, which now manages to display earlier matches
|
||||||
|
for later patterns (as long as there is no overlap).
|
||||||
|
.P
|
||||||
|
Patterns that can match an empty string are accepted, but empty string
|
||||||
|
matches are never recognized. An example is the pattern "(super)?(man)?", in
|
||||||
|
which all components are optional. This pattern finds all occurrences of both
|
||||||
|
"super" and "man"; the output differs from matching with "super|man" when only
|
||||||
|
the matching substrings are being shown.
|
||||||
|
.P
|
||||||
|
If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set,
|
||||||
|
\fBpcre2grep\fP uses the value to set a locale when calling the PCRE2 library.
|
||||||
|
The \fB--locale\fP option can be used to override this.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "SUPPORT FOR COMPRESSED FILES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
It is possible to compile \fBpcre2grep\fP so that it uses \fBlibz\fP or
|
||||||
|
\fBlibbz2\fP to read files whose names end in \fB.gz\fP or \fB.bz2\fP,
|
||||||
|
respectively. You can find out whether your binary has support for one or both
|
||||||
|
of these file types by running it with the \fB--help\fP option. If the
|
||||||
|
appropriate support is not present, files are treated as plain text. The
|
||||||
|
standard input is always so treated.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "BINARY FILES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
By default, a file that contains a binary zero byte within the first 1024 bytes
|
||||||
|
is identified as a binary file, and is processed specially. (GNU grep also
|
||||||
|
identifies binary files in this manner.) See the \fB--binary-files\fP option
|
||||||
|
for a means of changing the way binary files are handled.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH OPTIONS
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The order in which some of the options appear can affect the output. For
|
||||||
|
example, both the \fB-h\fP and \fB-l\fP options affect the printing of file
|
||||||
|
names. Whichever comes later in the command line will be the one that takes
|
||||||
|
effect. Similarly, except where noted below, if an option is given twice, the
|
||||||
|
later setting is used. Numerical values for options may be followed by K or M,
|
||||||
|
to signify multiplication by 1024 or 1024*1024 respectively.
|
||||||
|
.TP 10
|
||||||
|
\fB--\fP
|
||||||
|
This terminates the list of options. It is useful if the next item on the
|
||||||
|
command line starts with a hyphen but is not an option. This allows for the
|
||||||
|
processing of patterns and filenames that start with hyphens.
|
||||||
|
.TP
|
||||||
|
\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP
|
||||||
|
Output \fInumber\fP lines of context after each matching line. If filenames
|
||||||
|
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||||
|
colon for the context lines. A line containing "--" is output between each
|
||||||
|
group of lines, unless they are in fact contiguous in the input file. The value
|
||||||
|
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
|
||||||
|
guarantees to have up to 8K of following text available for context output.
|
||||||
|
.TP
|
||||||
|
\fB-a\fP, \fB--text\fP
|
||||||
|
Treat binary files as text. This is equivalent to
|
||||||
|
\fB--binary-files\fP=\fItext\fP.
|
||||||
|
.TP
|
||||||
|
\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP
|
||||||
|
Output \fInumber\fP lines of context before each matching line. If filenames
|
||||||
|
and/or line numbers are being output, a hyphen separator is used instead of a
|
||||||
|
colon for the context lines. A line containing "--" is output between each
|
||||||
|
group of lines, unless they are in fact contiguous in the input file. The value
|
||||||
|
of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP
|
||||||
|
guarantees to have up to 8K of preceding text available for context output.
|
||||||
|
.TP
|
||||||
|
\fB--binary-files=\fP\fIword\fP
|
||||||
|
Specify how binary files are to be processed. If the word is "binary" (the
|
||||||
|
default), pattern matching is performed on binary files, but the only output is
|
||||||
|
"Binary file <name> matches" when a match succeeds. If the word is "text",
|
||||||
|
which is equivalent to the \fB-a\fP or \fB--text\fP option, binary files are
|
||||||
|
processed in the same way as any other file. In this case, when a match
|
||||||
|
succeeds, the output may be binary garbage, which can have nasty effects if
|
||||||
|
sent to a terminal. If the word is "without-match", which is equivalent to the
|
||||||
|
\fB-I\fP option, binary files are not processed at all; they are assumed not to
|
||||||
|
be of interest.
|
||||||
|
.TP
|
||||||
|
\fB--buffer-size=\fP\fInumber\fP
|
||||||
|
Set the parameter that controls how much memory is used for buffering files
|
||||||
|
that are being scanned.
|
||||||
|
.TP
|
||||||
|
\fB-C\fP \fInumber\fP, \fB--context=\fP\fInumber\fP
|
||||||
|
Output \fInumber\fP lines of context both before and after each matching line.
|
||||||
|
This is equivalent to setting both \fB-A\fP and \fB-B\fP to the same value.
|
||||||
|
.TP
|
||||||
|
\fB-c\fP, \fB--count\fP
|
||||||
|
Do not output individual lines from the files that are being scanned; instead
|
||||||
|
output the number of lines that would otherwise have been shown. If no lines
|
||||||
|
are selected, the number zero is output. If several files are are being
|
||||||
|
scanned, a count is output for each of them. However, if the
|
||||||
|
\fB--files-with-matches\fP option is also used, only those files whose counts
|
||||||
|
are greater than zero are listed. When \fB-c\fP is used, the \fB-A\fP,
|
||||||
|
\fB-B\fP, and \fB-C\fP options are ignored.
|
||||||
|
.TP
|
||||||
|
\fB--colour\fP, \fB--color\fP
|
||||||
|
If this option is given without any data, it is equivalent to "--colour=auto".
|
||||||
|
If data is required, it must be given in the same shell item, separated by an
|
||||||
|
equals sign.
|
||||||
|
.TP
|
||||||
|
\fB--colour=\fP\fIvalue\fP, \fB--color=\fP\fIvalue\fP
|
||||||
|
This option specifies under what circumstances the parts of a line that matched
|
||||||
|
a pattern should be coloured in the output. By default, the output is not
|
||||||
|
coloured. The value (which is optional, see above) may be "never", "always", or
|
||||||
|
"auto". In the latter case, colouring happens only if the standard output is
|
||||||
|
connected to a terminal. More resources are used when colouring is enabled,
|
||||||
|
because \fBpcre2grep\fP has to search for all possible matches in a line, not
|
||||||
|
just one, in order to colour them all.
|
||||||
|
.sp
|
||||||
|
The colour that is used can be specified by setting the environment variable
|
||||||
|
PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The value of this variable should be a
|
||||||
|
string of two numbers, separated by a semicolon. They are copied directly into
|
||||||
|
the control string for setting colour on a terminal, so it is your
|
||||||
|
responsibility to ensure that they make sense. If neither of the environment
|
||||||
|
variables is set, the default is "1;31", which gives red.
|
||||||
|
.TP
|
||||||
|
\fB-D\fP \fIaction\fP, \fB--devices=\fP\fIaction\fP
|
||||||
|
If an input path is not a regular file or a directory, "action" specifies how
|
||||||
|
it is to be processed. Valid values are "read" (the default) or "skip"
|
||||||
|
(silently skip the path).
|
||||||
|
.TP
|
||||||
|
\fB-d\fP \fIaction\fP, \fB--directories=\fP\fIaction\fP
|
||||||
|
If an input path is a directory, "action" specifies how it is to be processed.
|
||||||
|
Valid values are "read" (the default in non-Windows environments, for
|
||||||
|
compatibility with GNU grep), "recurse" (equivalent to the \fB-r\fP option), or
|
||||||
|
"skip" (silently skip the path, the default in Windows environments). In the
|
||||||
|
"read" case, directories are read as if they were ordinary files. In some
|
||||||
|
operating systems the effect of reading a directory like this is an immediate
|
||||||
|
end-of-file; in others it may provoke an error.
|
||||||
|
.TP
|
||||||
|
\fB-e\fP \fIpattern\fP, \fB--regex=\fP\fIpattern\fP, \fB--regexp=\fP\fIpattern\fP
|
||||||
|
Specify a pattern to be matched. This option can be used multiple times in
|
||||||
|
order to specify several patterns. It can also be used as a way of specifying a
|
||||||
|
single pattern that starts with a hyphen. When \fB-e\fP is used, no argument
|
||||||
|
pattern is taken from the command line; all arguments are treated as file
|
||||||
|
names. There is no limit to the number of patterns. They are applied to each
|
||||||
|
line in the order in which they are defined until one matches.
|
||||||
|
.sp
|
||||||
|
If \fB-f\fP is used with \fB-e\fP, the command line patterns are matched first,
|
||||||
|
followed by the patterns from the file(s), independent of the order in which
|
||||||
|
these options are specified. Note that multiple use of \fB-e\fP is not the same
|
||||||
|
as a single pattern with alternatives. For example, X|Y finds the first
|
||||||
|
character in a line that is X or Y, whereas if the two patterns are given
|
||||||
|
separately, with X first, \fBpcre2grep\fP finds X if it is present, even if it
|
||||||
|
follows Y in the line. It finds Y only if there is no X in the line. This
|
||||||
|
matters only if you are using \fB-o\fP or \fB--colo(u)r\fP to show the part(s)
|
||||||
|
of the line that matched.
|
||||||
|
.TP
|
||||||
|
\fB--exclude\fP=\fIpattern\fP
|
||||||
|
Files (but not directories) whose names match the pattern are skipped without
|
||||||
|
being processed. This applies to all files, whether listed on the command line,
|
||||||
|
obtained from \fB--file-list\fP, or by scanning a directory. The pattern is a
|
||||||
|
PCRE2 regular expression, and is matched against the final component of the file
|
||||||
|
name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not
|
||||||
|
apply to this pattern. The option may be given any number of times in order to
|
||||||
|
specify multiple patterns. If a file name matches both an \fB--include\fP
|
||||||
|
and an \fB--exclude\fP pattern, it is excluded. There is no short form for this
|
||||||
|
option.
|
||||||
|
.TP
|
||||||
|
\fB--exclude-from=\fP\fIfilename\fP
|
||||||
|
Treat each non-empty line of the file as the data for an \fB--exclude\fP
|
||||||
|
option. What constitutes a newline when reading the file is the operating
|
||||||
|
system's default. The \fB--newline\fP option has no effect on this option. This
|
||||||
|
option may be given more than once in order to specify a number of files to
|
||||||
|
read.
|
||||||
|
.TP
|
||||||
|
\fB--exclude-dir\fP=\fIpattern\fP
|
||||||
|
Directories whose names match the pattern are skipped without being processed,
|
||||||
|
whatever the setting of the \fB--recursive\fP option. This applies to all
|
||||||
|
directories, whether listed on the command line, obtained from
|
||||||
|
\fB--file-list\fP, or by scanning a parent directory. The pattern is a PCRE2
|
||||||
|
regular expression, and is matched against the final component of the directory
|
||||||
|
name, not the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not
|
||||||
|
apply to this pattern. The option may be given any number of times in order to
|
||||||
|
specify more than one pattern. If a directory matches both \fB--include-dir\fP
|
||||||
|
and \fB--exclude-dir\fP, it is excluded. There is no short form for this
|
||||||
|
option.
|
||||||
|
.TP
|
||||||
|
\fB-F\fP, \fB--fixed-strings\fP
|
||||||
|
Interpret each data-matching pattern as a list of fixed strings, separated by
|
||||||
|
newlines, instead of as a regular expression. What constitutes a newline for
|
||||||
|
this purpose is controlled by the \fB--newline\fP option. The \fB-w\fP (match
|
||||||
|
as a word) and \fB-x\fP (match whole line) options can be used with \fB-F\fP.
|
||||||
|
They apply to each of the fixed strings. A line is selected if any of the fixed
|
||||||
|
strings are found in it (subject to \fB-w\fP or \fB-x\fP, if present). This
|
||||||
|
option applies only to the patterns that are matched against the contents of
|
||||||
|
files; it does not apply to patterns specified by any of the \fB--include\fP or
|
||||||
|
\fB--exclude\fP options.
|
||||||
|
.TP
|
||||||
|
\fB-f\fP \fIfilename\fP, \fB--file=\fP\fIfilename\fP
|
||||||
|
Read patterns from the file, one per line, and match them against
|
||||||
|
each line of input. What constitutes a newline when reading the file is the
|
||||||
|
operating system's default. The \fB--newline\fP option has no effect on this
|
||||||
|
option. Trailing white space is removed from each line, and blank lines are
|
||||||
|
ignored. An empty file contains no patterns and therefore matches nothing. See
|
||||||
|
also the comments about multiple patterns versus a single pattern with
|
||||||
|
alternatives in the description of \fB-e\fP above.
|
||||||
|
.sp
|
||||||
|
If this option is given more than once, all the specified files are
|
||||||
|
read. A data line is output if any of the patterns match it. A filename can
|
||||||
|
be given as "-" to refer to the standard input. When \fB-f\fP is used, patterns
|
||||||
|
specified on the command line using \fB-e\fP may also be present; they are
|
||||||
|
tested before the file's patterns. However, no other pattern is taken from the
|
||||||
|
command line; all arguments are treated as the names of paths to be searched.
|
||||||
|
.TP
|
||||||
|
\fB--file-list\fP=\fIfilename\fP
|
||||||
|
Read a list of files and/or directories that are to be scanned from the given
|
||||||
|
file, one per line. Trailing white space is removed from each line, and blank
|
||||||
|
lines are ignored. These paths are processed before any that are listed on the
|
||||||
|
command line. The filename can be given as "-" to refer to the standard input.
|
||||||
|
If \fB--file\fP and \fB--file-list\fP are both specified as "-", patterns are
|
||||||
|
read first. This is useful only when the standard input is a terminal, from
|
||||||
|
which further lines (the list of files) can be read after an end-of-file
|
||||||
|
indication. If this option is given more than once, all the specified files are
|
||||||
|
read.
|
||||||
|
.TP
|
||||||
|
\fB--file-offsets\fP
|
||||||
|
Instead of showing lines or parts of lines that match, show each match as an
|
||||||
|
offset from the start of the file and a length, separated by a comma. In this
|
||||||
|
mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP
|
||||||
|
options are ignored. If there is more than one match in a line, each of them is
|
||||||
|
shown separately. This option is mutually exclusive with \fB--line-offsets\fP
|
||||||
|
and \fB--only-matching\fP.
|
||||||
|
.TP
|
||||||
|
\fB-H\fP, \fB--with-filename\fP
|
||||||
|
Force the inclusion of the filename at the start of output lines when searching
|
||||||
|
a single file. By default, the filename is not shown in this case. For matching
|
||||||
|
lines, the filename is followed by a colon; for context lines, a hyphen
|
||||||
|
separator is used. If a line number is also being output, it follows the file
|
||||||
|
name.
|
||||||
|
.TP
|
||||||
|
\fB-h\fP, \fB--no-filename\fP
|
||||||
|
Suppress the output filenames when searching multiple files. By default,
|
||||||
|
filenames are shown when multiple files are searched. For matching lines, the
|
||||||
|
filename is followed by a colon; for context lines, a hyphen separator is used.
|
||||||
|
If a line number is also being output, it follows the file name.
|
||||||
|
.TP
|
||||||
|
\fB--help\fP
|
||||||
|
Output a help message, giving brief details of the command options and file
|
||||||
|
type support, and then exit. Anything else on the command line is
|
||||||
|
ignored.
|
||||||
|
.TP
|
||||||
|
\fB-I\fP
|
||||||
|
Treat binary files as never matching. This is equivalent to
|
||||||
|
\fB--binary-files\fP=\fIwithout-match\fP.
|
||||||
|
.TP
|
||||||
|
\fB-i\fP, \fB--ignore-case\fP
|
||||||
|
Ignore upper/lower case distinctions during comparisons.
|
||||||
|
.TP
|
||||||
|
\fB--include\fP=\fIpattern\fP
|
||||||
|
If any \fB--include\fP patterns are specified, the only files that are
|
||||||
|
processed are those that match one of the patterns (and do not match an
|
||||||
|
\fB--exclude\fP pattern). This option does not affect directories, but it
|
||||||
|
applies to all files, whether listed on the command line, obtained from
|
||||||
|
\fB--file-list\fP, or by scanning a directory. The pattern is a PCRE2 regular
|
||||||
|
expression, and is matched against the final component of the file name, not
|
||||||
|
the entire path. The \fB-F\fP, \fB-w\fP, and \fB-x\fP options do not apply to
|
||||||
|
this pattern. The option may be given any number of times. If a file name
|
||||||
|
matches both an \fB--include\fP and an \fB--exclude\fP pattern, it is excluded.
|
||||||
|
There is no short form for this option.
|
||||||
|
.TP
|
||||||
|
\fB--include-from=\fP\fIfilename\fP
|
||||||
|
Treat each non-empty line of the file as the data for an \fB--include\fP
|
||||||
|
option. What constitutes a newline for this purpose is the operating system's
|
||||||
|
default. The \fB--newline\fP option has no effect on this option. This option
|
||||||
|
may be given any number of times; all the files are read.
|
||||||
|
.TP
|
||||||
|
\fB--include-dir\fP=\fIpattern\fP
|
||||||
|
If any \fB--include-dir\fP patterns are specified, the only directories that
|
||||||
|
are processed are those that match one of the patterns (and do not match an
|
||||||
|
\fB--exclude-dir\fP pattern). This applies to all directories, whether listed
|
||||||
|
on the command line, obtained from \fB--file-list\fP, or by scanning a parent
|
||||||
|
directory. The pattern is a PCRE2 regular expression, and is matched against the
|
||||||
|
final component of the directory name, not the entire path. The \fB-F\fP,
|
||||||
|
\fB-w\fP, and \fB-x\fP options do not apply to this pattern. The option may be
|
||||||
|
given any number of times. If a directory matches both \fB--include-dir\fP and
|
||||||
|
\fB--exclude-dir\fP, it is excluded. There is no short form for this option.
|
||||||
|
.TP
|
||||||
|
\fB-L\fP, \fB--files-without-match\fP
|
||||||
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
|
that do not contain any lines that would have been output. Each file name is
|
||||||
|
output once, on a separate line.
|
||||||
|
.TP
|
||||||
|
\fB-l\fP, \fB--files-with-matches\fP
|
||||||
|
Instead of outputting lines from the files, just output the names of the files
|
||||||
|
containing lines that would have been output. Each file name is output
|
||||||
|
once, on a separate line. Searching normally stops as soon as a matching line
|
||||||
|
is found in a file. However, if the \fB-c\fP (count) option is also used,
|
||||||
|
matching continues in order to obtain the correct count, and those files that
|
||||||
|
have at least one match are listed along with their counts. Using this option
|
||||||
|
with \fB-c\fP is a way of suppressing the listing of files with no matches.
|
||||||
|
.TP
|
||||||
|
\fB--label\fP=\fIname\fP
|
||||||
|
This option supplies a name to be used for the standard input when file names
|
||||||
|
are being output. If not supplied, "(standard input)" is used. There is no
|
||||||
|
short form for this option.
|
||||||
|
.TP
|
||||||
|
\fB--line-buffered\fP
|
||||||
|
When this option is given, input is read and processed line by line, and the
|
||||||
|
output is flushed after each write. By default, input is read in large chunks,
|
||||||
|
unless \fBpcre2grep\fP can determine that it is reading from a terminal (which
|
||||||
|
is currently possible only in Unix-like environments). Output to terminal is
|
||||||
|
normally automatically flushed by the operating system. This option can be
|
||||||
|
useful when the input or output is attached to a pipe and you do not want
|
||||||
|
\fBpcre2grep\fP to buffer up large amounts of data. However, its use will affect
|
||||||
|
performance, and the \fB-M\fP (multiline) option ceases to work.
|
||||||
|
.TP
|
||||||
|
\fB--line-offsets\fP
|
||||||
|
Instead of showing lines or parts of lines that match, show each match as a
|
||||||
|
line number, the offset from the start of the line, and a length. The line
|
||||||
|
number is terminated by a colon (as usual; see the \fB-n\fP option), and the
|
||||||
|
offset and length are separated by a comma. In this mode, no context is shown.
|
||||||
|
That is, the \fB-A\fP, \fB-B\fP, and \fB-C\fP options are ignored. If there is
|
||||||
|
more than one match in a line, each of them is shown separately. This option is
|
||||||
|
mutually exclusive with \fB--file-offsets\fP and \fB--only-matching\fP.
|
||||||
|
.TP
|
||||||
|
\fB--locale\fP=\fIlocale-name\fP
|
||||||
|
This option specifies a locale to be used for pattern matching. It overrides
|
||||||
|
the value in the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variables. If no
|
||||||
|
locale is specified, the PCRE2 library's default (usually the "C" locale) is
|
||||||
|
used. There is no short form for this option.
|
||||||
|
.TP
|
||||||
|
\fB--match-limit\fP=\fInumber\fP
|
||||||
|
Processing some regular expression patterns can require a very large amount of
|
||||||
|
memory, leading in some cases to a program crash if not enough is available.
|
||||||
|
Other patterns may take a very long time to search for all possible matching
|
||||||
|
strings. The \fBpcre2_exec()\fP function that is called by \fBpcre2grep\fP to do
|
||||||
|
the matching has two parameters that can limit the resources that it uses.
|
||||||
|
.sp
|
||||||
|
The \fB--match-limit\fP option provides a means of limiting resource usage
|
||||||
|
when processing patterns that are not going to match, but which have a very
|
||||||
|
large number of possibilities in their search trees. The classic example is a
|
||||||
|
pattern that uses nested unlimited repeats. Internally, PCRE2 uses a function
|
||||||
|
called \fBmatch()\fP which it calls repeatedly (sometimes recursively). The
|
||||||
|
limit set by \fB--match-limit\fP is imposed on the number of times this
|
||||||
|
function is called during a match, which has the effect of limiting the amount
|
||||||
|
of backtracking that can take place.
|
||||||
|
.sp
|
||||||
|
The \fB--recursion-limit\fP option is similar to \fB--match-limit\fP, but
|
||||||
|
instead of limiting the total number of times that \fBmatch()\fP is called, it
|
||||||
|
limits the depth of recursive calls, which in turn limits the amount of memory
|
||||||
|
that can be used. The recursion depth is a smaller number than the total number
|
||||||
|
of calls, because not all calls to \fBmatch()\fP are recursive. This limit is
|
||||||
|
of use only if it is set smaller than \fB--match-limit\fP.
|
||||||
|
.sp
|
||||||
|
There are no short forms for these options. The default settings are specified
|
||||||
|
when the PCRE2 library is compiled, with the default default being 10 million.
|
||||||
|
.TP
|
||||||
|
\fB-M\fP, \fB--multiline\fP
|
||||||
|
Allow patterns to match more than one line. When this option is given, patterns
|
||||||
|
may usefully contain literal newline characters and internal occurrences of ^
|
||||||
|
and $ characters. The output for a successful match may consist of more than
|
||||||
|
one line, the last of which is the one in which the match ended. If the matched
|
||||||
|
string ends with a newline sequence the output ends at the end of that line.
|
||||||
|
.sp
|
||||||
|
When this option is set, the PCRE2 library is called in "multiline" mode.
|
||||||
|
There is a limit to the number of lines that can be matched, imposed by the way
|
||||||
|
that \fBpcre2grep\fP buffers the input file as it scans it. However,
|
||||||
|
\fBpcre2grep\fP ensures that at least 8K characters or the rest of the document
|
||||||
|
(whichever is the shorter) are available for forward matching, and similarly
|
||||||
|
the previous 8K characters (or all the previous characters, if fewer than 8K)
|
||||||
|
are guaranteed to be available for lookbehind assertions. This option does not
|
||||||
|
work when input is read line by line (see \fP--line-buffered\fP.)
|
||||||
|
.TP
|
||||||
|
\fB-N\fP \fInewline-type\fP, \fB--newline\fP=\fInewline-type\fP
|
||||||
|
The PCRE2 library supports five different conventions for indicating
|
||||||
|
the ends of lines. They are the single-character sequences CR (carriage return)
|
||||||
|
and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
|
||||||
|
which recognizes any of the preceding three types, and an "any" convention, in
|
||||||
|
which any Unicode line ending sequence is assumed to end a line. The Unicode
|
||||||
|
sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
|
||||||
|
(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
|
||||||
|
PS (paragraph separator, U+2029).
|
||||||
|
.sp
|
||||||
|
When the PCRE2 library is built, a default line-ending sequence is specified.
|
||||||
|
This is normally the standard sequence for the operating system. Unless
|
||||||
|
otherwise specified by this option, \fBpcre2grep\fP uses the library's default.
|
||||||
|
The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
|
||||||
|
makes it possible to use \fBpcre2grep\fP to scan files that have come from other
|
||||||
|
environments without having to modify their line endings. If the data that is
|
||||||
|
being scanned does not agree with the convention set by this option,
|
||||||
|
\fBpcre2grep\fP may behave in strange ways. Note that this option does not
|
||||||
|
apply to files specified by the \fB-f\fP, \fB--exclude-from\fP, or
|
||||||
|
\fB--include-from\fP options, which are expected to use the operating system's
|
||||||
|
standard newline sequence.
|
||||||
|
.TP
|
||||||
|
\fB-n\fP, \fB--line-number\fP
|
||||||
|
Precede each output line by its line number in the file, followed by a colon
|
||||||
|
for matching lines or a hyphen for context lines. If the filename is also being
|
||||||
|
output, it precedes the line number. This option is forced if
|
||||||
|
\fB--line-offsets\fP is used.
|
||||||
|
.TP
|
||||||
|
\fB--no-jit\fP
|
||||||
|
If the PCRE2 library is built with support for just-in-time compiling (which
|
||||||
|
speeds up matching), \fBpcre2grep\fP automatically makes use of this, unless it
|
||||||
|
was explicitly disabled at build time. This option can be used to disable the
|
||||||
|
use of JIT at run time. It is provided for testing and working round problems.
|
||||||
|
It should never be needed in normal use.
|
||||||
|
.TP
|
||||||
|
\fB-o\fP, \fB--only-matching\fP
|
||||||
|
Show only the part of the line that matched a pattern instead of the whole
|
||||||
|
line. In this mode, no context is shown. That is, the \fB-A\fP, \fB-B\fP, and
|
||||||
|
\fB-C\fP options are ignored. If there is more than one match in a line, each
|
||||||
|
of them is shown separately. If \fB-o\fP is combined with \fB-v\fP (invert the
|
||||||
|
sense of the match to find non-matching lines), no output is generated, but the
|
||||||
|
return code is set appropriately. If the matched portion of the line is empty,
|
||||||
|
nothing is output unless the file name or line number are being printed, in
|
||||||
|
which case they are shown on an otherwise empty line. This option is mutually
|
||||||
|
exclusive with \fB--file-offsets\fP and \fB--line-offsets\fP.
|
||||||
|
.TP
|
||||||
|
\fB-o\fP\fInumber\fP, \fB--only-matching\fP=\fInumber\fP
|
||||||
|
Show only the part of the line that matched the capturing parentheses of the
|
||||||
|
given number. Up to 32 capturing parentheses are supported, and -o0 is
|
||||||
|
equivalent to \fB-o\fP without a number. Because these options can be given
|
||||||
|
without an argument (see above), if an argument is present, it must be given in
|
||||||
|
the same shell item, for example, -o3 or --only-matching=2. The comments given
|
||||||
|
for the non-argument case above also apply to this case. If the specified
|
||||||
|
capturing parentheses do not exist in the pattern, or were not set in the
|
||||||
|
match, nothing is output unless the file name or line number are being printed.
|
||||||
|
.sp
|
||||||
|
If this option is given multiple times, multiple substrings are output, in the
|
||||||
|
order the options are given. For example, -o3 -o1 -o3 causes the substrings
|
||||||
|
matched by capturing parentheses 3 and 1 and then 3 again to be output. By
|
||||||
|
default, there is no separator (but see the next option).
|
||||||
|
.TP
|
||||||
|
\fB--om-separator\fP=\fItext\fP
|
||||||
|
Specify a separating string for multiple occurrences of \fB-o\fP. The default
|
||||||
|
is an empty string. Separating strings are never coloured.
|
||||||
|
.TP
|
||||||
|
\fB-q\fP, \fB--quiet\fP
|
||||||
|
Work quietly, that is, display nothing except error messages. The exit
|
||||||
|
status indicates whether or not any matches were found.
|
||||||
|
.TP
|
||||||
|
\fB-r\fP, \fB--recursive\fP
|
||||||
|
If any given path is a directory, recursively scan the files it contains,
|
||||||
|
taking note of any \fB--include\fP and \fB--exclude\fP settings. By default, a
|
||||||
|
directory is read as a normal file; in some operating systems this gives an
|
||||||
|
immediate end-of-file. This option is a shorthand for setting the \fB-d\fP
|
||||||
|
option to "recurse".
|
||||||
|
.TP
|
||||||
|
\fB--recursion-limit\fP=\fInumber\fP
|
||||||
|
See \fB--match-limit\fP above.
|
||||||
|
.TP
|
||||||
|
\fB-s\fP, \fB--no-messages\fP
|
||||||
|
Suppress error messages about non-existent or unreadable files. Such files are
|
||||||
|
quietly skipped. However, the return code is still 2, even if matches were
|
||||||
|
found in other files.
|
||||||
|
.TP
|
||||||
|
\fB-u\fP, \fB--utf-8\fP
|
||||||
|
Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled
|
||||||
|
with UTF-8 support. All patterns (including those for any \fB--exclude\fP and
|
||||||
|
\fB--include\fP options) and all subject lines that are scanned must be valid
|
||||||
|
strings of UTF-8 characters.
|
||||||
|
.TP
|
||||||
|
\fB-V\fP, \fB--version\fP
|
||||||
|
Write the version numbers of \fBpcre2grep\fP and the PCRE2 library to the
|
||||||
|
standard output and then exit. Anything else on the command line is
|
||||||
|
ignored.
|
||||||
|
.TP
|
||||||
|
\fB-v\fP, \fB--invert-match\fP
|
||||||
|
Invert the sense of the match, so that lines which do \fInot\fP match any of
|
||||||
|
the patterns are the ones that are found.
|
||||||
|
.TP
|
||||||
|
\fB-w\fP, \fB--word-regex\fP, \fB--word-regexp\fP
|
||||||
|
Force the patterns to match only whole words. This is equivalent to having \eb
|
||||||
|
at the start and end of the pattern. This option applies only to the patterns
|
||||||
|
that are matched against the contents of files; it does not apply to patterns
|
||||||
|
specified by any of the \fB--include\fP or \fB--exclude\fP options.
|
||||||
|
.TP
|
||||||
|
\fB-x\fP, \fB--line-regex\fP, \fB--line-regexp\fP
|
||||||
|
Force the patterns to be anchored (each must start matching at the beginning of
|
||||||
|
a line) and in addition, require them to match entire lines. This is equivalent
|
||||||
|
to having ^ and $ characters at the start and end of each alternative branch in
|
||||||
|
every pattern. This option applies only to the patterns that are matched
|
||||||
|
against the contents of files; it does not apply to patterns specified by any
|
||||||
|
of the \fB--include\fP or \fB--exclude\fP options.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "ENVIRONMENT VARIABLES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The environment variables \fBLC_ALL\fP and \fBLC_CTYPE\fP are examined, in that
|
||||||
|
order, for a locale. The first one that is set is used. This can be overridden
|
||||||
|
by the \fB--locale\fP option. If no locale is set, the PCRE2 library's default
|
||||||
|
(usually the "C" locale) is used.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "NEWLINES"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
The \fB-N\fP (\fB--newline\fP) option allows \fBpcre2grep\fP to scan files with
|
||||||
|
different newline conventions from the default. Any parts of the input files
|
||||||
|
that are written to the standard output are copied identically, with whatever
|
||||||
|
newline sequences they have in the input. However, the setting of this option
|
||||||
|
does not affect the interpretation of files specified by the \fB-f\fP,
|
||||||
|
\fB--exclude-from\fP, or \fB--include-from\fP options, which are assumed to use
|
||||||
|
the operating system's standard newline sequence, nor does it affect the way in
|
||||||
|
which \fBpcre2grep\fP writes informational messages to the standard error and
|
||||||
|
output streams. For these it uses the string "\en" to indicate newlines,
|
||||||
|
relying on the C I/O library to convert this to an appropriate sequence.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "OPTIONS COMPATIBILITY"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Many of the short and long forms of \fBpcre2grep\fP's options are the same
|
||||||
|
as in the GNU \fBgrep\fP program. Any long option of the form
|
||||||
|
\fB--xxx-regexp\fP (GNU terminology) is also available as \fB--xxx-regex\fP
|
||||||
|
(PCRE2 terminology). However, the \fB--file-list\fP, \fB--file-offsets\fP,
|
||||||
|
\fB--include-dir\fP, \fB--line-offsets\fP, \fB--locale\fP, \fB--match-limit\fP,
|
||||||
|
\fB-M\fP, \fB--multiline\fP, \fB-N\fP, \fB--newline\fP, \fB--om-separator\fP,
|
||||||
|
\fB--recursion-limit\fP, \fB-u\fP, and \fB--utf-8\fP options are specific to
|
||||||
|
\fBpcre2grep\fP, as is the use of the \fB--only-matching\fP option with a
|
||||||
|
capturing parentheses number.
|
||||||
|
.P
|
||||||
|
Although most of the common options work the same way, a few are different in
|
||||||
|
\fBpcre2grep\fP. For example, the \fB--include\fP option's argument is a glob
|
||||||
|
for GNU \fBgrep\fP, but a regular expression for \fBpcre2grep\fP. If both the
|
||||||
|
\fB-c\fP and \fB-l\fP options are given, GNU grep lists only file names,
|
||||||
|
without counts, but \fBpcre2grep\fP gives the counts.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "OPTIONS WITH DATA"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
There are four different ways in which an option with data can be specified.
|
||||||
|
If a short form option is used, the data may follow immediately, or (with one
|
||||||
|
exception) in the next command line item. For example:
|
||||||
|
.sp
|
||||||
|
-f/some/file
|
||||||
|
-f /some/file
|
||||||
|
.sp
|
||||||
|
The exception is the \fB-o\fP option, which may appear with or without data.
|
||||||
|
Because of this, if data is present, it must follow immediately in the same
|
||||||
|
item, for example -o3.
|
||||||
|
.P
|
||||||
|
If a long form option is used, the data may appear in the same command line
|
||||||
|
item, separated by an equals character, or (with two exceptions) it may appear
|
||||||
|
in the next command line item. For example:
|
||||||
|
.sp
|
||||||
|
--file=/some/file
|
||||||
|
--file /some/file
|
||||||
|
.sp
|
||||||
|
Note, however, that if you want to supply a file name beginning with ~ as data
|
||||||
|
in a shell command, and have the shell expand ~ to a home directory, you must
|
||||||
|
separate the file name from the option, because the shell does not treat ~
|
||||||
|
specially unless it is at the start of an item.
|
||||||
|
.P
|
||||||
|
The exceptions to the above are the \fB--colour\fP (or \fB--color\fP) and
|
||||||
|
\fB--only-matching\fP options, for which the data is optional. If one of these
|
||||||
|
options does have data, it must be given in the first form, using an equals
|
||||||
|
character. Otherwise \fBpcre2grep\fP will assume that it has no data.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "MATCHING ERRORS"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
It is possible to supply a regular expression that takes a very long time to
|
||||||
|
fail to match certain lines. Such patterns normally involve nested indefinite
|
||||||
|
repeats, for example: (a+)*\ed when matched against a line of a's with no final
|
||||||
|
digit. The PCRE2 matching function has a resource limit that causes it to abort
|
||||||
|
in these circumstances. If this happens, \fBpcre2grep\fP outputs an error
|
||||||
|
message and the line that caused the problem to the standard error stream. If
|
||||||
|
there are more than 20 such errors, \fBpcre2grep\fP gives up.
|
||||||
|
.P
|
||||||
|
The \fB--match-limit\fP option of \fBpcre2grep\fP can be used to set the overall
|
||||||
|
resource limit; there is a second option called \fB--recursion-limit\fP that
|
||||||
|
sets a limit on the amount of memory (usually stack) that is used (see the
|
||||||
|
discussion of these options above).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH DIAGNOSTICS
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
|
||||||
|
for syntax errors, overlong lines, non-existent or inaccessible files (even if
|
||||||
|
matches were found in other files) or too many matching errors. Using the
|
||||||
|
\fB-s\fP option to suppress error messages about inaccessible files does not
|
||||||
|
affect the return code.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH "SEE ALSO"
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
\fBpcre2pattern\fP(3), \fBpcre2syntax\fP(3), \fBpcre2test\fP(1).
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH AUTHOR
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Philip Hazel
|
||||||
|
University Computing Service
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
.fi
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.SH REVISION
|
||||||
|
.rs
|
||||||
|
.sp
|
||||||
|
.nf
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
|
.fi
|
|
@ -0,0 +1,741 @@
|
||||||
|
PCRE2GREP(1) General Commands Manual PCRE2GREP(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
NAME
|
||||||
|
pcre2grep - a grep with Perl-compatible regular expressions.
|
||||||
|
|
||||||
|
SYNOPSIS
|
||||||
|
pcre2grep [options] [long options] [pattern] [path1 path2 ...]
|
||||||
|
|
||||||
|
|
||||||
|
DESCRIPTION
|
||||||
|
|
||||||
|
pcre2grep searches files for character patterns, in the same way as
|
||||||
|
other grep commands do, but it uses the PCRE2 regular expression
|
||||||
|
library to support patterns that are compatible with the regular
|
||||||
|
expressions of Perl 5. See pcre2syntax(3) for a quick-reference summary
|
||||||
|
of pattern syntax, or pcre2pattern(3) for a full description of the
|
||||||
|
syntax and semantics of the regular expressions that PCRE2 supports.
|
||||||
|
|
||||||
|
Patterns, whether supplied on the command line or in a separate file,
|
||||||
|
are given without delimiters. For example:
|
||||||
|
|
||||||
|
pcre2grep Thursday /etc/motd
|
||||||
|
|
||||||
|
If you attempt to use delimiters (for example, by surrounding a pattern
|
||||||
|
with slashes, as is common in Perl scripts), they are interpreted as
|
||||||
|
part of the pattern. Quotes can of course be used to delimit patterns
|
||||||
|
on the command line because they are interpreted by the shell, and
|
||||||
|
indeed quotes are required if a pattern contains white space or shell
|
||||||
|
metacharacters.
|
||||||
|
|
||||||
|
The first argument that follows any option settings is treated as the
|
||||||
|
single pattern to be matched when neither -e nor -f is present. Con-
|
||||||
|
versely, when one or both of these options are used to specify pat-
|
||||||
|
terns, all arguments are treated as path names. At least one of -e, -f,
|
||||||
|
or an argument pattern must be provided.
|
||||||
|
|
||||||
|
If no files are specified, pcre2grep reads the standard input. The
|
||||||
|
standard input can also be referenced by a name consisting of a single
|
||||||
|
hyphen. For example:
|
||||||
|
|
||||||
|
pcre2grep some-pattern /file1 - /file3
|
||||||
|
|
||||||
|
By default, each line that matches a pattern is copied to the standard
|
||||||
|
output, and if there is more than one file, the file name is output at
|
||||||
|
the start of each line, followed by a colon. However, there are options
|
||||||
|
that can change how pcre2grep behaves. In particular, the -M option
|
||||||
|
makes it possible to search for patterns that span line boundaries.
|
||||||
|
What defines a line boundary is controlled by the -N (--newline)
|
||||||
|
option.
|
||||||
|
|
||||||
|
The amount of memory used for buffering files that are being scanned is
|
||||||
|
controlled by a parameter that can be set by the --buffer-size option.
|
||||||
|
The default value for this parameter is specified when pcre2grep is
|
||||||
|
built, with the default default being 20K. A block of memory three
|
||||||
|
times this size is used (to allow for buffering "before" and "after"
|
||||||
|
lines). An error occurs if a line overflows the buffer.
|
||||||
|
|
||||||
|
Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the
|
||||||
|
greater. BUFSIZ is defined in <stdio.h>. When there is more than one
|
||||||
|
pattern (specified by the use of -e and/or -f), each pattern is applied
|
||||||
|
to each line in the order in which they are defined, except that all
|
||||||
|
the -e patterns are tried before the -f patterns.
|
||||||
|
|
||||||
|
By default, as soon as one pattern matches a line, no further patterns
|
||||||
|
are considered. However, if --colour (or --color) is used to colour the
|
||||||
|
matching substrings, or if --only-matching, --file-offsets, or --line-
|
||||||
|
offsets is used to output only the part of the line that matched
|
||||||
|
(either shown literally, or as an offset), scanning resumes immediately
|
||||||
|
following the match, so that further matches on the same line can be
|
||||||
|
found. If there are multiple patterns, they are all tried on the
|
||||||
|
remainder of the line, but patterns that follow the one that matched
|
||||||
|
are not tried on the earlier part of the line.
|
||||||
|
|
||||||
|
This behaviour means that the order in which multiple patterns are
|
||||||
|
specified can affect the output when one of the above options is used.
|
||||||
|
This is no longer the same behaviour as GNU grep, which now manages to
|
||||||
|
display earlier matches for later patterns (as long as there is no
|
||||||
|
overlap).
|
||||||
|
|
||||||
|
Patterns that can match an empty string are accepted, but empty string
|
||||||
|
matches are never recognized. An example is the pattern
|
||||||
|
"(super)?(man)?", in which all components are optional. This pattern
|
||||||
|
finds all occurrences of both "super" and "man"; the output differs
|
||||||
|
from matching with "super|man" when only the matching substrings are
|
||||||
|
being shown.
|
||||||
|
|
||||||
|
If the LC_ALL or LC_CTYPE environment variable is set, pcre2grep uses
|
||||||
|
the value to set a locale when calling the PCRE2 library. The --locale
|
||||||
|
option can be used to override this.
|
||||||
|
|
||||||
|
|
||||||
|
SUPPORT FOR COMPRESSED FILES
|
||||||
|
|
||||||
|
It is possible to compile pcre2grep so that it uses libz or libbz2 to
|
||||||
|
read files whose names end in .gz or .bz2, respectively. You can find
|
||||||
|
out whether your binary has support for one or both of these file types
|
||||||
|
by running it with the --help option. If the appropriate support is not
|
||||||
|
present, files are treated as plain text. The standard input is always
|
||||||
|
so treated.
|
||||||
|
|
||||||
|
|
||||||
|
BINARY FILES
|
||||||
|
|
||||||
|
By default, a file that contains a binary zero byte within the first
|
||||||
|
1024 bytes is identified as a binary file, and is processed specially.
|
||||||
|
(GNU grep also identifies binary files in this manner.) See the
|
||||||
|
--binary-files option for a means of changing the way binary files are
|
||||||
|
handled.
|
||||||
|
|
||||||
|
|
||||||
|
OPTIONS
|
||||||
|
|
||||||
|
The order in which some of the options appear can affect the output.
|
||||||
|
For example, both the -h and -l options affect the printing of file
|
||||||
|
names. Whichever comes later in the command line will be the one that
|
||||||
|
takes effect. Similarly, except where noted below, if an option is
|
||||||
|
given twice, the later setting is used. Numerical values for options
|
||||||
|
may be followed by K or M, to signify multiplication by 1024 or
|
||||||
|
1024*1024 respectively.
|
||||||
|
|
||||||
|
-- This terminates the list of options. It is useful if the next
|
||||||
|
item on the command line starts with a hyphen but is not an
|
||||||
|
option. This allows for the processing of patterns and file-
|
||||||
|
names that start with hyphens.
|
||||||
|
|
||||||
|
-A number, --after-context=number
|
||||||
|
Output number lines of context after each matching line. If
|
||||||
|
filenames and/or line numbers are being output, a hyphen sep-
|
||||||
|
arator is used instead of a colon for the context lines. A
|
||||||
|
line containing "--" is output between each group of lines,
|
||||||
|
unless they are in fact contiguous in the input file. The
|
||||||
|
value of number is expected to be relatively small. However,
|
||||||
|
pcre2grep guarantees to have up to 8K of following text
|
||||||
|
available for context output.
|
||||||
|
|
||||||
|
-a, --text
|
||||||
|
Treat binary files as text. This is equivalent to --binary-
|
||||||
|
files=text.
|
||||||
|
|
||||||
|
-B number, --before-context=number
|
||||||
|
Output number lines of context before each matching line. If
|
||||||
|
filenames and/or line numbers are being output, a hyphen sep-
|
||||||
|
arator is used instead of a colon for the context lines. A
|
||||||
|
line containing "--" is output between each group of lines,
|
||||||
|
unless they are in fact contiguous in the input file. The
|
||||||
|
value of number is expected to be relatively small. However,
|
||||||
|
pcre2grep guarantees to have up to 8K of preceding text
|
||||||
|
available for context output.
|
||||||
|
|
||||||
|
--binary-files=word
|
||||||
|
Specify how binary files are to be processed. If the word is
|
||||||
|
"binary" (the default), pattern matching is performed on
|
||||||
|
binary files, but the only output is "Binary file <name>
|
||||||
|
matches" when a match succeeds. If the word is "text", which
|
||||||
|
is equivalent to the -a or --text option, binary files are
|
||||||
|
processed in the same way as any other file. In this case,
|
||||||
|
when a match succeeds, the output may be binary garbage,
|
||||||
|
which can have nasty effects if sent to a terminal. If the
|
||||||
|
word is "without-match", which is equivalent to the -I
|
||||||
|
option, binary files are not processed at all; they are
|
||||||
|
assumed not to be of interest.
|
||||||
|
|
||||||
|
--buffer-size=number
|
||||||
|
Set the parameter that controls how much memory is used for
|
||||||
|
buffering files that are being scanned.
|
||||||
|
|
||||||
|
-C number, --context=number
|
||||||
|
Output number lines of context both before and after each
|
||||||
|
matching line. This is equivalent to setting both -A and -B
|
||||||
|
to the same value.
|
||||||
|
|
||||||
|
-c, --count
|
||||||
|
Do not output individual lines from the files that are being
|
||||||
|
scanned; instead output the number of lines that would other-
|
||||||
|
wise have been shown. If no lines are selected, the number
|
||||||
|
zero is output. If several files are are being scanned, a
|
||||||
|
count is output for each of them. However, if the --files-
|
||||||
|
with-matches option is also used, only those files whose
|
||||||
|
counts are greater than zero are listed. When -c is used, the
|
||||||
|
-A, -B, and -C options are ignored.
|
||||||
|
|
||||||
|
--colour, --color
|
||||||
|
If this option is given without any data, it is equivalent to
|
||||||
|
"--colour=auto". If data is required, it must be given in
|
||||||
|
the same shell item, separated by an equals sign.
|
||||||
|
|
||||||
|
--colour=value, --color=value
|
||||||
|
This option specifies under what circumstances the parts of a
|
||||||
|
line that matched a pattern should be coloured in the output.
|
||||||
|
By default, the output is not coloured. The value (which is
|
||||||
|
optional, see above) may be "never", "always", or "auto". In
|
||||||
|
the latter case, colouring happens only if the standard out-
|
||||||
|
put is connected to a terminal. More resources are used when
|
||||||
|
colouring is enabled, because pcre2grep has to search for all
|
||||||
|
possible matches in a line, not just one, in order to colour
|
||||||
|
them all.
|
||||||
|
|
||||||
|
The colour that is used can be specified by setting the envi-
|
||||||
|
ronment variable PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The
|
||||||
|
value of this variable should be a string of two numbers,
|
||||||
|
separated by a semicolon. They are copied directly into the
|
||||||
|
control string for setting colour on a terminal, so it is
|
||||||
|
your responsibility to ensure that they make sense. If nei-
|
||||||
|
ther of the environment variables is set, the default is
|
||||||
|
"1;31", which gives red.
|
||||||
|
|
||||||
|
-D action, --devices=action
|
||||||
|
If an input path is not a regular file or a directory,
|
||||||
|
"action" specifies how it is to be processed. Valid values
|
||||||
|
are "read" (the default) or "skip" (silently skip the path).
|
||||||
|
|
||||||
|
-d action, --directories=action
|
||||||
|
If an input path is a directory, "action" specifies how it is
|
||||||
|
to be processed. Valid values are "read" (the default in
|
||||||
|
non-Windows environments, for compatibility with GNU grep),
|
||||||
|
"recurse" (equivalent to the -r option), or "skip" (silently
|
||||||
|
skip the path, the default in Windows environments). In the
|
||||||
|
"read" case, directories are read as if they were ordinary
|
||||||
|
files. In some operating systems the effect of reading a
|
||||||
|
directory like this is an immediate end-of-file; in others it
|
||||||
|
may provoke an error.
|
||||||
|
|
||||||
|
-e pattern, --regex=pattern, --regexp=pattern
|
||||||
|
Specify a pattern to be matched. This option can be used mul-
|
||||||
|
tiple times in order to specify several patterns. It can also
|
||||||
|
be used as a way of specifying a single pattern that starts
|
||||||
|
with a hyphen. When -e is used, no argument pattern is taken
|
||||||
|
from the command line; all arguments are treated as file
|
||||||
|
names. There is no limit to the number of patterns. They are
|
||||||
|
applied to each line in the order in which they are defined
|
||||||
|
until one matches.
|
||||||
|
|
||||||
|
If -f is used with -e, the command line patterns are matched
|
||||||
|
first, followed by the patterns from the file(s), independent
|
||||||
|
of the order in which these options are specified. Note that
|
||||||
|
multiple use of -e is not the same as a single pattern with
|
||||||
|
alternatives. For example, X|Y finds the first character in a
|
||||||
|
line that is X or Y, whereas if the two patterns are given
|
||||||
|
separately, with X first, pcre2grep finds X if it is present,
|
||||||
|
even if it follows Y in the line. It finds Y only if there is
|
||||||
|
no X in the line. This matters only if you are using -o or
|
||||||
|
--colo(u)r to show the part(s) of the line that matched.
|
||||||
|
|
||||||
|
--exclude=pattern
|
||||||
|
Files (but not directories) whose names match the pattern are
|
||||||
|
skipped without being processed. This applies to all files,
|
||||||
|
whether listed on the command line, obtained from --file-
|
||||||
|
list, or by scanning a directory. The pattern is a PCRE2 reg-
|
||||||
|
ular expression, and is matched against the final component
|
||||||
|
of the file name, not the entire path. The -F, -w, and -x
|
||||||
|
options do not apply to this pattern. The option may be given
|
||||||
|
any number of times in order to specify multiple patterns. If
|
||||||
|
a file name matches both an --include and an --exclude pat-
|
||||||
|
tern, it is excluded. There is no short form for this option.
|
||||||
|
|
||||||
|
--exclude-from=filename
|
||||||
|
Treat each non-empty line of the file as the data for an
|
||||||
|
--exclude option. What constitutes a newline when reading the
|
||||||
|
file is the operating system's default. The --newline option
|
||||||
|
has no effect on this option. This option may be given more
|
||||||
|
than once in order to specify a number of files to read.
|
||||||
|
|
||||||
|
--exclude-dir=pattern
|
||||||
|
Directories whose names match the pattern are skipped without
|
||||||
|
being processed, whatever the setting of the --recursive
|
||||||
|
option. This applies to all directories, whether listed on
|
||||||
|
the command line, obtained from --file-list, or by scanning a
|
||||||
|
parent directory. The pattern is a PCRE2 regular expression,
|
||||||
|
and is matched against the final component of the directory
|
||||||
|
name, not the entire path. The -F, -w, and -x options do not
|
||||||
|
apply to this pattern. The option may be given any number of
|
||||||
|
times in order to specify more than one pattern. If a direc-
|
||||||
|
tory matches both --include-dir and --exclude-dir, it is
|
||||||
|
excluded. There is no short form for this option.
|
||||||
|
|
||||||
|
-F, --fixed-strings
|
||||||
|
Interpret each data-matching pattern as a list of fixed
|
||||||
|
strings, separated by newlines, instead of as a regular
|
||||||
|
expression. What constitutes a newline for this purpose is
|
||||||
|
controlled by the --newline option. The -w (match as a word)
|
||||||
|
and -x (match whole line) options can be used with -F. They
|
||||||
|
apply to each of the fixed strings. A line is selected if any
|
||||||
|
of the fixed strings are found in it (subject to -w or -x, if
|
||||||
|
present). This option applies only to the patterns that are
|
||||||
|
matched against the contents of files; it does not apply to
|
||||||
|
patterns specified by any of the --include or --exclude
|
||||||
|
options.
|
||||||
|
|
||||||
|
-f filename, --file=filename
|
||||||
|
Read patterns from the file, one per line, and match them
|
||||||
|
against each line of input. What constitutes a newline when
|
||||||
|
reading the file is the operating system's default. The
|
||||||
|
--newline option has no effect on this option. Trailing white
|
||||||
|
space is removed from each line, and blank lines are ignored.
|
||||||
|
An empty file contains no patterns and therefore matches
|
||||||
|
nothing. See also the comments about multiple patterns versus
|
||||||
|
a single pattern with alternatives in the description of -e
|
||||||
|
above.
|
||||||
|
|
||||||
|
If this option is given more than once, all the specified
|
||||||
|
files are read. A data line is output if any of the patterns
|
||||||
|
match it. A filename can be given as "-" to refer to the
|
||||||
|
standard input. When -f is used, patterns specified on the
|
||||||
|
command line using -e may also be present; they are tested
|
||||||
|
before the file's patterns. However, no other pattern is
|
||||||
|
taken from the command line; all arguments are treated as the
|
||||||
|
names of paths to be searched.
|
||||||
|
|
||||||
|
--file-list=filename
|
||||||
|
Read a list of files and/or directories that are to be
|
||||||
|
scanned from the given file, one per line. Trailing white
|
||||||
|
space is removed from each line, and blank lines are ignored.
|
||||||
|
These paths are processed before any that are listed on the
|
||||||
|
command line. The filename can be given as "-" to refer to
|
||||||
|
the standard input. If --file and --file-list are both spec-
|
||||||
|
ified as "-", patterns are read first. This is useful only
|
||||||
|
when the standard input is a terminal, from which further
|
||||||
|
lines (the list of files) can be read after an end-of-file
|
||||||
|
indication. If this option is given more than once, all the
|
||||||
|
specified files are read.
|
||||||
|
|
||||||
|
--file-offsets
|
||||||
|
Instead of showing lines or parts of lines that match, show
|
||||||
|
each match as an offset from the start of the file and a
|
||||||
|
length, separated by a comma. In this mode, no context is
|
||||||
|
shown. That is, the -A, -B, and -C options are ignored. If
|
||||||
|
there is more than one match in a line, each of them is shown
|
||||||
|
separately. This option is mutually exclusive with --line-
|
||||||
|
offsets and --only-matching.
|
||||||
|
|
||||||
|
-H, --with-filename
|
||||||
|
Force the inclusion of the filename at the start of output
|
||||||
|
lines when searching a single file. By default, the filename
|
||||||
|
is not shown in this case. For matching lines, the filename
|
||||||
|
is followed by a colon; for context lines, a hyphen separator
|
||||||
|
is used. If a line number is also being output, it follows
|
||||||
|
the file name.
|
||||||
|
|
||||||
|
-h, --no-filename
|
||||||
|
Suppress the output filenames when searching multiple files.
|
||||||
|
By default, filenames are shown when multiple files are
|
||||||
|
searched. For matching lines, the filename is followed by a
|
||||||
|
colon; for context lines, a hyphen separator is used. If a
|
||||||
|
line number is also being output, it follows the file name.
|
||||||
|
|
||||||
|
--help Output a help message, giving brief details of the command
|
||||||
|
options and file type support, and then exit. Anything else
|
||||||
|
on the command line is ignored.
|
||||||
|
|
||||||
|
-I Treat binary files as never matching. This is equivalent to
|
||||||
|
--binary-files=without-match.
|
||||||
|
|
||||||
|
-i, --ignore-case
|
||||||
|
Ignore upper/lower case distinctions during comparisons.
|
||||||
|
|
||||||
|
--include=pattern
|
||||||
|
If any --include patterns are specified, the only files that
|
||||||
|
are processed are those that match one of the patterns (and
|
||||||
|
do not match an --exclude pattern). This option does not
|
||||||
|
affect directories, but it applies to all files, whether
|
||||||
|
listed on the command line, obtained from --file-list, or by
|
||||||
|
scanning a directory. The pattern is a PCRE2 regular expres-
|
||||||
|
sion, and is matched against the final component of the file
|
||||||
|
name, not the entire path. The -F, -w, and -x options do not
|
||||||
|
apply to this pattern. The option may be given any number of
|
||||||
|
times. If a file name matches both an --include and an
|
||||||
|
--exclude pattern, it is excluded. There is no short form
|
||||||
|
for this option.
|
||||||
|
|
||||||
|
--include-from=filename
|
||||||
|
Treat each non-empty line of the file as the data for an
|
||||||
|
--include option. What constitutes a newline for this purpose
|
||||||
|
is the operating system's default. The --newline option has
|
||||||
|
no effect on this option. This option may be given any number
|
||||||
|
of times; all the files are read.
|
||||||
|
|
||||||
|
--include-dir=pattern
|
||||||
|
If any --include-dir patterns are specified, the only direc-
|
||||||
|
tories that are processed are those that match one of the
|
||||||
|
patterns (and do not match an --exclude-dir pattern). This
|
||||||
|
applies to all directories, whether listed on the command
|
||||||
|
line, obtained from --file-list, or by scanning a parent
|
||||||
|
directory. The pattern is a PCRE2 regular expression, and is
|
||||||
|
matched against the final component of the directory name,
|
||||||
|
not the entire path. The -F, -w, and -x options do not apply
|
||||||
|
to this pattern. The option may be given any number of times.
|
||||||
|
If a directory matches both --include-dir and --exclude-dir,
|
||||||
|
it is excluded. There is no short form for this option.
|
||||||
|
|
||||||
|
-L, --files-without-match
|
||||||
|
Instead of outputting lines from the files, just output the
|
||||||
|
names of the files that do not contain any lines that would
|
||||||
|
have been output. Each file name is output once, on a sepa-
|
||||||
|
rate line.
|
||||||
|
|
||||||
|
-l, --files-with-matches
|
||||||
|
Instead of outputting lines from the files, just output the
|
||||||
|
names of the files containing lines that would have been out-
|
||||||
|
put. Each file name is output once, on a separate line.
|
||||||
|
Searching normally stops as soon as a matching line is found
|
||||||
|
in a file. However, if the -c (count) option is also used,
|
||||||
|
matching continues in order to obtain the correct count, and
|
||||||
|
those files that have at least one match are listed along
|
||||||
|
with their counts. Using this option with -c is a way of sup-
|
||||||
|
pressing the listing of files with no matches.
|
||||||
|
|
||||||
|
--label=name
|
||||||
|
This option supplies a name to be used for the standard input
|
||||||
|
when file names are being output. If not supplied, "(standard
|
||||||
|
input)" is used. There is no short form for this option.
|
||||||
|
|
||||||
|
--line-buffered
|
||||||
|
When this option is given, input is read and processed line
|
||||||
|
by line, and the output is flushed after each write. By
|
||||||
|
default, input is read in large chunks, unless pcre2grep can
|
||||||
|
determine that it is reading from a terminal (which is cur-
|
||||||
|
rently possible only in Unix-like environments). Output to
|
||||||
|
terminal is normally automatically flushed by the operating
|
||||||
|
system. This option can be useful when the input or output is
|
||||||
|
attached to a pipe and you do not want pcre2grep to buffer up
|
||||||
|
large amounts of data. However, its use will affect perfor-
|
||||||
|
mance, and the -M (multiline) option ceases to work.
|
||||||
|
|
||||||
|
--line-offsets
|
||||||
|
Instead of showing lines or parts of lines that match, show
|
||||||
|
each match as a line number, the offset from the start of the
|
||||||
|
line, and a length. The line number is terminated by a colon
|
||||||
|
(as usual; see the -n option), and the offset and length are
|
||||||
|
separated by a comma. In this mode, no context is shown.
|
||||||
|
That is, the -A, -B, and -C options are ignored. If there is
|
||||||
|
more than one match in a line, each of them is shown sepa-
|
||||||
|
rately. This option is mutually exclusive with --file-offsets
|
||||||
|
and --only-matching.
|
||||||
|
|
||||||
|
--locale=locale-name
|
||||||
|
This option specifies a locale to be used for pattern match-
|
||||||
|
ing. It overrides the value in the LC_ALL or LC_CTYPE envi-
|
||||||
|
ronment variables. If no locale is specified, the PCRE2
|
||||||
|
library's default (usually the "C" locale) is used. There is
|
||||||
|
no short form for this option.
|
||||||
|
|
||||||
|
--match-limit=number
|
||||||
|
Processing some regular expression patterns can require a
|
||||||
|
very large amount of memory, leading in some cases to a pro-
|
||||||
|
gram crash if not enough is available. Other patterns may
|
||||||
|
take a very long time to search for all possible matching
|
||||||
|
strings. The pcre2_exec() function that is called by
|
||||||
|
pcre2grep to do the matching has two parameters that can
|
||||||
|
limit the resources that it uses.
|
||||||
|
|
||||||
|
The --match-limit option provides a means of limiting
|
||||||
|
resource usage when processing patterns that are not going to
|
||||||
|
match, but which have a very large number of possibilities in
|
||||||
|
their search trees. The classic example is a pattern that
|
||||||
|
uses nested unlimited repeats. Internally, PCRE2 uses a func-
|
||||||
|
tion called match() which it calls repeatedly (sometimes
|
||||||
|
recursively). The limit set by --match-limit is imposed on
|
||||||
|
the number of times this function is called during a match,
|
||||||
|
which has the effect of limiting the amount of backtracking
|
||||||
|
that can take place.
|
||||||
|
|
||||||
|
The --recursion-limit option is similar to --match-limit, but
|
||||||
|
instead of limiting the total number of times that match() is
|
||||||
|
called, it limits the depth of recursive calls, which in turn
|
||||||
|
limits the amount of memory that can be used. The recursion
|
||||||
|
depth is a smaller number than the total number of calls,
|
||||||
|
because not all calls to match() are recursive. This limit is
|
||||||
|
of use only if it is set smaller than --match-limit.
|
||||||
|
|
||||||
|
There are no short forms for these options. The default set-
|
||||||
|
tings are specified when the PCRE2 library is compiled, with
|
||||||
|
the default default being 10 million.
|
||||||
|
|
||||||
|
-M, --multiline
|
||||||
|
Allow patterns to match more than one line. When this option
|
||||||
|
is given, patterns may usefully contain literal newline char-
|
||||||
|
acters and internal occurrences of ^ and $ characters. The
|
||||||
|
output for a successful match may consist of more than one
|
||||||
|
line, the last of which is the one in which the match ended.
|
||||||
|
If the matched string ends with a newline sequence the output
|
||||||
|
ends at the end of that line.
|
||||||
|
|
||||||
|
When this option is set, the PCRE2 library is called in "mul-
|
||||||
|
tiline" mode. There is a limit to the number of lines that
|
||||||
|
can be matched, imposed by the way that pcre2grep buffers the
|
||||||
|
input file as it scans it. However, pcre2grep ensures that at
|
||||||
|
least 8K characters or the rest of the document (whichever is
|
||||||
|
the shorter) are available for forward matching, and simi-
|
||||||
|
larly the previous 8K characters (or all the previous charac-
|
||||||
|
ters, if fewer than 8K) are guaranteed to be available for
|
||||||
|
lookbehind assertions. This option does not work when input
|
||||||
|
is read line by line (see --line-buffered.)
|
||||||
|
|
||||||
|
-N newline-type, --newline=newline-type
|
||||||
|
The PCRE2 library supports five different conventions for
|
||||||
|
indicating the ends of lines. They are the single-character
|
||||||
|
sequences CR (carriage return) and LF (linefeed), the two-
|
||||||
|
character sequence CRLF, an "anycrlf" convention, which rec-
|
||||||
|
ognizes any of the preceding three types, and an "any" con-
|
||||||
|
vention, in which any Unicode line ending sequence is assumed
|
||||||
|
to end a line. The Unicode sequences are the three just men-
|
||||||
|
tioned, plus VT (vertical tab, U+000B), FF (form feed,
|
||||||
|
U+000C), NEL (next line, U+0085), LS (line separator,
|
||||||
|
U+2028), and PS (paragraph separator, U+2029).
|
||||||
|
|
||||||
|
When the PCRE2 library is built, a default line-ending
|
||||||
|
sequence is specified. This is normally the standard
|
||||||
|
sequence for the operating system. Unless otherwise specified
|
||||||
|
by this option, pcre2grep uses the library's default. The
|
||||||
|
possible values for this option are CR, LF, CRLF, ANYCRLF, or
|
||||||
|
ANY. This makes it possible to use pcre2grep to scan files
|
||||||
|
that have come from other environments without having to mod-
|
||||||
|
ify their line endings. If the data that is being scanned
|
||||||
|
does not agree with the convention set by this option,
|
||||||
|
pcre2grep may behave in strange ways. Note that this option
|
||||||
|
does not apply to files specified by the -f, --exclude-from,
|
||||||
|
or --include-from options, which are expected to use the
|
||||||
|
operating system's standard newline sequence.
|
||||||
|
|
||||||
|
-n, --line-number
|
||||||
|
Precede each output line by its line number in the file, fol-
|
||||||
|
lowed by a colon for matching lines or a hyphen for context
|
||||||
|
lines. If the filename is also being output, it precedes the
|
||||||
|
line number. This option is forced if --line-offsets is used.
|
||||||
|
|
||||||
|
--no-jit If the PCRE2 library is built with support for just-in-time
|
||||||
|
compiling (which speeds up matching), pcre2grep automatically
|
||||||
|
makes use of this, unless it was explicitly disabled at build
|
||||||
|
time. This option can be used to disable the use of JIT at
|
||||||
|
run time. It is provided for testing and working round prob-
|
||||||
|
lems. It should never be needed in normal use.
|
||||||
|
|
||||||
|
-o, --only-matching
|
||||||
|
Show only the part of the line that matched a pattern instead
|
||||||
|
of the whole line. In this mode, no context is shown. That
|
||||||
|
is, the -A, -B, and -C options are ignored. If there is more
|
||||||
|
than one match in a line, each of them is shown separately.
|
||||||
|
If -o is combined with -v (invert the sense of the match to
|
||||||
|
find non-matching lines), no output is generated, but the
|
||||||
|
return code is set appropriately. If the matched portion of
|
||||||
|
the line is empty, nothing is output unless the file name or
|
||||||
|
line number are being printed, in which case they are shown
|
||||||
|
on an otherwise empty line. This option is mutually exclusive
|
||||||
|
with --file-offsets and --line-offsets.
|
||||||
|
|
||||||
|
-onumber, --only-matching=number
|
||||||
|
Show only the part of the line that matched the capturing
|
||||||
|
parentheses of the given number. Up to 32 capturing parenthe-
|
||||||
|
ses are supported, and -o0 is equivalent to -o without a num-
|
||||||
|
ber. Because these options can be given without an argument
|
||||||
|
(see above), if an argument is present, it must be given in
|
||||||
|
the same shell item, for example, -o3 or --only-matching=2.
|
||||||
|
The comments given for the non-argument case above also apply
|
||||||
|
to this case. If the specified capturing parentheses do not
|
||||||
|
exist in the pattern, or were not set in the match, nothing
|
||||||
|
is output unless the file name or line number are being
|
||||||
|
printed.
|
||||||
|
|
||||||
|
If this option is given multiple times, multiple substrings
|
||||||
|
are output, in the order the options are given. For example,
|
||||||
|
-o3 -o1 -o3 causes the substrings matched by capturing paren-
|
||||||
|
theses 3 and 1 and then 3 again to be output. By default,
|
||||||
|
there is no separator (but see the next option).
|
||||||
|
|
||||||
|
--om-separator=text
|
||||||
|
Specify a separating string for multiple occurrences of -o.
|
||||||
|
The default is an empty string. Separating strings are never
|
||||||
|
coloured.
|
||||||
|
|
||||||
|
-q, --quiet
|
||||||
|
Work quietly, that is, display nothing except error messages.
|
||||||
|
The exit status indicates whether or not any matches were
|
||||||
|
found.
|
||||||
|
|
||||||
|
-r, --recursive
|
||||||
|
If any given path is a directory, recursively scan the files
|
||||||
|
it contains, taking note of any --include and --exclude set-
|
||||||
|
tings. By default, a directory is read as a normal file; in
|
||||||
|
some operating systems this gives an immediate end-of-file.
|
||||||
|
This option is a shorthand for setting the -d option to
|
||||||
|
"recurse".
|
||||||
|
|
||||||
|
--recursion-limit=number
|
||||||
|
See --match-limit above.
|
||||||
|
|
||||||
|
-s, --no-messages
|
||||||
|
Suppress error messages about non-existent or unreadable
|
||||||
|
files. Such files are quietly skipped. However, the return
|
||||||
|
code is still 2, even if matches were found in other files.
|
||||||
|
|
||||||
|
-u, --utf-8
|
||||||
|
Operate in UTF-8 mode. This option is available only if PCRE2
|
||||||
|
has been compiled with UTF-8 support. All patterns (including
|
||||||
|
those for any --exclude and --include options) and all sub-
|
||||||
|
ject lines that are scanned must be valid strings of UTF-8
|
||||||
|
characters.
|
||||||
|
|
||||||
|
-V, --version
|
||||||
|
Write the version numbers of pcre2grep and the PCRE2 library
|
||||||
|
to the standard output and then exit. Anything else on the
|
||||||
|
command line is ignored.
|
||||||
|
|
||||||
|
-v, --invert-match
|
||||||
|
Invert the sense of the match, so that lines which do not
|
||||||
|
match any of the patterns are the ones that are found.
|
||||||
|
|
||||||
|
-w, --word-regex, --word-regexp
|
||||||
|
Force the patterns to match only whole words. This is equiva-
|
||||||
|
lent to having \b at the start and end of the pattern. This
|
||||||
|
option applies only to the patterns that are matched against
|
||||||
|
the contents of files; it does not apply to patterns speci-
|
||||||
|
fied by any of the --include or --exclude options.
|
||||||
|
|
||||||
|
-x, --line-regex, --line-regexp
|
||||||
|
Force the patterns to be anchored (each must start matching
|
||||||
|
at the beginning of a line) and in addition, require them to
|
||||||
|
match entire lines. This is equivalent to having ^ and $
|
||||||
|
characters at the start and end of each alternative branch in
|
||||||
|
every pattern. This option applies only to the patterns that
|
||||||
|
are matched against the contents of files; it does not apply
|
||||||
|
to patterns specified by any of the --include or --exclude
|
||||||
|
options.
|
||||||
|
|
||||||
|
|
||||||
|
ENVIRONMENT VARIABLES
|
||||||
|
|
||||||
|
The environment variables LC_ALL and LC_CTYPE are examined, in that
|
||||||
|
order, for a locale. The first one that is set is used. This can be
|
||||||
|
overridden by the --locale option. If no locale is set, the PCRE2
|
||||||
|
library's default (usually the "C" locale) is used.
|
||||||
|
|
||||||
|
|
||||||
|
NEWLINES
|
||||||
|
|
||||||
|
The -N (--newline) option allows pcre2grep to scan files with different
|
||||||
|
newline conventions from the default. Any parts of the input files that
|
||||||
|
are written to the standard output are copied identically, with what-
|
||||||
|
ever newline sequences they have in the input. However, the setting of
|
||||||
|
this option does not affect the interpretation of files specified by
|
||||||
|
the -f, --exclude-from, or --include-from options, which are assumed to
|
||||||
|
use the operating system's standard newline sequence, nor does it
|
||||||
|
affect the way in which pcre2grep writes informational messages to the
|
||||||
|
standard error and output streams. For these it uses the string "\n" to
|
||||||
|
indicate newlines, relying on the C I/O library to convert this to an
|
||||||
|
appropriate sequence.
|
||||||
|
|
||||||
|
|
||||||
|
OPTIONS COMPATIBILITY
|
||||||
|
|
||||||
|
Many of the short and long forms of pcre2grep's options are the same as
|
||||||
|
in the GNU grep program. Any long option of the form --xxx-regexp (GNU
|
||||||
|
terminology) is also available as --xxx-regex (PCRE2 terminology). How-
|
||||||
|
ever, the --file-list, --file-offsets, --include-dir, --line-offsets,
|
||||||
|
--locale, --match-limit, -M, --multiline, -N, --newline, --om-separa-
|
||||||
|
tor, --recursion-limit, -u, and --utf-8 options are specific to
|
||||||
|
pcre2grep, as is the use of the --only-matching option with a capturing
|
||||||
|
parentheses number.
|
||||||
|
|
||||||
|
Although most of the common options work the same way, a few are dif-
|
||||||
|
ferent in pcre2grep. For example, the --include option's argument is a
|
||||||
|
glob for GNU grep, but a regular expression for pcre2grep. If both the
|
||||||
|
-c and -l options are given, GNU grep lists only file names, without
|
||||||
|
counts, but pcre2grep gives the counts.
|
||||||
|
|
||||||
|
|
||||||
|
OPTIONS WITH DATA
|
||||||
|
|
||||||
|
There are four different ways in which an option with data can be spec-
|
||||||
|
ified. If a short form option is used, the data may follow immedi-
|
||||||
|
ately, or (with one exception) in the next command line item. For exam-
|
||||||
|
ple:
|
||||||
|
|
||||||
|
-f/some/file
|
||||||
|
-f /some/file
|
||||||
|
|
||||||
|
The exception is the -o option, which may appear with or without data.
|
||||||
|
Because of this, if data is present, it must follow immediately in the
|
||||||
|
same item, for example -o3.
|
||||||
|
|
||||||
|
If a long form option is used, the data may appear in the same command
|
||||||
|
line item, separated by an equals character, or (with two exceptions)
|
||||||
|
it may appear in the next command line item. For example:
|
||||||
|
|
||||||
|
--file=/some/file
|
||||||
|
--file /some/file
|
||||||
|
|
||||||
|
Note, however, that if you want to supply a file name beginning with ~
|
||||||
|
as data in a shell command, and have the shell expand ~ to a home
|
||||||
|
directory, you must separate the file name from the option, because the
|
||||||
|
shell does not treat ~ specially unless it is at the start of an item.
|
||||||
|
|
||||||
|
The exceptions to the above are the --colour (or --color) and --only-
|
||||||
|
matching options, for which the data is optional. If one of these
|
||||||
|
options does have data, it must be given in the first form, using an
|
||||||
|
equals character. Otherwise pcre2grep will assume that it has no data.
|
||||||
|
|
||||||
|
|
||||||
|
MATCHING ERRORS
|
||||||
|
|
||||||
|
It is possible to supply a regular expression that takes a very long
|
||||||
|
time to fail to match certain lines. Such patterns normally involve
|
||||||
|
nested indefinite repeats, for example: (a+)*\d when matched against a
|
||||||
|
line of a's with no final digit. The PCRE2 matching function has a
|
||||||
|
resource limit that causes it to abort in these circumstances. If this
|
||||||
|
happens, pcre2grep outputs an error message and the line that caused
|
||||||
|
the problem to the standard error stream. If there are more than 20
|
||||||
|
such errors, pcre2grep gives up.
|
||||||
|
|
||||||
|
The --match-limit option of pcre2grep can be used to set the overall
|
||||||
|
resource limit; there is a second option called --recursion-limit that
|
||||||
|
sets a limit on the amount of memory (usually stack) that is used (see
|
||||||
|
the discussion of these options above).
|
||||||
|
|
||||||
|
|
||||||
|
DIAGNOSTICS
|
||||||
|
|
||||||
|
Exit status is 0 if any matches were found, 1 if no matches were found,
|
||||||
|
and 2 for syntax errors, overlong lines, non-existent or inaccessible
|
||||||
|
files (even if matches were found in other files) or too many matching
|
||||||
|
errors. Using the -s option to suppress error messages about inaccessi-
|
||||||
|
ble files does not affect the return code.
|
||||||
|
|
||||||
|
|
||||||
|
SEE ALSO
|
||||||
|
|
||||||
|
pcre2pattern(3), pcre2syntax(3), pcre2test(1).
|
||||||
|
|
||||||
|
|
||||||
|
AUTHOR
|
||||||
|
|
||||||
|
Philip Hazel
|
||||||
|
University Computing Service
|
||||||
|
Cambridge CB2 3QH, England.
|
||||||
|
|
||||||
|
|
||||||
|
REVISION
|
||||||
|
|
||||||
|
Last updated: 28 September 2014
|
||||||
|
Copyright (c) 1997-2014 University of Cambridge.
|
|
@ -2997,7 +2997,7 @@ if (colour_option != NULL && strcmp(colour_option, "never") != 0)
|
||||||
}
|
}
|
||||||
if (do_colour)
|
if (do_colour)
|
||||||
{
|
{
|
||||||
char *cs = getenv("pcre2grep_COLOUR");
|
char *cs = getenv("PCRE2GREP_COLOUR");
|
||||||
if (cs == NULL) cs = getenv("PCRE2GREP_COLOR");
|
if (cs == NULL) cs = getenv("PCRE2GREP_COLOR");
|
||||||
if (cs != NULL) colour_string = cs;
|
if (cs != NULL) colour_string = cs;
|
||||||
}
|
}
|
||||||
|
|
|
@ -5094,9 +5094,9 @@ printf(" 32-bit support\n");
|
||||||
|
|
||||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_UNICODE, &rc, sizeof(rc));
|
(void)PCRE2_CONFIG(PCRE2_CONFIG_UNICODE, &rc, sizeof(rc));
|
||||||
if (rc != 0)
|
if (rc != 0)
|
||||||
printf(" UTF support (Unicode version %s)\n", uversion);
|
printf(" UTF and UCP support (Unicode version %s)\n", uversion);
|
||||||
else
|
else
|
||||||
printf(" No UTF support\n");
|
printf(" No UTF or UCP support\n");
|
||||||
(void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &rc, sizeof(rc));
|
(void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &rc, sizeof(rc));
|
||||||
if (rc != 0)
|
if (rc != 0)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue