diff --git a/Makefile.am b/Makefile.am
index 7df1e86..366b88c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -6,29 +6,31 @@ AM_CPPFLAGS = -I$(builddir)/src -I$(srcdir)/src
## Specify the documentation files that are distributed.
-# FIXME
dist_doc_DATA = \
AUTHORS \
COPYING \
ChangeLog \
LICENCE \
- README
-
-# doc/pcre.txt \
-# doc/pcre-config.txt \
-# doc/pcregrep.txt \
-# doc/pcretest.txt \
-# NEWS
+ NEWS \
+ README \
+ doc/pcre2.txt \
+ doc/pcre2-config.txt \
+ doc/pcre2grep.txt \
+ doc/pcre2test.txt
# FIXME
-#dist_html_DATA = \
-# doc/html/NON-AUTOTOOLS-BUILD.txt \
-# doc/html/README.txt \
-# doc/html/index.html \
-# doc/html/pcre-config.html \
+dist_html_DATA = \
+ doc/html/NON-AUTOTOOLS-BUILD.txt \
+ doc/html/README.txt \
+ doc/html/index.html \
+ doc/html/pcre2-config.html \
+ doc/html/pcre2api.html \
+ doc/html/pcre2callout.html \
+ doc/html/pcre2demo.html \
+ doc/html/pcre2test.html \
+ doc/html/pcre2unicode.html
+
# doc/html/pcre.html \
-# doc/html/pcre16.html \
-# doc/html/pcre32.html \
# doc/html/pcre_assign_jit_stack.html \
# doc/html/pcre_compile.html \
# doc/html/pcre_compile2.html \
@@ -56,11 +58,8 @@ dist_doc_DATA = \
# doc/html/pcre_utf16_to_host_byte_order.html \
# doc/html/pcre_utf32_to_host_byte_order.html \
# doc/html/pcre_version.html \
-# doc/html/pcreapi.html \
# doc/html/pcrebuild.html \
-# doc/html/pcrecallout.html \
# doc/html/pcrecompat.html \
-# doc/html/pcredemo.html \
# doc/html/pcregrep.html \
# doc/html/pcrejit.html \
# doc/html/pcrelimits.html \
@@ -72,18 +71,16 @@ dist_doc_DATA = \
# doc/html/pcreprecompile.html \
# doc/html/pcresample.html \
# doc/html/pcrestack.html \
-# doc/html/pcresyntax.html \
-# doc/html/pcretest.html \
-# doc/html/pcreunicode.html
+# doc/html/pcresyntax.html
# FIXME
dist_man_MANS = \
+ doc/pcre2-config.1 \
doc/pcre2api.3 \
doc/pcre2callout.3 \
doc/pcre2test.1 \
doc/pcre2unicode.3
-# doc/pcre2-config.1 \
# doc/pcre2.3 \
# doc/pcre2-16.3 \
# doc/pcre2-32.3 \
@@ -168,7 +165,6 @@ EXTRA_DIST += \
EXTRA_DIST += \
doc/perltest.txt \
- NON-UNIX-USE \
NON-AUTOTOOLS-BUILD \
HACKING
@@ -719,9 +715,9 @@ else
coverage:
@echo "Configuring with --enable-coverage is required to generate code coverage report."
-DISTCLEANFILES += src/*.gcda src/*.gcno
+DISTCLEANFILES += src/*.gcda src/*.gcno
-distclean-local:
+distclean-local:
rm -rf $(PACKAGE)-$(VERSION)-coverage*
endif # WITH_GCOV
diff --git a/PrepareRelease b/PrepareRelease
index c92d7f9..2b58a47 100755
--- a/PrepareRelease
+++ b/PrepareRelease
@@ -83,8 +83,7 @@ for file in pcre2api pcre2callout pcre2unicode ; do
done
# The three commands
-for file in pcre2test ; do
-# for file in pcre2test pcre2grep pcre-config ; do
+for file in pcre2test pcre2grep pcre2-config ; do
echo Making $file.txt
nroff -c -man $file.1 >$file.rawtxt
perl ../CleanTxt <$file.rawtxt >$file.txt
@@ -133,7 +132,7 @@ echo "Making HTML documentation"
/bin/rm html/*
cp index.html.src html/index.html
cp ../README html/README.txt
-# cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt
+cp ../NON-AUTOTOOLS-BUILD html/NON-AUTOTOOLS-BUILD.txt
for file in *.1 ; do
base=`basename $file .1`
@@ -187,7 +186,6 @@ files="\
COPYING \
AUTHORS \
NEWS \
- NON-UNIX-USE \
NON-AUTOTOOLS-BUILD \
INSTALL \
132html \
@@ -240,16 +238,6 @@ files="\
pcre32_utf32_utils.c \
pcre16_valid_utf16.c \
pcre32_valid_utf32.c \
- pcre_scanner.cc \
- pcre_scanner.h \
- pcre_scanner_unittest.cc \
- pcrecpp.cc \
- pcrecpp.h \
- pcrecpparg.h.in \
- pcrecpp_unittest.cc \
- pcre_stringpiece.cc \
- pcre_stringpiece.h.in \
- pcre_stringpiece_unittest.cc \
perltest.pl \
ucp.h \
makevp.bat \
diff --git a/doc/html/NON-AUTOTOOLS-BUILD.txt b/doc/html/NON-AUTOTOOLS-BUILD.txt
new file mode 100644
index 0000000..6f36fe6
--- /dev/null
+++ b/doc/html/NON-AUTOTOOLS-BUILD.txt
@@ -0,0 +1,402 @@
+Building PCRE2 without using autotools
+--------------------------------------
+
+This document has been converted from the PCRE1 document, but is not yet
+complete. I have removed a number of quite old sections about building in
+various environments, as they applied only to PCRE1 and are probably out of
+date.
+
+
+This document contains the following sections:
+
+ General
+ Generic instructions for the PCRE2 C library
+ Building for virtual Pascal
+ Stack size in Windows environments
+ Linking programs in Windows environments
+ Calling conventions in Windows environments
+ Comments about Win32 builds
+ Building PCRE2 on Windows with CMake
+ Testing with RunTest.bat
+ Building PCRE2 on native z/OS and z/VM
+
+
+GENERAL
+
+I (Philip Hazel) have no experience of Windows or VMS sytems and how their
+libraries work. The items in the PCRE2 distribution and Makefile that relate to
+anything other than Linux systems are untested by me.
+
+The basic PCRE2 library consists entirely of code written in Standard C, and so
+should compile successfully on any system that has a Standard C compiler and
+library.
+
+The PCRE2 distribution includes a "configure" file for use by the
+configure/make (autotools) build system, as found in many Unix-like
+environments. The README file contains information about the options for
+"configure".
+
+There is also support for CMake, which some users prefer, especially in Windows
+environments, though it can also be run in Unix-like environments. See the
+section entitled "Building PCRE2 on Windows with CMake" below.
+
+Versions of src/config.h and src/pcre2.h are distributed in the PCRE2 tarballs
+under the names src/config.h.generic and src/pcre2.h.generic. These are
+provided for those who build PCRE2 without using "configure" or CMake. If you
+use "configure" or CMake, the .generic versions are not used.
+
+
+GENERIC INSTRUCTIONS FOR THE PCRE2 C LIBRARY
+
+The following are generic instructions for building the PCRE2 C library "by
+hand". If you are going to use CMake, this section does not apply to you; you
+can skip ahead to the CMake section.
+
+ (1) Copy or rename the file src/config.h.generic as src/config.h, and edit the
+ macro settings that it contains to whatever is appropriate for your
+ environment. In particular, you can alter the definition of the NEWLINE
+ macro to specify what character(s) you want to be interpreted as line
+ terminators.
+
+ When you compile any of the PCRE2 modules, you must specify
+ -DHAVE_CONFIG_H to your compiler so that src/config.h is included in the
+ sources.
+
+ An alternative approach is not to edit src/config.h, but to use -D on the
+ compiler command line to make any changes that you need to the
+ configuration options. In this case -DHAVE_CONFIG_H must not be set.
+
+ NOTE: There have been occasions when the way in which certain parameters
+ in src/config.h are used has changed between releases. (In the
+ configure/make world, this is handled automatically.) When upgrading to a
+ new release, you are strongly advised to review src/config.h.generic
+ before re-using what you had previously.
+
+ (2) Copy or rename the file src/pcre2.h.generic as src/pcre2.h.
+
+ (3) EITHER:
+ Copy or rename file src/pcre2_chartables.c.dist as
+ src/pcre2_chartables.c.
+
+ OR:
+ Compile src/dftables.c as a stand-alone program (using -DHAVE_CONFIG_H
+ if you have set up src/config.h), and then run it with the single
+ argument "src/pcre2_chartables.c". This generates a set of standard
+ character tables and writes them to that file. The tables are generated
+ using the default C locale for your system. If you want to use a locale
+ that is specified by LC_xxx environment variables, add the -L option to
+ the dftables command. You must use this method if you are building on a
+ system that uses EBCDIC code.
+
+ The tables in src/pcre2_chartables.c are defaults. The caller of PCRE2 can
+ specify alternative tables at run time.
+
+ (4) For an 8-bit library, compile the following source files, setting
+ -DPCRE2_CODE_UNIT_WIDTH=8 as a compiler option. Also set -DHAVE_CONFIG_H
+ if you have set up src/config.h with your configuration, or else use other
+ -D settings to change the configuration as required.
+
+ pcre2_auto_possess.c
+ pcre2_chartables.c
+ pcre2_compile.c
+ pcre2_config.c
+ pcre2_context.c
+ pcre2_dfa_match.c
+ pcre2_error.c
+ pcre2_jit_compile.c
+ pcre2_jit_match.c
+ pcre2_jit_misc.c
+ pcre2_maketables.c
+ pcre2_match.c
+ pcre2_match_data.c
+ pcre2_newline.c
+ pcre2_ord2utf.c
+ pcre2_pattern_info.c
+ pcre2_string_utils.c
+ pcre2_study.c
+ pcre2_substring.c
+ pcre2_tables.c
+ pcre2_ucd.c
+ pcre2_valid_utf.c
+ pcre2_xclass.c
+
+ Make sure that you include -I. in the compiler command (or equivalent for
+ an unusual compiler) so that all included PCRE2 header files are first
+ sought in the src directory under the current directory. Otherwise you run
+ the risk of picking up a previously-installed file from somewhere else.
+
+ Note that you must compile pcre2_jit_xxx.c, even if you have not defined
+ SUPPORT_JIT in src/config.h, because when JIT support is not configured,
+ dummy functions are compiled. When JIT support IS configured, the JIT
+ sources #include other files from the sljit subdirectory, where there
+ should be 16 files, all of whose names begin with "sljit".
+
+ (5) Now link all the compiled code into an object library in whichever form
+ your system keeps such libraries. This is the basic PCRE2 C 8-bit library.
+ If your system has static and shared libraries, you may have to do this
+ once for each type.
+
+ (6) If you want to build a 16-bit library or 32-bit library (as well as, or
+ instead of the 8-bit library) just supply 16 or 32 as the value of
+ -DPCRE2_CODE_UNIT_WIDTH when you are compiling.
+
+ (7) If you want to build the POSIX wrapper functions (which apply only to the
+ 8-bit library), ensure that you have the pcre2posix.h file and then
+ compile pcre2posix.c. Link the result (on its own) as the pcre2posix
+ library.
+
+ (8) The pcre2test program can be linked with any combination of the 8-bit,
+ 16-bit and 32-bit libraries (depending on what you selected in
+ src/config.h). Compile pcre2test.c; don't forget -DHAVE_CONFIG_H if
+ necessary, but do NOT define PCRE2_CODE_UNIT_WIDTH. Then link with the
+ appropriate library/ies. If you compiled an 8-bit library, pcre2test also
+ needs the pcre2posix wrapper library.
+
+ (9) Run pcre2test on the testinput files in the testdata directory, and check
+ that the output matches the corresponding testoutput files. There are
+ comments about what each test does in the section entitled "Testing PCRE2"
+ in the README file. If you compiled more than one of the 8-bit, 16-bit and
+ 32-bit libraries, you need to run pcre2test with the -16 option to do
+ 16-bit tests and with the -32 option to do 32-bit tests.
+
+ Some tests are relevant only when certain build-time options are selected.
+ For example, test 4 is for Unicode support, and will not run if you have
+ built PCRE2 without it. See the comments at the start of each testinput
+ file. If you have a suitable Unix-like shell, the RunTest script will run
+ the appropriate tests for you. The command "RunTest list" will output a
+ list of all the tests.
+
+ Note that the supplied files are in Unix format, with just LF characters
+ as line terminators. You may need to edit them to change this if your
+ system uses a different convention.
+
+(10) If you have built PCRE2 with SUPPORT_JIT, the JIT features can be tested
+ by running pcre2test with the -jit option. This is done automatically by
+ the RunTest script. You might also like to build and run the freestanding
+ JIT test program, pcre2_jit_test.c.
+
+(11) If you want to use the pcre2grep command, compile and link pcre2grep.c; it
+ uses only the basic 8-bit PCRE2 library (it does not need the pcre2posix
+ library).
+
+
+BUILDING FOR VIRTUAL PASCAL
+
+FIXME FOR PCRE2
+
+A script for building PCRE2 using Borland's C++ compiler for use with VPASCAL
+was contributed by Alexander Tokarev. Stefan Weber updated the script and added
+additional files. The following files in the distribution are for building
+PCRE2 for use with VP/Borland: makevp_c.txt, makevp_l.txt, makevp.bat,
+pcre2gexp.pas.
+
+
+STACK SIZE IN WINDOWS ENVIRONMENTS
+
+The default processor stack size of 1Mb in some Windows environments is too
+small for matching patterns that need much recursion. In particular, test 2 may
+fail because of this. Normally, running out of stack causes a crash, but there
+have been cases where the test program has just died silently. See your linker
+documentation for how to increase stack size if you experience problems. The
+Linux default of 8Mb is a reasonable choice for the stack, though even that can
+be too small for some pattern/subject combinations.
+
+PCRE2 has a compile configuration option to disable the use of stack for
+recursion so that heap is used instead. However, pattern matching is
+significantly slower when this is done. There is more about stack usage in the
+"pcre2stack" documentation.
+
+
+LINKING PROGRAMS IN WINDOWS ENVIRONMENTS
+
+If you want to statically link a program against a PCRE2 library in the form of
+a non-dll .a file, you must define PCRE2_STATIC before including src/pcre2.h.
+
+
+CALLING CONVENTIONS IN WINDOWS ENVIRONMENTS
+
+It is possible to compile programs to use different calling conventions using
+MSVC. Search the web for "calling conventions" for more information. To make it
+easier to change the calling convention for the exported functions in the
+PCRE2 library, the macro PCRE2_CALL_CONVENTION is present in all the external
+definitions. It can be set externally when compiling (e.g. in CFLAGS). If it is
+not set, it defaults to empty; the default calling convention is then used
+(which is what is wanted most of the time).
+
+
+COMMENTS ABOUT WIN32 BUILDS (see also "BUILDING PCRE2 ON WINDOWS WITH CMAKE")
+
+There are two ways of building PCRE2 using the "configure, make, make install"
+paradigm on Windows systems: using MinGW or using Cygwin. These are not at all
+the same thing; they are completely different from each other. There is also
+support for building using CMake, which some users find a more straightforward
+way of building PCRE2 under Windows.
+
+The MinGW home page (http://www.mingw.org/) says this:
+
+ MinGW: A collection of freely available and freely distributable Windows
+ specific header files and import libraries combined with GNU toolsets that
+ allow one to produce native Windows programs that do not rely on any
+ 3rd-party C runtime DLLs.
+
+The Cygwin home page (http://www.cygwin.com/) says this:
+
+ Cygwin is a Linux-like environment for Windows. It consists of two parts:
+
+ . A DLL (cygwin1.dll) which acts as a Linux API emulation layer providing
+ substantial Linux API functionality
+
+ . A collection of tools which provide Linux look and feel.
+
+On both MinGW and Cygwin, PCRE2 should build correctly using:
+
+ ./configure && make && make install
+
+This should create two libraries called libpcre2-8 and libpcre2-posix. These
+are independent libraries: when you link with libpcre2-posix you must also link
+with libpcre2-8, which contains the basic functions.
+
+Using Cygwin's compiler generates libraries and executables that depend on
+cygwin1.dll. If a library that is generated this way is distributed,
+cygwin1.dll has to be distributed as well. Since cygwin1.dll is under the GPL
+licence, this forces not only PCRE2 to be under the GPL, but also the entire
+application. A distributor who wants to keep their own code proprietary must
+purchase an appropriate Cygwin licence.
+
+MinGW has no such restrictions. The MinGW compiler generates a library or
+executable that can run standalone on Windows without any third party dll or
+licensing issues.
+
+But there is more complication:
+
+If a Cygwin user uses the -mno-cygwin Cygwin gcc flag, what that really does is
+to tell Cygwin's gcc to use the MinGW gcc. Cygwin's gcc is only acting as a
+front end to MinGW's gcc (if you install Cygwin's gcc, you get both Cygwin's
+gcc and MinGW's gcc). So, a user can:
+
+. Build native binaries by using MinGW or by getting Cygwin and using
+ -mno-cygwin.
+
+. Build binaries that depend on cygwin1.dll by using Cygwin with the normal
+ compiler flags.
+
+The test files that are supplied with PCRE2 are in UNIX format, with LF
+characters as line terminators. Unless your PCRE2 library uses a default
+newline option that includes LF as a valid newline, it may be necessary to
+change the line terminators in the test files to get some of the tests to work.
+
+
+BUILDING PCRE2 ON WINDOWS WITH CMAKE
+
+CMake is an alternative configuration facility that can be used instead of
+"configure". CMake creates project files (make files, solution files, etc.)
+tailored to numerous development environments, including Visual Studio,
+Borland, Msys, MinGW, NMake, and Unix. If possible, use short paths with no
+spaces in the names for your CMake installation and your PCRE2 source and build
+directories.
+
+The following instructions were contributed by a PCRE1 user, but they should
+also work for PCRE2. If they are not followed exactly, errors may occur. In the
+event that errors do occur, it is recommended that you delete the CMake cache
+before attempting to repeat the CMake build process. In the CMake GUI, the
+cache can be deleted by selecting "File > Delete Cache".
+
+1. Install the latest CMake version available from http://www.cmake.org/, and
+ ensure that cmake\bin is on your path.
+
+2. Unzip (retaining folder structure) the PCRE2 source tree into a source
+ directory such as C:\pcre2. You should ensure your local date and time
+ is not earlier than the file dates in your source dir if the release is
+ very new.
+
+3. Create a new, empty build directory, preferably a subdirectory of the
+ source dir. For example, C:\pcre2\pcre2-xx\build.
+
+4. Run cmake-gui from the Shell envirornment of your build tool, for example,
+ Msys for Msys/MinGW or Visual Studio Command Prompt for VC/VC++. Do not try
+ to start Cmake from the Windows Start menu, as this can lead to errors.
+
+5. Enter C:\pcre2\pcre2-xx and C:\pcre2\pcre2-xx\build for the source and
+ build directories, respectively.
+
+6. Hit the "Configure" button.
+
+7. Select the particular IDE / build tool that you are using (Visual
+ Studio, MSYS makefiles, MinGW makefiles, etc.)
+
+8. The GUI will then list several configuration options. This is where
+ you can enable Unicode support or other PCRE2 optional features.
+
+9. Hit "Configure" again. The adjacent "Generate" button should now be
+ active.
+
+10. Hit "Generate".
+
+11. The build directory should now contain a usable build system, be it a
+ solution file for Visual Studio, makefiles for MinGW, etc. Exit from
+ cmake-gui and use the generated build system with your compiler or IDE.
+ E.g., for MinGW you can run "make", or for Visual Studio, open the PCRE2
+ solution, select the desired configuration (Debug, or Release, etc.) and
+ build the ALL_BUILD project.
+
+12. If during configuration with cmake-gui you've elected to build the test
+ programs, you can execute them by building the test project. E.g., for
+ MinGW: "make check"; for Visual Studio build the RUN_TESTS project. The
+ most recent build configuration is targeted by the tests. A summary of
+ test results is presented. Complete test output is subsequently
+ available for review in Testing\Temporary under your build dir.
+
+
+TESTING WITH RUNTEST.BAT FIXME FIXME NOT YET TESTED/UPDATED FIXME
+
+If configured with CMake, building the test project ("make check" or building
+ALL_TESTS in Visual Studio) creates (and runs) pcre2_test.bat (and depending
+on your configuration options, possibly other test programs) in the build
+directory. Pcre_test.bat runs RunTest.Bat with correct source and exe paths.
+
+For manual testing with RunTest.bat, provided the build dir is a subdirectory
+of the source directory: Open command shell window. Chdir to the location
+of your pcre2test.exe and pcre2grep.exe programs. Call RunTest.bat with
+"..\RunTest.Bat" or "..\..\RunTest.bat" as appropriate.
+
+To run only a particular test with RunTest.Bat provide a test number argument.
+
+Otherwise:
+
+1. Copy RunTest.bat into the directory where pcre2test.exe and pcre2grep.exe
+ have been created.
+
+2. Edit RunTest.bat to indentify the full or relative location of
+ the pcre2 source (wherein which the testdata folder resides), e.g.:
+
+ set srcdir=C:\pcre2\pcre2-10.00
+
+3. In a Windows command environment, chdir to the location of your bat and
+ exe programs.
+
+4. Run RunTest.bat. Test outputs will automatically be compared to expected
+ results, and discrepancies will be identified in the console output.
+
+To independently test the just-in-time compiler, run pcre2_jit_test.exe.
+
+
+BUILDING PCRE2 ON NATIVE Z/OS AND Z/VM
+
+z/OS and z/VM are operating systems for mainframe computers, produced by IBM.
+The character code used is EBCDIC, not ASCII or Unicode. In z/OS, UNIX APIs and
+applications can be supported through UNIX System Services, and in such an
+environment PCRE2 can be built in the same way as in other systems. However, in
+native z/OS (without UNIX System Services) and in z/VM, special ports are
+required. For details, please see this web site:
+
+ http://www.zaconsultants.net
+
+There is also a mirror here:
+
+ http://www.vsoft-software.com/downloads.html
+
+The site currently has ports for PCRE1 releases, but PCRE2 should follow in due
+course.
+
+==========================
+Last Updated: 28 September 2014
diff --git a/doc/html/README.txt b/doc/html/README.txt
index 7ad597a..95c8747 100644
--- a/doc/html/README.txt
+++ b/doc/html/README.txt
@@ -1 +1,832 @@
-This is a placeholder README file for a work in progress.
+README file for PCRE2 (Perl-compatible regular expression library)
+------------------------------------------------------------------
+
+PCRE2 is a re-implementation of the original PCRE library with an entirely new
+API. The latest release of PCRE2 is always available in three alternative
+formats from:
+
+FIXME: THIS WILL NOT BE THE CASE UNTIL THERE IS A FORMAL RELEASE.
+
+ ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.gz
+ ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.tar.bz2
+ ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre2/pcre2-xxx.zip
+
+There is a mailing list for discussion about the development of PCRE (both the
+original and new APIs) at pcre-dev@exim.org. You can access the archives and
+subscribe or manage your subscription here:
+
+ https://lists.exim.org/mailman/listinfo/pcre-dev
+
+Please read the NEWS file if you are upgrading from a previous release.
+The contents of this README file are:
+
+ The PCRE2 APIs
+ Documentation for PCRE2
+ Contributions by users of PCRE2
+ Building PCRE2 on non-Unix-like systems
+ Building PCRE2 without using autotools
+ Building PCRE2 using autotools
+ Retrieving configuration information
+ Shared libraries
+ Cross-compiling using autotools
+ Making new tarballs
+ Testing PCRE2
+ Character tables
+ File manifest
+
+
+The PCRE2 APIs
+--------------
+
+PCRE2 is written in C, and it has its own API. There are three sets of
+functions, one for the 8-bit library, which processes strings of bytes, one for
+the 16-bit library, which processes strings of 16-bit values, and one for the
+32-bit library, which processes strings of 32-bit values. As this is a new API,
+there as yet no C++ wrappers.
+
+The distribution does contain a set of C wrapper functions for the 8-bit
+library that are based on the POSIX regular expression API (see the pcre2posix
+man page). These end up in the library called libpcre2posix. Note that this
+just provides a POSIX calling interface to PCRE2; the regular expressions
+themselves still follow Perl syntax and semantics. The POSIX API is restricted,
+and does not give full access to all of PCRE2's facilities.
+
+The header file for the POSIX-style functions is called pcre2posix.h. The
+official POSIX name is regex.h, but I did not want to risk possible problems
+with existing files of that name by distributing it that way. To use PCRE2 with
+an existing program that uses the POSIX API, pcre2posix.h will have to be
+renamed or pointed at by a link.
+
+If you are using the POSIX interface to PCRE2 and there is already a POSIX
+regex library installed on your system, as well as worrying about the regex.h
+header file (as mentioned above), you must also take care when linking programs
+to ensure that they link with PCRE2's libpcre2posix library. Otherwise they may
+pick up the POSIX functions of the same name from the other library.
+
+One way of avoiding this confusion is to compile PCRE2 with the addition of
+-Dregcomp=PCRE2regcomp (and similarly for the other POSIX functions) to the
+compiler flags (CFLAGS if you are using "configure" -- see below). This has the
+effect of renaming the functions so that the names no longer clash. Of course,
+you have to do the same thing for your applications, or write them using the
+new names.
+
+
+Documentation for PCRE2
+----------------------
+
+If you install PCRE2 in the normal way on a Unix-like system, you will end up
+with a set of man pages whose names all start with "pcre2". The one that is
+just called "pcre2" lists all the others. In addition to these man pages, the
+PCRE2 documentation is supplied in two other forms:
+
+ 1. There are files called doc/pcre2.txt, doc/pcre2grep.txt, and
+ doc/pcre2test.txt in the source distribution. The first of these is a
+ concatenation of the text forms of all the section 3 man pages except the
+ listing of pcre2demo.c and those that summarize individual functions. The
+ other two are the text forms of the section 1 man pages for the pcre2grep
+ and pcre2test commands. These text forms are provided for ease of scanning
+ with text editors or similar tools. They are installed in
+
+Return to the PCRE2 index page.
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+pcre2-config [--prefix] [--exec-prefix] [--version]
+ [--libs8] [--libs16] [--libs32] [--libs-posix]
+ [--cflags] [--cflags-posix]
+
+pcre2-config returns the configuration of the installed PCRE2 libraries
+and the options required to compile a program to use them. Some of the options
+apply only to the 8-bit, or 16-bit, or 32-bit libraries, respectively, and are
+not available for libraries that have not been built. If an unavailable option
+is encountered, the "usage" information is output.
+
+--prefix
+Writes the directory prefix used in the PCRE2 installation for architecture
+independent files (/usr on many systems, /usr/local on some
+systems) to the standard output.
+
+--exec-prefix
+Writes the directory prefix used in the PCRE2 installation for architecture
+dependent files (normally the same as --prefix) to the standard output.
+
+--version
+Writes the version number of the installed PCRE2 libraries to the standard
+output.
+
+--libs8
+Writes to the standard output the command line options required to link
+with the 8-bit PCRE2 library (-lpcre2-8 on many systems).
+
+--libs16
+Writes to the standard output the command line options required to link
+with the 16-bit PCRE2 library (-lpcre2-16 on many systems).
+
+--libs32
+Writes to the standard output the command line options required to link
+with the 32-bit PCRE2 library (-lpcre2-32 on many systems).
+
+--libs-posix
+Writes to the standard output the command line options required to link with
+PCRE2's POSIX API wrapper library (-lpcre2-posix -lpcre2-8 on many
+systems).
+
+--cflags
+Writes to the standard output the command line options required to compile
+files that use PCRE2 (this may include some -I options, but is blank on
+many systems).
+
+--cflags-posix
+Writes to the standard output the command line options required to compile
+files that use PCRE2's POSIX API wrapper library (this may include some
+-I options, but is blank on many systems).
+
+pcre2(3)
+
+This manual page was originally written by Mark Baker for the Debian GNU/Linux
+system. It has been subsequently revised as a generic PCRE2 man page.
+
+Last updated: 28 September 2014
+
+Return to the PCRE2 index page.
+
+Return to the PCRE2 index page.
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+PCRE2 is the name used for a revised API for the PCRE library, which is a set
+of functions, written in C, that implement regular expression pattern matching
+using the same syntax and semantics as Perl, with just a few differences. Some
+features that appeared in Python and the original PCRE before they appeared in
+Perl are also available using the Python syntax, there is some support for one
+or two .NET and Oniguruma syntax items, and there are options for requesting
+some minor changes that give better ECMAScript (aka JavaScript) compatibility.
+
+The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit
+code units, which means that up to three separate libraries may be installed.
+The original work to extend PCRE to 16-bit and 32-bit code units was done by
+Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
+can be interpreted either as one character per code unit, or as UTF-encoded
+Unicode, with support for Unicode general category properties. Unicode is
+optional at build time, and must be enabled explicitly at run time. The version
+of Unicode in use can be discovered by running
+pcre2-config man page
+
+
SYNOPSIS
+
DESCRIPTION
+
OPTIONS
+
SEE ALSO
+
AUTHOR
+
REVISION
+
+pcre2 man page
+
+
INTRODUCTION
+
+ pcre2test -C
+
+
+The three libraries contain identical sets of functions, with names ending in +_8, _16, or _32, respectively (for example, pcre2_compile_8()). However, +by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just +one code unit width can be written using generic names such as +pcre2_compile(), and the documentation is written assuming that this is +the case. +
++In addition to the Perl-compatible matching function, PCRE2 contains an +alternative function that matches the same compiled patterns in a different +way. In certain circumstances, the alternative function has some advantages. +For a discussion of the two matching algorithms, see the +pcre2matching +page. +
++Details of exactly which Perl regular expression features are and are not +supported by PCRE2 are given in separate documents. See the +pcre2pattern +and +pcre2compat +pages. There is a syntax summary in the +pcre2syntax +page. +
++Some features of PCRE2 can be included, excluded, or changed when the library +is built. The +pcre2_config() +function makes it possible for a client to discover which features are +available. The features themselves are described in the +pcre2build +page. Documentation about building PCRE2 for various operating systems can be +found in the +README +and +NON-AUTOTOOLS_BUILD +files in the source distribution. +
++The libraries contains a number of undocumented internal functions and data +tables that are used by more than one of the exported external functions, but +which are not intended for use by external callers. Their names all begin with +"_pcre2", which hopefully will not provoke any name clashes. In some +environments, it is possible to control which external symbols are exported +when a shared library is built, and in these cases the undocumented symbols are +not exported. +
++If you are using PCRE2 in a non-UTF application that permits users to supply +arbitrary patterns for compilation, you should be aware of a feature that +allows users to turn on UTF support from within a pattern, provided that PCRE2 +was built with Unicode support. For example, an 8-bit pattern that begins with +"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings +of UTF-8 code units instead of individual 8-bit characters. This causes both +the pattern and any data against which it is matched to be checked for UTF-8 +validity. If the data string is very long, such a check might use sufficiently +many resources as to cause your application to lose performance. +
++One way of guarding against this possibility is to use the +pcre2_pattern_info() function to check the compiled pattern's options for +UTF. Alternatively, you can set the PCRE2_NEVER_UTF option at compile time. +This causes an compile time error if a pattern contains a UTF-setting sequence. +
++If your application is one that supports UTF, be aware that validity checking +can take time. If the same data string is to be matched many times, you can use +the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid +running redundant checks. +
++Another way that performance can be hit is by running a pattern that has a very +large search tree against a string that will never match. Nested unlimited +repeats in a pattern are a common example. PCRE2 provides some protection +against this: see the pcre2_set_match_limit() function in the +pcre2api +page. +
++The user documentation for PCRE2 comprises a number of different sections. In +the "man" format, each of these is a separate "man page". In the HTML format, +each is a separate page, linked from the index page. In the plain text format, +the descriptions of the pcre2grep and pcre2test programs are in +files called pcre2grep.txt and pcre2test.txt, respectively. The +remaining sections, except for the pcre2demo section (which is a program +listing), and the short pages for individual functions, are concatenated in +pcre2.txt, for ease of searching. The sections are as follows: +
+ pcre2 this document FIXME CHECK THIS LIST + pcre2-config show PCRE2 installation configuration information + pcre2api details of PCRE2's native C API + pcre2build building PCRE2 + pcre2callout details of the callout feature + pcre2compat discussion of Perl compatibility + pcre2demo a demonstration C program that uses PCRE2 + pcre2grep description of the pcre2grep command (8-bit only) + pcre2jit discussion of the just-in-time optimization support + pcre2limits details of size and other limits + pcre2matching discussion of the two matching algorithms + pcre2partial details of the partial matching facility + pcre2pattern syntax and semantics of supported regular expressions + pcre2perform discussion of performance issues + pcre2posix the POSIX-compatible C API for the 8-bit library + pcre2sample discussion of the pcre2demo program + pcre2stack discussion of stack usage + pcre2syntax quick syntax reference + pcre2test description of the pcre2test testing command + pcre2unicode discussion of Unicode and UTF support ++In the "man" and HTML formats, there is also a short page for each C library +function, listing its arguments and results. + +
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+Putting an actual email address here is a spam magnet. If you want to email me, +use my two initials, followed by the two digits 10, at the domain cam.ac.uk. +
+
+Last updated: 28 September 2014
+
+Copyright © 1997-2014 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/html/pcre2grep.html b/doc/html/pcre2grep.html new file mode 100644 index 0000000..6528a38 --- /dev/null +++ b/doc/html/pcre2grep.html @@ -0,0 +1,759 @@ + + ++Return to the PCRE2 index page. +
+
+This page is part of the PCRE2 HTML documentation. It was generated
+automatically from the original man page. If there is any nonsense in it,
+please consult the man page, in case the conversion went wrong.
+
+
+pcre2grep [options] [long options] [pattern] [path1 path2 ...] +
++pcre2grep searches files for character patterns, in the same way as other +grep commands do, but it uses the PCRE2 regular expression library to support +patterns that are compatible with the regular expressions of Perl 5. See +pcre2syntax(3) +for a quick-reference summary of pattern syntax, or +pcre2pattern(3) +for a full description of the syntax and semantics of the regular expressions +that PCRE2 supports. +
++Patterns, whether supplied on the command line or in a separate file, are given +without delimiters. For example: +
+ pcre2grep Thursday /etc/motd ++If you attempt to use delimiters (for example, by surrounding a pattern with +slashes, as is common in Perl scripts), they are interpreted as part of the +pattern. Quotes can of course be used to delimit patterns on the command line +because they are interpreted by the shell, and indeed quotes are required if a +pattern contains white space or shell metacharacters. + +
+The first argument that follows any option settings is treated as the single +pattern to be matched when neither -e nor -f is present. +Conversely, when one or both of these options are used to specify patterns, all +arguments are treated as path names. At least one of -e, -f, or an +argument pattern must be provided. +
++If no files are specified, pcre2grep reads the standard input. The +standard input can also be referenced by a name consisting of a single hyphen. +For example: +
+ pcre2grep some-pattern /file1 - /file3 ++By default, each line that matches a pattern is copied to the standard +output, and if there is more than one file, the file name is output at the +start of each line, followed by a colon. However, there are options that can +change how pcre2grep behaves. In particular, the -M option makes it +possible to search for patterns that span line boundaries. What defines a line +boundary is controlled by the -N (--newline) option. + +
+The amount of memory used for buffering files that are being scanned is +controlled by a parameter that can be set by the --buffer-size option. +The default value for this parameter is specified when pcre2grep is built, +with the default default being 20K. A block of memory three times this size is +used (to allow for buffering "before" and "after" lines). An error occurs if a +line overflows the buffer. +
++Patterns can be no longer than 8K or BUFSIZ bytes, whichever is the greater. +BUFSIZ is defined in <stdio.h>. When there is more than one pattern +(specified by the use of -e and/or -f), each pattern is applied to +each line in the order in which they are defined, except that all the -e +patterns are tried before the -f patterns. +
++By default, as soon as one pattern matches a line, no further patterns are +considered. However, if --colour (or --color) is used to colour the +matching substrings, or if --only-matching, --file-offsets, or +--line-offsets is used to output only the part of the line that matched +(either shown literally, or as an offset), scanning resumes immediately +following the match, so that further matches on the same line can be found. If +there are multiple patterns, they are all tried on the remainder of the line, +but patterns that follow the one that matched are not tried on the earlier part +of the line. +
++This behaviour means that the order in which multiple patterns are specified +can affect the output when one of the above options is used. This is no longer +the same behaviour as GNU grep, which now manages to display earlier matches +for later patterns (as long as there is no overlap). +
++Patterns that can match an empty string are accepted, but empty string +matches are never recognized. An example is the pattern "(super)?(man)?", in +which all components are optional. This pattern finds all occurrences of both +"super" and "man"; the output differs from matching with "super|man" when only +the matching substrings are being shown. +
++If the LC_ALL or LC_CTYPE environment variable is set, +pcre2grep uses the value to set a locale when calling the PCRE2 library. +The --locale option can be used to override this. +
++It is possible to compile pcre2grep so that it uses libz or +libbz2 to read files whose names end in .gz or .bz2, +respectively. You can find out whether your binary has support for one or both +of these file types by running it with the --help option. If the +appropriate support is not present, files are treated as plain text. The +standard input is always so treated. +
++By default, a file that contains a binary zero byte within the first 1024 bytes +is identified as a binary file, and is processed specially. (GNU grep also +identifies binary files in this manner.) See the --binary-files option +for a means of changing the way binary files are handled. +
++The order in which some of the options appear can affect the output. For +example, both the -h and -l options affect the printing of file +names. Whichever comes later in the command line will be the one that takes +effect. Similarly, except where noted below, if an option is given twice, the +later setting is used. Numerical values for options may be followed by K or M, +to signify multiplication by 1024 or 1024*1024 respectively. +
++-- +This terminates the list of options. It is useful if the next item on the +command line starts with a hyphen but is not an option. This allows for the +processing of patterns and filenames that start with hyphens. +
++-A number, --after-context=number +Output number lines of context after each matching line. If filenames +and/or line numbers are being output, a hyphen separator is used instead of a +colon for the context lines. A line containing "--" is output between each +group of lines, unless they are in fact contiguous in the input file. The value +of number is expected to be relatively small. However, pcre2grep +guarantees to have up to 8K of following text available for context output. +
++-a, --text +Treat binary files as text. This is equivalent to +--binary-files=text. +
++-B number, --before-context=number +Output number lines of context before each matching line. If filenames +and/or line numbers are being output, a hyphen separator is used instead of a +colon for the context lines. A line containing "--" is output between each +group of lines, unless they are in fact contiguous in the input file. The value +of number is expected to be relatively small. However, pcre2grep +guarantees to have up to 8K of preceding text available for context output. +
++--binary-files=word +Specify how binary files are to be processed. If the word is "binary" (the +default), pattern matching is performed on binary files, but the only output is +"Binary file <name> matches" when a match succeeds. If the word is "text", +which is equivalent to the -a or --text option, binary files are +processed in the same way as any other file. In this case, when a match +succeeds, the output may be binary garbage, which can have nasty effects if +sent to a terminal. If the word is "without-match", which is equivalent to the +-I option, binary files are not processed at all; they are assumed not to +be of interest. +
++--buffer-size=number +Set the parameter that controls how much memory is used for buffering files +that are being scanned. +
++-C number, --context=number +Output number lines of context both before and after each matching line. +This is equivalent to setting both -A and -B to the same value. +
++-c, --count +Do not output individual lines from the files that are being scanned; instead +output the number of lines that would otherwise have been shown. If no lines +are selected, the number zero is output. If several files are are being +scanned, a count is output for each of them. However, if the +--files-with-matches option is also used, only those files whose counts +are greater than zero are listed. When -c is used, the -A, +-B, and -C options are ignored. +
++--colour, --color +If this option is given without any data, it is equivalent to "--colour=auto". +If data is required, it must be given in the same shell item, separated by an +equals sign. +
+
+--colour=value, --color=value
+This option specifies under what circumstances the parts of a line that matched
+a pattern should be coloured in the output. By default, the output is not
+coloured. The value (which is optional, see above) may be "never", "always", or
+"auto". In the latter case, colouring happens only if the standard output is
+connected to a terminal. More resources are used when colouring is enabled,
+because pcre2grep has to search for all possible matches in a line, not
+just one, in order to colour them all.
+
+
+The colour that is used can be specified by setting the environment variable
+PCRE2GREP_COLOUR or PCRE2GREP_COLOR. The value of this variable should be a
+string of two numbers, separated by a semicolon. They are copied directly into
+the control string for setting colour on a terminal, so it is your
+responsibility to ensure that they make sense. If neither of the environment
+variables is set, the default is "1;31", which gives red.
+
+-D action, --devices=action +If an input path is not a regular file or a directory, "action" specifies how +it is to be processed. Valid values are "read" (the default) or "skip" +(silently skip the path). +
++-d action, --directories=action +If an input path is a directory, "action" specifies how it is to be processed. +Valid values are "read" (the default in non-Windows environments, for +compatibility with GNU grep), "recurse" (equivalent to the -r option), or +"skip" (silently skip the path, the default in Windows environments). In the +"read" case, directories are read as if they were ordinary files. In some +operating systems the effect of reading a directory like this is an immediate +end-of-file; in others it may provoke an error. +
+
+-e pattern, --regex=pattern, --regexp=pattern
+Specify a pattern to be matched. This option can be used multiple times in
+order to specify several patterns. It can also be used as a way of specifying a
+single pattern that starts with a hyphen. When -e is used, no argument
+pattern is taken from the command line; all arguments are treated as file
+names. There is no limit to the number of patterns. They are applied to each
+line in the order in which they are defined until one matches.
+
+
+If -f is used with -e, the command line patterns are matched first,
+followed by the patterns from the file(s), independent of the order in which
+these options are specified. Note that multiple use of -e is not the same
+as a single pattern with alternatives. For example, X|Y finds the first
+character in a line that is X or Y, whereas if the two patterns are given
+separately, with X first, pcre2grep finds X if it is present, even if it
+follows Y in the line. It finds Y only if there is no X in the line. This
+matters only if you are using -o or --colo(u)r to show the part(s)
+of the line that matched.
+
+--exclude=pattern +Files (but not directories) whose names match the pattern are skipped without +being processed. This applies to all files, whether listed on the command line, +obtained from --file-list, or by scanning a directory. The pattern is a +PCRE2 regular expression, and is matched against the final component of the file +name, not the entire path. The -F, -w, and -x options do not +apply to this pattern. The option may be given any number of times in order to +specify multiple patterns. If a file name matches both an --include +and an --exclude pattern, it is excluded. There is no short form for this +option. +
++--exclude-from=filename +Treat each non-empty line of the file as the data for an --exclude +option. What constitutes a newline when reading the file is the operating +system's default. The --newline option has no effect on this option. This +option may be given more than once in order to specify a number of files to +read. +
++--exclude-dir=pattern +Directories whose names match the pattern are skipped without being processed, +whatever the setting of the --recursive option. This applies to all +directories, whether listed on the command line, obtained from +--file-list, or by scanning a parent directory. The pattern is a PCRE2 +regular expression, and is matched against the final component of the directory +name, not the entire path. The -F, -w, and -x options do not +apply to this pattern. The option may be given any number of times in order to +specify more than one pattern. If a directory matches both --include-dir +and --exclude-dir, it is excluded. There is no short form for this +option. +
++-F, --fixed-strings +Interpret each data-matching pattern as a list of fixed strings, separated by +newlines, instead of as a regular expression. What constitutes a newline for +this purpose is controlled by the --newline option. The -w (match +as a word) and -x (match whole line) options can be used with -F. +They apply to each of the fixed strings. A line is selected if any of the fixed +strings are found in it (subject to -w or -x, if present). This +option applies only to the patterns that are matched against the contents of +files; it does not apply to patterns specified by any of the --include or +--exclude options. +
+
+-f filename, --file=filename
+Read patterns from the file, one per line, and match them against
+each line of input. What constitutes a newline when reading the file is the
+operating system's default. The --newline option has no effect on this
+option. Trailing white space is removed from each line, and blank lines are
+ignored. An empty file contains no patterns and therefore matches nothing. See
+also the comments about multiple patterns versus a single pattern with
+alternatives in the description of -e above.
+
+
+If this option is given more than once, all the specified files are
+read. A data line is output if any of the patterns match it. A filename can
+be given as "-" to refer to the standard input. When -f is used, patterns
+specified on the command line using -e may also be present; they are
+tested before the file's patterns. However, no other pattern is taken from the
+command line; all arguments are treated as the names of paths to be searched.
+
+--file-list=filename +Read a list of files and/or directories that are to be scanned from the given +file, one per line. Trailing white space is removed from each line, and blank +lines are ignored. These paths are processed before any that are listed on the +command line. The filename can be given as "-" to refer to the standard input. +If --file and --file-list are both specified as "-", patterns are +read first. This is useful only when the standard input is a terminal, from +which further lines (the list of files) can be read after an end-of-file +indication. If this option is given more than once, all the specified files are +read. +
++--file-offsets +Instead of showing lines or parts of lines that match, show each match as an +offset from the start of the file and a length, separated by a comma. In this +mode, no context is shown. That is, the -A, -B, and -C +options are ignored. If there is more than one match in a line, each of them is +shown separately. This option is mutually exclusive with --line-offsets +and --only-matching. +
++-H, --with-filename +Force the inclusion of the filename at the start of output lines when searching +a single file. By default, the filename is not shown in this case. For matching +lines, the filename is followed by a colon; for context lines, a hyphen +separator is used. If a line number is also being output, it follows the file +name. +
++-h, --no-filename +Suppress the output filenames when searching multiple files. By default, +filenames are shown when multiple files are searched. For matching lines, the +filename is followed by a colon; for context lines, a hyphen separator is used. +If a line number is also being output, it follows the file name. +
++--help +Output a help message, giving brief details of the command options and file +type support, and then exit. Anything else on the command line is +ignored. +
++-I +Treat binary files as never matching. This is equivalent to +--binary-files=without-match. +
++-i, --ignore-case +Ignore upper/lower case distinctions during comparisons. +
++--include=pattern +If any --include patterns are specified, the only files that are +processed are those that match one of the patterns (and do not match an +--exclude pattern). This option does not affect directories, but it +applies to all files, whether listed on the command line, obtained from +--file-list, or by scanning a directory. The pattern is a PCRE2 regular +expression, and is matched against the final component of the file name, not +the entire path. The -F, -w, and -x options do not apply to +this pattern. The option may be given any number of times. If a file name +matches both an --include and an --exclude pattern, it is excluded. +There is no short form for this option. +
++--include-from=filename +Treat each non-empty line of the file as the data for an --include +option. What constitutes a newline for this purpose is the operating system's +default. The --newline option has no effect on this option. This option +may be given any number of times; all the files are read. +
++--include-dir=pattern +If any --include-dir patterns are specified, the only directories that +are processed are those that match one of the patterns (and do not match an +--exclude-dir pattern). This applies to all directories, whether listed +on the command line, obtained from --file-list, or by scanning a parent +directory. The pattern is a PCRE2 regular expression, and is matched against the +final component of the directory name, not the entire path. The -F, +-w, and -x options do not apply to this pattern. The option may be +given any number of times. If a directory matches both --include-dir and +--exclude-dir, it is excluded. There is no short form for this option. +
++-L, --files-without-match +Instead of outputting lines from the files, just output the names of the files +that do not contain any lines that would have been output. Each file name is +output once, on a separate line. +
++-l, --files-with-matches +Instead of outputting lines from the files, just output the names of the files +containing lines that would have been output. Each file name is output +once, on a separate line. Searching normally stops as soon as a matching line +is found in a file. However, if the -c (count) option is also used, +matching continues in order to obtain the correct count, and those files that +have at least one match are listed along with their counts. Using this option +with -c is a way of suppressing the listing of files with no matches. +
++--label=name +This option supplies a name to be used for the standard input when file names +are being output. If not supplied, "(standard input)" is used. There is no +short form for this option. +
++--line-buffered +When this option is given, input is read and processed line by line, and the +output is flushed after each write. By default, input is read in large chunks, +unless pcre2grep can determine that it is reading from a terminal (which +is currently possible only in Unix-like environments). Output to terminal is +normally automatically flushed by the operating system. This option can be +useful when the input or output is attached to a pipe and you do not want +pcre2grep to buffer up large amounts of data. However, its use will affect +performance, and the -M (multiline) option ceases to work. +
++--line-offsets +Instead of showing lines or parts of lines that match, show each match as a +line number, the offset from the start of the line, and a length. The line +number is terminated by a colon (as usual; see the -n option), and the +offset and length are separated by a comma. In this mode, no context is shown. +That is, the -A, -B, and -C options are ignored. If there is +more than one match in a line, each of them is shown separately. This option is +mutually exclusive with --file-offsets and --only-matching. +
++--locale=locale-name +This option specifies a locale to be used for pattern matching. It overrides +the value in the LC_ALL or LC_CTYPE environment variables. If no +locale is specified, the PCRE2 library's default (usually the "C" locale) is +used. There is no short form for this option. +
+
+--match-limit=number
+Processing some regular expression patterns can require a very large amount of
+memory, leading in some cases to a program crash if not enough is available.
+Other patterns may take a very long time to search for all possible matching
+strings. The pcre2_exec() function that is called by pcre2grep to do
+the matching has two parameters that can limit the resources that it uses.
+
+
+The --match-limit option provides a means of limiting resource usage
+when processing patterns that are not going to match, but which have a very
+large number of possibilities in their search trees. The classic example is a
+pattern that uses nested unlimited repeats. Internally, PCRE2 uses a function
+called match() which it calls repeatedly (sometimes recursively). The
+limit set by --match-limit is imposed on the number of times this
+function is called during a match, which has the effect of limiting the amount
+of backtracking that can take place.
+
+
+The --recursion-limit option is similar to --match-limit, but
+instead of limiting the total number of times that match() is called, it
+limits the depth of recursive calls, which in turn limits the amount of memory
+that can be used. The recursion depth is a smaller number than the total number
+of calls, because not all calls to match() are recursive. This limit is
+of use only if it is set smaller than --match-limit.
+
+
+There are no short forms for these options. The default settings are specified
+when the PCRE2 library is compiled, with the default default being 10 million.
+
+-M, --multiline
+Allow patterns to match more than one line. When this option is given, patterns
+may usefully contain literal newline characters and internal occurrences of ^
+and $ characters. The output for a successful match may consist of more than
+one line, the last of which is the one in which the match ended. If the matched
+string ends with a newline sequence the output ends at the end of that line.
+
+
+When this option is set, the PCRE2 library is called in "multiline" mode.
+There is a limit to the number of lines that can be matched, imposed by the way
+that pcre2grep buffers the input file as it scans it. However,
+pcre2grep ensures that at least 8K characters or the rest of the document
+(whichever is the shorter) are available for forward matching, and similarly
+the previous 8K characters (or all the previous characters, if fewer than 8K)
+are guaranteed to be available for lookbehind assertions. This option does not
+work when input is read line by line (see \fP--line-buffered\fP.)
+
+-N newline-type, --newline=newline-type
+The PCRE2 library supports five different conventions for indicating
+the ends of lines. They are the single-character sequences CR (carriage return)
+and LF (linefeed), the two-character sequence CRLF, an "anycrlf" convention,
+which recognizes any of the preceding three types, and an "any" convention, in
+which any Unicode line ending sequence is assumed to end a line. The Unicode
+sequences are the three just mentioned, plus VT (vertical tab, U+000B), FF
+(form feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), and
+PS (paragraph separator, U+2029).
+
+
+When the PCRE2 library is built, a default line-ending sequence is specified.
+This is normally the standard sequence for the operating system. Unless
+otherwise specified by this option, pcre2grep uses the library's default.
+The possible values for this option are CR, LF, CRLF, ANYCRLF, or ANY. This
+makes it possible to use pcre2grep to scan files that have come from other
+environments without having to modify their line endings. If the data that is
+being scanned does not agree with the convention set by this option,
+pcre2grep may behave in strange ways. Note that this option does not
+apply to files specified by the -f, --exclude-from, or
+--include-from options, which are expected to use the operating system's
+standard newline sequence.
+
+-n, --line-number +Precede each output line by its line number in the file, followed by a colon +for matching lines or a hyphen for context lines. If the filename is also being +output, it precedes the line number. This option is forced if +--line-offsets is used. +
++--no-jit +If the PCRE2 library is built with support for just-in-time compiling (which +speeds up matching), pcre2grep automatically makes use of this, unless it +was explicitly disabled at build time. This option can be used to disable the +use of JIT at run time. It is provided for testing and working round problems. +It should never be needed in normal use. +
++-o, --only-matching +Show only the part of the line that matched a pattern instead of the whole +line. In this mode, no context is shown. That is, the -A, -B, and +-C options are ignored. If there is more than one match in a line, each +of them is shown separately. If -o is combined with -v (invert the +sense of the match to find non-matching lines), no output is generated, but the +return code is set appropriately. If the matched portion of the line is empty, +nothing is output unless the file name or line number are being printed, in +which case they are shown on an otherwise empty line. This option is mutually +exclusive with --file-offsets and --line-offsets. +
+
+-onumber, --only-matching=number
+Show only the part of the line that matched the capturing parentheses of the
+given number. Up to 32 capturing parentheses are supported, and -o0 is
+equivalent to -o without a number. Because these options can be given
+without an argument (see above), if an argument is present, it must be given in
+the same shell item, for example, -o3 or --only-matching=2. The comments given
+for the non-argument case above also apply to this case. If the specified
+capturing parentheses do not exist in the pattern, or were not set in the
+match, nothing is output unless the file name or line number are being printed.
+
+
+If this option is given multiple times, multiple substrings are output, in the
+order the options are given. For example, -o3 -o1 -o3 causes the substrings
+matched by capturing parentheses 3 and 1 and then 3 again to be output. By
+default, there is no separator (but see the next option).
+
+--om-separator=text +Specify a separating string for multiple occurrences of -o. The default +is an empty string. Separating strings are never coloured. +
++-q, --quiet +Work quietly, that is, display nothing except error messages. The exit +status indicates whether or not any matches were found. +
++-r, --recursive +If any given path is a directory, recursively scan the files it contains, +taking note of any --include and --exclude settings. By default, a +directory is read as a normal file; in some operating systems this gives an +immediate end-of-file. This option is a shorthand for setting the -d +option to "recurse". +
++--recursion-limit=number +See --match-limit above. +
++-s, --no-messages +Suppress error messages about non-existent or unreadable files. Such files are +quietly skipped. However, the return code is still 2, even if matches were +found in other files. +
++-u, --utf-8 +Operate in UTF-8 mode. This option is available only if PCRE2 has been compiled +with UTF-8 support. All patterns (including those for any --exclude and +--include options) and all subject lines that are scanned must be valid +strings of UTF-8 characters. +
++-V, --version +Write the version numbers of pcre2grep and the PCRE2 library to the +standard output and then exit. Anything else on the command line is +ignored. +
++-v, --invert-match +Invert the sense of the match, so that lines which do not match any of +the patterns are the ones that are found. +
++-w, --word-regex, --word-regexp +Force the patterns to match only whole words. This is equivalent to having \b +at the start and end of the pattern. This option applies only to the patterns +that are matched against the contents of files; it does not apply to patterns +specified by any of the --include or --exclude options. +
++-x, --line-regex, --line-regexp +Force the patterns to be anchored (each must start matching at the beginning of +a line) and in addition, require them to match entire lines. This is equivalent +to having ^ and $ characters at the start and end of each alternative branch in +every pattern. This option applies only to the patterns that are matched +against the contents of files; it does not apply to patterns specified by any +of the --include or --exclude options. +
++The environment variables LC_ALL and LC_CTYPE are examined, in that +order, for a locale. The first one that is set is used. This can be overridden +by the --locale option. If no locale is set, the PCRE2 library's default +(usually the "C" locale) is used. +
++The -N (--newline) option allows pcre2grep to scan files with +different newline conventions from the default. Any parts of the input files +that are written to the standard output are copied identically, with whatever +newline sequences they have in the input. However, the setting of this option +does not affect the interpretation of files specified by the -f, +--exclude-from, or --include-from options, which are assumed to use +the operating system's standard newline sequence, nor does it affect the way in +which pcre2grep writes informational messages to the standard error and +output streams. For these it uses the string "\n" to indicate newlines, +relying on the C I/O library to convert this to an appropriate sequence. +
++Many of the short and long forms of pcre2grep's options are the same +as in the GNU grep program. Any long option of the form +--xxx-regexp (GNU terminology) is also available as --xxx-regex +(PCRE2 terminology). However, the --file-list, --file-offsets, +--include-dir, --line-offsets, --locale, --match-limit, +-M, --multiline, -N, --newline, --om-separator, +--recursion-limit, -u, and --utf-8 options are specific to +pcre2grep, as is the use of the --only-matching option with a +capturing parentheses number. +
++Although most of the common options work the same way, a few are different in +pcre2grep. For example, the --include option's argument is a glob +for GNU grep, but a regular expression for pcre2grep. If both the +-c and -l options are given, GNU grep lists only file names, +without counts, but pcre2grep gives the counts. +
++There are four different ways in which an option with data can be specified. +If a short form option is used, the data may follow immediately, or (with one +exception) in the next command line item. For example: +
+ -f/some/file + -f /some/file ++The exception is the -o option, which may appear with or without data. +Because of this, if data is present, it must follow immediately in the same +item, for example -o3. + +
+If a long form option is used, the data may appear in the same command line +item, separated by an equals character, or (with two exceptions) it may appear +in the next command line item. For example: +
+ --file=/some/file + --file /some/file ++Note, however, that if you want to supply a file name beginning with ~ as data +in a shell command, and have the shell expand ~ to a home directory, you must +separate the file name from the option, because the shell does not treat ~ +specially unless it is at the start of an item. + +
+The exceptions to the above are the --colour (or --color) and +--only-matching options, for which the data is optional. If one of these +options does have data, it must be given in the first form, using an equals +character. Otherwise pcre2grep will assume that it has no data. +
++It is possible to supply a regular expression that takes a very long time to +fail to match certain lines. Such patterns normally involve nested indefinite +repeats, for example: (a+)*\d when matched against a line of a's with no final +digit. The PCRE2 matching function has a resource limit that causes it to abort +in these circumstances. If this happens, pcre2grep outputs an error +message and the line that caused the problem to the standard error stream. If +there are more than 20 such errors, pcre2grep gives up. +
++The --match-limit option of pcre2grep can be used to set the overall +resource limit; there is a second option called --recursion-limit that +sets a limit on the amount of memory (usually stack) that is used (see the +discussion of these options above). +
++Exit status is 0 if any matches were found, 1 if no matches were found, and 2 +for syntax errors, overlong lines, non-existent or inaccessible files (even if +matches were found in other files) or too many matching errors. Using the +-s option to suppress error messages about inaccessible files does not +affect the return code. +
++pcre2pattern(3), pcre2syntax(3), pcre2test(1). +
+
+Philip Hazel
+
+University Computing Service
+
+Cambridge CB2 3QH, England.
+
+
+Last updated: 28 September 2014
+
+Copyright © 1997-2014 University of Cambridge.
+
+
+Return to the PCRE2 index page. +
diff --git a/doc/pcre2-config.1 b/doc/pcre2-config.1 new file mode 100644 index 0000000..7fa0a09 --- /dev/null +++ b/doc/pcre2-config.1 @@ -0,0 +1,86 @@ +.TH PCRE2-CONFIG 1 "28 September 2014" "PCRE2 10.00" +.SH NAME +pcre2-config - program to return PCRE2 configuration +.SH SYNOPSIS +.rs +.sp +.nf +.B pcre2-config [--prefix] [--exec-prefix] [--version] +.B " [--libs8] [--libs16] [--libs32] [--libs-posix]" +.B " [--cflags] [--cflags-posix]" +.fi +. +. +.SH DESCRIPTION +.rs +.sp +\fBpcre2-config\fP returns the configuration of the installed PCRE2 libraries +and the options required to compile a program to use them. Some of the options +apply only to the 8-bit, or 16-bit, or 32-bit libraries, respectively, and are +not available for libraries that have not been built. If an unavailable option +is encountered, the "usage" information is output. +. +. +.SH OPTIONS +.rs +.TP 10 +\fB--prefix\fP +Writes the directory prefix used in the PCRE2 installation for architecture +independent files (\fI/usr\fP on many systems, \fI/usr/local\fP on some +systems) to the standard output. +.TP 10 +\fB--exec-prefix\fP +Writes the directory prefix used in the PCRE2 installation for architecture +dependent files (normally the same as \fB--prefix\fP) to the standard output. +.TP 10 +\fB--version\fP +Writes the version number of the installed PCRE2 libraries to the standard +output. +.TP 10 +\fB--libs8\fP +Writes to the standard output the command line options required to link +with the 8-bit PCRE2 library (\fB-lpcre2-8\fP on many systems). +.TP 10 +\fB--libs16\fP +Writes to the standard output the command line options required to link +with the 16-bit PCRE2 library (\fB-lpcre2-16\fP on many systems). +.TP 10 +\fB--libs32\fP +Writes to the standard output the command line options required to link +with the 32-bit PCRE2 library (\fB-lpcre2-32\fP on many systems). +.TP 10 +\fB--libs-posix\fP +Writes to the standard output the command line options required to link with +PCRE2's POSIX API wrapper library (\fB-lpcre2-posix\fP \fB-lpcre2-8\fP on many +systems). +.TP 10 +\fB--cflags\fP +Writes to the standard output the command line options required to compile +files that use PCRE2 (this may include some \fB-I\fP options, but is blank on +many systems). +.TP 10 +\fB--cflags-posix\fP +Writes to the standard output the command line options required to compile +files that use PCRE2's POSIX API wrapper library (this may include some +\fB-I\fP options, but is blank on many systems). +. +. +.SH "SEE ALSO" +.rs +.sp +\fBpcre2(3)\fP +. +. +.SH AUTHOR +.rs +.sp +This manual page was originally written by Mark Baker for the Debian GNU/Linux +system. It has been subsequently revised as a generic PCRE2 man page. +. +. +.SH REVISION +.rs +.sp +.nf +Last updated: 28 September 2014 +.fi diff --git a/doc/pcre2-config.txt b/doc/pcre2-config.txt new file mode 100644 index 0000000..8ddea2a --- /dev/null +++ b/doc/pcre2-config.txt @@ -0,0 +1,81 @@ +PCRE2-CONFIG(1) General Commands Manual PCRE2-CONFIG(1) + + + +NAME + pcre2-config - program to return PCRE2 configuration + +SYNOPSIS + + pcre2-config [--prefix] [--exec-prefix] [--version] + [--libs8] [--libs16] [--libs32] [--libs-posix] + [--cflags] [--cflags-posix] + + +DESCRIPTION + + pcre2-config returns the configuration of the installed PCRE2 libraries + and the options required to compile a program to use them. Some of the + options apply only to the 8-bit, or 16-bit, or 32-bit libraries, + respectively, and are not available for libraries that have not been + built. If an unavailable option is encountered, the "usage" information + is output. + + +OPTIONS + + --prefix Writes the directory prefix used in the PCRE2 installation + for architecture independent files (/usr on many systems, + /usr/local on some systems) to the standard output. + + --exec-prefix + Writes the directory prefix used in the PCRE2 installation + for architecture dependent files (normally the same as --pre- + fix) to the standard output. + + --version Writes the version number of the installed PCRE2 libraries to + the standard output. + + --libs8 Writes to the standard output the command line options + required to link with the 8-bit PCRE2 library (-lpcre2-8 on + many systems). + + --libs16 Writes to the standard output the command line options + required to link with the 16-bit PCRE2 library (-lpcre2-16 on + many systems). + + --libs32 Writes to the standard output the command line options + required to link with the 32-bit PCRE2 library (-lpcre2-32 on + many systems). + + --libs-posix + Writes to the standard output the command line options + required to link with PCRE2's POSIX API wrapper library + (-lpcre2-posix -lpcre2-8 on many systems). + + --cflags Writes to the standard output the command line options + required to compile files that use PCRE2 (this may include + some -I options, but is blank on many systems). + + --cflags-posix + Writes to the standard output the command line options + required to compile files that use PCRE2's POSIX API wrapper + library (this may include some -I options, but is blank on + many systems). + + +SEE ALSO + + pcre2(3) + + +AUTHOR + + This manual page was originally written by Mark Baker for the Debian + GNU/Linux system. It has been subsequently revised as a generic PCRE2 + man page. + + +REVISION + + Last updated: 28 September 2014 diff --git a/doc/pcre2.3 b/doc/pcre2.3 new file mode 100644 index 0000000..aaa71d3 --- /dev/null +++ b/doc/pcre2.3 @@ -0,0 +1,180 @@ +.TH PCRE2 3 "28 September 2014" "PCRE2 10.00" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH INTRODUCTION +.rs +.sp +PCRE2 is the name used for a revised API for the PCRE library, which is a set +of functions, written in C, that implement regular expression pattern matching +using the same syntax and semantics as Perl, with just a few differences. Some +features that appeared in Python and the original PCRE before they appeared in +Perl are also available using the Python syntax, there is some support for one +or two .NET and Oniguruma syntax items, and there are options for requesting +some minor changes that give better ECMAScript (aka JavaScript) compatibility. +.P +The source code for PCRE2 can be compiled to support 8-bit, 16-bit, or 32-bit +code units, which means that up to three separate libraries may be installed. +The original work to extend PCRE to 16-bit and 32-bit code units was done by +Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings +can be interpreted either as one character per code unit, or as UTF-encoded +Unicode, with support for Unicode general category properties. Unicode is +optional at build time, and must be enabled explicitly at run time. The version +of Unicode in use can be discovered by running +.sp + pcre2test -C +.P +The three libraries contain identical sets of functions, with names ending in +_8, _16, or _32, respectively (for example, \fBpcre2_compile_8()\fP). However, +by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or 32, a program that uses just +one code unit width can be written using generic names such as +\fBpcre2_compile()\fP, and the documentation is written assuming that this is +the case. +.P +In addition to the Perl-compatible matching function, PCRE2 contains an +alternative function that matches the same compiled patterns in a different +way. In certain circumstances, the alternative function has some advantages. +For a discussion of the two matching algorithms, see the +.\" HREF +\fBpcre2matching\fP +.\" +page. +.P +Details of exactly which Perl regular expression features are and are not +supported by PCRE2 are given in separate documents. See the +.\" HREF +\fBpcre2pattern\fP +.\" +and +.\" HREF +\fBpcre2compat\fP +.\" +pages. There is a syntax summary in the +.\" HREF +\fBpcre2syntax\fP +.\" +page. +.P +Some features of PCRE2 can be included, excluded, or changed when the library +is built. The +.\" HREF +\fBpcre2_config()\fP +.\" +function makes it possible for a client to discover which features are +available. The features themselves are described in the +.\" HREF +\fBpcre2build\fP +.\" +page. Documentation about building PCRE2 for various operating systems can be +found in the +.\" HTML +.\" +\fBREADME\fP +.\" +and +.\" HTML +.\" +\fBNON-AUTOTOOLS_BUILD\fP +.\" +files in the source distribution. +.P +The libraries contains a number of undocumented internal functions and data +tables that are used by more than one of the exported external functions, but +which are not intended for use by external callers. Their names all begin with +"_pcre2", which hopefully will not provoke any name clashes. In some +environments, it is possible to control which external symbols are exported +when a shared library is built, and in these cases the undocumented symbols are +not exported. +. +. +.SH "SECURITY CONSIDERATIONS" +.rs +.sp +If you are using PCRE2 in a non-UTF application that permits users to supply +arbitrary patterns for compilation, you should be aware of a feature that +allows users to turn on UTF support from within a pattern, provided that PCRE2 +was built with Unicode support. For example, an 8-bit pattern that begins with +"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings +of UTF-8 code units instead of individual 8-bit characters. This causes both +the pattern and any data against which it is matched to be checked for UTF-8 +validity. If the data string is very long, such a check might use sufficiently +many resources as to cause your application to lose performance. +.P +One way of guarding against this possibility is to use the +\fBpcre2_pattern_info()\fP function to check the compiled pattern's options for +UTF. Alternatively, you can set the PCRE2_NEVER_UTF option at compile time. +This causes an compile time error if a pattern contains a UTF-setting sequence. +.P +If your application is one that supports UTF, be aware that validity checking +can take time. If the same data string is to be matched many times, you can use +the PCRE2_NO_UTF_CHECK option for the second and subsequent matches to avoid +running redundant checks. +.P +Another way that performance can be hit is by running a pattern that has a very +large search tree against a string that will never match. Nested unlimited +repeats in a pattern are a common example. PCRE2 provides some protection +against this: see the \fBpcre2_set_match_limit()\fP function in the +.\" HREF +\fBpcre2api\fP +.\" +page. +. +. +.SH "USER DOCUMENTATION" +.rs +.sp +The user documentation for PCRE2 comprises a number of different sections. In +the "man" format, each of these is a separate "man page". In the HTML format, +each is a separate page, linked from the index page. In the plain text format, +the descriptions of the \fBpcre2grep\fP and \fBpcre2test\fP programs are in +files called \fBpcre2grep.txt\fP and \fBpcre2test.txt\fP, respectively. The +remaining sections, except for the \fBpcre2demo\fP section (which is a program +listing), and the short pages for individual functions, are concatenated in +\fBpcre2.txt\fP, for ease of searching. The sections are as follows: +.sp + pcre2 this document FIXME CHECK THIS LIST + pcre2-config show PCRE2 installation configuration information + pcre2api details of PCRE2's native C API + pcre2build building PCRE2 + pcre2callout details of the callout feature + pcre2compat discussion of Perl compatibility + pcre2demo a demonstration C program that uses PCRE2 + pcre2grep description of the \fBpcre2grep\fP command (8-bit only) + pcre2jit discussion of the just-in-time optimization support + pcre2limits details of size and other limits + pcre2matching discussion of the two matching algorithms + pcre2partial details of the partial matching facility +.\" JOIN + pcre2pattern syntax and semantics of supported + regular expressions + pcre2perform discussion of performance issues + pcre2posix the POSIX-compatible C API for the 8-bit library + pcre2sample discussion of the pcre2demo program + pcre2stack discussion of stack usage + pcre2syntax quick syntax reference + pcre2test description of the \fBpcre2test\fP testing command + pcre2unicode discussion of Unicode and UTF support +.sp +In the "man" and HTML formats, there is also a short page for each C library +function, listing its arguments and results. +. +. +.SH AUTHOR +.rs +.sp +.nf +Philip Hazel +University Computing Service +Cambridge CB2 3QH, England. +.fi +.P +Putting an actual email address here is a spam magnet. If you want to email me, +use my two initials, followed by the two digits 10, at the domain cam.ac.uk. +. +. +.SH REVISION +.rs +.sp +.nf +Last updated: 28 September 2014 +Copyright (c) 1997-2014 University of Cambridge. +.fi diff --git a/doc/pcre2build.3 b/doc/pcre2build.3 new file mode 100644 index 0000000..2146777 --- /dev/null +++ b/doc/pcre2build.3 @@ -0,0 +1,490 @@ +.TH PCRE2BUILD 3 "28 Sepember 2014" "PCRE2 10.00" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +. +. +.SH "BUILDING PCRE2" +.rs +.sp +PCRE2 is distributed with a \fBconfigure\fP script that can be used to build +the library in Unix-like environments using the applications known as +Autotools. Also in the distribution are files to support building using +\fBCMake\fP instead of \fBconfigure\fP. The text file +.\" HTML +.\" +\fBREADME\fP +.\" +contains general information about building with Autotools (some of which is +repeated below), and also has some comments about building on various operating +systems. There is a lot more information about building PCRE2 without using +Autotools (including information about using \fBCMake\fP and building "by +hand") in the text file called +.\" HTML +.\" +\fBNON-AUTOTOOLS-BUILD\fP. +.\" +You should consult this file as well as the +.\" HTML +.\" +\fBREADME\fP +.\" +file if you are building in a non-Unix-like environment. +. +. +.SH "PCRE2 BUILD-TIME OPTIONS" +.rs +.sp +The rest of this document describes the optional features of PCRE2 that can be +selected when the library is compiled. It assumes use of the \fBconfigure\fP +script, where the optional features are selected or deselected by providing +options to \fBconfigure\fP before running the \fBmake\fP command. However, the +same options can be selected in both Unix-like and non-Unix-like environments +if you are using \fBCMake\fP instead of \fBconfigure\fP to build PCRE2. +.P +If you are not using Autotools or \fBCMake\fP, option selection can be done by +editing the \fBconfig.h\fP file, or by passing parameter settings to the +compiler, as described in +.\" HTML +.\" +\fBNON-AUTOTOOLS-BUILD\fP. +.\" +.P +The complete list of options for \fBconfigure\fP (which includes the standard +ones such as the selection of the installation directory) can be obtained by +running +.sp + ./configure --help +.sp +The following sections include descriptions of options whose names begin with +--enable or --disable. These settings specify changes to the defaults for the +\fBconfigure\fP command. Because of the way that \fBconfigure\fP works, +--enable and --disable always come in pairs, so the complementary option always +exists as well, but as it specifies the default, it is not described. +. +. +.SH "BUILDING 8-BIT, 16-BIT AND 32-BIT LIBRARIES" +.rs +.sp +By default, a library called \fBlibpcre2-8\fP is built, containing functions +that take string arguments contained in vectors of bytes, interpreted either as +single-byte characters, or UTF-8 strings. You can also build two other +libraries, called \fBlibpcre2-16\fP and \fBlibpcre2-32\fP, which process +strings that are contained in vectors of 16-bit and 32-bit code units, +respectively. These can be interpreted either as single-unit characters or +UTF-16/UTF-32 strings. To build these additional libraries, add one or both of +the following to the \fBconfigure\fP command: +.sp + --enable-pcre16 + --enable-pcre32 +.sp +If you do not want the 8-bit library, add +.sp + --disable-pcre8 +.sp +as well. At least one of the three libraries must be built. Note that the POSIX +wrapper is for the 8-bit library only, and that \fBpcre2grep\fP is an 8-bit +program. Neither of these are built if you select only the 16-bit or 32-bit +libraries. +. +. +.SH "BUILDING SHARED AND STATIC LIBRARIES" +.rs +.sp +The Autotools PCRE2 building process uses \fBlibtool\fP to build both shared +and static libraries by default. You can suppress one of these by adding one of +.sp + --disable-shared + --disable-static +.sp +to the \fBconfigure\fP command, as required. +. +. +.SH "Unicode and UTF SUPPORT" +.rs +.sp +To build PCRE2 with support for Unicode and UTF character strings, add +.sp + --enable-unicode +.sp +to the \fBconfigure\fP command. This setting applies to all three libraries, +adding support for UTF-8 to the 8-bit library, support for UTF-16 to the 16-bit +library, and support for UTF-32 to the to the 32-bit library. +It is not possible to build one library with +UTF support and another without in the same configuration. +.P +Of itself, this setting does not make PCRE2 treat strings as UTF-8, UTF-16 or +UTF-32. As well as compiling PCRE2 with this option, you also have have to set +the PCRE2_UTF option when you call \fBpcre2_compile()\fP to compile a pattern. +.P +If you set --enable-unicode when compiling in an EBCDIC environment, PCRE2 +expects its input to be either ASCII or UTF-8 (depending on the run-time +option). It is not possible to support both EBCDIC and UTF-8 codes in the same +version of the library. Consequently, --enable-unicode and --enable-ebcdic are +mutually exclusive. +.P +UTF support allows the libraries to process character codepoints up to 0x10ffff +in the strings that they handle. It also provides support for accessing the +properties of such characters, using pattern escapes such as \eP, \ep, and \eX. +Only the general category properties such as \fILu\fP and \fINd\fP are +supported. Details are given in the +.\" HREF +\fBpcre2pattern\fP +.\" +documentation. +. +. +.SH "JUST-IN-TIME COMPILER SUPPORT" +.rs +.sp +Just-in-time compiler support is included in the build by specifying +.sp + --enable-jit +.sp +This support is available only for certain hardware architectures. If this +option is set for an unsupported architecture, a compile time error occurs. +See the +.\" HREF +\fBpcre2jit\fP +.\" +documentation for a discussion of JIT usage. When JIT support is enabled, +pcre2grep automatically makes use of it, unless you add +.sp + --disable-pcre2grep-jit +.sp +to the "configure" command. +. +. +.SH "CODE VALUE OF NEWLINE" +.rs +.sp +By default, PCRE2 interprets the linefeed (LF) character as indicating the end +of a line. This is the normal newline character on Unix-like systems. You can +compile PCRE2 to use carriage return (CR) instead, by adding +.sp + --enable-newline-is-cr +.sp +to the \fBconfigure\fP command. There is also a --enable-newline-is-lf option, +which explicitly specifies linefeed as the newline character. +.sp +Alternatively, you can specify that line endings are to be indicated by the two +character sequence CRLF. If you want this, add +.sp + --enable-newline-is-crlf +.sp +to the \fBconfigure\fP command. There is a fourth option, specified by +.sp + --enable-newline-is-anycrlf +.sp +which causes PCRE2 to recognize any of the three sequences CR, LF, or CRLF as +indicating a line ending. Finally, a fifth option, specified by +.sp + --enable-newline-is-any +.sp +causes PCRE2 to recognize any Unicode newline sequence. +.P +Whatever line ending convention is selected when PCRE2 is built can be +overridden when the library functions are called. At build time it is +conventional to use the standard for your operating system. +. +. +.SH "WHAT \eR MATCHES" +.rs +.sp +By default, the sequence \eR in a pattern matches any Unicode newline sequence, +whatever has been selected as the line ending sequence. If you specify +.sp + --enable-bsr-anycrlf +.sp +the default is changed so that \eR matches only CR, LF, or CRLF. Whatever is +selected when PCRE2 is built can be overridden when the library functions are +called. +. +. +.SH "HANDLING VERY LARGE PATTERNS" +.rs +.sp +Within a compiled pattern, offset values are used to point from one part to +another (for example, from an opening parenthesis to an alternation +metacharacter). By default, in the 8-bit and 16-bit libraries, two-byte values +are used for these offsets, leading to a maximum size for a compiled pattern of +around 64K. This is sufficient to handle all but the most gigantic patterns. +Nevertheless, some people do want to process truly enormous patterns, so it is +possible to compile PCRE2 to use three-byte or four-byte offsets by adding a +setting such as +.sp + --with-link-size=3 +.sp +to the \fBconfigure\fP command. The value given must be 2, 3, or 4. For the +16-bit library, a value of 3 is rounded up to 4. In these libraries, using +longer offsets slows down the operation of PCRE2 because it has to load +additional data when handling them. For the 32-bit library the value is always +4 and cannot be overridden; the value of --with-link-size is ignored. +. +. +.SH "AVOIDING EXCESSIVE STACK USAGE" +.rs +.sp +When matching with the \fBpcre2_match()\fP function, PCRE2 implements +backtracking by making recursive calls to an internal function called +\fBmatch()\fP. In environments where the size of the stack is limited, this can +severely limit PCRE2's operation. (The Unix environment does not usually suffer +from this problem, but it may sometimes be necessary to increase the maximum +stack size. There is a discussion in the +.\" HREF +\fBpcre2stack\fP +.\" +documentation.) An alternative approach to recursion that uses memory from the +heap to remember data, instead of using recursive function calls, has been +implemented to work round the problem of limited stack size. If you want to +build a version of PCRE2 that works this way, add +.sp + --disable-stack-for-recursion +.sp +to the \fBconfigure\fP command. By default, the system functions \fBmalloc()\fP +and \fBfree()\fP are called to manage the heap memory that is required, but +custom memory management functions can be called instead. PCRE2 runs noticeably +more slowly when built in this way. This option affects only the +\fBpcre2_match()\fP function; it is not relevant for \fBpcre2_dfa_match()\fP. +. +. +.SH "LIMITING PCRE2 RESOURCE USAGE" +.rs +.sp +Internally, PCRE2 has a function called \fBmatch()\fP, which it calls +repeatedly (sometimes recursively) when matching a pattern with the +\fBpcre2_match()\fP function. By controlling the maximum number of times this +function may be called during a single matching operation, a limit can be +placed on the resources used by a single call to \fBpcre2_match()\fP. The limit +can be changed at run time, as described in the +.\" HREF +\fBpcre2api\fP +.\" +documentation. The default is 10 million, but this can be changed by adding a +setting such as +.sp + --with-match-limit=500000 +.sp +to the \fBconfigure\fP command. This setting has no effect on the +\fBpcre2_dfa_match()\fP matching function. +.P +In some environments it is desirable to limit the depth of recursive calls of +\fBmatch()\fP more strictly than the total number of calls, in order to +restrict the maximum amount of stack (or heap, if --disable-stack-for-recursion +is specified) that is used. A second limit controls this; it defaults to the +value that is set for --with-match-limit, which imposes no additional +constraints. However, you can set a lower limit by adding, for example, +.sp + --with-match-limit-recursion=10000 +.sp +to the \fBconfigure\fP command. This value can also be overridden at run time. +. +. +.SH "CREATING CHARACTER TABLES AT BUILD TIME" +.rs +.sp +PCRE2 uses fixed tables for processing characters whose code points are less +than 256. By default, PCRE2 is built with a set of tables that are distributed +in the file \fIsrc/pcre2_chartables.c.dist\fP. These tables are for ASCII codes +only. If you add +.sp + --enable-rebuild-chartables +.sp +to the \fBconfigure\fP command, the distributed tables are no longer used. +Instead, a program called \fBdftables\fP is compiled and run. This outputs the +source for new set of tables, created in the default locale of your C run-time +system. (This method of replacing the tables does not work if you are cross +compiling, because \fBdftables\fP is run on the local host. If you need to +create alternative tables when cross compiling, you will have to do so "by +hand".) +. +. +.SH "USING EBCDIC CODE" +.rs +.sp +PCRE2 assumes by default that it will run in an environment where the character +code is ASCII (or Unicode, which is a superset of ASCII). This is the case for +most computer operating systems. PCRE2 can, however, be compiled to run in an +EBCDIC environment by adding +.sp + --enable-ebcdic +.sp +to the \fBconfigure\fP command. This setting implies +--enable-rebuild-chartables. You should only use it if you know that you are in +an EBCDIC environment (for example, an IBM mainframe operating system). The +--enable-ebcdic option is incompatible with --enable-unicode. +.P +The EBCDIC character that corresponds to an ASCII LF is assumed to have the +value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In +such an environment you should use +.sp + --enable-ebcdic-nl25 +.sp +as well as, or instead of, --enable-ebcdic. The EBCDIC character for CR has the +same value as in ASCII, namely, 0x0d. Whichever of 0x15 and 0x25 is \fInot\fP +chosen as LF is made to correspond to the Unicode NEL character (which, in +Unicode, is 0x85). +.P +The options that select newline behaviour, such as --enable-newline-is-cr, +and equivalent run-time options, refer to these character values in an EBCDIC +environment. +. +. +.SH "PCRE2GREP OPTIONS FOR COMPRESSED FILE SUPPORT" +.rs +.sp +By default, \fBpcre2grep\fP reads all files as plain text. You can build it so +that it recognizes files whose names end in \fB.gz\fP or \fB.bz2\fP, and reads +them with \fBlibz\fP or \fBlibbz2\fP, respectively, by adding one or both of +.sp + --enable-pcre2grep-libz + --enable-pcre2grep-libbz2 +.sp +to the \fBconfigure\fP command. These options naturally require that the +relevant libraries are installed on your system. Configuration will fail if +they are not. +. +. +.SH "PCRE2GREP BUFFER SIZE" +.rs +.sp +\fBpcre2grep\fP uses an internal buffer to hold a "window" on the file it is +scanning, in order to be able to output "before" and "after" lines when it +finds a match. The size of the buffer is controlled by a parameter whose +default value is 20K. The buffer itself is three times this size, but because +of the way it is used for holding "before" lines, the longest line that is +guaranteed to be processable is the parameter size. You can change the default +parameter value by adding, for example, +.sp + --with-pcre2grep-bufsize=50K +.sp +to the \fBconfigure\fP command. The caller of \fPpcre2grep\fP can, however, +override this value by specifying a run-time option. +. +. +.SH "PCRE2TEST OPTION FOR LIBREADLINE SUPPORT" +.rs +.sp +If you add one of +.sp + --enable-pcre2test-libreadline + --enable-pcre2test-libedit +.sp +to the \fBconfigure\fP command, \fBpcre2test\fP is linked with the +\fBlibreadline\fP or\fBlibedit\fP library, respectively, and when its input is +from a terminal, it reads it using the \fBreadline()\fP function. This provides +line-editing and history facilities. Note that \fBlibreadline\fP is +GPL-licensed, so if you distribute a binary of \fBpcre2test\fP linked in this +way, there may be licensing issues. These can be avoided by linking with +\fBlibedit\fP (which has a BSD licence) instead. +.P +Setting this option causes the \fB-lreadline\fP option to be added to the +\fBpcre2test\fP build. In many operating environments with a sytem-installed +readline library this is sufficient. However, in some environments (e.g. if an +unmodified distribution version of readline is in use), some extra +configuration may be necessary. The INSTALL file for \fBlibreadline\fP says +this: +.sp + "Readline uses the termcap functions, but does not link with + the termcap or curses library itself, allowing applications + which link with readline the to choose an appropriate library." +.sp +If your environment has not been set up so that an appropriate library is +automatically included, you may need to add something like +.sp + LIBS="-ncurses" +.sp +immediately before the \fBconfigure\fP command. +. +. +.SH "DEBUGGING WITH VALGRIND SUPPORT" +.rs +.sp +By adding the +.sp + --enable-valgrind +.sp +option to to the \fBconfigure\fP command, PCRE2 will use valgrind annotations +to mark certain memory regions as unaddressable. This allows it to detect +invalid memory accesses, and is mostly useful for debugging PCRE2 itself. +. +. +.SH "CODE COVERAGE REPORTING" +.rs +.sp +If your C compiler is gcc, you can build a version of PCRE2 that can generate a +code coverage report for its test suite. To enable this, you must install +\fBlcov\fP version 1.6 or above. Then specify +.sp + --enable-coverage +.sp +to the \fBconfigure\fP command and build PCRE2 in the usual way. +.P +Note that using \fBccache\fP (a caching C compiler) is incompatible with code +coverage reporting. If you have configured \fBccache\fP to run automatically +on your system, you must set the environment variable +.sp + CCACHE_DISABLE=1 +.sp +before running \fBmake\fP to build PCRE2, so that \fBccache\fP is not used. +.P +When --enable-coverage is used, the following addition targets are added to the +\fIMakefile\fP: +.sp + make coverage +.sp +This creates a fresh coverage report for the PCRE2 test suite. It is equivalent +to running "make coverage-reset", "make coverage-baseline", "make check", and +then "make coverage-report". +.sp + make coverage-reset +.sp +This zeroes the coverage counters, but does nothing else. +.sp + make coverage-baseline +.sp +This captures baseline coverage information. +.sp + make coverage-report +.sp +This creates the coverage report. +.sp + make coverage-clean-report +.sp +This removes the generated coverage report without cleaning the coverage data +itself. +.sp + make coverage-clean-data +.sp +This removes the captured coverage data without removing the coverage files +created at compile time (*.gcno). +.sp + make coverage-clean +.sp +This cleans all coverage data including the generated coverage report. For more +information about code coverage, see the \fBgcov\fP and \fBlcov\fP +documentation. +. +. +.SH "SEE ALSO" +.rs +.sp +\fBpcre2api\fP(3), \fBpcre2_config\fP(3). +. +. +.SH AUTHOR +.rs +.sp +.nf +Philip Hazel +University Computing Service +Cambridge CB2 3QH, England. +.fi +. +. +.SH REVISION +.rs +.sp +.nf +Last updated: 28 September 2014 +Copyright (c) 1997-2014 University of Cambridge. +.fi diff --git a/doc/pcre2compat.3 b/doc/pcre2compat.3 new file mode 100644 index 0000000..d40742d --- /dev/null +++ b/doc/pcre2compat.3 @@ -0,0 +1,190 @@ +.TH PCRE2COMPAT 3 "28 September 2014" "PCRE2 10.0" +.SH NAME +PCRE2 - Perl-compatible regular expressions (revised API) +.SH "DIFFERENCES BETWEEN PCRE2 AND PERL" +.rs +.sp +This document describes the differences in the ways that PCRE2 and Perl handle +regular expressions. The differences described here are with respect to Perl +versions 5.10 and above. +.P +1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does +have are given in the +.\" HREF +\fBpcre2unicode\fP +.\" +page. +.P +2. PCRE2 allows repeat quantifiers only on parenthesized assertions, but they +do not mean what you might think. For example, (?!a){3} does not assert that +the next three characters are not "a". It just asserts that the next character +is not "a" three times (in principle: PCRE2 optimizes this to run the assertion +just once). Perl allows repeat quantifiers on other assertions such as \eb, but +these do not seem to have any use. +.P +3. Capturing subpatterns that occur inside negative lookahead assertions are +counted, but their entries in the offsets vector are never set. Perl sometimes +(but not always) sets its numerical variables from inside negative assertions. +.P +4. The following Perl escape sequences are not supported: \el, \eu, \eL, +\eU, and \eN when followed by a character name or Unicode value. (\eN on its +own, matching a non-newline character, is supported.) In fact these are +implemented by Perl's general string-handling and are not part of its pattern +matching engine. If any of these are encountered by PCRE2, an error is +generated by default. However, if the PCRE2_ALT_BSUX option is set, +\eU and \eu are interpreted as ECMAScript interprets them. +.P +5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is +built with Unicode support. The properties that can be tested with \ep and \eP +are limited to the general category properties such as Lu and Nd, script names +such as Greek or Han, and the derived properties Any and L&. PCRE2 does support +the Cs (surrogate) property, which Perl does not; the Perl documentation says +"Because Perl hides the need for the user to understand the internal +representation of Unicode characters, there is no need to implement the +somewhat messy concept of surrogates." +.P +6. PCRE2 does support the \eQ...\eE escape for quoting substrings. Characters +in between are treated as literals. This is slightly different from Perl in +that $ and @ are also handled as literals inside the quotes. In Perl, they +cause variable interpolation (but of course PCRE2 does not have variables). +Note the following examples: +.sp + Pattern PCRE2 matches Perl matches +.sp +.\" JOIN + \eQabc$xyz\eE abc$xyz abc followed by the + contents of $xyz + \eQabc\e$xyz\eE abc\e$xyz abc\e$xyz + \eQabc\eE\e$\eQxyz\eE abc$xyz abc$xyz +.sp +The \eQ...\eE sequence is recognized both inside and outside character classes. +.P +7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) +constructions. However, there is support for recursive patterns. This is not +available in Perl 5.8, but it is in Perl 5.10. Also, the PCRE2 "callout" +feature allows an external function to be called during pattern matching. See +the +.\" HREF +\fBpcre2callout\fP +.\" +documentation for details. +.P +8. Subpatterns that are called as subroutines (whether or not recursively) are +always treated as atomic groups in PCRE2. This is like Python, but unlike Perl. +Captured values that are set outside a subroutine call can be reference from +inside in PCRE2, but not in Perl. There is a discussion that explains these +differences in more detail in the +.\" HTML +.\" +section on recursion differences from Perl +.\" +in the +.\" HREF +\fBpcre2pattern\fP +.\" +page. +.P +9. If any of the backtracking control verbs are used in a subpattern that is +called as a subroutine (whether or not recursively), their effect is confined +to that subpattern; it does not extend to the surrounding pattern. This is not +always the case in Perl. In particular, if (*THEN) is present in a group that +is called as a subroutine, its action is limited to that group, even if the +group does not contain any | characters. Note that such subpatterns are +processed as anchored at the point where they are tested. +.P +10. If a pattern contains more than one backtracking control verb, the first +one that is backtracked onto acts. For example, in the pattern +A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C +triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the +same as PCRE2, but there are examples where it differs. +.P +11. Most backtracking verbs in assertions have their normal actions. They are +not confined to the assertion. +.P +12. There are some differences that are concerned with the settings of captured +strings when part of a pattern is repeated. For example, matching "aba" against +the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to +"b". +.P +13. PCRE2's handling of duplicate subpattern numbers and duplicate subpattern +names is not as general as Perl's. This is a consequence of the fact the PCRE2 +works internally just with numbers, using an external table to translate +between numbers and names. In particular, a pattern such as (?|(?A)|(?\fP. When there is more than one pattern +(specified by the use of \fB-e\fP and/or \fB-f\fP), each pattern is applied to +each line in the order in which they are defined, except that all the \fB-e\fP +patterns are tried before the \fB-f\fP patterns. +.P +By default, as soon as one pattern matches a line, no further patterns are +considered. However, if \fB--colour\fP (or \fB--color\fP) is used to colour the +matching substrings, or if \fB--only-matching\fP, \fB--file-offsets\fP, or +\fB--line-offsets\fP is used to output only the part of the line that matched +(either shown literally, or as an offset), scanning resumes immediately +following the match, so that further matches on the same line can be found. If +there are multiple patterns, they are all tried on the remainder of the line, +but patterns that follow the one that matched are not tried on the earlier part +of the line. +.P +This behaviour means that the order in which multiple patterns are specified +can affect the output when one of the above options is used. This is no longer +the same behaviour as GNU grep, which now manages to display earlier matches +for later patterns (as long as there is no overlap). +.P +Patterns that can match an empty string are accepted, but empty string +matches are never recognized. An example is the pattern "(super)?(man)?", in +which all components are optional. This pattern finds all occurrences of both +"super" and "man"; the output differs from matching with "super|man" when only +the matching substrings are being shown. +.P +If the \fBLC_ALL\fP or \fBLC_CTYPE\fP environment variable is set, +\fBpcre2grep\fP uses the value to set a locale when calling the PCRE2 library. +The \fB--locale\fP option can be used to override this. +. +. +.SH "SUPPORT FOR COMPRESSED FILES" +.rs +.sp +It is possible to compile \fBpcre2grep\fP so that it uses \fBlibz\fP or +\fBlibbz2\fP to read files whose names end in \fB.gz\fP or \fB.bz2\fP, +respectively. You can find out whether your binary has support for one or both +of these file types by running it with the \fB--help\fP option. If the +appropriate support is not present, files are treated as plain text. The +standard input is always so treated. +. +. +.SH "BINARY FILES" +.rs +.sp +By default, a file that contains a binary zero byte within the first 1024 bytes +is identified as a binary file, and is processed specially. (GNU grep also +identifies binary files in this manner.) See the \fB--binary-files\fP option +for a means of changing the way binary files are handled. +. +. +.SH OPTIONS +.rs +.sp +The order in which some of the options appear can affect the output. For +example, both the \fB-h\fP and \fB-l\fP options affect the printing of file +names. Whichever comes later in the command line will be the one that takes +effect. Similarly, except where noted below, if an option is given twice, the +later setting is used. Numerical values for options may be followed by K or M, +to signify multiplication by 1024 or 1024*1024 respectively. +.TP 10 +\fB--\fP +This terminates the list of options. It is useful if the next item on the +command line starts with a hyphen but is not an option. This allows for the +processing of patterns and filenames that start with hyphens. +.TP +\fB-A\fP \fInumber\fP, \fB--after-context=\fP\fInumber\fP +Output \fInumber\fP lines of context after each matching line. If filenames +and/or line numbers are being output, a hyphen separator is used instead of a +colon for the context lines. A line containing "--" is output between each +group of lines, unless they are in fact contiguous in the input file. The value +of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP +guarantees to have up to 8K of following text available for context output. +.TP +\fB-a\fP, \fB--text\fP +Treat binary files as text. This is equivalent to +\fB--binary-files\fP=\fItext\fP. +.TP +\fB-B\fP \fInumber\fP, \fB--before-context=\fP\fInumber\fP +Output \fInumber\fP lines of context before each matching line. If filenames +and/or line numbers are being output, a hyphen separator is used instead of a +colon for the context lines. A line containing "--" is output between each +group of lines, unless they are in fact contiguous in the input file. The value +of \fInumber\fP is expected to be relatively small. However, \fBpcre2grep\fP +guarantees to have up to 8K of preceding text available for context output. +.TP +\fB--binary-files=\fP\fIword\fP +Specify how binary files are to be processed. If the word is "binary" (the +default), pattern matching is performed on binary files, but the only output is +"Binary file