Make --enable-unicode the default.
This commit is contained in:
parent
08e3107cbe
commit
44ef2c3401
|
@ -149,7 +149,7 @@ SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
|
||||||
SET(PCRE2_SUPPORT_PCRE2GREP_JIT ON CACHE BOOL
|
SET(PCRE2_SUPPORT_PCRE2GREP_JIT ON CACHE BOOL
|
||||||
"Enable use of Just-in-time compiling in pcre2grep.")
|
"Enable use of Just-in-time compiling in pcre2grep.")
|
||||||
|
|
||||||
SET(PCRE2_SUPPORT_UNICODE OFF CACHE BOOL
|
SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
|
||||||
"Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.")
|
"Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.")
|
||||||
|
|
||||||
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
||||||
|
|
12
ChangeLog
12
ChangeLog
|
@ -14,19 +14,21 @@ logged. In addition to the API changes, the following changes were made. They
|
||||||
are either new functionality, or bug fixes and other noticeable changes of
|
are either new functionality, or bug fixes and other noticeable changes of
|
||||||
behaviour that were implemented after the code had been forked.
|
behaviour that were implemented after the code had been forked.
|
||||||
|
|
||||||
1. The test program, now called pcre2test, was re-specified and almost
|
1. Unicode support is now enabled by default.
|
||||||
|
|
||||||
|
2. The test program, now called pcre2test, was re-specified and almost
|
||||||
completely re-written. Its input is not compatible with input for pcretest.
|
completely re-written. Its input is not compatible with input for pcretest.
|
||||||
|
|
||||||
2. Patterns may start with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) to set the
|
3. Patterns may start with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) to set the
|
||||||
PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART options for every subject line that is
|
PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART options for every subject line that is
|
||||||
matched by that pattern.
|
matched by that pattern.
|
||||||
|
|
||||||
3. For the benefit of those who use PCRE2 via some other application, that is,
|
4. For the benefit of those who use PCRE2 via some other application, that is,
|
||||||
not writing the function calls themselves, it is possible to check the PCRE2
|
not writing the function calls themselves, it is possible to check the PCRE2
|
||||||
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
||||||
string such as "yesno".
|
string such as "yesno".
|
||||||
|
|
||||||
4. There are case-equivalent Unicode characters whose encodings use different
|
5. There are case-equivalent Unicode characters whose encodings use different
|
||||||
numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is
|
numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is
|
||||||
theoretically possible for this to happen in UTF-16 too.) If a backreference to
|
theoretically possible for this to happen in UTF-16 too.) If a backreference to
|
||||||
a group containing one of these characters was greedily repeated, and during
|
a group containing one of these characters was greedily repeated, and during
|
||||||
|
@ -38,7 +40,7 @@ Incorrect backtracking meant that group 2 captured only the last two bytes.
|
||||||
This bug has been fixed; the new code is slower, but it is used only when the
|
This bug has been fixed; the new code is slower, but it is used only when the
|
||||||
strings matched by the repetition are not all the same length.
|
strings matched by the repetition are not all the same length.
|
||||||
|
|
||||||
5. A pattern such as /()a/ was not setting the "first character must be 'a'"
|
6. A pattern such as /()a/ was not setting the "first character must be 'a'"
|
||||||
information. This applied to any pattern with a group that matched no
|
information. This applied to any pattern with a group that matched no
|
||||||
characters, for example: /(?:(?=.)|(?<!x))a/.
|
characters, for example: /(?:(?=.)|(?<!x))a/.
|
||||||
|
|
||||||
|
|
|
@ -325,7 +325,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
||||||
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
||||||
|
|
||||||
8. The GUI will then list several configuration options. This is where
|
8. The GUI will then list several configuration options. This is where
|
||||||
you can enable Unicode support or other PCRE2 optional features.
|
you can disable Unicode support or select other PCRE2 optional features.
|
||||||
|
|
||||||
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
||||||
active.
|
active.
|
||||||
|
@ -399,4 +399,4 @@ The site currently has ports for PCRE1 releases, but PCRE2 should follow in due
|
||||||
course.
|
course.
|
||||||
|
|
||||||
==========================
|
==========================
|
||||||
Last Updated: 28 September 2014
|
Last Updated: 03 November 2014
|
||||||
|
|
36
README
36
README
|
@ -179,24 +179,22 @@ library. They are also documented in the pcre2build man page.
|
||||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||||
you add --disable-pcre2grep-jit to the "configure" command.
|
you add --disable-pcre2grep-jit to the "configure" command.
|
||||||
|
|
||||||
. If you want to make use of the support for UTF-8 Unicode character strings in
|
. If you do not want to make use of the support for UTF-8 Unicode character
|
||||||
the 8-bit library, UTF-16 Unicode character strings in the 16-bit library,
|
strings in the 8-bit library, UTF-16 Unicode character strings in the 16-bit
|
||||||
and UTF-32 Unicode character strings in the 32-bit library, you must add
|
library, and UTF-32 Unicode character strings in the 32-bit library, you can
|
||||||
--enable-unicode to the "configure" command. Without it, the code for
|
add --disable-unicode to the "configure" command. This reduces the size of
|
||||||
handling UTF-8, UTF-16 and UTF-8 is not included. It is not possible to
|
the libraries. It is not possible to configure one library with Unicode
|
||||||
configure one library with UTF support and the other without in the same
|
support, and another without, in the same configuration.
|
||||||
configuration.
|
|
||||||
|
|
||||||
Even when --enable-unicode is included, the use of a UTF encoding still has
|
When Unicode support is available, the use of a UTF encoding still has to be
|
||||||
to be enabled by an option at run time. When PCRE2 is compiled with this
|
enabled by an option at run time. When PCRE2 is compiled with Unicode
|
||||||
option, its input can only either be ASCII or UTF-8/16/32, even when running
|
support, its input can only either be ASCII or UTF-8/16/32, even when running
|
||||||
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
||||||
--enable-ebcdic at the same time.
|
--enable-ebcdic at the same time.
|
||||||
|
|
||||||
When --enable-unicode is specified, as well as supporting UTF strings, PCRE2
|
As well as supporting UTF strings, Unicode support includes support for the
|
||||||
includes support for the \P, \p, and \X sequences that recognize Unicode
|
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||||
character properties. However, only the basic two-letter properties such as
|
However, only the basic two-letter properties such as Lu are supported.
|
||||||
Lu are supported.
|
|
||||||
|
|
||||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
||||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||||
|
@ -285,7 +283,7 @@ library. They are also documented in the pcre2build man page.
|
||||||
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
||||||
character code (as opposed to ASCII/Unicode) by specifying
|
character code (as opposed to ASCII/Unicode) by specifying
|
||||||
|
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
This automatically implies --enable-rebuild-chartables (see above). However,
|
This automatically implies --enable-rebuild-chartables (see above). However,
|
||||||
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
||||||
|
@ -543,8 +541,8 @@ from pcre2test. Other files whose names begin with "test" are used as working
|
||||||
files in some tests.
|
files in some tests.
|
||||||
|
|
||||||
Some tests are relevant only when certain build-time options were selected. For
|
Some tests are relevant only when certain build-time options were selected. For
|
||||||
example, the tests for UTF-8/16/32 support are run only if --enable-unicode was
|
example, the tests for UTF-8/16/32 features are run only when Unicode support
|
||||||
used. RunTest outputs a comment when it skips a test.
|
is available. RunTest outputs a comment when it skips a test.
|
||||||
|
|
||||||
Many of the tests that are not skipped are run twice if JIT support is
|
Many of the tests that are not skipped are run twice if JIT support is
|
||||||
available. On the second run, JIT compilation is forced. This testing can be
|
available. On the second run, JIT compilation is forced. This testing can be
|
||||||
|
@ -633,7 +631,7 @@ JIT-specific features such as information output from pcre2test about JIT
|
||||||
compilation.
|
compilation.
|
||||||
|
|
||||||
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
||||||
POSIX interface to the 8-bit library, withouth and with Unicode support,
|
POSIX interface to the 8-bit library, without and with Unicode support,
|
||||||
respectively.
|
respectively.
|
||||||
|
|
||||||
|
|
||||||
|
@ -828,4 +826,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 25 October 2014
|
Last updated: 03 November 2014
|
||||||
|
|
10
configure.ac
10
configure.ac
|
@ -148,10 +148,10 @@ AC_ARG_ENABLE(rebuild-chartables,
|
||||||
[rebuild character tables in current locale]),
|
[rebuild character tables in current locale]),
|
||||||
, enable_rebuild_chartables=no)
|
, enable_rebuild_chartables=no)
|
||||||
|
|
||||||
# Handle --enable-unicode (disabled by default)
|
# Handle --disable-unicode (enabled by default)
|
||||||
AC_ARG_ENABLE(unicode,
|
AC_ARG_ENABLE(unicode,
|
||||||
AS_HELP_STRING([--enable-unicode],
|
AS_HELP_STRING([--disable-unicode],
|
||||||
[enable Unicode support (incompatible with --enable-ebcdic)]),
|
[disable Unicode support]),
|
||||||
, enable_unicode=unset)
|
, enable_unicode=unset)
|
||||||
|
|
||||||
# Handle newline options
|
# Handle newline options
|
||||||
|
@ -299,10 +299,10 @@ then
|
||||||
AC_MSG_ERROR([At least one of the 8, 16 or 32 bit libraries must be enabled])
|
AC_MSG_ERROR([At least one of the 8, 16 or 32 bit libraries must be enabled])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# enable_unicode is disabled by default.
|
# Unicode is enabled by default.
|
||||||
if test "x$enable_unicode" = "xunset"
|
if test "x$enable_unicode" = "xunset"
|
||||||
then
|
then
|
||||||
enable_unicode=no
|
enable_unicode=yes
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Convert the newline identifier into the appropriate integer value. These must
|
# Convert the newline identifier into the appropriate integer value. These must
|
||||||
|
|
|
@ -325,7 +325,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
||||||
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
||||||
|
|
||||||
8. The GUI will then list several configuration options. This is where
|
8. The GUI will then list several configuration options. This is where
|
||||||
you can enable Unicode support or other PCRE2 optional features.
|
you can disable Unicode support or select other PCRE2 optional features.
|
||||||
|
|
||||||
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
||||||
active.
|
active.
|
||||||
|
@ -399,4 +399,4 @@ The site currently has ports for PCRE1 releases, but PCRE2 should follow in due
|
||||||
course.
|
course.
|
||||||
|
|
||||||
==========================
|
==========================
|
||||||
Last Updated: 28 September 2014
|
Last Updated: 03 November 2014
|
||||||
|
|
|
@ -179,24 +179,22 @@ library. They are also documented in the pcre2build man page.
|
||||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||||
you add --disable-pcre2grep-jit to the "configure" command.
|
you add --disable-pcre2grep-jit to the "configure" command.
|
||||||
|
|
||||||
. If you want to make use of the support for UTF-8 Unicode character strings in
|
. If you do not want to make use of the support for UTF-8 Unicode character
|
||||||
the 8-bit library, UTF-16 Unicode character strings in the 16-bit library,
|
strings in the 8-bit library, UTF-16 Unicode character strings in the 16-bit
|
||||||
and UTF-32 Unicode character strings in the 32-bit library, you must add
|
library, and UTF-32 Unicode character strings in the 32-bit library, you can
|
||||||
--enable-unicode to the "configure" command. Without it, the code for
|
add --disable-unicode to the "configure" command. This reduces the size of
|
||||||
handling UTF-8, UTF-16 and UTF-8 is not included. It is not possible to
|
the libraries. It is not possible to configure one library with Unicode
|
||||||
configure one library with UTF support and the other without in the same
|
support, and another without, in the same configuration.
|
||||||
configuration.
|
|
||||||
|
|
||||||
Even when --enable-unicode is included, the use of a UTF encoding still has
|
When Unicode support is available, the use of a UTF encoding still has to be
|
||||||
to be enabled by an option at run time. When PCRE2 is compiled with this
|
enabled by an option at run time. When PCRE2 is compiled with Unicode
|
||||||
option, its input can only either be ASCII or UTF-8/16/32, even when running
|
support, its input can only either be ASCII or UTF-8/16/32, even when running
|
||||||
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
||||||
--enable-ebcdic at the same time.
|
--enable-ebcdic at the same time.
|
||||||
|
|
||||||
When --enable-unicode is specified, as well as supporting UTF strings, PCRE2
|
As well as supporting UTF strings, Unicode support includes support for the
|
||||||
includes support for the \P, \p, and \X sequences that recognize Unicode
|
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||||
character properties. However, only the basic two-letter properties such as
|
However, only the basic two-letter properties such as Lu are supported.
|
||||||
Lu are supported.
|
|
||||||
|
|
||||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
||||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||||
|
@ -285,7 +283,7 @@ library. They are also documented in the pcre2build man page.
|
||||||
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
||||||
character code (as opposed to ASCII/Unicode) by specifying
|
character code (as opposed to ASCII/Unicode) by specifying
|
||||||
|
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
This automatically implies --enable-rebuild-chartables (see above). However,
|
This automatically implies --enable-rebuild-chartables (see above). However,
|
||||||
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
||||||
|
@ -543,8 +541,8 @@ from pcre2test. Other files whose names begin with "test" are used as working
|
||||||
files in some tests.
|
files in some tests.
|
||||||
|
|
||||||
Some tests are relevant only when certain build-time options were selected. For
|
Some tests are relevant only when certain build-time options were selected. For
|
||||||
example, the tests for UTF-8/16/32 support are run only if --enable-unicode was
|
example, the tests for UTF-8/16/32 features are run only when Unicode support
|
||||||
used. RunTest outputs a comment when it skips a test.
|
is available. RunTest outputs a comment when it skips a test.
|
||||||
|
|
||||||
Many of the tests that are not skipped are run twice if JIT support is
|
Many of the tests that are not skipped are run twice if JIT support is
|
||||||
available. On the second run, JIT compilation is forced. This testing can be
|
available. On the second run, JIT compilation is forced. This testing can be
|
||||||
|
@ -633,7 +631,7 @@ JIT-specific features such as information output from pcre2test about JIT
|
||||||
compilation.
|
compilation.
|
||||||
|
|
||||||
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
||||||
POSIX interface to the 8-bit library, withouth and with Unicode support,
|
POSIX interface to the 8-bit library, without and with Unicode support,
|
||||||
respectively.
|
respectively.
|
||||||
|
|
||||||
|
|
||||||
|
@ -828,4 +826,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 25 October 2014
|
Last updated: 03 November 2014
|
||||||
|
|
|
@ -35,9 +35,10 @@ code units, which means that up to three separate libraries may be installed.
|
||||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||||
Unicode, with support for Unicode general category properties. Unicode is
|
Unicode, with support for Unicode general category properties. Unicode support
|
||||||
optional at build time, and must be enabled explicitly at run time. The version
|
is optional at build time (but is the default); however, processing strings as
|
||||||
of Unicode in use can be discovered by running
|
UTF code units must be enabled explicitly at run time. The version of Unicode
|
||||||
|
in use can be discovered by running
|
||||||
<pre>
|
<pre>
|
||||||
pcre2test -C
|
pcre2test -C
|
||||||
</PRE>
|
</PRE>
|
||||||
|
@ -95,13 +96,13 @@ not exported.
|
||||||
<P>
|
<P>
|
||||||
If you are using PCRE2 in a non-UTF application that permits users to supply
|
If you are using PCRE2 in a non-UTF application that permits users to supply
|
||||||
arbitrary patterns for compilation, you should be aware of a feature that
|
arbitrary patterns for compilation, you should be aware of a feature that
|
||||||
allows users to turn on UTF support from within a pattern, provided that PCRE2
|
allows users to turn on UTF support from within a pattern. For example, an
|
||||||
was built with Unicode support. For example, an 8-bit pattern that begins with
|
8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets
|
||||||
"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings
|
patterns and subjects as strings of UTF-8 code units instead of individual
|
||||||
of UTF-8 code units instead of individual 8-bit characters. This causes both
|
8-bit characters. This causes both the pattern and any data against which it is
|
||||||
the pattern and any data against which it is matched to be checked for UTF-8
|
matched to be checked for UTF-8 validity. If the data string is very long, such
|
||||||
validity. If the data string is very long, such a check might use sufficiently
|
a check might use sufficiently many resources as to cause your application to
|
||||||
many resources as to cause your application to lose performance.
|
lose performance.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
One way of guarding against this possibility is to use the
|
One way of guarding against this possibility is to use the
|
||||||
|
@ -173,7 +174,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -115,27 +115,24 @@ to the <b>configure</b> command, as required.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">Unicode and UTF SUPPORT</a><br>
|
<br><a name="SEC5" href="#TOC1">Unicode and UTF SUPPORT</a><br>
|
||||||
<P>
|
<P>
|
||||||
To build PCRE2 with support for Unicode and UTF character strings, add
|
By default, PCRE2 is built with support for Unicode and UTF character strings.
|
||||||
|
To build it without Unicode support, add
|
||||||
<pre>
|
<pre>
|
||||||
--enable-unicode
|
--disable-unicode
|
||||||
</pre>
|
</pre>
|
||||||
to the <b>configure</b> command. This setting applies to all three libraries,
|
to the <b>configure</b> command. This setting applies to all three libraries. It
|
||||||
adding support for UTF-8 to the 8-bit library, support for UTF-16 to the 16-bit
|
is not possible to build one library with Unicode support, and another without,
|
||||||
library, and support for UTF-32 to the to the 32-bit library.
|
in the same configuration.
|
||||||
It is not possible to build one library with
|
|
||||||
UTF support and another without in the same configuration.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Of itself, this setting does not make PCRE2 treat strings as UTF-8, UTF-16 or
|
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16
|
||||||
UTF-32. As well as compiling PCRE2 with this option, you also have have to set
|
or UTF-32. To do that you have have to set the PCRE2_UTF option when you call
|
||||||
the PCRE2_UTF option when you call <b>pcre2_compile()</b> to compile a pattern.
|
<b>pcre2_compile()</b> to compile a pattern.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If you set --enable-unicode when compiling in an EBCDIC environment, PCRE2
|
It is not possible to support both EBCDIC and UTF-8 codes in the same version
|
||||||
expects its input to be either ASCII or UTF-8 (depending on the run-time
|
of the library. Consequently, --enable-unicode and --enable-ebcdic are mutually
|
||||||
option). It is not possible to support both EBCDIC and UTF-8 codes in the same
|
exclusive.
|
||||||
version of the library. Consequently, --enable-unicode and --enable-ebcdic are
|
|
||||||
mutually exclusive.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
||||||
|
@ -301,12 +298,12 @@ code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
|
||||||
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
||||||
EBCDIC environment by adding
|
EBCDIC environment by adding
|
||||||
<pre>
|
<pre>
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
</pre>
|
</pre>
|
||||||
to the <b>configure</b> command. This setting implies
|
to the <b>configure</b> command. This setting implies
|
||||||
--enable-rebuild-chartables. You should only use it if you know that you are in
|
--enable-rebuild-chartables. You should only use it if you know that you are in
|
||||||
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
||||||
--enable-ebcdic option is incompatible with --enable-unicode.
|
--enable-ebcdic option is incompatible with Unicode support.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
||||||
|
@ -469,7 +466,7 @@ Cambridge CB2 3QH, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -89,11 +89,11 @@ In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as
|
||||||
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
||||||
specified for the 32-bit library, in which case it constrains the character
|
specified for the 32-bit library, in which case it constrains the character
|
||||||
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
||||||
built to include Unicode support. When using UTF strings you must either call
|
built to include Unicode support (which is the default). When using UTF strings
|
||||||
the compiling function with the PCRE2_UTF option, or the pattern must start
|
you must either call the compiling function with the PCRE2_UTF option, or the
|
||||||
with the special sequence (*UTF), which is equivalent to setting the relevant
|
pattern must start with the special sequence (*UTF), which is equivalent to
|
||||||
option. How setting a UTF mode affects pattern matching is mentioned in several
|
setting the relevant option. How setting a UTF mode affects pattern matching is
|
||||||
places below. There is also a summary of features in the
|
mentioned in several places below. There is also a summary of features in the
|
||||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||||
page.
|
page.
|
||||||
</P>
|
</P>
|
||||||
|
@ -538,9 +538,9 @@ By default, characters whose code points are greater than 127 never match \d,
|
||||||
\s, or \w, and always match \D, \S, and \W, although this may vary for
|
\s, or \w, and always match \D, \S, and \W, although this may vary for
|
||||||
characters in the range 128-255 when locale-specific matching is happening.
|
characters in the range 128-255 when locale-specific matching is happening.
|
||||||
These escape sequences retain their original meanings from before Unicode
|
These escape sequences retain their original meanings from before Unicode
|
||||||
support was available, mainly for efficiency reasons. If PCRE2 is compiled with
|
support was available, mainly for efficiency reasons. If the PCRE2_UCP option
|
||||||
Unicode support, and the PCRE2_UCP option is set, the behaviour is changed so
|
is set, the behaviour is changed so that Unicode properties are used to
|
||||||
that Unicode properties are used to determine character types, as follows:
|
determine character types, as follows:
|
||||||
<pre>
|
<pre>
|
||||||
\d any character that matches \p{Nd} (decimal digit)
|
\d any character that matches \p{Nd} (decimal digit)
|
||||||
\s any character that matches \p{Z} or \h or \v
|
\s any character that matches \p{Z} or \h or \v
|
||||||
|
@ -641,11 +641,11 @@ an error.
|
||||||
Unicode character properties
|
Unicode character properties
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is built with Unicode support, three additional escape sequences
|
When PCRE2 is built with Unicode support (the default), three additional escape
|
||||||
that match characters with specific properties are available. In 8-bit
|
sequences that match characters with specific properties are available. In
|
||||||
non-UTF-8 mode, these sequences are of course limited to testing characters
|
8-bit non-UTF-8 mode, these sequences are of course limited to testing
|
||||||
whose codepoints are less than 256, but they do work in this mode. The extra
|
characters whose codepoints are less than 256, but they do work in this mode.
|
||||||
escape sequences are:
|
The extra escape sequences are:
|
||||||
<pre>
|
<pre>
|
||||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||||
|
@ -3193,7 +3193,7 @@ Cambridge CB2 3QH, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 19 October 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -16,11 +16,12 @@ please consult the man page, in case the conversion went wrong.
|
||||||
UNICODE AND UTF SUPPORT
|
UNICODE AND UTF SUPPORT
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is built with Unicode support, it acquires knowledge of Unicode
|
When PCRE2 is built with Unicode support (which is the default), it has
|
||||||
character properties and can process text strings in UTF-8, UTF-16, or UTF-32
|
knowledge of Unicode character properties and can process text strings in
|
||||||
format (depending on the code unit width). By default, PCRE2 assumes that one
|
UTF-8, UTF-16, or UTF-32 format (depending on the code unit width). However, by
|
||||||
code unit is one character. To process a pattern as a UTF string, where a
|
default, PCRE2 assumes that one code unit is one character. To process a
|
||||||
character may require more than one code unit, you must call
|
pattern as a UTF string, where a character may require more than one code unit,
|
||||||
|
you must call
|
||||||
<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
|
<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
|
||||||
with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||||
(*UTF). When either of these is the case, both the pattern and any subject
|
(*UTF). When either of these is the case, both the pattern and any subject
|
||||||
|
@ -28,9 +29,8 @@ strings that are matched against it are treated as UTF strings instead of
|
||||||
strings of individual one-code-unit characters.
|
strings of individual one-code-unit characters.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If you build PCRE2 with Unicode support, the library will be bigger, but the
|
If you do not need Unicode support you can build PCRE2 without it, in which
|
||||||
additional run time overhead is limited to testing the PCRE2_UTF flag
|
case the library will be smaller.
|
||||||
occasionally, so should not be very much.
|
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
UNICODE PROPERTY SUPPORT
|
UNICODE PROPERTY SUPPORT
|
||||||
|
@ -261,7 +261,7 @@ Cambridge CB2 3QH, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 16 September 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
25
doc/pcre2.3
25
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2 3 "28 September 2014" "PCRE2 10.00"
|
.TH PCRE2 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH INTRODUCTION
|
.SH INTRODUCTION
|
||||||
|
@ -17,9 +17,10 @@ code units, which means that up to three separate libraries may be installed.
|
||||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||||
Unicode, with support for Unicode general category properties. Unicode is
|
Unicode, with support for Unicode general category properties. Unicode support
|
||||||
optional at build time, and must be enabled explicitly at run time. The version
|
is optional at build time (but is the default); however, processing strings as
|
||||||
of Unicode in use can be discovered by running
|
UTF code units must be enabled explicitly at run time. The version of Unicode
|
||||||
|
in use can be discovered by running
|
||||||
.sp
|
.sp
|
||||||
pcre2test -C
|
pcre2test -C
|
||||||
.P
|
.P
|
||||||
|
@ -91,13 +92,13 @@ not exported.
|
||||||
.sp
|
.sp
|
||||||
If you are using PCRE2 in a non-UTF application that permits users to supply
|
If you are using PCRE2 in a non-UTF application that permits users to supply
|
||||||
arbitrary patterns for compilation, you should be aware of a feature that
|
arbitrary patterns for compilation, you should be aware of a feature that
|
||||||
allows users to turn on UTF support from within a pattern, provided that PCRE2
|
allows users to turn on UTF support from within a pattern. For example, an
|
||||||
was built with Unicode support. For example, an 8-bit pattern that begins with
|
8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets
|
||||||
"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings
|
patterns and subjects as strings of UTF-8 code units instead of individual
|
||||||
of UTF-8 code units instead of individual 8-bit characters. This causes both
|
8-bit characters. This causes both the pattern and any data against which it is
|
||||||
the pattern and any data against which it is matched to be checked for UTF-8
|
matched to be checked for UTF-8 validity. If the data string is very long, such
|
||||||
validity. If the data string is very long, such a check might use sufficiently
|
a check might use sufficiently many resources as to cause your application to
|
||||||
many resources as to cause your application to lose performance.
|
lose performance.
|
||||||
.P
|
.P
|
||||||
One way of guarding against this possibility is to use the
|
One way of guarding against this possibility is to use the
|
||||||
\fBpcre2_pattern_info()\fP function to check the compiled pattern's options for
|
\fBpcre2_pattern_info()\fP function to check the compiled pattern's options for
|
||||||
|
@ -175,6 +176,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
306
doc/pcre2.txt
306
doc/pcre2.txt
|
@ -32,87 +32,88 @@ INTRODUCTION
|
||||||
code units was done by Zoltan Herczeg and Christian Persch, respec-
|
code units was done by Zoltan Herczeg and Christian Persch, respec-
|
||||||
tively. In all three cases, strings can be interpreted either as one
|
tively. In all three cases, strings can be interpreted either as one
|
||||||
character per code unit, or as UTF-encoded Unicode, with support for
|
character per code unit, or as UTF-encoded Unicode, with support for
|
||||||
Unicode general category properties. Unicode is optional at build time,
|
Unicode general category properties. Unicode support is optional at
|
||||||
and must be enabled explicitly at run time. The version of Unicode in
|
build time (but is the default); however, processing strings as UTF
|
||||||
use can be discovered by running
|
code units must be enabled explicitly at run time. The version of Uni-
|
||||||
|
code in use can be discovered by running
|
||||||
|
|
||||||
pcre2test -C
|
pcre2test -C
|
||||||
|
|
||||||
The three libraries contain identical sets of functions, with names
|
The three libraries contain identical sets of functions, with names
|
||||||
ending in _8, _16, or _32, respectively (for example, pcre2_com-
|
ending in _8, _16, or _32, respectively (for example, pcre2_com-
|
||||||
pile_8()). However, by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or
|
pile_8()). However, by defining PCRE2_CODE_UNIT_WIDTH to be 8, 16, or
|
||||||
32, a program that uses just one code unit width can be written using
|
32, a program that uses just one code unit width can be written using
|
||||||
generic names such as pcre2_compile(), and the documentation is written
|
generic names such as pcre2_compile(), and the documentation is written
|
||||||
assuming that this is the case.
|
assuming that this is the case.
|
||||||
|
|
||||||
In addition to the Perl-compatible matching function, PCRE2 contains an
|
In addition to the Perl-compatible matching function, PCRE2 contains an
|
||||||
alternative function that matches the same compiled patterns in a dif-
|
alternative function that matches the same compiled patterns in a dif-
|
||||||
ferent way. In certain circumstances, the alternative function has some
|
ferent way. In certain circumstances, the alternative function has some
|
||||||
advantages. For a discussion of the two matching algorithms, see the
|
advantages. For a discussion of the two matching algorithms, see the
|
||||||
pcre2matching page.
|
pcre2matching page.
|
||||||
|
|
||||||
Details of exactly which Perl regular expression features are and are
|
Details of exactly which Perl regular expression features are and are
|
||||||
not supported by PCRE2 are given in separate documents. See the
|
not supported by PCRE2 are given in separate documents. See the
|
||||||
pcre2pattern and pcre2compat pages. There is a syntax summary in the
|
pcre2pattern and pcre2compat pages. There is a syntax summary in the
|
||||||
pcre2syntax page.
|
pcre2syntax page.
|
||||||
|
|
||||||
Some features of PCRE2 can be included, excluded, or changed when the
|
Some features of PCRE2 can be included, excluded, or changed when the
|
||||||
library is built. The pcre2_config() function makes it possible for a
|
library is built. The pcre2_config() function makes it possible for a
|
||||||
client to discover which features are available. The features them-
|
client to discover which features are available. The features them-
|
||||||
selves are described in the pcre2build page. Documentation about build-
|
selves are described in the pcre2build page. Documentation about build-
|
||||||
ing PCRE2 for various operating systems can be found in the README and
|
ing PCRE2 for various operating systems can be found in the README and
|
||||||
NON-AUTOTOOLS_BUILD files in the source distribution.
|
NON-AUTOTOOLS_BUILD files in the source distribution.
|
||||||
|
|
||||||
The libraries contains a number of undocumented internal functions and
|
The libraries contains a number of undocumented internal functions and
|
||||||
data tables that are used by more than one of the exported external
|
data tables that are used by more than one of the exported external
|
||||||
functions, but which are not intended for use by external callers.
|
functions, but which are not intended for use by external callers.
|
||||||
Their names all begin with "_pcre2", which hopefully will not provoke
|
Their names all begin with "_pcre2", which hopefully will not provoke
|
||||||
any name clashes. In some environments, it is possible to control which
|
any name clashes. In some environments, it is possible to control which
|
||||||
external symbols are exported when a shared library is built, and in
|
external symbols are exported when a shared library is built, and in
|
||||||
these cases the undocumented symbols are not exported.
|
these cases the undocumented symbols are not exported.
|
||||||
|
|
||||||
|
|
||||||
SECURITY CONSIDERATIONS
|
SECURITY CONSIDERATIONS
|
||||||
|
|
||||||
If you are using PCRE2 in a non-UTF application that permits users to
|
If you are using PCRE2 in a non-UTF application that permits users to
|
||||||
supply arbitrary patterns for compilation, you should be aware of a
|
supply arbitrary patterns for compilation, you should be aware of a
|
||||||
feature that allows users to turn on UTF support from within a pattern,
|
feature that allows users to turn on UTF support from within a pattern.
|
||||||
provided that PCRE2 was built with Unicode support. For example, an
|
For example, an 8-bit pattern that begins with "(*UTF)" turns on UTF-8
|
||||||
8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which
|
mode, which interprets patterns and subjects as strings of UTF-8 code
|
||||||
interprets patterns and subjects as strings of UTF-8 code units instead
|
units instead of individual 8-bit characters. This causes both the pat-
|
||||||
of individual 8-bit characters. This causes both the pattern and any
|
tern and any data against which it is matched to be checked for UTF-8
|
||||||
data against which it is matched to be checked for UTF-8 validity. If
|
validity. If the data string is very long, such a check might use suf-
|
||||||
the data string is very long, such a check might use sufficiently many
|
ficiently many resources as to cause your application to lose perfor-
|
||||||
resources as to cause your application to lose performance.
|
mance.
|
||||||
|
|
||||||
One way of guarding against this possibility is to use the pcre2_pat-
|
One way of guarding against this possibility is to use the pcre2_pat-
|
||||||
tern_info() function to check the compiled pattern's options for UTF.
|
tern_info() function to check the compiled pattern's options for UTF.
|
||||||
Alternatively, you can set the PCRE2_NEVER_UTF option at compile time.
|
Alternatively, you can set the PCRE2_NEVER_UTF option at compile time.
|
||||||
This causes an compile time error if a pattern contains a UTF-setting
|
This causes an compile time error if a pattern contains a UTF-setting
|
||||||
sequence.
|
sequence.
|
||||||
|
|
||||||
If your application is one that supports UTF, be aware that validity
|
If your application is one that supports UTF, be aware that validity
|
||||||
checking can take time. If the same data string is to be matched many
|
checking can take time. If the same data string is to be matched many
|
||||||
times, you can use the PCRE2_NO_UTF_CHECK option for the second and
|
times, you can use the PCRE2_NO_UTF_CHECK option for the second and
|
||||||
subsequent matches to avoid running redundant checks.
|
subsequent matches to avoid running redundant checks.
|
||||||
|
|
||||||
Another way that performance can be hit is by running a pattern that
|
Another way that performance can be hit is by running a pattern that
|
||||||
has a very large search tree against a string that will never match.
|
has a very large search tree against a string that will never match.
|
||||||
Nested unlimited repeats in a pattern are a common example. PCRE2 pro-
|
Nested unlimited repeats in a pattern are a common example. PCRE2 pro-
|
||||||
vides some protection against this: see the pcre2_set_match_limit()
|
vides some protection against this: see the pcre2_set_match_limit()
|
||||||
function in the pcre2api page.
|
function in the pcre2api page.
|
||||||
|
|
||||||
|
|
||||||
USER DOCUMENTATION
|
USER DOCUMENTATION
|
||||||
|
|
||||||
The user documentation for PCRE2 comprises a number of different sec-
|
The user documentation for PCRE2 comprises a number of different sec-
|
||||||
tions. In the "man" format, each of these is a separate "man page". In
|
tions. In the "man" format, each of these is a separate "man page". In
|
||||||
the HTML format, each is a separate page, linked from the index page.
|
the HTML format, each is a separate page, linked from the index page.
|
||||||
In the plain text format, the descriptions of the pcre2grep and
|
In the plain text format, the descriptions of the pcre2grep and
|
||||||
pcre2test programs are in files called pcre2grep.txt and pcre2test.txt,
|
pcre2test programs are in files called pcre2grep.txt and pcre2test.txt,
|
||||||
respectively. The remaining sections, except for the pcre2demo section
|
respectively. The remaining sections, except for the pcre2demo section
|
||||||
(which is a program listing), and the short pages for individual func-
|
(which is a program listing), and the short pages for individual func-
|
||||||
tions, are concatenated in pcre2.txt, for ease of searching. The sec-
|
tions, are concatenated in pcre2.txt, for ease of searching. The sec-
|
||||||
tions are as follows:
|
tions are as follows:
|
||||||
|
|
||||||
pcre2 this document FIXME CHECK THIS LIST
|
pcre2 this document FIXME CHECK THIS LIST
|
||||||
|
@ -123,7 +124,7 @@ USER DOCUMENTATION
|
||||||
pcre2compat discussion of Perl compatibility
|
pcre2compat discussion of Perl compatibility
|
||||||
pcre2demo a demonstration C program that uses PCRE2
|
pcre2demo a demonstration C program that uses PCRE2
|
||||||
pcre2grep description of the pcre2grep command (8-bit only)
|
pcre2grep description of the pcre2grep command (8-bit only)
|
||||||
pcre2jit discussion of the just-in-time optimization sup-
|
pcre2jit discussion of the just-in-time optimization sup-
|
||||||
port
|
port
|
||||||
pcre2limits details of size and other limits
|
pcre2limits details of size and other limits
|
||||||
pcre2matching discussion of the two matching algorithms
|
pcre2matching discussion of the two matching algorithms
|
||||||
|
@ -138,7 +139,7 @@ USER DOCUMENTATION
|
||||||
pcre2test description of the pcre2test testing command
|
pcre2test description of the pcre2test testing command
|
||||||
pcre2unicode discussion of Unicode and UTF support
|
pcre2unicode discussion of Unicode and UTF support
|
||||||
|
|
||||||
In the "man" and HTML formats, there is also a short page for each C
|
In the "man" and HTML formats, there is also a short page for each C
|
||||||
library function, listing its arguments and results.
|
library function, listing its arguments and results.
|
||||||
|
|
||||||
|
|
||||||
|
@ -148,14 +149,14 @@ AUTHOR
|
||||||
University Computing Service
|
University Computing Service
|
||||||
Cambridge CB2 3QH, England.
|
Cambridge CB2 3QH, England.
|
||||||
|
|
||||||
Putting an actual email address here is a spam magnet. If you want to
|
Putting an actual email address here is a spam magnet. If you want to
|
||||||
email me, use my two initials, followed by the two digits 10, at the
|
email me, use my two initials, followed by the two digits 10, at the
|
||||||
domain cam.ac.uk.
|
domain cam.ac.uk.
|
||||||
|
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -2619,26 +2620,22 @@ BUILDING SHARED AND STATIC LIBRARIES
|
||||||
|
|
||||||
Unicode and UTF SUPPORT
|
Unicode and UTF SUPPORT
|
||||||
|
|
||||||
To build PCRE2 with support for Unicode and UTF character strings, add
|
By default, PCRE2 is built with support for Unicode and UTF character
|
||||||
|
strings. To build it without Unicode support, add
|
||||||
|
|
||||||
--enable-unicode
|
--disable-unicode
|
||||||
|
|
||||||
to the configure command. This setting applies to all three libraries,
|
to the configure command. This setting applies to all three libraries.
|
||||||
adding support for UTF-8 to the 8-bit library, support for UTF-16 to
|
It is not possible to build one library with Unicode support, and
|
||||||
the 16-bit library, and support for UTF-32 to the to the 32-bit
|
another without, in the same configuration.
|
||||||
library. It is not possible to build one library with UTF support and
|
|
||||||
another without in the same configuration.
|
|
||||||
|
|
||||||
Of itself, this setting does not make PCRE2 treat strings as UTF-8,
|
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8,
|
||||||
UTF-16 or UTF-32. As well as compiling PCRE2 with this option, you also
|
UTF-16 or UTF-32. To do that you have have to set the PCRE2_UTF option
|
||||||
have have to set the PCRE2_UTF option when you call pcre2_compile() to
|
when you call pcre2_compile() to compile a pattern.
|
||||||
compile a pattern.
|
|
||||||
|
|
||||||
If you set --enable-unicode when compiling in an EBCDIC environment,
|
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||||
PCRE2 expects its input to be either ASCII or UTF-8 (depending on the
|
version of the library. Consequently, --enable-unicode and --enable-
|
||||||
run-time option). It is not possible to support both EBCDIC and UTF-8
|
ebcdic are mutually exclusive.
|
||||||
codes in the same version of the library. Consequently, --enable-uni-
|
|
||||||
code and --enable-ebcdic are mutually exclusive.
|
|
||||||
|
|
||||||
UTF support allows the libraries to process character codepoints up to
|
UTF support allows the libraries to process character codepoints up to
|
||||||
0x10ffff in the strings that they handle. It also provides support for
|
0x10ffff in the strings that they handle. It also provides support for
|
||||||
|
@ -2809,12 +2806,12 @@ USING EBCDIC CODE
|
||||||
This is the case for most computer operating systems. PCRE2 can, how-
|
This is the case for most computer operating systems. PCRE2 can, how-
|
||||||
ever, be compiled to run in an EBCDIC environment by adding
|
ever, be compiled to run in an EBCDIC environment by adding
|
||||||
|
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
to the configure command. This setting implies --enable-rebuild-charta-
|
to the configure command. This setting implies --enable-rebuild-charta-
|
||||||
bles. You should only use it if you know that you are in an EBCDIC
|
bles. You should only use it if you know that you are in an EBCDIC
|
||||||
environment (for example, an IBM mainframe operating system). The
|
environment (for example, an IBM mainframe operating system). The
|
||||||
--enable-ebcdic option is incompatible with --enable-unicode.
|
--enable-ebcdic option is incompatible with Unicode support.
|
||||||
|
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
||||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||||
|
@ -2978,7 +2975,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -4511,122 +4508,121 @@ NAME
|
||||||
|
|
||||||
UNICODE AND UTF SUPPORT
|
UNICODE AND UTF SUPPORT
|
||||||
|
|
||||||
When PCRE2 is built with Unicode support, it acquires knowledge of Uni-
|
When PCRE2 is built with Unicode support (which is the default), it has
|
||||||
code character properties and can process text strings in UTF-8,
|
knowledge of Unicode character properties and can process text strings
|
||||||
UTF-16, or UTF-32 format (depending on the code unit width). By
|
in UTF-8, UTF-16, or UTF-32 format (depending on the code unit width).
|
||||||
default, PCRE2 assumes that one code unit is one character. To process
|
However, by default, PCRE2 assumes that one code unit is one character.
|
||||||
a pattern as a UTF string, where a character may require more than one
|
To process a pattern as a UTF string, where a character may require
|
||||||
code unit, you must call pcre2_compile() with the PCRE2_UTF option
|
more than one code unit, you must call pcre2_compile() with the
|
||||||
flag, or the pattern must start with the sequence (*UTF). When either
|
PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||||
of these is the case, both the pattern and any subject strings that are
|
(*UTF). When either of these is the case, both the pattern and any sub-
|
||||||
matched against it are treated as UTF strings instead of strings of
|
ject strings that are matched against it are treated as UTF strings
|
||||||
individual one-code-unit characters.
|
instead of strings of individual one-code-unit characters.
|
||||||
|
|
||||||
If you build PCRE2 with Unicode support, the library will be bigger,
|
If you do not need Unicode support you can build PCRE2 without it, in
|
||||||
but the additional run time overhead is limited to testing the
|
which case the library will be smaller.
|
||||||
PCRE2_UTF flag occasionally, so should not be very much.
|
|
||||||
|
|
||||||
|
|
||||||
UNICODE PROPERTY SUPPORT
|
UNICODE PROPERTY SUPPORT
|
||||||
|
|
||||||
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
When PCRE2 is built with Unicode support, the escape sequences \p{..},
|
||||||
\P{..}, and \X can be used. The Unicode properties that can be tested
|
\P{..}, and \X can be used. The Unicode properties that can be tested
|
||||||
are limited to the general category properties such as Lu for an upper
|
are limited to the general category properties such as Lu for an upper
|
||||||
case letter or Nd for a decimal number, the Unicode script names such
|
case letter or Nd for a decimal number, the Unicode script names such
|
||||||
as Arabic or Han, and the derived properties Any and L&. Full lists are
|
as Arabic or Han, and the derived properties Any and L&. Full lists are
|
||||||
given in the pcre2pattern and pcre2syntax documentation. Only the short
|
given in the pcre2pattern and pcre2syntax documentation. Only the short
|
||||||
names for properties are supported. For example, \p{L} matches a let-
|
names for properties are supported. For example, \p{L} matches a let-
|
||||||
ter. Its Perl synonym, \p{Letter}, is not supported. Furthermore, in
|
ter. Its Perl synonym, \p{Letter}, is not supported. Furthermore, in
|
||||||
Perl, many properties may optionally be prefixed by "Is", for compati-
|
Perl, many properties may optionally be prefixed by "Is", for compati-
|
||||||
bility with Perl 5.6. PCRE does not support this.
|
bility with Perl 5.6. PCRE does not support this.
|
||||||
|
|
||||||
|
|
||||||
WIDE CHARACTERS AND UTF MODES
|
WIDE CHARACTERS AND UTF MODES
|
||||||
|
|
||||||
Codepoints less than 256 can be specified in patterns by either braced
|
Codepoints less than 256 can be specified in patterns by either braced
|
||||||
or unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3).
|
or unbraced hexadecimal escape sequences (for example, \x{b3} or \xb3).
|
||||||
Larger values have to use braced sequences. Unbraced octal code points
|
Larger values have to use braced sequences. Unbraced octal code points
|
||||||
up to \777 are also recognized; larger ones can be coded using \o{...}.
|
up to \777 are also recognized; larger ones can be coded using \o{...}.
|
||||||
|
|
||||||
In UTF modes, repeat quantifiers apply to complete UTF characters, not
|
In UTF modes, repeat quantifiers apply to complete UTF characters, not
|
||||||
to individual code units.
|
to individual code units.
|
||||||
|
|
||||||
In UTF modes, the dot metacharacter matches one UTF character instead
|
In UTF modes, the dot metacharacter matches one UTF character instead
|
||||||
of a single code unit.
|
of a single code unit.
|
||||||
|
|
||||||
The escape sequence \C can be used to match a single code unit, in a
|
The escape sequence \C can be used to match a single code unit, in a
|
||||||
UTF mode, but its use can lead to some strange effects because it
|
UTF mode, but its use can lead to some strange effects because it
|
||||||
breaks up multi-unit characters (see the description of \C in the
|
breaks up multi-unit characters (see the description of \C in the
|
||||||
pcre2pattern documentation). The use of \C is not supported in the
|
pcre2pattern documentation). The use of \C is not supported in the
|
||||||
alternative matching function pcre2_dfa_exec(), nor is it supported in
|
alternative matching function pcre2_dfa_exec(), nor is it supported in
|
||||||
UTF mode by the JIT optimization. If JIT optimization is requested for
|
UTF mode by the JIT optimization. If JIT optimization is requested for
|
||||||
a UTF pattern that contains \C, it will not succeed, and so the match-
|
a UTF pattern that contains \C, it will not succeed, and so the match-
|
||||||
ing will be carried out by the normal interpretive function.
|
ing will be carried out by the normal interpretive function.
|
||||||
|
|
||||||
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test
|
||||||
characters of any code value, but, by default, the characters that
|
characters of any code value, but, by default, the characters that
|
||||||
PCRE2 recognizes as digits, spaces, or word characters remain the same
|
PCRE2 recognizes as digits, spaces, or word characters remain the same
|
||||||
set as in non-UTF mode, all with code points less than 256. This
|
set as in non-UTF mode, all with code points less than 256. This
|
||||||
remains true even when PCRE2 is built to include Unicode support,
|
remains true even when PCRE2 is built to include Unicode support,
|
||||||
because to do otherwise would slow down matching in many common cases.
|
because to do otherwise would slow down matching in many common cases.
|
||||||
Note that this also applies to \b and \B, because they are defined in
|
Note that this also applies to \b and \B, because they are defined in
|
||||||
terms of \w and \W. If you want to test for a wider sense of, say,
|
terms of \w and \W. If you want to test for a wider sense of, say,
|
||||||
"digit", you can use explicit Unicode property tests such as \p{Nd}.
|
"digit", you can use explicit Unicode property tests such as \p{Nd}.
|
||||||
Alternatively, if you set the PCRE2_UCP option, the way that the char-
|
Alternatively, if you set the PCRE2_UCP option, the way that the char-
|
||||||
acter escapes work is changed so that Unicode properties are used to
|
acter escapes work is changed so that Unicode properties are used to
|
||||||
determine which characters match. There are more details in the section
|
determine which characters match. There are more details in the section
|
||||||
on generic character types in the pcre2pattern documentation.
|
on generic character types in the pcre2pattern documentation.
|
||||||
|
|
||||||
Similarly, characters that match the POSIX named character classes are
|
Similarly, characters that match the POSIX named character classes are
|
||||||
all low-valued characters, unless the PCRE2_UCP option is set.
|
all low-valued characters, unless the PCRE2_UCP option is set.
|
||||||
|
|
||||||
However, the special horizontal and vertical white space matching
|
However, the special horizontal and vertical white space matching
|
||||||
escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
|
escapes (\h, \H, \v, and \V) do match all the appropriate Unicode char-
|
||||||
acters, whether or not PCRE2_UCP is set.
|
acters, whether or not PCRE2_UCP is set.
|
||||||
|
|
||||||
Case-insensitive matching in UTF mode makes use of Unicode properties.
|
Case-insensitive matching in UTF mode makes use of Unicode properties.
|
||||||
A few Unicode characters such as Greek sigma have more than two code-
|
A few Unicode characters such as Greek sigma have more than two code-
|
||||||
points that are case-equivalent, and these are treated as such.
|
points that are case-equivalent, and these are treated as such.
|
||||||
|
|
||||||
|
|
||||||
VALIDITY OF UTF STRINGS
|
VALIDITY OF UTF STRINGS
|
||||||
|
|
||||||
When the PCRE2_UTF option is set, the strings passed as patterns and
|
When the PCRE2_UTF option is set, the strings passed as patterns and
|
||||||
subjects are (by default) checked for validity on entry to the relevant
|
subjects are (by default) checked for validity on entry to the relevant
|
||||||
functions. If an invalid UTF string is passed, an error return is
|
functions. If an invalid UTF string is passed, an error return is
|
||||||
given.
|
given.
|
||||||
|
|
||||||
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
UTF-16 and UTF-32 strings can indicate their endianness by special code
|
||||||
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
knows as a byte-order mark (BOM). The PCRE2 functions do not handle
|
||||||
this, expecting strings to be in host byte order.
|
this, expecting strings to be in host byte order.
|
||||||
|
|
||||||
The entire string is checked before any other processing takes place.
|
The entire string is checked before any other processing takes place.
|
||||||
In addition to checking the format of the string, there is a check to
|
In addition to checking the format of the string, there is a check to
|
||||||
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
ensure that all code points lie in the range U+0 to U+10FFFF, excluding
|
||||||
the surrogate area. The so-called "non-character" code points are not
|
the surrogate area. The so-called "non-character" code points are not
|
||||||
excluded because Unicode corrigendum #9 makes it clear that they should
|
excluded because Unicode corrigendum #9 makes it clear that they should
|
||||||
not be.
|
not be.
|
||||||
|
|
||||||
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
Characters in the "Surrogate Area" of Unicode are reserved for use by
|
||||||
UTF-16, where they are used in pairs to encode code points with values
|
UTF-16, where they are used in pairs to encode code points with values
|
||||||
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
|
||||||
are available independently in the UTF-8 and UTF-32 encodings. (In
|
are available independently in the UTF-8 and UTF-32 encodings. (In
|
||||||
other words, the whole surrogate thing is a fudge for UTF-16 which
|
other words, the whole surrogate thing is a fudge for UTF-16 which
|
||||||
unfortunately messes up UTF-8 and UTF-32.)
|
unfortunately messes up UTF-8 and UTF-32.)
|
||||||
|
|
||||||
In some situations, you may already know that your strings are valid,
|
In some situations, you may already know that your strings are valid,
|
||||||
and therefore want to skip these checks in order to improve perfor-
|
and therefore want to skip these checks in order to improve perfor-
|
||||||
mance, for example in the case of a long subject string that is being
|
mance, for example in the case of a long subject string that is being
|
||||||
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK flag at compile
|
scanned repeatedly. If you set the PCRE2_NO_UTF_CHECK flag at compile
|
||||||
time or at run time, PCRE2 assumes that the pattern or subject it is
|
time or at run time, PCRE2 assumes that the pattern or subject it is
|
||||||
given (respectively) contains only valid UTF code unit sequences.
|
given (respectively) contains only valid UTF code unit sequences.
|
||||||
|
|
||||||
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
Passing PCRE2_NO_UTF_CHECK to pcre2_compile() just disables the check
|
||||||
for the pattern; it does not also apply to subject strings. If you want
|
for the pattern; it does not also apply to subject strings. If you want
|
||||||
to disable the check for a subject string you must pass this option to
|
to disable the check for a subject string you must pass this option to
|
||||||
pcre2_exec() or pcre2_dfa_exec().
|
pcre2_exec() or pcre2_dfa_exec().
|
||||||
|
|
||||||
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the
|
||||||
result is undefined and your program may crash or loop indefinitely.
|
result is undefined and your program may crash or loop indefinitely.
|
||||||
|
|
||||||
Errors in UTF-8 strings
|
Errors in UTF-8 strings
|
||||||
|
@ -4639,10 +4635,10 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR4
|
PCRE2_ERROR_UTF8_ERR4
|
||||||
PCRE2_ERROR_UTF8_ERR5
|
PCRE2_ERROR_UTF8_ERR5
|
||||||
|
|
||||||
The string ends with a truncated UTF-8 character; the code specifies
|
The string ends with a truncated UTF-8 character; the code specifies
|
||||||
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
how many bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8
|
||||||
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
characters to be no longer than 4 bytes, the encoding scheme (origi-
|
||||||
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
nally defined by RFC 2279) allows for up to 6 bytes, and this is
|
||||||
checked first; hence the possibility of 4 or 5 missing bytes.
|
checked first; hence the possibility of 4 or 5 missing bytes.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR6
|
PCRE2_ERROR_UTF8_ERR6
|
||||||
|
@ -4652,24 +4648,24 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR10
|
PCRE2_ERROR_UTF8_ERR10
|
||||||
|
|
||||||
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of
|
||||||
the character do not have the binary value 0b10 (that is, either the
|
the character do not have the binary value 0b10 (that is, either the
|
||||||
most significant bit is 0, or the next bit is 1).
|
most significant bit is 0, or the next bit is 1).
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR11
|
PCRE2_ERROR_UTF8_ERR11
|
||||||
PCRE2_ERROR_UTF8_ERR12
|
PCRE2_ERROR_UTF8_ERR12
|
||||||
|
|
||||||
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes
|
||||||
long; these code points are excluded by RFC 3629.
|
long; these code points are excluded by RFC 3629.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR13
|
PCRE2_ERROR_UTF8_ERR13
|
||||||
|
|
||||||
A 4-byte character has a value greater than 0x10fff; these code points
|
A 4-byte character has a value greater than 0x10fff; these code points
|
||||||
are excluded by RFC 3629.
|
are excluded by RFC 3629.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR14
|
PCRE2_ERROR_UTF8_ERR14
|
||||||
|
|
||||||
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
A 3-byte character has a value in the range 0xd800 to 0xdfff; this
|
||||||
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
range of code points are reserved by RFC 3629 for use with UTF-16, and
|
||||||
so are excluded from UTF-8.
|
so are excluded from UTF-8.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR15
|
PCRE2_ERROR_UTF8_ERR15
|
||||||
|
@ -4678,26 +4674,26 @@ VALIDITY OF UTF STRINGS
|
||||||
PCRE2_ERROR_UTF8_ERR18
|
PCRE2_ERROR_UTF8_ERR18
|
||||||
PCRE2_ERROR_UTF8_ERR19
|
PCRE2_ERROR_UTF8_ERR19
|
||||||
|
|
||||||
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes
|
||||||
for a value that can be represented by fewer bytes, which is invalid.
|
for a value that can be represented by fewer bytes, which is invalid.
|
||||||
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
For example, the two bytes 0xc0, 0xae give the value 0x2e, whose cor-
|
||||||
rect coding uses just one byte.
|
rect coding uses just one byte.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR20
|
PCRE2_ERROR_UTF8_ERR20
|
||||||
|
|
||||||
The two most significant bits of the first byte of a character have the
|
The two most significant bits of the first byte of a character have the
|
||||||
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
binary value 0b10 (that is, the most significant bit is 1 and the sec-
|
||||||
ond is 0). Such a byte can only validly occur as the second or subse-
|
ond is 0). Such a byte can only validly occur as the second or subse-
|
||||||
quent byte of a multi-byte character.
|
quent byte of a multi-byte character.
|
||||||
|
|
||||||
PCRE2_ERROR_UTF8_ERR21
|
PCRE2_ERROR_UTF8_ERR21
|
||||||
|
|
||||||
The first byte of a character has the value 0xfe or 0xff. These values
|
The first byte of a character has the value 0xfe or 0xff. These values
|
||||||
can never occur in a valid UTF-8 string.
|
can never occur in a valid UTF-8 string.
|
||||||
|
|
||||||
Errors in UTF-16 strings
|
Errors in UTF-16 strings
|
||||||
|
|
||||||
The following negative error codes are given for invalid UTF-16
|
The following negative error codes are given for invalid UTF-16
|
||||||
strings:
|
strings:
|
||||||
|
|
||||||
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
PCRE_UTF16_ERR1 Missing low surrogate at end of string
|
||||||
|
@ -4707,7 +4703,7 @@ VALIDITY OF UTF STRINGS
|
||||||
|
|
||||||
Errors in UTF-32 strings
|
Errors in UTF-32 strings
|
||||||
|
|
||||||
The following negative error codes are given for invalid UTF-32
|
The following negative error codes are given for invalid UTF-32
|
||||||
strings:
|
strings:
|
||||||
|
|
||||||
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
|
||||||
|
@ -4723,7 +4719,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 16 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2BUILD 3 "28 Sepember 2014" "PCRE2 10.00"
|
.TH PCRE2BUILD 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.
|
.
|
||||||
|
@ -102,25 +102,22 @@ to the \fBconfigure\fP command, as required.
|
||||||
.SH "Unicode and UTF SUPPORT"
|
.SH "Unicode and UTF SUPPORT"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
To build PCRE2 with support for Unicode and UTF character strings, add
|
By default, PCRE2 is built with support for Unicode and UTF character strings.
|
||||||
|
To build it without Unicode support, add
|
||||||
.sp
|
.sp
|
||||||
--enable-unicode
|
--disable-unicode
|
||||||
.sp
|
.sp
|
||||||
to the \fBconfigure\fP command. This setting applies to all three libraries,
|
to the \fBconfigure\fP command. This setting applies to all three libraries. It
|
||||||
adding support for UTF-8 to the 8-bit library, support for UTF-16 to the 16-bit
|
is not possible to build one library with Unicode support, and another without,
|
||||||
library, and support for UTF-32 to the to the 32-bit library.
|
in the same configuration.
|
||||||
It is not possible to build one library with
|
|
||||||
UTF support and another without in the same configuration.
|
|
||||||
.P
|
.P
|
||||||
Of itself, this setting does not make PCRE2 treat strings as UTF-8, UTF-16 or
|
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16
|
||||||
UTF-32. As well as compiling PCRE2 with this option, you also have have to set
|
or UTF-32. To do that you have have to set the PCRE2_UTF option when you call
|
||||||
the PCRE2_UTF option when you call \fBpcre2_compile()\fP to compile a pattern.
|
\fBpcre2_compile()\fP to compile a pattern.
|
||||||
.P
|
.P
|
||||||
If you set --enable-unicode when compiling in an EBCDIC environment, PCRE2
|
It is not possible to support both EBCDIC and UTF-8 codes in the same version
|
||||||
expects its input to be either ASCII or UTF-8 (depending on the run-time
|
of the library. Consequently, --enable-unicode and --enable-ebcdic are mutually
|
||||||
option). It is not possible to support both EBCDIC and UTF-8 codes in the same
|
exclusive.
|
||||||
version of the library. Consequently, --enable-unicode and --enable-ebcdic are
|
|
||||||
mutually exclusive.
|
|
||||||
.P
|
.P
|
||||||
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
||||||
in the strings that they handle. It also provides support for accessing the
|
in the strings that they handle. It also provides support for accessing the
|
||||||
|
@ -306,12 +303,12 @@ code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
|
||||||
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
||||||
EBCDIC environment by adding
|
EBCDIC environment by adding
|
||||||
.sp
|
.sp
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
.sp
|
.sp
|
||||||
to the \fBconfigure\fP command. This setting implies
|
to the \fBconfigure\fP command. This setting implies
|
||||||
--enable-rebuild-chartables. You should only use it if you know that you are in
|
--enable-rebuild-chartables. You should only use it if you know that you are in
|
||||||
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
||||||
--enable-ebcdic option is incompatible with --enable-unicode.
|
--enable-ebcdic option is incompatible with Unicode support.
|
||||||
.P
|
.P
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
||||||
value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In
|
value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In
|
||||||
|
@ -485,6 +482,6 @@ Cambridge CB2 3QH, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2PATTERN 3 "19 October 2014" "PCRE2 10.00"
|
.TH PCRE2PATTERN 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||||
|
@ -51,11 +51,11 @@ In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as
|
||||||
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
||||||
specified for the 32-bit library, in which case it constrains the character
|
specified for the 32-bit library, in which case it constrains the character
|
||||||
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
||||||
built to include Unicode support. When using UTF strings you must either call
|
built to include Unicode support (which is the default). When using UTF strings
|
||||||
the compiling function with the PCRE2_UTF option, or the pattern must start
|
you must either call the compiling function with the PCRE2_UTF option, or the
|
||||||
with the special sequence (*UTF), which is equivalent to setting the relevant
|
pattern must start with the special sequence (*UTF), which is equivalent to
|
||||||
option. How setting a UTF mode affects pattern matching is mentioned in several
|
setting the relevant option. How setting a UTF mode affects pattern matching is
|
||||||
places below. There is also a summary of features in the
|
mentioned in several places below. There is also a summary of features in the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2unicode\fP
|
\fBpcre2unicode\fP
|
||||||
.\"
|
.\"
|
||||||
|
@ -540,9 +540,9 @@ By default, characters whose code points are greater than 127 never match \ed,
|
||||||
\es, or \ew, and always match \eD, \eS, and \eW, although this may vary for
|
\es, or \ew, and always match \eD, \eS, and \eW, although this may vary for
|
||||||
characters in the range 128-255 when locale-specific matching is happening.
|
characters in the range 128-255 when locale-specific matching is happening.
|
||||||
These escape sequences retain their original meanings from before Unicode
|
These escape sequences retain their original meanings from before Unicode
|
||||||
support was available, mainly for efficiency reasons. If PCRE2 is compiled with
|
support was available, mainly for efficiency reasons. If the PCRE2_UCP option
|
||||||
Unicode support, and the PCRE2_UCP option is set, the behaviour is changed so
|
is set, the behaviour is changed so that Unicode properties are used to
|
||||||
that Unicode properties are used to determine character types, as follows:
|
determine character types, as follows:
|
||||||
.sp
|
.sp
|
||||||
\ed any character that matches \ep{Nd} (decimal digit)
|
\ed any character that matches \ep{Nd} (decimal digit)
|
||||||
\es any character that matches \ep{Z} or \eh or \ev
|
\es any character that matches \ep{Z} or \eh or \ev
|
||||||
|
@ -645,11 +645,11 @@ an error.
|
||||||
.SS Unicode character properties
|
.SS Unicode character properties
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
When PCRE2 is built with Unicode support, three additional escape sequences
|
When PCRE2 is built with Unicode support (the default), three additional escape
|
||||||
that match characters with specific properties are available. In 8-bit
|
sequences that match characters with specific properties are available. In
|
||||||
non-UTF-8 mode, these sequences are of course limited to testing characters
|
8-bit non-UTF-8 mode, these sequences are of course limited to testing
|
||||||
whose codepoints are less than 256, but they do work in this mode. The extra
|
characters whose codepoints are less than 256, but they do work in this mode.
|
||||||
escape sequences are:
|
The extra escape sequences are:
|
||||||
.sp
|
.sp
|
||||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||||
|
@ -3236,6 +3236,6 @@ Cambridge CB2 3QH, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 19 October 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
.TH PCRE2UNICODE 3 "16 September 2014" "PCRE2 10.00"
|
.TH PCRE2UNICODE 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE - Perl-compatible regular expressions (revised API)
|
PCRE - Perl-compatible regular expressions (revised API)
|
||||||
.SH "UNICODE AND UTF SUPPORT"
|
.SH "UNICODE AND UTF SUPPORT"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
When PCRE2 is built with Unicode support, it acquires knowledge of Unicode
|
When PCRE2 is built with Unicode support (which is the default), it has
|
||||||
character properties and can process text strings in UTF-8, UTF-16, or UTF-32
|
knowledge of Unicode character properties and can process text strings in
|
||||||
format (depending on the code unit width). By default, PCRE2 assumes that one
|
UTF-8, UTF-16, or UTF-32 format (depending on the code unit width). However, by
|
||||||
code unit is one character. To process a pattern as a UTF string, where a
|
default, PCRE2 assumes that one code unit is one character. To process a
|
||||||
character may require more than one code unit, you must call
|
pattern as a UTF string, where a character may require more than one code unit,
|
||||||
|
you must call
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2_compile()\fP
|
\fBpcre2_compile()\fP
|
||||||
.\"
|
.\"
|
||||||
|
@ -17,9 +18,8 @@ with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||||
strings that are matched against it are treated as UTF strings instead of
|
strings that are matched against it are treated as UTF strings instead of
|
||||||
strings of individual one-code-unit characters.
|
strings of individual one-code-unit characters.
|
||||||
.P
|
.P
|
||||||
If you build PCRE2 with Unicode support, the library will be bigger, but the
|
If you do not need Unicode support you can build PCRE2 without it, in which
|
||||||
additional run time overhead is limited to testing the PCRE2_UTF flag
|
case the library will be smaller.
|
||||||
occasionally, so should not be very much.
|
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "UNICODE PROPERTY SUPPORT"
|
.SH "UNICODE PROPERTY SUPPORT"
|
||||||
|
@ -249,6 +249,6 @@ Cambridge CB2 3QH, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 16 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -215,7 +215,7 @@ if [ $ISGCC -ne 0 -a $usemain -ne 0 ]; then
|
||||||
echo "---------- Maximally configured test with -O2 ----------"
|
echo "---------- Maximally configured test with -O2 ----------"
|
||||||
SAVECLFAGS="$CFLAGS"
|
SAVECLFAGS="$CFLAGS"
|
||||||
CFLAGS="$CFLAGS -O2"
|
CFLAGS="$CFLAGS -O2"
|
||||||
opts="--disable-shared --enable-unicode $enable_jit --enable-pcre2-16 --enable-pcre2-32"
|
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
|
||||||
runtest
|
runtest
|
||||||
CFLAGS="$SAVECFLAGS"
|
CFLAGS="$SAVECFLAGS"
|
||||||
fi
|
fi
|
||||||
|
@ -224,25 +224,25 @@ if [ $usemain -ne 0 ]; then
|
||||||
echo "---------- Non-JIT tests in the current directory ----------"
|
echo "---------- Non-JIT tests in the current directory ----------"
|
||||||
for opts in \
|
for opts in \
|
||||||
"" \
|
"" \
|
||||||
"--enable-unicode --disable-static" \
|
"--disable-static" \
|
||||||
|
"--disable-shared" \
|
||||||
|
"--disable-unicode --disable-stack-for-recursion --disable-shared" \
|
||||||
"--disable-stack-for-recursion --disable-shared" \
|
"--disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-unicode --disable-shared" \
|
"--with-link-size=3 --disable-shared" \
|
||||||
"--enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-unicode --enable-rebuild-chartables --disable-shared" \
|
||||||
"--enable-unicode --with-link-size=3 --disable-shared" \
|
"--disable-unicode --enable-newline-is-any --disable-shared" \
|
||||||
"--enable-rebuild-chartables --disable-shared" \
|
"--disable-unicode --enable-newline-is-cr --disable-shared" \
|
||||||
"--enable-newline-is-any --disable-shared" \
|
"--disable-unicode --enable-newline-is-crlf --disable-shared" \
|
||||||
"--enable-newline-is-cr --disable-shared" \
|
"--disable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
|
||||||
"--enable-newline-is-crlf --disable-shared" \
|
"--enable-newline-is-any --disable-stack-for-recursion --disable-static" \
|
||||||
"--enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
|
"--disable-unicode --enable-pcre2-16" \
|
||||||
"--enable-unicode --enable-newline-is-any --disable-stack-for-recursion --disable-static" \
|
"--disable-unicode --enable-pcre2-16 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-16" \
|
|
||||||
"--enable-pcre2-16 --disable-stack-for-recursion --disable-shared" \
|
"--enable-pcre2-16 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-16 --enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-unicode --enable-pcre2-32" \
|
||||||
"--enable-pcre2-32" \
|
"--disable-unicode --enable-pcre2-32 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-32 --disable-stack-for-recursion --disable-shared" \
|
"--enable-pcre2-32 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-32 --enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-shared" \
|
||||||
"--enable-pcre2-32 --enable-pcre2-16 --disable-shared" \
|
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --disable-shared"
|
||||||
"--enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --disable-shared"
|
|
||||||
do
|
do
|
||||||
runtest
|
runtest
|
||||||
done
|
done
|
||||||
|
@ -253,19 +253,19 @@ fi
|
||||||
if [ $usejit -ne 0 ]; then
|
if [ $usejit -ne 0 ]; then
|
||||||
echo "---------- JIT tests in the current directory ----------"
|
echo "---------- JIT tests in the current directory ----------"
|
||||||
for opts in \
|
for opts in \
|
||||||
|
"--disable-unicode --enable-jit --disable-shared" \
|
||||||
"--enable-jit --disable-shared" \
|
"--enable-jit --disable-shared" \
|
||||||
"--enable-jit --enable-unicode --disable-shared" \
|
"--enable-jit --with-link-size=3 --disable-shared" \
|
||||||
"--enable-jit --enable-unicode --with-link-size=3 --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-unicode --disable-shared" \
|
"--disable-unicode --enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --enable-unicode --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --with-link-size=3 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-unicode --with-link-size=3 --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --with-link-size=4 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-unicode --with-link-size=4 --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --enable-unicode --disable-shared" \
|
"--disable-unicode --enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --enable-unicode --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --with-link-size=4 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --enable-unicode --with-link-size=4 --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
|
||||||
"--enable-jit --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --enable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
|
|
||||||
do
|
do
|
||||||
runtest
|
runtest
|
||||||
done
|
done
|
||||||
|
@ -280,9 +280,9 @@ if [ $usevalgrind -ne 0 ]; then
|
||||||
withvalgrind="with valgrind"
|
withvalgrind="with valgrind"
|
||||||
|
|
||||||
for opts in \
|
for opts in \
|
||||||
"--enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-unicode --with-link-size=3 --disable-shared" \
|
"--with-link-size=3 --disable-shared" \
|
||||||
"--disable-shared"
|
"--disable-unicode --disable-shared"
|
||||||
do
|
do
|
||||||
opts="--enable-valgrind $opts"
|
opts="--enable-valgrind $opts"
|
||||||
runtest
|
runtest
|
||||||
|
@ -290,8 +290,8 @@ if [ $usevalgrind -ne 0 ]; then
|
||||||
|
|
||||||
if [ $usejit -ne 0 ]; then
|
if [ $usejit -ne 0 ]; then
|
||||||
for opts in \
|
for opts in \
|
||||||
"--enable-jit --enable-unicode --disable-shared" \
|
"--enable-jit --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-unicode"
|
"--enable-jit --enable-pcre2-16 --enable-pcre2-32"
|
||||||
do
|
do
|
||||||
opts="--enable-valgrind $opts"
|
opts="--enable-valgrind $opts"
|
||||||
runtest
|
runtest
|
||||||
|
@ -337,7 +337,7 @@ fi
|
||||||
|
|
||||||
if [ $usetmp -ne 0 ]; then
|
if [ $usetmp -ne 0 ]; then
|
||||||
for opts in \
|
for opts in \
|
||||||
"--enable-unicode --disable-shared"
|
"--disable-shared"
|
||||||
do
|
do
|
||||||
runtest
|
runtest
|
||||||
done
|
done
|
||||||
|
|
|
@ -1444,7 +1444,7 @@ the three different cases. */
|
||||||
#define PCRE2_GET_STARTCHAR(a,b) a = pcre2_get_startchar_8(G(b,8))
|
#define PCRE2_GET_STARTCHAR(a,b) a = pcre2_get_startchar_8(G(b,8))
|
||||||
#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_8(G(a,8),b)
|
#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_8(G(a,8),b)
|
||||||
#define PCRE2_JIT_FREE_UNUSED_MEMORY(a) pcre2_jit_free_unused_memory_8(G(a,8))
|
#define PCRE2_JIT_FREE_UNUSED_MEMORY(a) pcre2_jit_free_unused_memory_8(G(a,8))
|
||||||
#define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h,(pcre2_jit_stack_8 *)i) \
|
#define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h,i) \
|
||||||
a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
||||||
(pcre2_jit_stack_8 *)i)
|
(pcre2_jit_stack_8 *)i)
|
||||||
#define PCRE2_JIT_STACK_CREATE(a,b,c,d) \
|
#define PCRE2_JIT_STACK_CREATE(a,b,c,d) \
|
||||||
|
|
Loading…
Reference in New Issue