Make --enable-unicode the default.
This commit is contained in:
parent
08e3107cbe
commit
44ef2c3401
|
@ -149,7 +149,7 @@ SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
|
||||||
SET(PCRE2_SUPPORT_PCRE2GREP_JIT ON CACHE BOOL
|
SET(PCRE2_SUPPORT_PCRE2GREP_JIT ON CACHE BOOL
|
||||||
"Enable use of Just-in-time compiling in pcre2grep.")
|
"Enable use of Just-in-time compiling in pcre2grep.")
|
||||||
|
|
||||||
SET(PCRE2_SUPPORT_UNICODE OFF CACHE BOOL
|
SET(PCRE2_SUPPORT_UNICODE ON CACHE BOOL
|
||||||
"Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.")
|
"Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.")
|
||||||
|
|
||||||
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
|
||||||
|
|
12
ChangeLog
12
ChangeLog
|
@ -14,19 +14,21 @@ logged. In addition to the API changes, the following changes were made. They
|
||||||
are either new functionality, or bug fixes and other noticeable changes of
|
are either new functionality, or bug fixes and other noticeable changes of
|
||||||
behaviour that were implemented after the code had been forked.
|
behaviour that were implemented after the code had been forked.
|
||||||
|
|
||||||
1. The test program, now called pcre2test, was re-specified and almost
|
1. Unicode support is now enabled by default.
|
||||||
|
|
||||||
|
2. The test program, now called pcre2test, was re-specified and almost
|
||||||
completely re-written. Its input is not compatible with input for pcretest.
|
completely re-written. Its input is not compatible with input for pcretest.
|
||||||
|
|
||||||
2. Patterns may start with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) to set the
|
3. Patterns may start with (*NOTEMPTY) or (*NOTEMPTY_ATSTART) to set the
|
||||||
PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART options for every subject line that is
|
PCRE2_NOTEMPTY or PCRE2_NOTEMPTY_ATSTART options for every subject line that is
|
||||||
matched by that pattern.
|
matched by that pattern.
|
||||||
|
|
||||||
3. For the benefit of those who use PCRE2 via some other application, that is,
|
4. For the benefit of those who use PCRE2 via some other application, that is,
|
||||||
not writing the function calls themselves, it is possible to check the PCRE2
|
not writing the function calls themselves, it is possible to check the PCRE2
|
||||||
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
version by matching a pattern such as /(?(VERSION>=10.0)yes|no)/ against a
|
||||||
string such as "yesno".
|
string such as "yesno".
|
||||||
|
|
||||||
4. There are case-equivalent Unicode characters whose encodings use different
|
5. There are case-equivalent Unicode characters whose encodings use different
|
||||||
numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is
|
numbers of code units in UTF-8. U+023A and U+2C65 are one example. (It is
|
||||||
theoretically possible for this to happen in UTF-16 too.) If a backreference to
|
theoretically possible for this to happen in UTF-16 too.) If a backreference to
|
||||||
a group containing one of these characters was greedily repeated, and during
|
a group containing one of these characters was greedily repeated, and during
|
||||||
|
@ -38,7 +40,7 @@ Incorrect backtracking meant that group 2 captured only the last two bytes.
|
||||||
This bug has been fixed; the new code is slower, but it is used only when the
|
This bug has been fixed; the new code is slower, but it is used only when the
|
||||||
strings matched by the repetition are not all the same length.
|
strings matched by the repetition are not all the same length.
|
||||||
|
|
||||||
5. A pattern such as /()a/ was not setting the "first character must be 'a'"
|
6. A pattern such as /()a/ was not setting the "first character must be 'a'"
|
||||||
information. This applied to any pattern with a group that matched no
|
information. This applied to any pattern with a group that matched no
|
||||||
characters, for example: /(?:(?=.)|(?<!x))a/.
|
characters, for example: /(?:(?=.)|(?<!x))a/.
|
||||||
|
|
||||||
|
|
|
@ -325,7 +325,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
||||||
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
||||||
|
|
||||||
8. The GUI will then list several configuration options. This is where
|
8. The GUI will then list several configuration options. This is where
|
||||||
you can enable Unicode support or other PCRE2 optional features.
|
you can disable Unicode support or select other PCRE2 optional features.
|
||||||
|
|
||||||
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
||||||
active.
|
active.
|
||||||
|
@ -399,4 +399,4 @@ The site currently has ports for PCRE1 releases, but PCRE2 should follow in due
|
||||||
course.
|
course.
|
||||||
|
|
||||||
==========================
|
==========================
|
||||||
Last Updated: 28 September 2014
|
Last Updated: 03 November 2014
|
||||||
|
|
36
README
36
README
|
@ -179,24 +179,22 @@ library. They are also documented in the pcre2build man page.
|
||||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||||
you add --disable-pcre2grep-jit to the "configure" command.
|
you add --disable-pcre2grep-jit to the "configure" command.
|
||||||
|
|
||||||
. If you want to make use of the support for UTF-8 Unicode character strings in
|
. If you do not want to make use of the support for UTF-8 Unicode character
|
||||||
the 8-bit library, UTF-16 Unicode character strings in the 16-bit library,
|
strings in the 8-bit library, UTF-16 Unicode character strings in the 16-bit
|
||||||
and UTF-32 Unicode character strings in the 32-bit library, you must add
|
library, and UTF-32 Unicode character strings in the 32-bit library, you can
|
||||||
--enable-unicode to the "configure" command. Without it, the code for
|
add --disable-unicode to the "configure" command. This reduces the size of
|
||||||
handling UTF-8, UTF-16 and UTF-8 is not included. It is not possible to
|
the libraries. It is not possible to configure one library with Unicode
|
||||||
configure one library with UTF support and the other without in the same
|
support, and another without, in the same configuration.
|
||||||
configuration.
|
|
||||||
|
|
||||||
Even when --enable-unicode is included, the use of a UTF encoding still has
|
When Unicode support is available, the use of a UTF encoding still has to be
|
||||||
to be enabled by an option at run time. When PCRE2 is compiled with this
|
enabled by an option at run time. When PCRE2 is compiled with Unicode
|
||||||
option, its input can only either be ASCII or UTF-8/16/32, even when running
|
support, its input can only either be ASCII or UTF-8/16/32, even when running
|
||||||
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
||||||
--enable-ebcdic at the same time.
|
--enable-ebcdic at the same time.
|
||||||
|
|
||||||
When --enable-unicode is specified, as well as supporting UTF strings, PCRE2
|
As well as supporting UTF strings, Unicode support includes support for the
|
||||||
includes support for the \P, \p, and \X sequences that recognize Unicode
|
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||||
character properties. However, only the basic two-letter properties such as
|
However, only the basic two-letter properties such as Lu are supported.
|
||||||
Lu are supported.
|
|
||||||
|
|
||||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
||||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||||
|
@ -285,7 +283,7 @@ library. They are also documented in the pcre2build man page.
|
||||||
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
||||||
character code (as opposed to ASCII/Unicode) by specifying
|
character code (as opposed to ASCII/Unicode) by specifying
|
||||||
|
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
This automatically implies --enable-rebuild-chartables (see above). However,
|
This automatically implies --enable-rebuild-chartables (see above). However,
|
||||||
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
||||||
|
@ -543,8 +541,8 @@ from pcre2test. Other files whose names begin with "test" are used as working
|
||||||
files in some tests.
|
files in some tests.
|
||||||
|
|
||||||
Some tests are relevant only when certain build-time options were selected. For
|
Some tests are relevant only when certain build-time options were selected. For
|
||||||
example, the tests for UTF-8/16/32 support are run only if --enable-unicode was
|
example, the tests for UTF-8/16/32 features are run only when Unicode support
|
||||||
used. RunTest outputs a comment when it skips a test.
|
is available. RunTest outputs a comment when it skips a test.
|
||||||
|
|
||||||
Many of the tests that are not skipped are run twice if JIT support is
|
Many of the tests that are not skipped are run twice if JIT support is
|
||||||
available. On the second run, JIT compilation is forced. This testing can be
|
available. On the second run, JIT compilation is forced. This testing can be
|
||||||
|
@ -633,7 +631,7 @@ JIT-specific features such as information output from pcre2test about JIT
|
||||||
compilation.
|
compilation.
|
||||||
|
|
||||||
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
||||||
POSIX interface to the 8-bit library, withouth and with Unicode support,
|
POSIX interface to the 8-bit library, without and with Unicode support,
|
||||||
respectively.
|
respectively.
|
||||||
|
|
||||||
|
|
||||||
|
@ -828,4 +826,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 25 October 2014
|
Last updated: 03 November 2014
|
||||||
|
|
10
configure.ac
10
configure.ac
|
@ -148,10 +148,10 @@ AC_ARG_ENABLE(rebuild-chartables,
|
||||||
[rebuild character tables in current locale]),
|
[rebuild character tables in current locale]),
|
||||||
, enable_rebuild_chartables=no)
|
, enable_rebuild_chartables=no)
|
||||||
|
|
||||||
# Handle --enable-unicode (disabled by default)
|
# Handle --disable-unicode (enabled by default)
|
||||||
AC_ARG_ENABLE(unicode,
|
AC_ARG_ENABLE(unicode,
|
||||||
AS_HELP_STRING([--enable-unicode],
|
AS_HELP_STRING([--disable-unicode],
|
||||||
[enable Unicode support (incompatible with --enable-ebcdic)]),
|
[disable Unicode support]),
|
||||||
, enable_unicode=unset)
|
, enable_unicode=unset)
|
||||||
|
|
||||||
# Handle newline options
|
# Handle newline options
|
||||||
|
@ -299,10 +299,10 @@ then
|
||||||
AC_MSG_ERROR([At least one of the 8, 16 or 32 bit libraries must be enabled])
|
AC_MSG_ERROR([At least one of the 8, 16 or 32 bit libraries must be enabled])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# enable_unicode is disabled by default.
|
# Unicode is enabled by default.
|
||||||
if test "x$enable_unicode" = "xunset"
|
if test "x$enable_unicode" = "xunset"
|
||||||
then
|
then
|
||||||
enable_unicode=no
|
enable_unicode=yes
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Convert the newline identifier into the appropriate integer value. These must
|
# Convert the newline identifier into the appropriate integer value. These must
|
||||||
|
|
|
@ -325,7 +325,7 @@ cache can be deleted by selecting "File > Delete Cache".
|
||||||
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
Studio, MSYS makefiles, MinGW makefiles, etc.)
|
||||||
|
|
||||||
8. The GUI will then list several configuration options. This is where
|
8. The GUI will then list several configuration options. This is where
|
||||||
you can enable Unicode support or other PCRE2 optional features.
|
you can disable Unicode support or select other PCRE2 optional features.
|
||||||
|
|
||||||
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
9. Hit "Configure" again. The adjacent "Generate" button should now be
|
||||||
active.
|
active.
|
||||||
|
@ -399,4 +399,4 @@ The site currently has ports for PCRE1 releases, but PCRE2 should follow in due
|
||||||
course.
|
course.
|
||||||
|
|
||||||
==========================
|
==========================
|
||||||
Last Updated: 28 September 2014
|
Last Updated: 03 November 2014
|
||||||
|
|
|
@ -179,24 +179,22 @@ library. They are also documented in the pcre2build man page.
|
||||||
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
. When JIT support is enabled, pcre2grep automatically makes use of it, unless
|
||||||
you add --disable-pcre2grep-jit to the "configure" command.
|
you add --disable-pcre2grep-jit to the "configure" command.
|
||||||
|
|
||||||
. If you want to make use of the support for UTF-8 Unicode character strings in
|
. If you do not want to make use of the support for UTF-8 Unicode character
|
||||||
the 8-bit library, UTF-16 Unicode character strings in the 16-bit library,
|
strings in the 8-bit library, UTF-16 Unicode character strings in the 16-bit
|
||||||
and UTF-32 Unicode character strings in the 32-bit library, you must add
|
library, and UTF-32 Unicode character strings in the 32-bit library, you can
|
||||||
--enable-unicode to the "configure" command. Without it, the code for
|
add --disable-unicode to the "configure" command. This reduces the size of
|
||||||
handling UTF-8, UTF-16 and UTF-8 is not included. It is not possible to
|
the libraries. It is not possible to configure one library with Unicode
|
||||||
configure one library with UTF support and the other without in the same
|
support, and another without, in the same configuration.
|
||||||
configuration.
|
|
||||||
|
|
||||||
Even when --enable-unicode is included, the use of a UTF encoding still has
|
When Unicode support is available, the use of a UTF encoding still has to be
|
||||||
to be enabled by an option at run time. When PCRE2 is compiled with this
|
enabled by an option at run time. When PCRE2 is compiled with Unicode
|
||||||
option, its input can only either be ASCII or UTF-8/16/32, even when running
|
support, its input can only either be ASCII or UTF-8/16/32, even when running
|
||||||
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
on EBCDIC platforms. It is not possible to use both --enable-unicode and
|
||||||
--enable-ebcdic at the same time.
|
--enable-ebcdic at the same time.
|
||||||
|
|
||||||
When --enable-unicode is specified, as well as supporting UTF strings, PCRE2
|
As well as supporting UTF strings, Unicode support includes support for the
|
||||||
includes support for the \P, \p, and \X sequences that recognize Unicode
|
\P, \p, and \X sequences that recognize Unicode character properties.
|
||||||
character properties. However, only the basic two-letter properties such as
|
However, only the basic two-letter properties such as Lu are supported.
|
||||||
Lu are supported.
|
|
||||||
|
|
||||||
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
. You can build PCRE2 to recognize either CR or LF or the sequence CRLF or any
|
||||||
of the preceding, or any of the Unicode newline sequences as indicating the
|
of the preceding, or any of the Unicode newline sequences as indicating the
|
||||||
|
@ -285,7 +283,7 @@ library. They are also documented in the pcre2build man page.
|
||||||
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
. It is possible to compile PCRE2 for use on systems that use EBCDIC as their
|
||||||
character code (as opposed to ASCII/Unicode) by specifying
|
character code (as opposed to ASCII/Unicode) by specifying
|
||||||
|
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
This automatically implies --enable-rebuild-chartables (see above). However,
|
This automatically implies --enable-rebuild-chartables (see above). However,
|
||||||
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
when PCRE2 is built this way, it always operates in EBCDIC. It cannot support
|
||||||
|
@ -543,8 +541,8 @@ from pcre2test. Other files whose names begin with "test" are used as working
|
||||||
files in some tests.
|
files in some tests.
|
||||||
|
|
||||||
Some tests are relevant only when certain build-time options were selected. For
|
Some tests are relevant only when certain build-time options were selected. For
|
||||||
example, the tests for UTF-8/16/32 support are run only if --enable-unicode was
|
example, the tests for UTF-8/16/32 features are run only when Unicode support
|
||||||
used. RunTest outputs a comment when it skips a test.
|
is available. RunTest outputs a comment when it skips a test.
|
||||||
|
|
||||||
Many of the tests that are not skipped are run twice if JIT support is
|
Many of the tests that are not skipped are run twice if JIT support is
|
||||||
available. On the second run, JIT compilation is forced. This testing can be
|
available. On the second run, JIT compilation is forced. This testing can be
|
||||||
|
@ -633,7 +631,7 @@ JIT-specific features such as information output from pcre2test about JIT
|
||||||
compilation.
|
compilation.
|
||||||
|
|
||||||
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
The sixteenth and seventeenth tests are run only in 8-bit mode. They check the
|
||||||
POSIX interface to the 8-bit library, withouth and with Unicode support,
|
POSIX interface to the 8-bit library, without and with Unicode support,
|
||||||
respectively.
|
respectively.
|
||||||
|
|
||||||
|
|
||||||
|
@ -828,4 +826,4 @@ The distribution should contain the files listed below.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: ph10
|
Email local part: ph10
|
||||||
Email domain: cam.ac.uk
|
Email domain: cam.ac.uk
|
||||||
Last updated: 25 October 2014
|
Last updated: 03 November 2014
|
||||||
|
|
|
@ -35,9 +35,10 @@ code units, which means that up to three separate libraries may be installed.
|
||||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||||
Unicode, with support for Unicode general category properties. Unicode is
|
Unicode, with support for Unicode general category properties. Unicode support
|
||||||
optional at build time, and must be enabled explicitly at run time. The version
|
is optional at build time (but is the default); however, processing strings as
|
||||||
of Unicode in use can be discovered by running
|
UTF code units must be enabled explicitly at run time. The version of Unicode
|
||||||
|
in use can be discovered by running
|
||||||
<pre>
|
<pre>
|
||||||
pcre2test -C
|
pcre2test -C
|
||||||
</PRE>
|
</PRE>
|
||||||
|
@ -95,13 +96,13 @@ not exported.
|
||||||
<P>
|
<P>
|
||||||
If you are using PCRE2 in a non-UTF application that permits users to supply
|
If you are using PCRE2 in a non-UTF application that permits users to supply
|
||||||
arbitrary patterns for compilation, you should be aware of a feature that
|
arbitrary patterns for compilation, you should be aware of a feature that
|
||||||
allows users to turn on UTF support from within a pattern, provided that PCRE2
|
allows users to turn on UTF support from within a pattern. For example, an
|
||||||
was built with Unicode support. For example, an 8-bit pattern that begins with
|
8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets
|
||||||
"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings
|
patterns and subjects as strings of UTF-8 code units instead of individual
|
||||||
of UTF-8 code units instead of individual 8-bit characters. This causes both
|
8-bit characters. This causes both the pattern and any data against which it is
|
||||||
the pattern and any data against which it is matched to be checked for UTF-8
|
matched to be checked for UTF-8 validity. If the data string is very long, such
|
||||||
validity. If the data string is very long, such a check might use sufficiently
|
a check might use sufficiently many resources as to cause your application to
|
||||||
many resources as to cause your application to lose performance.
|
lose performance.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
One way of guarding against this possibility is to use the
|
One way of guarding against this possibility is to use the
|
||||||
|
@ -173,7 +174,7 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC5" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -115,27 +115,24 @@ to the <b>configure</b> command, as required.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC5" href="#TOC1">Unicode and UTF SUPPORT</a><br>
|
<br><a name="SEC5" href="#TOC1">Unicode and UTF SUPPORT</a><br>
|
||||||
<P>
|
<P>
|
||||||
To build PCRE2 with support for Unicode and UTF character strings, add
|
By default, PCRE2 is built with support for Unicode and UTF character strings.
|
||||||
|
To build it without Unicode support, add
|
||||||
<pre>
|
<pre>
|
||||||
--enable-unicode
|
--disable-unicode
|
||||||
</pre>
|
</pre>
|
||||||
to the <b>configure</b> command. This setting applies to all three libraries,
|
to the <b>configure</b> command. This setting applies to all three libraries. It
|
||||||
adding support for UTF-8 to the 8-bit library, support for UTF-16 to the 16-bit
|
is not possible to build one library with Unicode support, and another without,
|
||||||
library, and support for UTF-32 to the to the 32-bit library.
|
in the same configuration.
|
||||||
It is not possible to build one library with
|
|
||||||
UTF support and another without in the same configuration.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
Of itself, this setting does not make PCRE2 treat strings as UTF-8, UTF-16 or
|
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16
|
||||||
UTF-32. As well as compiling PCRE2 with this option, you also have have to set
|
or UTF-32. To do that you have have to set the PCRE2_UTF option when you call
|
||||||
the PCRE2_UTF option when you call <b>pcre2_compile()</b> to compile a pattern.
|
<b>pcre2_compile()</b> to compile a pattern.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If you set --enable-unicode when compiling in an EBCDIC environment, PCRE2
|
It is not possible to support both EBCDIC and UTF-8 codes in the same version
|
||||||
expects its input to be either ASCII or UTF-8 (depending on the run-time
|
of the library. Consequently, --enable-unicode and --enable-ebcdic are mutually
|
||||||
option). It is not possible to support both EBCDIC and UTF-8 codes in the same
|
exclusive.
|
||||||
version of the library. Consequently, --enable-unicode and --enable-ebcdic are
|
|
||||||
mutually exclusive.
|
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
||||||
|
@ -301,12 +298,12 @@ code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
|
||||||
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
||||||
EBCDIC environment by adding
|
EBCDIC environment by adding
|
||||||
<pre>
|
<pre>
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
</pre>
|
</pre>
|
||||||
to the <b>configure</b> command. This setting implies
|
to the <b>configure</b> command. This setting implies
|
||||||
--enable-rebuild-chartables. You should only use it if you know that you are in
|
--enable-rebuild-chartables. You should only use it if you know that you are in
|
||||||
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
||||||
--enable-ebcdic option is incompatible with --enable-unicode.
|
--enable-ebcdic option is incompatible with Unicode support.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
||||||
|
@ -469,7 +466,7 @@ Cambridge CB2 3QH, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -89,11 +89,11 @@ In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as
|
||||||
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
||||||
specified for the 32-bit library, in which case it constrains the character
|
specified for the 32-bit library, in which case it constrains the character
|
||||||
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
||||||
built to include Unicode support. When using UTF strings you must either call
|
built to include Unicode support (which is the default). When using UTF strings
|
||||||
the compiling function with the PCRE2_UTF option, or the pattern must start
|
you must either call the compiling function with the PCRE2_UTF option, or the
|
||||||
with the special sequence (*UTF), which is equivalent to setting the relevant
|
pattern must start with the special sequence (*UTF), which is equivalent to
|
||||||
option. How setting a UTF mode affects pattern matching is mentioned in several
|
setting the relevant option. How setting a UTF mode affects pattern matching is
|
||||||
places below. There is also a summary of features in the
|
mentioned in several places below. There is also a summary of features in the
|
||||||
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
<a href="pcre2unicode.html"><b>pcre2unicode</b></a>
|
||||||
page.
|
page.
|
||||||
</P>
|
</P>
|
||||||
|
@ -538,9 +538,9 @@ By default, characters whose code points are greater than 127 never match \d,
|
||||||
\s, or \w, and always match \D, \S, and \W, although this may vary for
|
\s, or \w, and always match \D, \S, and \W, although this may vary for
|
||||||
characters in the range 128-255 when locale-specific matching is happening.
|
characters in the range 128-255 when locale-specific matching is happening.
|
||||||
These escape sequences retain their original meanings from before Unicode
|
These escape sequences retain their original meanings from before Unicode
|
||||||
support was available, mainly for efficiency reasons. If PCRE2 is compiled with
|
support was available, mainly for efficiency reasons. If the PCRE2_UCP option
|
||||||
Unicode support, and the PCRE2_UCP option is set, the behaviour is changed so
|
is set, the behaviour is changed so that Unicode properties are used to
|
||||||
that Unicode properties are used to determine character types, as follows:
|
determine character types, as follows:
|
||||||
<pre>
|
<pre>
|
||||||
\d any character that matches \p{Nd} (decimal digit)
|
\d any character that matches \p{Nd} (decimal digit)
|
||||||
\s any character that matches \p{Z} or \h or \v
|
\s any character that matches \p{Z} or \h or \v
|
||||||
|
@ -641,11 +641,11 @@ an error.
|
||||||
Unicode character properties
|
Unicode character properties
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is built with Unicode support, three additional escape sequences
|
When PCRE2 is built with Unicode support (the default), three additional escape
|
||||||
that match characters with specific properties are available. In 8-bit
|
sequences that match characters with specific properties are available. In
|
||||||
non-UTF-8 mode, these sequences are of course limited to testing characters
|
8-bit non-UTF-8 mode, these sequences are of course limited to testing
|
||||||
whose codepoints are less than 256, but they do work in this mode. The extra
|
characters whose codepoints are less than 256, but they do work in this mode.
|
||||||
escape sequences are:
|
The extra escape sequences are:
|
||||||
<pre>
|
<pre>
|
||||||
\p{<i>xx</i>} a character with the <i>xx</i> property
|
\p{<i>xx</i>} a character with the <i>xx</i> property
|
||||||
\P{<i>xx</i>} a character without the <i>xx</i> property
|
\P{<i>xx</i>} a character without the <i>xx</i> property
|
||||||
|
@ -3193,7 +3193,7 @@ Cambridge CB2 3QH, England.
|
||||||
</P>
|
</P>
|
||||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 19 October 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
|
@ -16,11 +16,12 @@ please consult the man page, in case the conversion went wrong.
|
||||||
UNICODE AND UTF SUPPORT
|
UNICODE AND UTF SUPPORT
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
When PCRE2 is built with Unicode support, it acquires knowledge of Unicode
|
When PCRE2 is built with Unicode support (which is the default), it has
|
||||||
character properties and can process text strings in UTF-8, UTF-16, or UTF-32
|
knowledge of Unicode character properties and can process text strings in
|
||||||
format (depending on the code unit width). By default, PCRE2 assumes that one
|
UTF-8, UTF-16, or UTF-32 format (depending on the code unit width). However, by
|
||||||
code unit is one character. To process a pattern as a UTF string, where a
|
default, PCRE2 assumes that one code unit is one character. To process a
|
||||||
character may require more than one code unit, you must call
|
pattern as a UTF string, where a character may require more than one code unit,
|
||||||
|
you must call
|
||||||
<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
|
<a href="pcre2_compile.html"><b>pcre2_compile()</b></a>
|
||||||
with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||||
(*UTF). When either of these is the case, both the pattern and any subject
|
(*UTF). When either of these is the case, both the pattern and any subject
|
||||||
|
@ -28,9 +29,8 @@ strings that are matched against it are treated as UTF strings instead of
|
||||||
strings of individual one-code-unit characters.
|
strings of individual one-code-unit characters.
|
||||||
</P>
|
</P>
|
||||||
<P>
|
<P>
|
||||||
If you build PCRE2 with Unicode support, the library will be bigger, but the
|
If you do not need Unicode support you can build PCRE2 without it, in which
|
||||||
additional run time overhead is limited to testing the PCRE2_UTF flag
|
case the library will be smaller.
|
||||||
occasionally, so should not be very much.
|
|
||||||
</P>
|
</P>
|
||||||
<br><b>
|
<br><b>
|
||||||
UNICODE PROPERTY SUPPORT
|
UNICODE PROPERTY SUPPORT
|
||||||
|
@ -261,7 +261,7 @@ Cambridge CB2 3QH, England.
|
||||||
REVISION
|
REVISION
|
||||||
</b><br>
|
</b><br>
|
||||||
<P>
|
<P>
|
||||||
Last updated: 16 September 2014
|
Last updated: 03 November 2014
|
||||||
<br>
|
<br>
|
||||||
Copyright © 1997-2014 University of Cambridge.
|
Copyright © 1997-2014 University of Cambridge.
|
||||||
<br>
|
<br>
|
||||||
|
|
25
doc/pcre2.3
25
doc/pcre2.3
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2 3 "28 September 2014" "PCRE2 10.00"
|
.TH PCRE2 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH INTRODUCTION
|
.SH INTRODUCTION
|
||||||
|
@ -17,9 +17,10 @@ code units, which means that up to three separate libraries may be installed.
|
||||||
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
The original work to extend PCRE to 16-bit and 32-bit code units was done by
|
||||||
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
Zoltan Herczeg and Christian Persch, respectively. In all three cases, strings
|
||||||
can be interpreted either as one character per code unit, or as UTF-encoded
|
can be interpreted either as one character per code unit, or as UTF-encoded
|
||||||
Unicode, with support for Unicode general category properties. Unicode is
|
Unicode, with support for Unicode general category properties. Unicode support
|
||||||
optional at build time, and must be enabled explicitly at run time. The version
|
is optional at build time (but is the default); however, processing strings as
|
||||||
of Unicode in use can be discovered by running
|
UTF code units must be enabled explicitly at run time. The version of Unicode
|
||||||
|
in use can be discovered by running
|
||||||
.sp
|
.sp
|
||||||
pcre2test -C
|
pcre2test -C
|
||||||
.P
|
.P
|
||||||
|
@ -91,13 +92,13 @@ not exported.
|
||||||
.sp
|
.sp
|
||||||
If you are using PCRE2 in a non-UTF application that permits users to supply
|
If you are using PCRE2 in a non-UTF application that permits users to supply
|
||||||
arbitrary patterns for compilation, you should be aware of a feature that
|
arbitrary patterns for compilation, you should be aware of a feature that
|
||||||
allows users to turn on UTF support from within a pattern, provided that PCRE2
|
allows users to turn on UTF support from within a pattern. For example, an
|
||||||
was built with Unicode support. For example, an 8-bit pattern that begins with
|
8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which interprets
|
||||||
"(*UTF)" turns on UTF-8 mode, which interprets patterns and subjects as strings
|
patterns and subjects as strings of UTF-8 code units instead of individual
|
||||||
of UTF-8 code units instead of individual 8-bit characters. This causes both
|
8-bit characters. This causes both the pattern and any data against which it is
|
||||||
the pattern and any data against which it is matched to be checked for UTF-8
|
matched to be checked for UTF-8 validity. If the data string is very long, such
|
||||||
validity. If the data string is very long, such a check might use sufficiently
|
a check might use sufficiently many resources as to cause your application to
|
||||||
many resources as to cause your application to lose performance.
|
lose performance.
|
||||||
.P
|
.P
|
||||||
One way of guarding against this possibility is to use the
|
One way of guarding against this possibility is to use the
|
||||||
\fBpcre2_pattern_info()\fP function to check the compiled pattern's options for
|
\fBpcre2_pattern_info()\fP function to check the compiled pattern's options for
|
||||||
|
@ -175,6 +176,6 @@ use my two initials, followed by the two digits 10, at the domain cam.ac.uk.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -32,9 +32,10 @@ INTRODUCTION
|
||||||
code units was done by Zoltan Herczeg and Christian Persch, respec-
|
code units was done by Zoltan Herczeg and Christian Persch, respec-
|
||||||
tively. In all three cases, strings can be interpreted either as one
|
tively. In all three cases, strings can be interpreted either as one
|
||||||
character per code unit, or as UTF-encoded Unicode, with support for
|
character per code unit, or as UTF-encoded Unicode, with support for
|
||||||
Unicode general category properties. Unicode is optional at build time,
|
Unicode general category properties. Unicode support is optional at
|
||||||
and must be enabled explicitly at run time. The version of Unicode in
|
build time (but is the default); however, processing strings as UTF
|
||||||
use can be discovered by running
|
code units must be enabled explicitly at run time. The version of Uni-
|
||||||
|
code in use can be discovered by running
|
||||||
|
|
||||||
pcre2test -C
|
pcre2test -C
|
||||||
|
|
||||||
|
@ -76,14 +77,14 @@ SECURITY CONSIDERATIONS
|
||||||
|
|
||||||
If you are using PCRE2 in a non-UTF application that permits users to
|
If you are using PCRE2 in a non-UTF application that permits users to
|
||||||
supply arbitrary patterns for compilation, you should be aware of a
|
supply arbitrary patterns for compilation, you should be aware of a
|
||||||
feature that allows users to turn on UTF support from within a pattern,
|
feature that allows users to turn on UTF support from within a pattern.
|
||||||
provided that PCRE2 was built with Unicode support. For example, an
|
For example, an 8-bit pattern that begins with "(*UTF)" turns on UTF-8
|
||||||
8-bit pattern that begins with "(*UTF)" turns on UTF-8 mode, which
|
mode, which interprets patterns and subjects as strings of UTF-8 code
|
||||||
interprets patterns and subjects as strings of UTF-8 code units instead
|
units instead of individual 8-bit characters. This causes both the pat-
|
||||||
of individual 8-bit characters. This causes both the pattern and any
|
tern and any data against which it is matched to be checked for UTF-8
|
||||||
data against which it is matched to be checked for UTF-8 validity. If
|
validity. If the data string is very long, such a check might use suf-
|
||||||
the data string is very long, such a check might use sufficiently many
|
ficiently many resources as to cause your application to lose perfor-
|
||||||
resources as to cause your application to lose performance.
|
mance.
|
||||||
|
|
||||||
One way of guarding against this possibility is to use the pcre2_pat-
|
One way of guarding against this possibility is to use the pcre2_pat-
|
||||||
tern_info() function to check the compiled pattern's options for UTF.
|
tern_info() function to check the compiled pattern's options for UTF.
|
||||||
|
@ -155,7 +156,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -2619,26 +2620,22 @@ BUILDING SHARED AND STATIC LIBRARIES
|
||||||
|
|
||||||
Unicode and UTF SUPPORT
|
Unicode and UTF SUPPORT
|
||||||
|
|
||||||
To build PCRE2 with support for Unicode and UTF character strings, add
|
By default, PCRE2 is built with support for Unicode and UTF character
|
||||||
|
strings. To build it without Unicode support, add
|
||||||
|
|
||||||
--enable-unicode
|
--disable-unicode
|
||||||
|
|
||||||
to the configure command. This setting applies to all three libraries,
|
to the configure command. This setting applies to all three libraries.
|
||||||
adding support for UTF-8 to the 8-bit library, support for UTF-16 to
|
It is not possible to build one library with Unicode support, and
|
||||||
the 16-bit library, and support for UTF-32 to the to the 32-bit
|
another without, in the same configuration.
|
||||||
library. It is not possible to build one library with UTF support and
|
|
||||||
another without in the same configuration.
|
|
||||||
|
|
||||||
Of itself, this setting does not make PCRE2 treat strings as UTF-8,
|
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8,
|
||||||
UTF-16 or UTF-32. As well as compiling PCRE2 with this option, you also
|
UTF-16 or UTF-32. To do that you have have to set the PCRE2_UTF option
|
||||||
have have to set the PCRE2_UTF option when you call pcre2_compile() to
|
when you call pcre2_compile() to compile a pattern.
|
||||||
compile a pattern.
|
|
||||||
|
|
||||||
If you set --enable-unicode when compiling in an EBCDIC environment,
|
It is not possible to support both EBCDIC and UTF-8 codes in the same
|
||||||
PCRE2 expects its input to be either ASCII or UTF-8 (depending on the
|
version of the library. Consequently, --enable-unicode and --enable-
|
||||||
run-time option). It is not possible to support both EBCDIC and UTF-8
|
ebcdic are mutually exclusive.
|
||||||
codes in the same version of the library. Consequently, --enable-uni-
|
|
||||||
code and --enable-ebcdic are mutually exclusive.
|
|
||||||
|
|
||||||
UTF support allows the libraries to process character codepoints up to
|
UTF support allows the libraries to process character codepoints up to
|
||||||
0x10ffff in the strings that they handle. It also provides support for
|
0x10ffff in the strings that they handle. It also provides support for
|
||||||
|
@ -2809,12 +2806,12 @@ USING EBCDIC CODE
|
||||||
This is the case for most computer operating systems. PCRE2 can, how-
|
This is the case for most computer operating systems. PCRE2 can, how-
|
||||||
ever, be compiled to run in an EBCDIC environment by adding
|
ever, be compiled to run in an EBCDIC environment by adding
|
||||||
|
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
|
|
||||||
to the configure command. This setting implies --enable-rebuild-charta-
|
to the configure command. This setting implies --enable-rebuild-charta-
|
||||||
bles. You should only use it if you know that you are in an EBCDIC
|
bles. You should only use it if you know that you are in an EBCDIC
|
||||||
environment (for example, an IBM mainframe operating system). The
|
environment (for example, an IBM mainframe operating system). The
|
||||||
--enable-ebcdic option is incompatible with --enable-unicode.
|
--enable-ebcdic option is incompatible with Unicode support.
|
||||||
|
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have
|
||||||
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
the value 0x15 by default. However, in some EBCDIC environments, 0x25
|
||||||
|
@ -2978,7 +2975,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -4511,20 +4508,19 @@ NAME
|
||||||
|
|
||||||
UNICODE AND UTF SUPPORT
|
UNICODE AND UTF SUPPORT
|
||||||
|
|
||||||
When PCRE2 is built with Unicode support, it acquires knowledge of Uni-
|
When PCRE2 is built with Unicode support (which is the default), it has
|
||||||
code character properties and can process text strings in UTF-8,
|
knowledge of Unicode character properties and can process text strings
|
||||||
UTF-16, or UTF-32 format (depending on the code unit width). By
|
in UTF-8, UTF-16, or UTF-32 format (depending on the code unit width).
|
||||||
default, PCRE2 assumes that one code unit is one character. To process
|
However, by default, PCRE2 assumes that one code unit is one character.
|
||||||
a pattern as a UTF string, where a character may require more than one
|
To process a pattern as a UTF string, where a character may require
|
||||||
code unit, you must call pcre2_compile() with the PCRE2_UTF option
|
more than one code unit, you must call pcre2_compile() with the
|
||||||
flag, or the pattern must start with the sequence (*UTF). When either
|
PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||||
of these is the case, both the pattern and any subject strings that are
|
(*UTF). When either of these is the case, both the pattern and any sub-
|
||||||
matched against it are treated as UTF strings instead of strings of
|
ject strings that are matched against it are treated as UTF strings
|
||||||
individual one-code-unit characters.
|
instead of strings of individual one-code-unit characters.
|
||||||
|
|
||||||
If you build PCRE2 with Unicode support, the library will be bigger,
|
If you do not need Unicode support you can build PCRE2 without it, in
|
||||||
but the additional run time overhead is limited to testing the
|
which case the library will be smaller.
|
||||||
PCRE2_UTF flag occasionally, so should not be very much.
|
|
||||||
|
|
||||||
|
|
||||||
UNICODE PROPERTY SUPPORT
|
UNICODE PROPERTY SUPPORT
|
||||||
|
@ -4723,7 +4719,7 @@ AUTHOR
|
||||||
|
|
||||||
REVISION
|
REVISION
|
||||||
|
|
||||||
Last updated: 16 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
------------------------------------------------------------------------------
|
------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2BUILD 3 "28 Sepember 2014" "PCRE2 10.00"
|
.TH PCRE2BUILD 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.
|
.
|
||||||
|
@ -102,25 +102,22 @@ to the \fBconfigure\fP command, as required.
|
||||||
.SH "Unicode and UTF SUPPORT"
|
.SH "Unicode and UTF SUPPORT"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
To build PCRE2 with support for Unicode and UTF character strings, add
|
By default, PCRE2 is built with support for Unicode and UTF character strings.
|
||||||
|
To build it without Unicode support, add
|
||||||
.sp
|
.sp
|
||||||
--enable-unicode
|
--disable-unicode
|
||||||
.sp
|
.sp
|
||||||
to the \fBconfigure\fP command. This setting applies to all three libraries,
|
to the \fBconfigure\fP command. This setting applies to all three libraries. It
|
||||||
adding support for UTF-8 to the 8-bit library, support for UTF-16 to the 16-bit
|
is not possible to build one library with Unicode support, and another without,
|
||||||
library, and support for UTF-32 to the to the 32-bit library.
|
in the same configuration.
|
||||||
It is not possible to build one library with
|
|
||||||
UTF support and another without in the same configuration.
|
|
||||||
.P
|
.P
|
||||||
Of itself, this setting does not make PCRE2 treat strings as UTF-8, UTF-16 or
|
Of itself, Unicode support does not make PCRE2 treat strings as UTF-8, UTF-16
|
||||||
UTF-32. As well as compiling PCRE2 with this option, you also have have to set
|
or UTF-32. To do that you have have to set the PCRE2_UTF option when you call
|
||||||
the PCRE2_UTF option when you call \fBpcre2_compile()\fP to compile a pattern.
|
\fBpcre2_compile()\fP to compile a pattern.
|
||||||
.P
|
.P
|
||||||
If you set --enable-unicode when compiling in an EBCDIC environment, PCRE2
|
It is not possible to support both EBCDIC and UTF-8 codes in the same version
|
||||||
expects its input to be either ASCII or UTF-8 (depending on the run-time
|
of the library. Consequently, --enable-unicode and --enable-ebcdic are mutually
|
||||||
option). It is not possible to support both EBCDIC and UTF-8 codes in the same
|
exclusive.
|
||||||
version of the library. Consequently, --enable-unicode and --enable-ebcdic are
|
|
||||||
mutually exclusive.
|
|
||||||
.P
|
.P
|
||||||
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
UTF support allows the libraries to process character codepoints up to 0x10ffff
|
||||||
in the strings that they handle. It also provides support for accessing the
|
in the strings that they handle. It also provides support for accessing the
|
||||||
|
@ -306,12 +303,12 @@ code is ASCII (or Unicode, which is a superset of ASCII). This is the case for
|
||||||
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
most computer operating systems. PCRE2 can, however, be compiled to run in an
|
||||||
EBCDIC environment by adding
|
EBCDIC environment by adding
|
||||||
.sp
|
.sp
|
||||||
--enable-ebcdic
|
--enable-ebcdic --disable-unicode
|
||||||
.sp
|
.sp
|
||||||
to the \fBconfigure\fP command. This setting implies
|
to the \fBconfigure\fP command. This setting implies
|
||||||
--enable-rebuild-chartables. You should only use it if you know that you are in
|
--enable-rebuild-chartables. You should only use it if you know that you are in
|
||||||
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
an EBCDIC environment (for example, an IBM mainframe operating system). The
|
||||||
--enable-ebcdic option is incompatible with --enable-unicode.
|
--enable-ebcdic option is incompatible with Unicode support.
|
||||||
.P
|
.P
|
||||||
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
The EBCDIC character that corresponds to an ASCII LF is assumed to have the
|
||||||
value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In
|
value 0x15 by default. However, in some EBCDIC environments, 0x25 is used. In
|
||||||
|
@ -485,6 +482,6 @@ Cambridge CB2 3QH, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 28 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2PATTERN 3 "19 October 2014" "PCRE2 10.00"
|
.TH PCRE2PATTERN 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||||
|
@ -51,11 +51,11 @@ In the 8-bit and 16-bit PCRE2 libraries, characters may be coded either as
|
||||||
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
single code units, or as multiple UTF-8 or UTF-16 code units. UTF-32 can be
|
||||||
specified for the 32-bit library, in which case it constrains the character
|
specified for the 32-bit library, in which case it constrains the character
|
||||||
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
values to valid Unicode code points. To process UTF strings, PCRE2 must be
|
||||||
built to include Unicode support. When using UTF strings you must either call
|
built to include Unicode support (which is the default). When using UTF strings
|
||||||
the compiling function with the PCRE2_UTF option, or the pattern must start
|
you must either call the compiling function with the PCRE2_UTF option, or the
|
||||||
with the special sequence (*UTF), which is equivalent to setting the relevant
|
pattern must start with the special sequence (*UTF), which is equivalent to
|
||||||
option. How setting a UTF mode affects pattern matching is mentioned in several
|
setting the relevant option. How setting a UTF mode affects pattern matching is
|
||||||
places below. There is also a summary of features in the
|
mentioned in several places below. There is also a summary of features in the
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2unicode\fP
|
\fBpcre2unicode\fP
|
||||||
.\"
|
.\"
|
||||||
|
@ -540,9 +540,9 @@ By default, characters whose code points are greater than 127 never match \ed,
|
||||||
\es, or \ew, and always match \eD, \eS, and \eW, although this may vary for
|
\es, or \ew, and always match \eD, \eS, and \eW, although this may vary for
|
||||||
characters in the range 128-255 when locale-specific matching is happening.
|
characters in the range 128-255 when locale-specific matching is happening.
|
||||||
These escape sequences retain their original meanings from before Unicode
|
These escape sequences retain their original meanings from before Unicode
|
||||||
support was available, mainly for efficiency reasons. If PCRE2 is compiled with
|
support was available, mainly for efficiency reasons. If the PCRE2_UCP option
|
||||||
Unicode support, and the PCRE2_UCP option is set, the behaviour is changed so
|
is set, the behaviour is changed so that Unicode properties are used to
|
||||||
that Unicode properties are used to determine character types, as follows:
|
determine character types, as follows:
|
||||||
.sp
|
.sp
|
||||||
\ed any character that matches \ep{Nd} (decimal digit)
|
\ed any character that matches \ep{Nd} (decimal digit)
|
||||||
\es any character that matches \ep{Z} or \eh or \ev
|
\es any character that matches \ep{Z} or \eh or \ev
|
||||||
|
@ -645,11 +645,11 @@ an error.
|
||||||
.SS Unicode character properties
|
.SS Unicode character properties
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
When PCRE2 is built with Unicode support, three additional escape sequences
|
When PCRE2 is built with Unicode support (the default), three additional escape
|
||||||
that match characters with specific properties are available. In 8-bit
|
sequences that match characters with specific properties are available. In
|
||||||
non-UTF-8 mode, these sequences are of course limited to testing characters
|
8-bit non-UTF-8 mode, these sequences are of course limited to testing
|
||||||
whose codepoints are less than 256, but they do work in this mode. The extra
|
characters whose codepoints are less than 256, but they do work in this mode.
|
||||||
escape sequences are:
|
The extra escape sequences are:
|
||||||
.sp
|
.sp
|
||||||
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
\ep{\fIxx\fP} a character with the \fIxx\fP property
|
||||||
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
\eP{\fIxx\fP} a character without the \fIxx\fP property
|
||||||
|
@ -3236,6 +3236,6 @@ Cambridge CB2 3QH, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 19 October 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
.TH PCRE2UNICODE 3 "16 September 2014" "PCRE2 10.00"
|
.TH PCRE2UNICODE 3 "03 November 2014" "PCRE2 10.00"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE - Perl-compatible regular expressions (revised API)
|
PCRE - Perl-compatible regular expressions (revised API)
|
||||||
.SH "UNICODE AND UTF SUPPORT"
|
.SH "UNICODE AND UTF SUPPORT"
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
When PCRE2 is built with Unicode support, it acquires knowledge of Unicode
|
When PCRE2 is built with Unicode support (which is the default), it has
|
||||||
character properties and can process text strings in UTF-8, UTF-16, or UTF-32
|
knowledge of Unicode character properties and can process text strings in
|
||||||
format (depending on the code unit width). By default, PCRE2 assumes that one
|
UTF-8, UTF-16, or UTF-32 format (depending on the code unit width). However, by
|
||||||
code unit is one character. To process a pattern as a UTF string, where a
|
default, PCRE2 assumes that one code unit is one character. To process a
|
||||||
character may require more than one code unit, you must call
|
pattern as a UTF string, where a character may require more than one code unit,
|
||||||
|
you must call
|
||||||
.\" HREF
|
.\" HREF
|
||||||
\fBpcre2_compile()\fP
|
\fBpcre2_compile()\fP
|
||||||
.\"
|
.\"
|
||||||
|
@ -17,9 +18,8 @@ with the PCRE2_UTF option flag, or the pattern must start with the sequence
|
||||||
strings that are matched against it are treated as UTF strings instead of
|
strings that are matched against it are treated as UTF strings instead of
|
||||||
strings of individual one-code-unit characters.
|
strings of individual one-code-unit characters.
|
||||||
.P
|
.P
|
||||||
If you build PCRE2 with Unicode support, the library will be bigger, but the
|
If you do not need Unicode support you can build PCRE2 without it, in which
|
||||||
additional run time overhead is limited to testing the PCRE2_UTF flag
|
case the library will be smaller.
|
||||||
occasionally, so should not be very much.
|
|
||||||
.
|
.
|
||||||
.
|
.
|
||||||
.SH "UNICODE PROPERTY SUPPORT"
|
.SH "UNICODE PROPERTY SUPPORT"
|
||||||
|
@ -249,6 +249,6 @@ Cambridge CB2 3QH, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 16 September 2014
|
Last updated: 03 November 2014
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2014 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -215,7 +215,7 @@ if [ $ISGCC -ne 0 -a $usemain -ne 0 ]; then
|
||||||
echo "---------- Maximally configured test with -O2 ----------"
|
echo "---------- Maximally configured test with -O2 ----------"
|
||||||
SAVECLFAGS="$CFLAGS"
|
SAVECLFAGS="$CFLAGS"
|
||||||
CFLAGS="$CFLAGS -O2"
|
CFLAGS="$CFLAGS -O2"
|
||||||
opts="--disable-shared --enable-unicode $enable_jit --enable-pcre2-16 --enable-pcre2-32"
|
opts="--disable-shared $enable_jit --enable-pcre2-16 --enable-pcre2-32"
|
||||||
runtest
|
runtest
|
||||||
CFLAGS="$SAVECFLAGS"
|
CFLAGS="$SAVECFLAGS"
|
||||||
fi
|
fi
|
||||||
|
@ -224,25 +224,25 @@ if [ $usemain -ne 0 ]; then
|
||||||
echo "---------- Non-JIT tests in the current directory ----------"
|
echo "---------- Non-JIT tests in the current directory ----------"
|
||||||
for opts in \
|
for opts in \
|
||||||
"" \
|
"" \
|
||||||
"--enable-unicode --disable-static" \
|
"--disable-static" \
|
||||||
|
"--disable-shared" \
|
||||||
|
"--disable-unicode --disable-stack-for-recursion --disable-shared" \
|
||||||
"--disable-stack-for-recursion --disable-shared" \
|
"--disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-unicode --disable-shared" \
|
"--with-link-size=3 --disable-shared" \
|
||||||
"--enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-unicode --enable-rebuild-chartables --disable-shared" \
|
||||||
"--enable-unicode --with-link-size=3 --disable-shared" \
|
"--disable-unicode --enable-newline-is-any --disable-shared" \
|
||||||
"--enable-rebuild-chartables --disable-shared" \
|
"--disable-unicode --enable-newline-is-cr --disable-shared" \
|
||||||
"--enable-newline-is-any --disable-shared" \
|
"--disable-unicode --enable-newline-is-crlf --disable-shared" \
|
||||||
"--enable-newline-is-cr --disable-shared" \
|
"--disable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
|
||||||
"--enable-newline-is-crlf --disable-shared" \
|
"--enable-newline-is-any --disable-stack-for-recursion --disable-static" \
|
||||||
"--enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
|
"--disable-unicode --enable-pcre2-16" \
|
||||||
"--enable-unicode --enable-newline-is-any --disable-stack-for-recursion --disable-static" \
|
"--disable-unicode --enable-pcre2-16 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-16" \
|
|
||||||
"--enable-pcre2-16 --disable-stack-for-recursion --disable-shared" \
|
"--enable-pcre2-16 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-16 --enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-unicode --enable-pcre2-32" \
|
||||||
"--enable-pcre2-32" \
|
"--disable-unicode --enable-pcre2-32 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-32 --disable-stack-for-recursion --disable-shared" \
|
"--enable-pcre2-32 --disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-pcre2-32 --enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-shared" \
|
||||||
"--enable-pcre2-32 --enable-pcre2-16 --disable-shared" \
|
"--disable-unicode --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --disable-shared"
|
||||||
"--enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --disable-shared"
|
|
||||||
do
|
do
|
||||||
runtest
|
runtest
|
||||||
done
|
done
|
||||||
|
@ -253,19 +253,19 @@ fi
|
||||||
if [ $usejit -ne 0 ]; then
|
if [ $usejit -ne 0 ]; then
|
||||||
echo "---------- JIT tests in the current directory ----------"
|
echo "---------- JIT tests in the current directory ----------"
|
||||||
for opts in \
|
for opts in \
|
||||||
|
"--disable-unicode --enable-jit --disable-shared" \
|
||||||
"--enable-jit --disable-shared" \
|
"--enable-jit --disable-shared" \
|
||||||
"--enable-jit --enable-unicode --disable-shared" \
|
"--enable-jit --with-link-size=3 --disable-shared" \
|
||||||
"--enable-jit --enable-unicode --with-link-size=3 --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-unicode --disable-shared" \
|
"--disable-unicode --enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --disable-pcre2-8 --enable-unicode --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --with-link-size=3 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-unicode --with-link-size=3 --disable-shared" \
|
"--enable-jit --enable-pcre2-16 --with-link-size=4 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-unicode --with-link-size=4 --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --enable-unicode --disable-shared" \
|
"--disable-unicode --enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --disable-pcre2-8 --enable-unicode --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --with-link-size=4 --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-32 --enable-unicode --with-link-size=4 --disable-shared" \
|
"--enable-jit --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
|
||||||
"--enable-jit --enable-pcre2-32 --enable-pcre2-16 --disable-pcre2-8 --enable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
|
|
||||||
do
|
do
|
||||||
runtest
|
runtest
|
||||||
done
|
done
|
||||||
|
@ -280,9 +280,9 @@ if [ $usevalgrind -ne 0 ]; then
|
||||||
withvalgrind="with valgrind"
|
withvalgrind="with valgrind"
|
||||||
|
|
||||||
for opts in \
|
for opts in \
|
||||||
"--enable-unicode --disable-stack-for-recursion --disable-shared" \
|
"--disable-stack-for-recursion --disable-shared" \
|
||||||
"--enable-unicode --with-link-size=3 --disable-shared" \
|
"--with-link-size=3 --disable-shared" \
|
||||||
"--disable-shared"
|
"--disable-unicode --disable-shared"
|
||||||
do
|
do
|
||||||
opts="--enable-valgrind $opts"
|
opts="--enable-valgrind $opts"
|
||||||
runtest
|
runtest
|
||||||
|
@ -290,8 +290,8 @@ if [ $usevalgrind -ne 0 ]; then
|
||||||
|
|
||||||
if [ $usejit -ne 0 ]; then
|
if [ $usejit -ne 0 ]; then
|
||||||
for opts in \
|
for opts in \
|
||||||
"--enable-jit --enable-unicode --disable-shared" \
|
"--enable-jit --disable-shared" \
|
||||||
"--enable-jit --enable-pcre2-16 --enable-pcre2-32 --enable-unicode"
|
"--enable-jit --enable-pcre2-16 --enable-pcre2-32"
|
||||||
do
|
do
|
||||||
opts="--enable-valgrind $opts"
|
opts="--enable-valgrind $opts"
|
||||||
runtest
|
runtest
|
||||||
|
@ -337,7 +337,7 @@ fi
|
||||||
|
|
||||||
if [ $usetmp -ne 0 ]; then
|
if [ $usetmp -ne 0 ]; then
|
||||||
for opts in \
|
for opts in \
|
||||||
"--enable-unicode --disable-shared"
|
"--disable-shared"
|
||||||
do
|
do
|
||||||
runtest
|
runtest
|
||||||
done
|
done
|
||||||
|
|
|
@ -1444,7 +1444,7 @@ the three different cases. */
|
||||||
#define PCRE2_GET_STARTCHAR(a,b) a = pcre2_get_startchar_8(G(b,8))
|
#define PCRE2_GET_STARTCHAR(a,b) a = pcre2_get_startchar_8(G(b,8))
|
||||||
#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_8(G(a,8),b)
|
#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_8(G(a,8),b)
|
||||||
#define PCRE2_JIT_FREE_UNUSED_MEMORY(a) pcre2_jit_free_unused_memory_8(G(a,8))
|
#define PCRE2_JIT_FREE_UNUSED_MEMORY(a) pcre2_jit_free_unused_memory_8(G(a,8))
|
||||||
#define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h,(pcre2_jit_stack_8 *)i) \
|
#define PCRE2_JIT_MATCH(a,b,c,d,e,f,g,h,i) \
|
||||||
a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
a = pcre2_jit_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8), \
|
||||||
(pcre2_jit_stack_8 *)i)
|
(pcre2_jit_stack_8 *)i)
|
||||||
#define PCRE2_JIT_STACK_CREATE(a,b,c,d) \
|
#define PCRE2_JIT_STACK_CREATE(a,b,c,d) \
|
||||||
|
|
Loading…
Reference in New Issue