API documentation and a lot of little related changes to the code.

This commit is contained in:
Philip.Hazel 2014-09-19 07:43:39 +00:00
parent de4f203346
commit eee8530add
40 changed files with 3484 additions and 459 deletions

View File

@ -149,8 +149,8 @@ SET(PCRE2_SUPPORT_JIT OFF CACHE BOOL
SET(PCRE2_SUPPORT_PCRE2GREP_JIT ON CACHE BOOL
"Enable use of Just-in-time compiling in pcre2grep.")
SET(PCRE2_SUPPORT_UTF OFF CACHE BOOL
"Enable support for Unicode Transformation Format (UTF-8/UTF-16/UTF-32) encoding.")
SET(PCRE2_SUPPORT_UNICODE OFF CACHE BOOL
"Enable support for Unicode and UTF-8/UTF-16/UTF-32 encoding.")
SET(PCRE2_SUPPORT_BSR_ANYCRLF OFF CACHE BOOL
"ON=Backslash-R matches only LF CR and CRLF, OFF=Backslash-R matches all Unicode Linebreaks")
@ -245,9 +245,9 @@ IF(PCRE2_SUPPORT_BSR_ANYCRLF)
SET(BSR_ANYCRLF 1)
ENDIF(PCRE2_SUPPORT_BSR_ANYCRLF)
IF(PCRE2_SUPPORT_UTF)
SET(SUPPORT_UTF 1)
ENDIF(PCRE2_SUPPORT_UTF)
IF(PCRE2_SUPPORT_UNICODE)
SET(SUPPORT_UNICODE 1)
ENDIF(PCRE2_SUPPORT_UNICODE)
IF(PCRE2_SUPPORT_JIT)
SET(SUPPORT_JIT 1)
@ -709,7 +709,7 @@ IF(PCRE2_SHOW_REPORT)
MESSAGE(STATUS " Build 16 bit PCRE2 library ...... : ${PCRE2_BUILD_PCRE16}")
MESSAGE(STATUS " Build 32 bit PCRE2 library ...... : ${PCRE2_BUILD_PCRE32}")
MESSAGE(STATUS " Enable JIT compiling support .... : ${PCRE2_SUPPORT_JIT}")
MESSAGE(STATUS " Enable UTF support .............. : ${PCRE2_SUPPORT_UTF}")
MESSAGE(STATUS " Enable Unicode support .......... : ${PCRE2_SUPPORT_UNICODE}")
MESSAGE(STATUS " Newline char/sequence ........... : ${PCRE2_NEWLINE}")
MESSAGE(STATUS " \\R matches only ANYCRLF ......... : ${PCRE2_SUPPORT_BSR_ANYCRLF}")
MESSAGE(STATUS " EBCDIC coding ................... : ${PCRE2_EBCDIC}")

View File

@ -76,7 +76,10 @@ AM_CPPFLAGS = -I$(builddir)/src -I$(srcdir)/src
# doc/html/pcreunicode.html
# FIXME
#dist_man_MANS = \
dist_man_MANS = \
doc/pcre2api.3
# doc/pcre2-config.1 \
# doc/pcre2.3 \
# doc/pcre2-16.3 \
@ -108,7 +111,6 @@ AM_CPPFLAGS = -I$(builddir)/src -I$(srcdir)/src
# doc/pcre2_utf16_to_host_byte_order.3 \
# doc/pcre2_utf32_to_host_byte_order.3 \
# doc/pcre2_version.3 \
# doc/pcre2api.3 \
# doc/pcre2build.3 \
# doc/pcre2callout.3 \
# doc/pcre2compat.3 \

View File

@ -314,10 +314,11 @@ else
fi
fi
# UTF support always applies to all bit sizes if both are supported; we can't
# have UTF-8 support without UTF-16 or UTF-32 support.
# UTF support is implied by Unicode support, and it always applies to all bit
# sizes if both are supported; we can't have UTF-8 support without UTF-16 or
# UTF-32 support.
$sim ./pcre2test -C utf >/dev/null
$sim ./pcre2test -C unicode >/dev/null
utf=$?
jitopt=

View File

@ -25,7 +25,7 @@
#cmakedefine SUPPORT_JIT 1
#cmakedefine SUPPORT_PCRE2GREP_JIT 1
#cmakedefine SUPPORT_UTF 1
#cmakedefine SUPPORT_UNICODE 1
#cmakedefine SUPPORT_VALGRIND 1
#cmakedefine BSR_ANYCRLF 1

View File

@ -137,11 +137,11 @@ AC_ARG_ENABLE(rebuild-chartables,
[rebuild character tables in current locale]),
, enable_rebuild_chartables=no)
# Handle --enable-utf (disabled by default)
AC_ARG_ENABLE(utf,
AS_HELP_STRING([--enable-utf],
[enable UTF-8/16/32 support (incompatible with --enable-ebcdic)]),
, enable_utf=unset)
# Handle --enable-unicode (disabled by default)
AC_ARG_ENABLE(unicode,
AS_HELP_STRING([--enable-unicode],
[enable Unicode support (incompatible with --enable-ebcdic)]),
, enable_unicode=unset)
# Handle newline options
ac_pcre2_newline=lf
@ -288,10 +288,10 @@ then
AC_MSG_ERROR([At least one of the 8, 16 or 32 bit libraries must be enabled])
fi
# enable_utf is disabled by default.
if test "x$enable_utf" = "xunset"
# enable_unicode is disabled by default.
if test "x$enable_unicode" = "xunset"
then
enable_utf=no
enable_unicode=no
fi
# Convert the newline identifier into the appropriate integer value. These must
@ -320,8 +320,8 @@ fi
#
if test "x$enable_ebcdic" = "xyes"; then
enable_rebuild_chartables=yes
if test "x$enable_utf" = "xyes"; then
AC_MSG_ERROR([support for EBCDIC and UTF-8/16/32 cannot be enabled at the same time])
if test "x$enable_unicode" = "xyes"; then
AC_MSG_ERROR([support for EBCDIC and Unicode cannot be enabled at the same time])
fi
fi
@ -372,7 +372,7 @@ AM_CONDITIONAL(WITH_PCRE16, test "x$enable_pcre16" = "xyes")
AM_CONDITIONAL(WITH_PCRE32, test "x$enable_pcre32" = "xyes")
AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes")
AM_CONDITIONAL(WITH_JIT, test "x$enable_jit" = "xyes")
AM_CONDITIONAL(WITH_UTF, test "x$enable_utf" = "xyes")
AM_CONDITIONAL(WITH_UNICODE, test "x$enable_unicode" = "xyes")
AM_CONDITIONAL(WITH_VALGRIND, test "x$enable_valgrind" = "xyes")
# Checks for typedefs, structures, and compiler characteristics.
@ -513,12 +513,12 @@ if test "$enable_pcre2grep_jit" = "yes"; then
Define to any value to enable JIT support in pcre2grep.])
fi
if test "$enable_utf" = "yes"; then
AC_DEFINE([SUPPORT_UTF], [], [
Define to any value to enable support for the UTF-8/16/32 Unicode encoding.
if test "$enable_unicode" = "yes"; then
AC_DEFINE([SUPPORT_UNICODE], [], [
Define to any value to enable support for Unicode and UTF encoding.
This will work even in an EBCDIC environment, but it is incompatible
with the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC
code *or* ASCII/UTF-8/16/32, but not both at once.])
code *or* ASCII/Unicode, but not both at once.])
fi
if test "$enable_stack_for_recursion" = "no"; then
@ -854,7 +854,7 @@ $PACKAGE-$VERSION configuration summary:
Build 16-bit pcre2 library ...... : ${enable_pcre16}
Build 32-bit pcre2 library ...... : ${enable_pcre32}
Enable JIT compiling support .... : ${enable_jit}
Enable UTF-8/16/32 support ...... : ${enable_utf}
Enable Unicode support .......... : ${enable_unicode}
Newline char/sequence ........... : ${enable_newline}
\R matches only ANYCRLF ......... : ${enable_bsr_anycrlf}
EBCDIC coding ................... : ${enable_ebcdic}

2704
doc/pcre2api.3 Normal file

File diff suppressed because it is too large Load Diff

View File

@ -120,7 +120,7 @@ to the same value:
pcre16 the 16-bit library was built
pcre32 the 32-bit library was built
pcre8 the 8-bit library was built
utf UTF and Unicode property support is available
unicode Unicode support is available
.sp
If an unknown option is given, an error message is output; the exit code is 0.
.TP 10

254
doc/pcre2unicode.3 Normal file
View File

@ -0,0 +1,254 @@
.TH PCRE2UNICODE 3 "16 September 2014" "PCRE2 10.00"
.SH NAME
PCRE - Perl-compatible regular expressions (revised API)
.SH "UNICODE AND UTF SUPPORT"
.rs
.sp
When PCRE2 is built with Unicode support, it acquires knowledge of Unicode
character properties and can process text strings in UTF-8, UTF-16, or UTF-32
format (depending on the code unit width). By default, PCRE2 assumes that one
code unit is one character. To process a pattern as a UTF string, where a
character may require more than one code unit, you must call
.\" HREF
\fBpcre2_compile()\fP
.\"
with the PCRE2_UTF option flag, or the pattern must start with the sequence
(*UTF). When either of these is the case, both the pattern and any subject
strings that are matched against it are treated as UTF strings instead of
strings of individual one-code-unit characters.
.P
If you build PCRE2 with Unicode support, the library will be bigger, but the
additional run time overhead is limited to testing the PCRE2_UTF flag
occasionally, so should not be very much.
.
.
.SH "UNICODE PROPERTY SUPPORT"
.rs
.sp
When PCRE2 is built with Unicode support, the escape sequences \ep{..},
\eP{..}, and \eX can be used. The Unicode properties that can be tested are
limited to the general category properties such as Lu for an upper case letter
or Nd for a decimal number, the Unicode script names such as Arabic or Han, and
the derived properties Any and L&. Full lists are given in the
.\" HREF
\fBpcre2pattern\fP
.\"
and
.\" HREF
\fBpcre2syntax\fP
.\"
documentation. Only the short names for properties are supported. For example,
\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported.
Furthermore, in Perl, many properties may optionally be prefixed by "Is", for
compatibility with Perl 5.6. PCRE does not support this.
.
.
.SH "WIDE CHARACTERS AND UTF MODES"
.rs
.sp
Codepoints less than 256 can be specified in patterns by either braced or
unbraced hexadecimal escape sequences (for example, \ex{b3} or \exb3). Larger
values have to use braced sequences. Unbraced octal code points up to \e777 are
also recognized; larger ones can be coded using \eo{...}.
.P
In UTF modes, repeat quantifiers apply to complete UTF characters, not to
individual code units.
.P
In UTF modes, the dot metacharacter matches one UTF character instead of a
single code unit.
.P
The escape sequence \eC can be used to match a single code unit, in a UTF mode,
but its use can lead to some strange effects because it breaks up multi-unit
characters (see the description of \eC in the
.\" HREF
\fBpcre2pattern\fP
.\"
documentation). The use of \eC is not supported in the alternative matching
function \fBpcre2_dfa_exec()\fP, nor is it supported in UTF mode by the JIT
optimization. If JIT optimization is requested for a UTF pattern that contains
\eC, it will not succeed, and so the matching will be carried out by the normal
interpretive function.
.P
The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test
characters of any code value, but, by default, the characters that PCRE2
recognizes as digits, spaces, or word characters remain the same set as in
non-UTF mode, all with code points less than 256. This remains true even when
PCRE2 is built to include Unicode support, because to do otherwise would slow
down matching in many common cases. Note that this also applies to \eb
and \eB, because they are defined in terms of \ew and \eW. If you want
to test for a wider sense of, say, "digit", you can use explicit Unicode
property tests such as \ep{Nd}. Alternatively, if you set the PCRE2_UCP option,
the way that the character escapes work is changed so that Unicode properties
are used to determine which characters match. There are more details in the
section on
.\" HTML <a href="pcre2pattern.html#genericchartypes">
.\" </a>
generic character types
.\"
in the
.\" HREF
\fBpcre2pattern\fP
.\"
documentation.
.P
Similarly, characters that match the POSIX named character classes are all
low-valued characters, unless the PCRE2_UCP option is set.
.P
However, the special horizontal and vertical white space matching escapes (\eh,
\eH, \ev, and \eV) do match all the appropriate Unicode characters, whether or
not PCRE2_UCP is set.
.P
Case-insensitive matching in UTF mode makes use of Unicode properties. A few
Unicode characters such as Greek sigma have more than two codepoints that are
case-equivalent, and these are treated as such.
.
.
.SH "VALIDITY OF UTF STRINGS"
.rs
.sp
When the PCRE2_UTF option is set, the strings passed as patterns and subjects
are (by default) checked for validity on entry to the relevant functions.
If an invalid UTF string is passed, an error return is given.
.P
UTF-16 and UTF-32 strings can indicate their endianness by special code knows
as a byte-order mark (BOM). The PCRE2 functions do not handle this, expecting
strings to be in host byte order.
.P
The entire string is checked before any other processing takes place. In
addition to checking the format of the string, there is a check to ensure that
all code points lie in the range U+0 to U+10FFFF, excluding the surrogate area.
The so-called "non-character" code points are not excluded because Unicode
corrigendum #9 makes it clear that they should not be.
.P
Characters in the "Surrogate Area" of Unicode are reserved for use by UTF-16,
where they are used in pairs to encode code points with values greater than
0xFFFF. The code points that are encoded by UTF-16 pairs are available
independently in the UTF-8 and UTF-32 encodings. (In other words, the whole
surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8 and
UTF-32.)
.P
In some situations, you may already know that your strings are valid, and
therefore want to skip these checks in order to improve performance, for
example in the case of a long subject string that is being scanned repeatedly.
If you set the PCRE2_NO_UTF_CHECK flag at compile time or at run time, PCRE2
assumes that the pattern or subject it is given (respectively) contains only
valid UTF code unit sequences.
.P
Passing PCRE2_NO_UTF_CHECK to \fBpcre2_compile()\fP just disables the check for
the pattern; it does not also apply to subject strings. If you want to disable
the check for a subject string you must pass this option to \fBpcre2_exec()\fP
or \fBpcre2_dfa_exec()\fP.
.P
If you pass an invalid UTF string when PCRE2_NO_UTF_CHECK is set, the result
is undefined and your program may crash or loop indefinitely.
.
.
.\" HTML <a name="utf8strings"></a>
.SS "Errors in UTF-8 strings"
.rs
.sp
The following negative error codes are given for invalid UTF-8 strings:
.sp
PCRE2_ERROR_UTF8_ERR1
PCRE2_ERROR_UTF8_ERR2
PCRE2_ERROR_UTF8_ERR3
PCRE2_ERROR_UTF8_ERR4
PCRE2_ERROR_UTF8_ERR5
.sp
The string ends with a truncated UTF-8 character; the code specifies how many
bytes are missing (1 to 5). Although RFC 3629 restricts UTF-8 characters to be
no longer than 4 bytes, the encoding scheme (originally defined by RFC 2279)
allows for up to 6 bytes, and this is checked first; hence the possibility of
4 or 5 missing bytes.
.sp
PCRE2_ERROR_UTF8_ERR6
PCRE2_ERROR_UTF8_ERR7
PCRE2_ERROR_UTF8_ERR8
PCRE2_ERROR_UTF8_ERR9
PCRE2_ERROR_UTF8_ERR10
.sp
The two most significant bits of the 2nd, 3rd, 4th, 5th, or 6th byte of the
character do not have the binary value 0b10 (that is, either the most
significant bit is 0, or the next bit is 1).
.sp
PCRE2_ERROR_UTF8_ERR11
PCRE2_ERROR_UTF8_ERR12
.sp
A character that is valid by the RFC 2279 rules is either 5 or 6 bytes long;
these code points are excluded by RFC 3629.
.sp
PCRE2_ERROR_UTF8_ERR13
.sp
A 4-byte character has a value greater than 0x10fff; these code points are
excluded by RFC 3629.
.sp
PCRE2_ERROR_UTF8_ERR14
.sp
A 3-byte character has a value in the range 0xd800 to 0xdfff; this range of
code points are reserved by RFC 3629 for use with UTF-16, and so are excluded
from UTF-8.
.sp
PCRE2_ERROR_UTF8_ERR15
PCRE2_ERROR_UTF8_ERR16
PCRE2_ERROR_UTF8_ERR17
PCRE2_ERROR_UTF8_ERR18
PCRE2_ERROR_UTF8_ERR19
.sp
A 2-, 3-, 4-, 5-, or 6-byte character is "overlong", that is, it codes for a
value that can be represented by fewer bytes, which is invalid. For example,
the two bytes 0xc0, 0xae give the value 0x2e, whose correct coding uses just
one byte.
.sp
PCRE2_ERROR_UTF8_ERR20
.sp
The two most significant bits of the first byte of a character have the binary
value 0b10 (that is, the most significant bit is 1 and the second is 0). Such a
byte can only validly occur as the second or subsequent byte of a multi-byte
character.
.sp
PCRE2_ERROR_UTF8_ERR21
.sp
The first byte of a character has the value 0xfe or 0xff. These values can
never occur in a valid UTF-8 string.
.
.
.\" HTML <a name="utf16strings"></a>
.SS "Errors in UTF-16 strings"
.rs
.sp
The following negative error codes are given for invalid UTF-16 strings:
.sp
PCRE_UTF16_ERR1 Missing low surrogate at end of string
PCRE_UTF16_ERR2 Invalid low surrogate follows high surrogate
PCRE_UTF16_ERR3 Isolated low surrogate
.sp
.
.
.\" HTML <a name="utf32strings"></a>
.SS "Errors in UTF-32 strings"
.rs
.sp
The following negative error codes are given for invalid UTF-32 strings:
.sp
PCRE_UTF32_ERR1 Surrogate character (range from 0xd800 to 0xdfff)
PCRE_UTF32_ERR2 Code point is greater than 0x10ffff
.sp
.
.
.SH AUTHOR
.rs
.sp
.nf
Philip Hazel
University Computing Service
Cambridge CB2 3QH, England.
.fi
.
.
.SH REVISION
.rs
.sp
.nf
Last updated: 16 September 2014
Copyright (c) 1997-2014 University of Cambridge.
.fi

View File

@ -202,7 +202,7 @@ if [ $ISGCC -ne 0 -a $usemain -ne 0 ]; then
echo "---------- Maximally configured test with -O2 ----------"
SAVECLFAGS="$CFLAGS"
CFLAGS="$CFLAGS -O2"
opts="--disable-shared --enable-utf $enable_jit --enable-pcre16 --enable-pcre32"
opts="--disable-shared --enable-unicode $enable_jit --enable-pcre16 --enable-pcre32"
runtest
CFLAGS="$SAVECFLAGS"
fi
@ -211,23 +211,23 @@ if [ $usemain -ne 0 ]; then
echo "---------- Non-JIT tests in the current directory ----------"
for opts in \
"" \
"--enable-utf --disable-static" \
"--enable-unicode --disable-static" \
"--disable-stack-for-recursion --disable-shared" \
"--enable-utf --disable-shared" \
"--enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-utf --with-link-size=3 --disable-shared" \
"--enable-unicode --disable-shared" \
"--enable-unicode --disable-stack-for-recursion --disable-shared" \
"--enable-unicode --with-link-size=3 --disable-shared" \
"--enable-rebuild-chartables --disable-shared" \
"--enable-newline-is-any --disable-shared" \
"--enable-newline-is-cr --disable-shared" \
"--enable-newline-is-crlf --disable-shared" \
"--enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared" \
"--enable-utf --enable-newline-is-any --disable-stack-for-recursion --disable-static" \
"--enable-unicode --enable-newline-is-any --disable-stack-for-recursion --disable-static" \
"--enable-pcre16" \
"--enable-pcre16 --disable-stack-for-recursion --disable-shared" \
"--enable-pcre16 --enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-pcre16 --enable-unicode --disable-stack-for-recursion --disable-shared" \
"--enable-pcre32" \
"--enable-pcre32 --disable-stack-for-recursion --disable-shared" \
"--enable-pcre32 --enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-pcre32 --enable-unicode --disable-stack-for-recursion --disable-shared" \
"--enable-pcre32 --enable-pcre16 --disable-shared" \
"--enable-pcre32 --enable-pcre16 --disable-pcre8 --disable-shared"
do
@ -241,18 +241,18 @@ if [ $usejit -ne 0 ]; then
echo "---------- JIT tests in the current directory ----------"
for opts in \
"--enable-jit --disable-shared" \
"--enable-jit --enable-utf --disable-shared" \
"--enable-jit --enable-utf --with-link-size=3 --disable-shared" \
"--enable-jit --enable-pcre16 --enable-utf --disable-shared" \
"--enable-jit --enable-unicode --disable-shared" \
"--enable-jit --enable-unicode --with-link-size=3 --disable-shared" \
"--enable-jit --enable-pcre16 --enable-unicode --disable-shared" \
"--enable-jit --enable-pcre16 --disable-pcre8 --disable-shared" \
"--enable-jit --enable-pcre16 --disable-pcre8 --enable-utf --disable-shared" \
"--enable-jit --enable-pcre16 --enable-utf --with-link-size=3 --disable-shared" \
"--enable-jit --enable-pcre16 --enable-utf --with-link-size=4 --disable-shared" \
"--enable-jit --enable-pcre32 --enable-utf --disable-shared" \
"--enable-jit --enable-pcre16 --disable-pcre8 --enable-unicode --disable-shared" \
"--enable-jit --enable-pcre16 --enable-unicode --with-link-size=3 --disable-shared" \
"--enable-jit --enable-pcre16 --enable-unicode --with-link-size=4 --disable-shared" \
"--enable-jit --enable-pcre32 --enable-unicode --disable-shared" \
"--enable-jit --enable-pcre32 --disable-pcre8 --disable-shared" \
"--enable-jit --enable-pcre32 --disable-pcre8 --enable-utf --disable-shared" \
"--enable-jit --enable-pcre32 --enable-utf --with-link-size=4 --disable-shared" \
"--enable-jit --enable-pcre32 --enable-pcre16 --disable-pcre8 --enable-utf --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
"--enable-jit --enable-pcre32 --disable-pcre8 --enable-unicode --disable-shared" \
"--enable-jit --enable-pcre32 --enable-unicode --with-link-size=4 --disable-shared" \
"--enable-jit --enable-pcre32 --enable-pcre16 --disable-pcre8 --enable-unicode --enable-newline-is-anycrlf --enable-bsr-anycrlf --disable-shared"
do
runtest
done
@ -267,8 +267,8 @@ if [ $usevalgrind -ne 0 ]; then
withvalgrind="with valgrind"
for opts in \
"--enable-utf --disable-stack-for-recursion --disable-shared" \
"--enable-utf --with-link-size=3 --disable-shared" \
"--enable-unicode --disable-stack-for-recursion --disable-shared" \
"--enable-unicode --with-link-size=3 --disable-shared" \
"--disable-shared"
do
opts="--enable-valgrind $opts"
@ -277,8 +277,8 @@ if [ $usevalgrind -ne 0 ]; then
if [ $usejit -ne 0 ]; then
for opts in \
"--enable-jit --enable-utf --disable-shared" \
"--enable-jit --enable-pcre16 --enable-pcre32 --enable-utf"
"--enable-jit --enable-unicode --disable-shared" \
"--enable-jit --enable-pcre16 --enable-pcre32 --enable-unicode"
do
opts="--enable-valgrind $opts"
runtest
@ -324,7 +324,7 @@ fi
if [ $usetmp -ne 0 ]; then
for opts in \
"--enable-utf --disable-shared"
"--enable-unicode --disable-shared"
do
runtest
done

View File

@ -472,7 +472,7 @@ print("condition to cut out the tables when not needed. But don't leave")
print("a totally empty module because some compilers barf at that.")
print("Instead, just supply small dummy tables. */")
print()
print("#ifndef SUPPORT_UTF")
print("#ifndef SUPPORT_UNICODE")
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
print("const uint8_t PRIV(ucd_stage1)[] = {0};")
print("const uint16_t PRIV(ucd_stage2)[] = {0};")
@ -507,7 +507,7 @@ print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
print("#if UCD_BLOCK_SIZE != %d" % min_block_size)
print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h")
print("#endif")
print("#endif /* SUPPORT_UTF */")
print("#endif /* SUPPORT_UNICODE */")
print()
print("#endif /* PCRE2_PCRE2TEST */")

View File

@ -19,8 +19,8 @@ one. */
#include "../src/config.h"
#endif
#ifndef SUPPORT_UTF
#define SUPPORT_UTF
#ifndef SUPPORT_UNICODE
#define SUPPORT_UNICODE
#endif
#include <ctype.h>

View File

@ -278,11 +278,11 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to any value to enable the 8 bit PCRE2 library. */
/* #undef SUPPORT_PCRE8 */
/* Define to any value to enable support for the UTF-8/16/32 Unicode encoding.
This will work even in an EBCDIC environment, but it is incompatible with
the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or*
ASCII/UTF-8/16/32, but not both at once. */
/* #undef SUPPORT_UTF */
/* Define to any value to enable support for Unicode and UTF encoding. This
will work even in an EBCDIC environment, but it is incompatible with the
EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or*
ASCII/Unicode, but not both at once. */
/* #undef SUPPORT_UNICODE */
/* Define to any value for valgrind support to find invalid memory reads. */
/* #undef SUPPORT_VALGRIND */

View File

@ -193,32 +193,32 @@ must all be greater than zero. */
#define PCRE2_ERROR_UTF32_ERR1 (-27)
#define PCRE2_ERROR_UTF32_ERR2 (-28)
/* Error codes for pcre2[_dfa]_match() */
/* Error codes for pcre2[_dfa]_match(), substring extraction functions, and
context functions. */
#define PCRE2_ERROR_BADCOUNT (-29)
#define PCRE2_ERROR_BADENDIANNESS (-30)
#define PCRE2_ERROR_BADLENGTH (-31)
#define PCRE2_ERROR_BADMAGIC (-32)
#define PCRE2_ERROR_BADMODE (-33)
#define PCRE2_ERROR_BADOFFSET (-34)
#define PCRE2_ERROR_BADOPTION (-35)
#define PCRE2_ERROR_BADUTFOFFSET (-36)
#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */
#define PCRE2_ERROR_DFA_BADRESTART (-38)
#define PCRE2_ERROR_DFA_RECURSE (-39)
#define PCRE2_ERROR_DFA_UCOND (-40)
#define PCRE2_ERROR_DFA_UITEM (-41)
#define PCRE2_ERROR_DFA_UMLIMIT (-42)
#define PCRE2_ERROR_DFA_WSSIZE (-43)
#define PCRE2_ERROR_INTERNAL (-44)
#define PCRE2_ERROR_JIT_BADOPTION (-45)
#define PCRE2_ERROR_JIT_STACKLIMIT (-46)
#define PCRE2_ERROR_MATCHLIMIT (-47)
#define PCRE2_ERROR_NOMEMORY (-48)
#define PCRE2_ERROR_NOSUBSTRING (-49)
#define PCRE2_ERROR_NULL (-50)
#define PCRE2_ERROR_RECURSELOOP (-51)
#define PCRE2_ERROR_RECURSIONLIMIT (-52)
#define PCRE2_ERROR_BADDATA (-29)
#define PCRE2_ERROR_BADLENGTH (-30)
#define PCRE2_ERROR_BADMAGIC (-31)
#define PCRE2_ERROR_BADMODE (-32)
#define PCRE2_ERROR_BADOFFSET (-33)
#define PCRE2_ERROR_BADOPTION (-34)
#define PCRE2_ERROR_BADUTFOFFSET (-35)
#define PCRE2_ERROR_CALLOUT (-36) /* Never used by PCRE2 itself */
#define PCRE2_ERROR_DFA_BADRESTART (-37)
#define PCRE2_ERROR_DFA_RECURSE (-38)
#define PCRE2_ERROR_DFA_UCOND (-39)
#define PCRE2_ERROR_DFA_UITEM (-40)
#define PCRE2_ERROR_DFA_WSSIZE (-41)
#define PCRE2_ERROR_INTERNAL (-42)
#define PCRE2_ERROR_JIT_BADOPTION (-43)
#define PCRE2_ERROR_JIT_STACKLIMIT (-44)
#define PCRE2_ERROR_MATCHLIMIT (-45)
#define PCRE2_ERROR_NOMEMORY (-46)
#define PCRE2_ERROR_NOSUBSTRING (-47)
#define PCRE2_ERROR_NULL (-48)
#define PCRE2_ERROR_RECURSELOOP (-49)
#define PCRE2_ERROR_RECURSIONLIMIT (-50)
#define PCRE2_ERROR_UNSET (-51)
/* Request types for pcre2_pattern_info() */
@ -257,8 +257,8 @@ must all be greater than zero. */
#define PCRE2_CONFIG_PARENSLIMIT 7
#define PCRE2_CONFIG_RECURSIONLIMIT 5
#define PCRE2_CONFIG_STACKRECURSE 8
#define PCRE2_CONFIG_UNICODE_VERSION 9
#define PCRE2_CONFIG_UTF 10
#define PCRE2_CONFIG_UNICODE 9
#define PCRE2_CONFIG_UNICODE_VERSION 10
#define PCRE2_CONFIG_VERSION 11
/* Types for code units in patterns and subject strings. */
@ -338,7 +338,7 @@ expanded for each width below. Start with functions that give general
information. */
#define PCRE2_GENERAL_INFO_FUNCTIONS \
PCRE2_EXP_DECL int pcre2_config(int, void *, PCRE2_SIZE);
PCRE2_EXP_DECL int pcre2_config(uint32_t, void *, PCRE2_SIZE);
/* Functions for manipulating contexts. */
@ -437,16 +437,16 @@ PCRE2_EXP_DECL PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *);
PCRE2_EXP_DECL int pcre2_substring_copy_byname(pcre2_match_data *, \
PCRE2_SPTR, PCRE2_UCHAR *, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_copy_bynumber(pcre2_match_data *, \
int, PCRE2_UCHAR *, PCRE2_SIZE *); \
unsigned int, PCRE2_UCHAR *, PCRE2_SIZE *); \
PCRE2_EXP_DECL void pcre2_substring_free(PCRE2_UCHAR *); \
PCRE2_EXP_DECL int pcre2_substring_get_byname(pcre2_match_data *, \
PCRE2_SPTR, PCRE2_UCHAR **, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_get_bynumber(pcre2_match_data *, \
int, PCRE2_UCHAR **, PCRE2_SIZE *); \
unsigned int, PCRE2_UCHAR **, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_length_byname(pcre2_match_data *, \
PCRE2_SPTR, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_length_bynumber(pcre2_match_data *, \
int, PCRE2_SIZE *); \
unsigned int, PCRE2_SIZE *); \
PCRE2_EXP_DECL int pcre2_substring_nametable_scan(const pcre2_code *, \
PCRE2_SPTR, PCRE2_SPTR *, PCRE2_SPTR *); \
PCRE2_EXP_DECL int pcre2_substring_number_from_name(\
@ -622,24 +622,27 @@ PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS
#undef PCRE2_OTHER_FUNCTIONS
#undef PCRE2_TYPES_STRUCTURES_AND_FUNCTIONS
/* Re-define PCRE2_SUFFIX to use the external width value, if defined.
Otherwise, undefine the other macros and make PCRE2_SUFFIX a no-op, to reduce
confusion. */
/* PCRE2_CODE_UNIT_WIDTH must be defined. If it is 8, 16, or 32, redefine
PCRE2_SUFFIX to use it. If it is 0, undefine the other macros and make
PCRE2_SUFFIX a no-op. Otherwise, generate an error. */
#undef PCRE2_SUFFIX
#ifdef PCRE2_CODE_UNIT_WIDTH
#if PCRE2_CODE_UNIT_WIDTH != 8 && \
PCRE2_CODE_UNIT_WIDTH != 16 && \
PCRE2_CODE_UNIT_WIDTH != 32
#error PCRE2_CODE_UNIT_WIDTH must be 8, 16, or 32
#endif
#ifndef PCRE2_CODE_UNIT_WIDTH
#error PCRE2_CODE_UNIT_WIDTH must be defined before including pcre2.h.
#error Use 8, 16, or 32; or 0 for a multi-width application.
#else /* PCRE2_CODE_UNIT_WIDTH is defined */
#if PCRE2_CODE_UNIT_WIDTH == 8 || \
PCRE2_CODE_UNIT_WIDTH == 16 || \
PCRE2_CODE_UNIT_WIDTH == 32
#define PCRE2_SUFFIX(a) PCRE2_GLUE(a, PCRE2_CODE_UNIT_WIDTH)
#else
#elif PCRE2_CODE_UNIT_WIDTH == 0
#undef PCRE2_JOIN
#undef PCRE2_GLUE
#define PCRE2_SUFFIX(a) a
#else
#error PCRE2_CODE_UNIT_WIDTH must be 0, 8, 16, or 32.
#endif
#endif /* PCRE2_CODE_UNIT_WIDTH is defined */
#ifdef __cplusplus
} /* extern "C" */

View File

@ -231,7 +231,7 @@ static const uint8_t opcode_possessify[] = {
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
/*************************************************
* Check a character and a property *
*************************************************/
@ -311,7 +311,7 @@ switch(ptype)
return FALSE;
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
@ -368,7 +368,7 @@ PCRE2_UCHAR base;
PCRE2_SPTR end;
uint32_t chr;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
uint32_t *clist_dest;
const uint32_t *clist_src;
#else
@ -451,7 +451,7 @@ switch(c)
GETCHARINCTEST(chr, code);
list[2] = chr;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (chr < 128 || (chr < 256 && !utf))
list[3] = fcc[chr];
else
@ -470,7 +470,7 @@ switch(c)
list[4] = NOTACHAR;
return code;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case OP_PROP:
case OP_NOTPROP:
if (code[0] != PT_CLIST)
@ -812,7 +812,7 @@ for(;;)
leftop = base_list[0];
rightop = list[0];
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
accepted = FALSE; /* Always set in non-unicode case. */
if (leftop == OP_PROP || leftop == OP_NOTPROP)
{
@ -915,7 +915,7 @@ for(;;)
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
@ -1039,7 +1039,7 @@ for(;;)
case OP_EOD: /* Can always possessify before \z */
break;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case OP_PROP:
case OP_NOTPROP:
if (!check_char_prop(chr, list_ptr[2], list_ptr[3],

View File

@ -433,7 +433,7 @@ static const int posix_class_maps[] = {
/* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by
Unicode property escapes. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
static const PCRE2_UCHAR string_PNd[] = {
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
@ -541,7 +541,7 @@ static PCRE2_SPTR posix_substitutes[] = {
NULL /* ^xdigit */
};
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *))
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Masks for checking option settings. */
@ -887,7 +887,7 @@ for (;;)
case OP_NOTI:
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@ -901,7 +901,7 @@ for (;;)
case OP_NOTEXACTI:
branchlength += (int)GET2(cc,1);
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@ -1315,7 +1315,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
actual length is stored in the compiled code, so we must update "code"
here. */
#if defined SUPPORT_UTF || PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
case OP_XCLASS:
ccode = code += GET(code, 1);
goto CHECK_CLASS_REPEAT;
@ -1325,7 +1325,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
case OP_NCLASS:
ccode = code + PRIV(OP_lengths)[OP_CLASS];
#if defined SUPPORT_UTF || PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
CHECK_CLASS_REPEAT:
#endif
@ -2062,7 +2062,7 @@ return escape;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
/*************************************************
* Handle \P and \p *
*************************************************/
@ -2678,7 +2678,7 @@ return -1;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
/*************************************************
* Get othercase range *
*************************************************/
@ -2740,7 +2740,7 @@ for (++c; c <= d; c++)
*cptr = c; /* Rest of input range */
return 0;
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
@ -2780,7 +2780,7 @@ range. */
if ((options & PCRE2_CASELESS) != 0)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0)
{
int rc;
@ -2810,7 +2810,7 @@ if ((options & PCRE2_CASELESS) != 0)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
@ -2844,7 +2844,7 @@ if (end >= start)
{
PCRE2_UCHAR *uchardata = *uchardptr;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UTF) != 0)
{
if (start < end)
@ -2860,7 +2860,7 @@ if (end >= start)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Without UTF support, character values are constrained by the bit length,
and can only be > 256 for 16-bit and 32-bit libraries. */
@ -3042,7 +3042,7 @@ uint8_t classbits[32];
not do this for other options (e.g. PCRE2_EXTENDED) because they may change
dynamically as we process the pattern. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0;
#if PCRE2_CODE_UNIT_WIDTH != 32
PCRE2_UCHAR utf_units[6]; /* For setting up multi-cu chars */
@ -3235,7 +3235,7 @@ for (;; ptr++)
break;
}
ptr++;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) FORWARDCHAR(ptr);
#endif
}
@ -3474,7 +3474,7 @@ for (;; ptr++)
goto FAILED;
}
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(c))
{ /* Braces are required because the */
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
@ -3556,7 +3556,7 @@ for (;; ptr++)
that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
directly. UCP support is not available unless UTF support is.*/
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if ((options & PCRE2_UCP) != 0)
{
unsigned int ptype = 0;
@ -3599,7 +3599,7 @@ for (;; ptr++)
break;
}
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* In the non-UCP case, or when UCP makes no difference, we build the
bit map for the POSIX class in a chunk of local store because we may be
@ -3689,7 +3689,7 @@ for (;; ptr++)
switch (escape)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case ESC_du: /* These are the values given for \d etc */
case ESC_DU: /* when PCRE2_UCP is set. We replace the */
case ESC_wu: /* escape sequence with an appropriate \p */
@ -3757,7 +3757,7 @@ for (;; ptr++)
cb, PRIV(vspace_list));
break;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case ESC_p:
case ESC_P:
{
@ -3840,7 +3840,7 @@ for (;; ptr++)
/* Otherwise, we have a potential range; pick up the next character */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{ /* Braces are required because the */
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
@ -3940,7 +3940,7 @@ for (;; ptr++)
if (negate_class)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
int d;
#endif
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
@ -3951,7 +3951,7 @@ for (;; ptr++)
one other case. If so, generate a special OP_NOTPROP item instead of
OP_NOTI. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0 &&
(d = UCD_CASESET(c)) != 0)
{
@ -4032,7 +4032,7 @@ for (;; ptr++)
be listed) there are no characters < 256, we can omit the bitmap in the
actual compiled code. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (xclass && (!should_flip_negation || (options & PCRE2_UCP) != 0))
#elif PCRE2_CODE_UNIT_WIDTH != 8
if (xclass && !should_flip_negation)
@ -4157,7 +4157,7 @@ for (;; ptr++)
break;
}
p++;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) FORWARDCHAR(p);
#endif
} /* Loop for comment characters */
@ -4265,7 +4265,7 @@ for (;; ptr++)
/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character
repeats by setting op_type to add a suitable offset into repeat_type. Note
the the Unicode property types will be present only when SUPPORT_UTF is
the the Unicode property types will be present only when SUPPORT_UNICODE is
defined, but we don't wrap the little bits of code here because it just
makes it horribly messy. */
@ -4880,7 +4880,7 @@ for (;; ptr++)
case OP_NOTEXACT:
case OP_NOTEXACTI:
tempcode += PRIV(OP_lengths)[*tempcode];
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(tempcode[-1]))
tempcode += GET_EXTRALEN(tempcode[-1]);
#endif
@ -6407,7 +6407,7 @@ for (;; ptr++)
/* So are Unicode property matches, if supported. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
else if (escape == ESC_P || escape == ESC_p)
{
BOOL negated;
@ -6442,7 +6442,7 @@ for (;; ptr++)
if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
cb->max_lookbehind == 0)
cb->max_lookbehind = 1;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (escape >= ESC_DU && escape <= ESC_wu)
{
nestptr = ptr + 1; /* Where to resume */
@ -6479,7 +6479,7 @@ for (;; ptr++)
mclength = 1;
mcbuffer[0] = c;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(c))
ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
#endif
@ -6493,7 +6493,7 @@ for (;; ptr++)
/* For caseless UTF mode, check whether this character has more than one
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_CASELESS) != 0)
{
GETCHAR(c, mcbuffer);
@ -7527,7 +7527,7 @@ ptr += skipatstart;
/* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
#ifndef SUPPORT_UTF
#ifndef SUPPORT_UNICODE
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{
errorcode = ERR32;
@ -7911,7 +7911,7 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0)
points and cannot have another case. In 16-bit and 32-bit modes, we can
check wide characters when UTF (and therefore UCP) is supported. */
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
else if (firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
@ -7945,7 +7945,7 @@ if (reqcuflags >= 0 &&
{
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
}
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
re->flags |= PCRE2_LASTCASELESS;
#endif

View File

@ -75,7 +75,7 @@ Returns: 0 if data returned
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_config(int what, void *where, size_t length)
pcre2_config(uint32_t what, void *where, size_t length)
{
if (length < sizeof(int)) return PCRE2_ERROR_BADLENGTH;
@ -145,7 +145,7 @@ switch (what)
case PCRE2_CONFIG_UNICODE_VERSION:
{
#if defined SUPPORT_UTF
#if defined SUPPORT_UNICODE
const char *v = PRIV(unicode_version);
#else
const char *v = "Unicode not supported";
@ -158,8 +158,8 @@ switch (what)
}
break;
case PCRE2_CONFIG_UTF:
#if defined SUPPORT_UTF
case PCRE2_CONFIG_UNICODE:
#if defined SUPPORT_UNICODE
*((int *)where) = 1;
#else
*((int *)where) = 0;

View File

@ -263,8 +263,9 @@ if (mcontext != NULL)
* Set values in contexts *
*************************************************/
/* All these functions return 1 for success or 0 if invalid data is given. Only
some of the functions are able to test the validity of the data. */
/* All these functions return 0 for success or PCRE2_ERROR_BADDATA if invalid
data is given. Only some of the functions are able to test the validity of the
data. */
/* ------------ Compile contexts ------------ */
@ -274,7 +275,7 @@ pcre2_set_character_tables(pcre2_compile_context *ccontext,
const unsigned char *tables)
{
ccontext->tables = tables;
return 1;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
@ -285,10 +286,10 @@ switch(value)
case PCRE2_BSR_ANYCRLF:
case PCRE2_BSR_UNICODE:
ccontext->bsr_convention = value;
return 1;
return 0;
default:
return 0;
return PCRE2_ERROR_BADDATA;
}
}
@ -303,10 +304,10 @@ switch(newline)
case PCRE2_NEWLINE_ANY:
case PCRE2_NEWLINE_ANYCRLF:
ccontext->newline_convention = newline;
return 1;
return 0;
default:
return 0;
return PCRE2_ERROR_BADDATA;
}
}
@ -314,7 +315,7 @@ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t limit)
{
ccontext->parens_nest_limit = limit;
return 1;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
@ -322,7 +323,7 @@ pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext,
int (*guard)(uint32_t))
{
ccontext->stack_guard = guard;
return 1;
return 0;
}
@ -336,10 +337,10 @@ switch(value)
case PCRE2_BSR_ANYCRLF:
case PCRE2_BSR_UNICODE:
mcontext->bsr_convention = value;
return 1;
return 0;
default:
return 0;
return PCRE2_ERROR_BADDATA;
}
}
@ -354,10 +355,10 @@ switch(newline)
case PCRE2_NEWLINE_ANY:
case PCRE2_NEWLINE_ANYCRLF:
mcontext->newline_convention = newline;
return 1;
return 0;
default:
return 0;
return PCRE2_ERROR_BADDATA;
}
}
@ -367,21 +368,21 @@ pcre2_set_callout(pcre2_match_context *mcontext,
{
mcontext->callout = callout;
mcontext->callout_data = callout_data;
return 1;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_match_limit(pcre2_match_context *mcontext, uint32_t limit)
{
mcontext->match_limit = limit;
return 1;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_recursion_limit(pcre2_match_context *mcontext, uint32_t limit)
{
mcontext->recursion_limit = limit;
return 1;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
@ -399,7 +400,7 @@ mcontext->stack_memctl.memory_data = mydata;
(void)myfree;
(void)mydata;
#endif
return 1;
return 0;
}
/* End of pcre2_context.c */

View File

@ -391,7 +391,7 @@ PCRE2_SPTR start_subject = mb->start_subject;
PCRE2_SPTR end_subject = mb->end_subject;
PCRE2_SPTR start_code = mb->start_code;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
#else
BOOL utf = FALSE;
@ -447,7 +447,7 @@ if (*first_op == OP_REVERSE)
/* If we can't go back the amount required for the longest lookbehind
pattern, go back as far as we can; some alternatives may still be viable. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
/* In character mode we have to step back character by character */
if (utf)
@ -570,11 +570,11 @@ for (;;)
if (ptr < end_subject)
{
clen = 1; /* Number of data items in the character */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
GETCHARLENTEST(c, ptr, clen);
#else
c = *ptr;
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
}
else
{
@ -652,9 +652,9 @@ for (;;)
if (coptable[codevalue] > 0)
{
dlen = 1;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
d = code[coptable[codevalue]];
if (codevalue >= OP_TYPESTAR)
{
@ -948,11 +948,11 @@ for (;;)
{
PCRE2_SPTR temp = ptr - 1;
if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 32
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf) { BACKCHAR(temp); }
#endif
GETCHARTEST(d, temp);
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
{
if (d == '_') left_word = TRUE; else
@ -972,12 +972,12 @@ for (;;)
if (ptr >= mb->last_used_ptr)
{
PCRE2_SPTR temp = ptr + 1;
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 32
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf) { FORWARDCHAR(temp); }
#endif
mb->last_used_ptr = temp;
}
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
{
if (c == '_') right_word = TRUE; else
@ -1003,7 +1003,7 @@ for (;;)
if the support is in the binary; otherwise a compile-time error occurs.
*/
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case OP_PROP:
case OP_NOTPROP:
if (clen > 0)
@ -1258,7 +1258,7 @@ for (;;)
argument. It keeps the code above fast for the other cases. The argument
is in the d variable. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case OP_PROP_EXTRA + OP_TYPEPLUS:
case OP_PROP_EXTRA + OP_TYPEMINPLUS:
case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
@ -1501,7 +1501,7 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case OP_PROP_EXTRA + OP_TYPEQUERY:
case OP_PROP_EXTRA + OP_TYPEMINQUERY:
case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
@ -1785,7 +1785,7 @@ for (;;)
break;
/*-----------------------------------------------------------------*/
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
case OP_PROP_EXTRA + OP_TYPEEXACT:
case OP_PROP_EXTRA + OP_TYPEUPTO:
case OP_PROP_EXTRA + OP_TYPEMINUPTO:
@ -2063,7 +2063,7 @@ for (;;)
case OP_CHARI:
if (clen == 0) break;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
@ -2077,7 +2077,7 @@ for (;;)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
{
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
@ -2086,7 +2086,7 @@ for (;;)
break;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
/*-----------------------------------------------------------------*/
/* This is a tricky one because it can match more than one character.
Find out how many characters to skip, and then set up a negative state
@ -2222,11 +2222,11 @@ for (;;)
if (clen > 0)
{
unsigned int otherd;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
otherd = TABLE_GET(d, fcc, d);
if (c != d && c != otherd)
{ ADD_NEW(state_offset + dlen + 1, 0); }
@ -2257,11 +2257,11 @@ for (;;)
uint32_t otherd = NOTACHAR;
if (caseless)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
@ -2300,11 +2300,11 @@ for (;;)
uint32_t otherd = NOTACHAR;
if (caseless)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
@ -2341,11 +2341,11 @@ for (;;)
uint32_t otherd = NOTACHAR;
if (caseless)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
@ -2374,11 +2374,11 @@ for (;;)
uint32_t otherd = NOTACHAR;
if (caseless)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
@ -2414,11 +2414,11 @@ for (;;)
uint32_t otherd = NOTACHAR;
if (caseless)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
otherd = TABLE_GET(d, fcc, d);
}
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
@ -2747,7 +2747,7 @@ for (;;)
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
{
int charcount = local_offsets[rc+1] - local_offsets[rc];
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 32
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf)
{
PCRE2_SPTR p = start_subject + local_offsets[rc];
@ -2851,7 +2851,7 @@ for (;;)
PCRE2_SPTR p = ptr;
PCRE2_SPTR pp = local_ptr;
charcount = (int)(pp - p);
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 32
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
#endif
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
@ -2933,7 +2933,7 @@ for (;;)
}
else
{
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 32
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
if (utf)
{
PCRE2_SPTR p = start_subject + local_offsets[0];
@ -3106,14 +3106,24 @@ if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
/* Check that the first field in the block is the magic number. If it is not,
return with PCRE2_ERROR_BADMAGIC. However, if the magic number is equal to
REVERSED_MAGIC_NUMBER we return with PCRE2_ERROR_BADENDIANNESS, which
means that the pattern is likely compiled with different endianness. */
/* FIXME: Remove BADENDIANNESS if saving/restoring is not to be implemented. */
/* Check that the first field in the block is the magic number. If it is not,
return with PCRE2_ERROR_BADMAGIC. */
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
#ifdef FIXME
If saving restoring gets implemented, define PCRE2_ERROR_BADENDIANNESS, and add
this comment and code:
/* However, if the magic number is equal to REVERSED_MAGIC_NUMBER we return
with PCRE2_ERROR_BADENDIANNESS, which means that the pattern is likely compiled
with different endianness. */
if (re->magic_number != MAGIC_NUMBER)
return re->magic_number == REVERSED_MAGIC_NUMBER?
PCRE2_ERROR_BADENDIANNESS:PCRE2_ERROR_BADMAGIC;
#endif
/* Check the code unit width. */
@ -3238,7 +3248,7 @@ switch(newline)
we must also check that a starting offset does not point into the middle of a
multiunit character. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->rightchar));
@ -3253,7 +3263,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
return PCRE2_ERROR_BADUTFOFFSET;
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Set up the first code unit to match, if available. The first_codeunit value
is never set for an anchored regular expression, but the anchoring may be
@ -3270,7 +3280,7 @@ if (!anchored)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
}
@ -3290,7 +3300,7 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
#endif
}
@ -3327,7 +3337,7 @@ for (;;)
if (firstline)
{
PCRE2_SPTR t = start_match;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
while (t < mb->end_subject && !IS_NEWLINE(t))
@ -3362,7 +3372,7 @@ for (;;)
{
if (start_match > mb->start_subject + start_offset)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
@ -3516,7 +3526,7 @@ for (;;)
if (firstline && IS_NEWLINE(start_match)) break;
start_match++;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
ACROSSCHAR(start_match < end_subject, *start_match,

View File

@ -198,35 +198,34 @@ static const char match_error_texts[] =
"UTF-16 error: isolated low surrogate\0"
"UTF-32 error: code points 0xd800-0xdfff are not defined\0"
"UTF-32 error: code points greater than 0x10ffff are not defined\0"
"bad count value\0"
"bad data value\0"
/* 30 */
"pattern compiled with other endianness\0"
"bad length\0"
"magic number missing\0"
"pattern compiled in wrong mode: 8/16/32-bit error\0"
"bad offset value\0"
/* 35 */
"bad option value\0"
/* 35 */
"bad offset into UTF string\0"
"callout error code\0" /* Never returned by PCRE2 itself */
"invalid data in workspace for DFA restart\0"
"too much recursion for DFA matching\0"
/* 40 */
"backreference condition or recursion test not supported for DFA matching\0"
/* 40 */
"item unsupported for DFA matching\0"
"match limit not supported for DFA matching\0"
"workspace size exceeded in DFA matching\0"
"internal error - pattern overwritten?\0"
/* 45 */
"bad JIT option\0"
"JIT stack limit reached\0"
/* 45 */
"match limit exceeded\0"
"no more memory\0"
"unknown or unset substring\0"
/* 50 */
"NULL argument passed\0"
"nested recursion at the same subject position\0"
/* 50 */
"recursion limit exceeded\0"
"requested value is not set\0"
;

View File

@ -38,11 +38,11 @@ POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* We do not support both EBCDIC and UTF at the same time. The "configure"
/* We do not support both EBCDIC and Unicode at the same time. The "configure"
script prevents both being selected, but not everybody uses "configure". */
#if defined EBCDIC && defined SUPPORT_UTF
#error The use of both EBCDIC and SUPPORT_UTF is not supported.
#if defined EBCDIC && defined SUPPORT_UNICODE
#error The use of both EBCDIC and SUPPORT_UNICODE is not supported.
#endif
/* Standard C headers */
@ -597,14 +597,14 @@ there are some longer strings as well.
This means that, on EBCDIC platforms, the PCRE library can handle either
EBCDIC, or UTF-8, but not both. To support both in the same compiled library
would need different lookups depending on whether PCRE_UTF8 was set or not.
would need different lookups depending on whether PCRE2_UTF was set or not.
This would make it impossible to use characters in switch/case statements,
which would reduce performance. For a theoretical use (which nobody has asked
for) in a minority area (EBCDIC platforms), this is not sensible. Any
application that did need both could compile two versions of the library, using
macros to give the functions distinct names. */
#ifndef SUPPORT_UTF
#ifndef SUPPORT_UNICODE
/* UTF-8 support is not enabled; use the platform-dependent character literals
so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF
@ -920,7 +920,7 @@ a positive value. */
#define STRING_LIMIT_MATCH_EQ "LIMIT_MATCH="
#define STRING_LIMIT_RECURSION_EQ "LIMIT_RECURSION="
#else /* SUPPORT_UTF */
#else /* SUPPORT_UNICODE */
/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
@ -1189,7 +1189,7 @@ only. */
#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN
#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* -------------------- End of character and string names -------------------*/
@ -1775,10 +1775,10 @@ typedef struct {
/* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */
/* When this file is included by pcre2test, PCRE2_CODE_UNIT_WIDTH is not
defined, so the following items are omitted. */
/* When this file is included by pcre2test, PCRE2_CODE_UNIT_WIDTH is defined as
0, so the following items are omitted. */
#ifdef PCRE2_CODE_UNIT_WIDTH
#if defined PCRE2_CODE_UNIT_WIDTH && PCRE2_CODE_UNIT_WIDTH != 0
/* This is the largest non-UTF code point. */

View File

@ -208,9 +208,9 @@ tables. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define MAX_255(c) TRUE
#define MAX_MARK ((1u << 8) - 1)
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
#define SUPPORT_WIDE_CHARS
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
#define TABLE_GET(c, table, default) ((table)[c])
#else /* Code units are 16 or 32 bits */
@ -246,7 +246,7 @@ complicated ones for UTF characters. GETCHARLENTEST and other macros are not
used when UTF is not supported. To make sure they can never even appear when
UTF support is omitted, we don't even define them. */
#ifndef SUPPORT_UTF
#ifndef SUPPORT_UNICODE
/* #define MAX_UTF_SINGLE_CU */
/* #define HAS_EXTRALEN(c) */
@ -263,7 +263,7 @@ UTF support is omitted, we don't even define them. */
/* #define FORWARDCHAR(eptr) */
/* #define ACROSSCHAR(condition, eptr, action) */
#else /* SUPPORT_UTF */
#else /* SUPPORT_UNICODE */
/* ------------------- 8-bit support ------------------ */
@ -527,7 +527,7 @@ These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
#define PUTCHAR(c, p) (*p = c, 1)
#endif /* UTF-32 character handling */
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Mode-dependent macros that have the same definition in all modes. */

View File

@ -145,7 +145,7 @@ static int
match_ref(PCRE2_SIZE offset, PCRE2_SIZE offset_top, register PCRE2_SPTR eptr,
match_block *mb, BOOL caseless, PCRE2_SIZE *lengthptr)
{
#if defined SUPPORT_UTF
#if defined SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
#endif
@ -173,7 +173,7 @@ length = mb->ovector[offset+1] - mb->ovector[offset];
if (caseless)
{
#if defined SUPPORT_UTF
#if defined SUPPORT_UNICODE
if (utf)
{
/* Match characters up to the end of the reference. NOTE: the number of
@ -352,7 +352,7 @@ typedef struct heapframe {
struct heapframe *Xprevframe;
struct heapframe *Xnextframe;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
PCRE2_SPTR Xcharptr;
#endif
PCRE2_SPTR Xeptr;
@ -378,7 +378,7 @@ typedef struct heapframe {
uint32_t Xop;
uint32_t Xsave_capture_last;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
uint32_t Xprop_value;
int Xprop_type;
int Xprop_fail_result;
@ -399,7 +399,7 @@ typedef struct heapframe {
eptrblock Xnewptrb;
recursion_info Xnew_recursive;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
PCRE2_UCHAR Xocchars[6];
#endif
} heapframe;
@ -610,7 +610,7 @@ HEAP_RECURSE:
/* Ditto for the local variables */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
#define charptr frame->Xcharptr
#define prop_value frame->Xprop_value
#define prop_type frame->Xprop_type
@ -666,7 +666,7 @@ declarations can be cut out in a block. The only declarations within blocks
below are for variables that do not have to be preserved over a recursive call
to RMATCH(). */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
PCRE2_SPTR charptr;
#endif
PCRE2_SPTR callpat;
@ -684,7 +684,7 @@ uint32_t number;
uint32_t op;
uint32_t save_capture_last;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
uint32_t prop_value;
int prop_type;
int prop_fail_result;
@ -721,7 +721,7 @@ the alternative names that are used. */
/* These statements are here to stop the compiler complaining about unitialized
variables. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
prop_value = 0;
prop_fail_result = 0;
#endif
@ -742,7 +742,7 @@ call because it's quite a complicated macro. It has to be used in one
particular way. This shouldn't, however, impact performance when true recursion
is being used. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
utf = (mb->poptions & PCRE2_UTF) != 0;
#else
utf = FALSE;
@ -1662,7 +1662,7 @@ for (;;)
back a number of characters, not bytes. */
case OP_REVERSE:
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
i = GET(ecode, 1);
@ -2197,7 +2197,7 @@ for (;;)
be "non-word" characters. Remember the earliest consulted character for
partial matching. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
/* Get status of previous character */
@ -2257,7 +2257,7 @@ for (;;)
if (eptr == mb->start_subject) prev_is_word = FALSE; else
{
if (eptr <= mb->start_used_ptr) mb->start_used_ptr = eptr - 1;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
{
c = eptr[-1];
@ -2283,7 +2283,7 @@ for (;;)
else
{
if (eptr >= mb->last_used_ptr) mb->last_used_ptr = eptr + 1;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
{
c = *eptr;
@ -2334,7 +2334,7 @@ for (;;)
RRETURN(MATCH_NOMATCH);
}
eptr++;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) ACROSSCHAR(eptr < mb->end_subject, *eptr, eptr++);
#endif
ecode++;
@ -2550,7 +2550,7 @@ for (;;)
ecode++;
break;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
/* Check the next character by Unicode property. We will get here only
if the support is in the binary; otherwise a compile-time error occurs. */
@ -2684,7 +2684,7 @@ for (;;)
CHECK_PARTIAL();
ecode++;
break;
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Match a back reference, possibly repeatedly. Look past the end of the
@ -2955,7 +2955,7 @@ for (;;)
/* First, ensure the minimum number of matches are present. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
for (i = 1; i <= min; i++)
@ -3007,7 +3007,7 @@ for (;;)
if (minimize)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
for (fi = min;; fi++)
@ -3063,7 +3063,7 @@ for (;;)
{
pp = eptr;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
for (i = min; i < max; i++)
@ -3232,7 +3232,7 @@ for (;;)
SCHECK_PARTIAL();
break;
}
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
GETCHARLENTEST(c, eptr, len);
#else
c = *eptr;
@ -3248,7 +3248,7 @@ for (;;)
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM21);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr-- == pp) break; /* Stop if tried at original pos */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) BACKCHAR(eptr);
#endif
}
@ -3262,7 +3262,7 @@ for (;;)
/* Match a single character, casefully */
case OP_CHAR:
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
length = 1;
@ -3299,7 +3299,7 @@ for (;;)
RRETURN(MATCH_NOMATCH);
}
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
length = 1;
@ -3334,7 +3334,7 @@ for (;;)
if (fc != dc)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (dc != UCD_OTHERCASE(fc))
#endif
RRETURN(MATCH_NOMATCH);
@ -3342,7 +3342,7 @@ for (;;)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
{
@ -3436,7 +3436,7 @@ for (;;)
for speed. */
REPEATCHAR:
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
length = 1;
@ -3527,7 +3527,7 @@ for (;;)
value of fc will always be < 128. */
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* When not in UTF-8 mode, load a single-byte character. */
fc = *ecode++;
@ -3547,11 +3547,11 @@ for (;;)
/* fc must be < 128 if UTF is enabled. */
foc = mb->fcc[fc];
#else
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && fc > 127)
foc = UCD_OTHERCASE(fc);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
foc = TABLE_GET(fc, mb->fcc, fc);
#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
@ -3682,7 +3682,7 @@ for (;;)
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
register uint32_t ch, och;
@ -3705,7 +3705,7 @@ for (;;)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
{
register uint32_t ch = ecode[1];
c = *eptr++;
@ -3803,14 +3803,14 @@ for (;;)
if (op >= OP_NOTSTARI) /* Caseless */
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && fc > 127)
foc = UCD_OTHERCASE(fc);
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
foc = TABLE_GET(fc, mb->fcc, fc);
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
register uint32_t d;
@ -3826,7 +3826,7 @@ for (;;)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
{
for (i = 1; i <= min; i++)
@ -3845,7 +3845,7 @@ for (;;)
if (minimize)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
register uint32_t d;
@ -3864,7 +3864,7 @@ for (;;)
}
}
else
#endif /*SUPPORT_UTF */
#endif /*SUPPORT_UNICODE */
/* Not UTF mode */
{
for (fi = min;; fi++)
@ -3890,7 +3890,7 @@ for (;;)
{
pp = eptr;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
register uint32_t d;
@ -3917,7 +3917,7 @@ for (;;)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
{
for (i = min; i < max; i++)
@ -3947,7 +3947,7 @@ for (;;)
else
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
register uint32_t d;
@ -3981,7 +3981,7 @@ for (;;)
if (minimize)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
register uint32_t d;
@ -4025,7 +4025,7 @@ for (;;)
{
pp = eptr;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
register uint32_t d;
@ -4144,7 +4144,7 @@ for (;;)
REPEATTYPE:
ctype = *ecode++; /* Code for the character type */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (ctype == OP_PROP || ctype == OP_NOTPROP)
{
prop_fail_result = ctype == OP_NOTPROP;
@ -4162,7 +4162,7 @@ for (;;)
if (min > 0)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (prop_type >= 0)
{
switch(prop_type)
@ -4378,11 +4378,11 @@ for (;;)
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Handle all other cases when the coding is UTF-8 */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) switch(ctype)
{
case OP_ANY:
@ -4631,7 +4631,7 @@ for (;;)
} /* End switch(ctype) */
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Code for the non-UTF-8 case for minimum matching of operators other
than OP_PROP and OP_NOTPROP. */
@ -4889,7 +4889,7 @@ for (;;)
if (minimize)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (prop_type >= 0)
{
switch(prop_type)
@ -5138,9 +5138,9 @@ for (;;)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
for (fi = min;; fi++)
@ -5410,7 +5410,7 @@ for (;;)
{
pp = eptr; /* Remember where we started */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (prop_type >= 0)
{
switch(prop_type)
@ -5696,9 +5696,9 @@ for (;;)
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
switch(ctype)
@ -5940,7 +5940,7 @@ for (;;)
}
}
else
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Not UTF mode */
{
switch(ctype)
@ -6219,13 +6219,13 @@ switch (frame->Xwhere)
#ifdef SUPPORT_WIDE_CHARS
LBL(20) LBL(21)
#endif
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
LBL(16) LBL(18)
LBL(22) LBL(23) LBL(28) LBL(30)
LBL(32) LBL(34) LBL(42) LBL(46)
LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
default:
return PCRE2_ERROR_INTERNAL;
}
@ -6398,14 +6398,21 @@ if (code == NULL || subject == NULL || match_data == NULL)
return PCRE2_ERROR_NULL;
if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
/* Check that the first field in the block is the magic number. If it is not,
return with PCRE2_ERROR_BADMAGIC. However, if the magic number is equal to
REVERSED_MAGIC_NUMBER we return with PCRE2_ERROR_BADENDIANNESS, which
means that the pattern is likely compiled with different endianness. */
/* Check that the first field in the block is the magic number. */
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
#ifdef FIXME
If saving restoring gets implemented, define PCRE2_ERROR_BADENDIANNESS, and add
this comment and code:
/* However, if the magic number is equal to REVERSED_MAGIC_NUMBER we return
with PCRE2_ERROR_BADENDIANNESS, which means that the pattern is likely compiled
with different endianness. */
if (re->magic_number != MAGIC_NUMBER)
return re->magic_number == REVERSED_MAGIC_NUMBER?
PCRE2_ERROR_BADENDIANNESS:PCRE2_ERROR_BADMAGIC;
#endif
/* Check the code unit width. */
@ -6451,7 +6458,7 @@ mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
we must also check that a starting offset does not point into the middle of a
multiunit character. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
{
match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->rightchar));
@ -6466,7 +6473,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
return PCRE2_ERROR_BADUTFOFFSET;
#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* If the pattern was successfully studied with JIT support, run the JIT
executable instead of the rest of this function. Most options must be set at
@ -6640,7 +6647,7 @@ if (!anchored)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
}
@ -6660,7 +6667,7 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH != 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
#endif
}
@ -6696,7 +6703,7 @@ for(;;)
if (firstline)
{
PCRE2_SPTR t = start_match;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
while (t < mb->end_subject && !IS_NEWLINE(t))
@ -6731,7 +6738,7 @@ for(;;)
{
if (start_match > mb->start_subject + start_offset)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
while (start_match < end_subject && !WAS_NEWLINE(start_match))
@ -6905,7 +6912,7 @@ for(;;)
case MATCH_THEN:
mb->ignore_skip_arg = 0;
new_start_match = start_match + 1;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
ACROSSCHAR(new_start_match < end_subject, *new_start_match,
new_start_match++);

View File

@ -81,12 +81,12 @@ PRIV(is_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR endptr,
{
uint32_t c;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) { GETCHAR(c, ptr); } else c = *ptr;
#else
(void)utf;
c = *ptr;
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
if (type == NLTYPE_ANYCRLF) switch(c)
{
@ -172,7 +172,7 @@ PRIV(was_newline)(PCRE2_SPTR ptr, uint32_t type, PCRE2_SPTR startptr,
uint32_t c;
ptr--;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
BACKCHAR(ptr);
@ -182,7 +182,7 @@ else c = *ptr;
#else
(void)utf;
c = *ptr;
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
if (type == NLTYPE_ANYCRLF) switch(c)
{

View File

@ -50,10 +50,11 @@ into a UTF string. The behaviour is different for each code unit width. */
#include "pcre2_internal.h"
/* If SUPPORT_UTF is not defined, this function will never be called. Supply a
dummy function because some compilers do not like empty source modules. */
/* If SUPPORT_UNICODE is not defined, this function will never be called.
Supply a dummy function because some compilers do not like empty source
modules. */
#ifndef SUPPORT_UTF
#ifndef SUPPORT_UNICODE
unsigned int
PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
{
@ -61,7 +62,7 @@ PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
(void)(buffer);
return 0;
}
#else /* SUPPORT_UTF */
#else /* SUPPORT_UNICODE */
/*************************************************
@ -114,6 +115,6 @@ return 2;
return 1;
#endif
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* End of pcre_ord2utf.c */

View File

@ -56,11 +56,9 @@ Arguments:
what what information is required
where where to put the information
Returns: 0 if data returned, negative on error
Returns: 0 if data returned, negative on error or unset value
*/
/* FIXME: Remove BADENDIANNESS if saving/restoring is not to be implemented. */
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_pattern_info(const pcre2_code *code, uint32_t what, void *where)
{
@ -69,13 +67,21 @@ const pcre2_real_code *re = (pcre2_real_code *)code;
if (re == NULL || where == NULL) return PCRE2_ERROR_NULL;
/* Check that the first field in the block is the magic number. If it is not,
return with PCRE2_ERROR_BADMAGIC. However, if the magic number is equal to
REVERSED_MAGIC_NUMBER we return with PCRE2_ERROR_BADENDIANNESS, which
means that the pattern is likely compiled with different endianness. */
return with PCRE2_ERROR_BADMAGIC. */
if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
#ifdef FIXME
If saving restoring gets implemented, define PCRE2_ERROR_BADENDIANNESS, and add
this comment and code:
/* However, if the magic number is equal to REVERSED_MAGIC_NUMBER we return
with PCRE2_ERROR_BADENDIANNESS, which means that the pattern is likely compiled
with different endianness. */
if (re->magic_number != MAGIC_NUMBER)
return re->magic_number == REVERSED_MAGIC_NUMBER?
PCRE2_ERROR_BADENDIANNESS:PCRE2_ERROR_BADMAGIC;
#endif
/* Check that this pattern was compiled in the correct bit mode */
@ -151,6 +157,7 @@ switch(what)
case PCRE2_INFO_MATCHLIMIT:
*((uint32_t *)where) = re->limit_match;
if (re->limit_match == UINT32_MAX) return PCRE2_ERROR_UNSET;
break;
case PCRE2_INFO_MAXLOOKBEHIND:
@ -179,6 +186,7 @@ switch(what)
case PCRE2_INFO_RECURSIONLIMIT:
*((uint32_t *)where) = re->limit_recursion;
if (re->limit_recursion == UINT32_MAX) return PCRE2_ERROR_UNSET;
break;
case PCRE2_INFO_SIZE:

View File

@ -94,7 +94,7 @@ BOOL one_code_unit = !utf;
/* If UTF is supported and requested, check for a valid single code unit. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
@ -105,7 +105,7 @@ if (utf)
one_code_unit = (c & 0xfffff800u) != 0xd800u;
#endif /* CODE_UNIT_WIDTH */
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* Handle a valid one-code-unit character at any width. */
@ -121,7 +121,7 @@ if (one_code_unit)
for each width. If UTF is not supported, control should never get here, but we
need a return statement to keep the compiler happy. */
#ifndef SUPPORT_UTF
#ifndef SUPPORT_UNICODE
return 0;
#else
@ -178,7 +178,7 @@ as an indication. */
fprintf(f, "\\X{%x}", c);
return 0;
#endif /* PCRE2_CODE_UNIT_WIDTH == 32 */
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
}
@ -221,7 +221,7 @@ into the main code, however, we just put one into this function. */
static const char *
get_ucpname(unsigned int ptype, unsigned int pvalue)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
int i;
for (i = utt_size - 1; i >= 0; i--)
{
@ -233,7 +233,7 @@ return (i >= 0)? utt_names + utt[i].name_offset : "??";
(void)ptype;
(void)pvalue;
return "??";
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
}

View File

@ -228,7 +228,7 @@ for (;;)
case OP_NOTPOSPLUSI:
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@ -249,7 +249,7 @@ for (;;)
case OP_NOTEXACTI:
branchlength += GET2(cc,1);
cc += 2 + IMM2_SIZE;
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@ -297,7 +297,7 @@ for (;;)
appear, but leave the code, just in case.) */
case OP_ANYBYTE:
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf) return -1;
#endif
branchlength++;
@ -536,7 +536,7 @@ for (;;)
case OP_NOTPOSQUERYI:
cc += PRIV(OP_lengths)[op];
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
#endif
break;
@ -608,7 +608,7 @@ SET_BIT(c);
/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
the end of the character, even when caseless. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
@ -617,7 +617,7 @@ if (utf)
if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
#endif
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* If caseless, handle the other case of the character. */
@ -671,7 +671,7 @@ set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
register uint32_t c;
for (c = 0; c < table_limit; c++)
re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
@ -712,7 +712,7 @@ set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
register uint32_t c;
for (c = 0; c < table_limit; c++)
re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
#endif
}
@ -752,7 +752,7 @@ set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
register uint32_t c;
int yield = SSB_DONE;
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
int table_limit = utf? 16:32;
#else
int table_limit = 32;
@ -866,7 +866,7 @@ do
const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
while ((c = *p++) < NOTACHAR)
{
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (utf)
{
PCRE2_UCHAR buff[6];
@ -1042,7 +1042,7 @@ do
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
units of horizontal space characters. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
SET_BIT(0xC2); /* For U+00A0 */
@ -1081,7 +1081,7 @@ do
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
units of vertical space characters. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
SET_BIT(0xC2); /* For U+0085 (NEL) */
@ -1181,7 +1181,7 @@ do
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
units of horizontal space characters. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
SET_BIT(0xC2); /* For U+00A0 */
@ -1218,7 +1218,7 @@ do
/* For the 8-bit library in UTF-8 mode, set the bits for the first code
units of vertical space characters. */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
SET_BIT(0xC2); /* For U+0085 (NEL) */
@ -1287,7 +1287,7 @@ do
character modes, set the 0xFF bit to indicate code units >= 255. */
case OP_NCLASS:
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (utf)
{
re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
@ -1318,7 +1318,7 @@ do
if (classmap != NULL)
{
#if defined SUPPORT_UTF && PCRE2_CODE_UNIT_WIDTH == 8
#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
if (utf)
{
for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];

View File

@ -108,8 +108,8 @@ Returns: if successful: 0
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_copy_bynumber(pcre2_match_data *match_data, int stringnumber,
PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr)
pcre2_substring_copy_bynumber(pcre2_match_data *match_data,
unsigned int stringnumber, PCRE2_UCHAR *buffer, PCRE2_SIZE *sizeptr)
{
PCRE2_SIZE left, right;
PCRE2_SIZE p = 0;
@ -189,8 +189,8 @@ Returns: if successful: zero
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_get_bynumber(pcre2_match_data *match_data, int stringnumber,
PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr)
pcre2_substring_get_bynumber(pcre2_match_data *match_data,
unsigned int stringnumber, PCRE2_UCHAR **stringptr, PCRE2_SIZE *sizeptr)
{
PCRE2_SIZE left, right;
PCRE2_SIZE p = 0;
@ -288,7 +288,7 @@ Returns: 0 if successful, else a negative error number
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_substring_length_bynumber(pcre2_match_data *match_data,
int stringnumber, PCRE2_SIZE *sizeptr)
unsigned int stringnumber, PCRE2_SIZE *sizeptr)
{
if (stringnumber >= match_data->oveccount ||
stringnumber > match_data->code->top_bracket ||

View File

@ -76,7 +76,7 @@ as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for
handling wide characters. */
#if defined PCRE2_PCRE2TEST || \
(defined SUPPORT_UTF && \
(defined SUPPORT_UNICODE && \
defined PCRE2_CODE_UNIT_WIDTH && \
PCRE2_CODE_UNIT_WIDTH == 8)
@ -106,7 +106,7 @@ const uint8_t PRIV(utf8_table4)[] = {
#endif /* UTF-8 support needed */
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
/* Table to translate from particular type value to the general value. */
@ -728,6 +728,6 @@ const ucp_type_table PRIV(utt)[] = {
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* End of pcre2_tables.c */

View File

@ -32,7 +32,7 @@ condition to cut out the tables when not needed. But don't leave
a totally empty module because some compilers barf at that.
Instead, just supply small dummy tables. */
#ifndef SUPPORT_UTF
#ifndef SUPPORT_UNICODE
const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};
const uint8_t PRIV(ucd_stage1)[] = {0};
const uint16_t PRIV(ucd_stage2)[] = {0};
@ -3628,6 +3628,6 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 58112 bytes, block = 128 */
#if UCD_BLOCK_SIZE != 128
#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
#endif
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
#endif /* PCRE2_PCRE2TEST */

View File

@ -50,12 +50,12 @@ strings. */
#include "pcre2_internal.h"
#ifndef SUPPORT_UTF
#ifndef SUPPORT_UNICODE
/*************************************************
* Dummy function when UTF not supported *
* Dummy function when Unicode is not supported *
*************************************************/
/* This function should never be called when UTF is not supported. */
/* This function should never be called when Unicode is not supported. */
int
PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
@ -388,6 +388,6 @@ for (p = string; length-- > 0; p++)
return 0;
#endif /* CODE_UNIT_WIDTH */
}
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
/* End of pcre2_valid_utf.c */

View File

@ -103,7 +103,7 @@ while ((t = *data++) != XCL_END)
uint32_t x, y;
if (t == XCL_SINGLE)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
GETCHARINC(x, data); /* macro generates multiple statements */
@ -115,7 +115,7 @@ while ((t = *data++) != XCL_END)
}
else if (t == XCL_RANGE)
{
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
if (utf)
{
GETCHARINC(x, data); /* macro generates multiple statements */
@ -130,7 +130,7 @@ while ((t = *data++) != XCL_END)
if (c >= x && c <= y) return !negated;
}
#ifdef SUPPORT_UTF
#ifdef SUPPORT_UNICODE
else /* XCL_PROP & XCL_NOTPROP */
{
const ucd_record *prop = GET_UCD(c);
@ -262,7 +262,7 @@ while ((t = *data++) != XCL_END)
}
#else
(void)utf; /* Avoid compiler warning */
#endif /* SUPPORT_UTF */
#endif /* SUPPORT_UNICODE */
}
return negated; /* char did not match */

View File

@ -196,6 +196,7 @@ so that the PCRE2_EXP_xxx macros get set appropriately for an application, not
for building the library. */
#define PRIV(name) name
#define PCRE2_CODE_UNIT_WIDTH 0
#include "pcre2.h"
#include "pcre2posix.h"
#include "pcre2_internal.h"
@ -208,16 +209,17 @@ of PRIV avoids name clashes. */
#include "pcre2_tables.c"
#include "pcre2_ucd.c"
/* When PCRE2_CODE_UNIT_WIDTH is unset, pcre2_internal.h does not include
/* When PCRE2_CODE_UNIT_WIDTH is zero, pcre2_internal.h does not include
pcre2_intmodedep.h, which is where mode-dependent macros and structures are
defined. We can now include it for each supported code unit width. Because
PCRE2_CODE_UNIT_WIDTH was not defined before including pcre2.h, it will have
left PCRE2_SUFFIX defined as a no-op. We must re-define it appropriately while
including these files, and then restore it to a no-op. Because LINK_SIZE may be
changed in 16-bit mode and forced to 1 in 32-bit mode, the order of these
inclusions should not be changed. */
PCRE2_CODE_UNIT_WIDTH was defined as zero before including pcre2.h, it will
have left PCRE2_SUFFIX defined as a no-op. We must re-define it appropriately
while including these files, and then restore it to a no-op. Because LINK_SIZE
may be changed in 16-bit mode and forced to 1 in 32-bit mode, the order of
these inclusions should not be changed. */
#undef PCRE2_SUFFIX
#undef PCRE2_CODE_UNIT_WIDTH
#ifdef SUPPORT_PCRE8
#define PCRE2_CODE_UNIT_WIDTH 8
@ -576,7 +578,7 @@ static coptstruct coptlist[] = {
{ "pcre16", CONF_FIX, SUPPORT_16 },
{ "pcre32", CONF_FIX, SUPPORT_32 },
{ "pcre8", CONF_FIX, SUPPORT_8 },
{ "utf", CONF_INT, PCRE2_CONFIG_UTF }
{ "unicode", CONF_INT, PCRE2_CONFIG_UNICODE }
};
#define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct)
@ -2815,22 +2817,26 @@ pattern.
Arguments:
what code for the required information
where where to put the answer
unsetok PCRE2_ERROR_UNSET is an "expected" result
Returns: the return from pcre2_pattern_info()
*/
static int
pattern_info(int what, void *where)
pattern_info(int what, void *where, BOOL unsetok)
{
int rc;
PCRE2_PATTERN_INFO(rc, compiled_code, what, where);
if (rc >= 0) return 0;
fprintf(outfile, "Error %d from pcre2_pattern_info_%d(%d)\n", rc, test_mode,
if (rc != PCRE2_ERROR_UNSET || !unsetok)
{
fprintf(outfile, "Error %d from pcre2_pattern_info_%d(%d)\n", rc, test_mode,
what);
if (rc == PCRE2_ERROR_BADMODE)
if (rc == PCRE2_ERROR_BADMODE)
fprintf(outfile, "Running in %d-bit mode but pattern was compiled in "
"%d-bit mode\n", test_mode,
8 * (FLD(compiled_code, flags) & PCRE2_MODE_MASK));
}
return rc;
}
@ -3026,32 +3032,61 @@ if ((pat_patctl.control & CTL_INFO) != 0)
{
const void *nametable;
const uint8_t *start_bits;
BOOL match_limit_set, recursion_limit_set;
uint32_t backrefmax, bsr_convention, capture_count, first_ctype, first_cunit,
hascrorlf, jchanged, last_ctype, last_cunit, match_empty, match_limit,
maxlookbehind, minlength, nameentrysize, namecount, newline_convention,
recursion_limit;
/* These info requests may return PCRE2_ERROR_UNSET. */
switch(pattern_info(PCRE2_INFO_MATCHLIMIT, &match_limit, TRUE))
{
case 0:
match_limit_set = TRUE;
break;
case PCRE2_ERROR_UNSET:
match_limit_set = FALSE;
break;
default:
return PR_ABEND;
}
switch(pattern_info(PCRE2_INFO_RECURSIONLIMIT, &recursion_limit, TRUE))
{
case 0:
recursion_limit_set = TRUE;
break;
case PCRE2_ERROR_UNSET:
recursion_limit_set = FALSE;
break;
default:
return PR_ABEND;
}
/* These info requests should always succeed. */
if (pattern_info(PCRE2_INFO_BACKREFMAX, &backrefmax) +
pattern_info(PCRE2_INFO_BSR, &bsr_convention) +
pattern_info(PCRE2_INFO_CAPTURECOUNT, &capture_count) +
pattern_info(PCRE2_INFO_FIRSTBITMAP, &start_bits) +
pattern_info(PCRE2_INFO_FIRSTCODEUNIT, &first_cunit) +
pattern_info(PCRE2_INFO_FIRSTCODETYPE, &first_ctype) +
pattern_info(PCRE2_INFO_HASCRORLF, &hascrorlf) +
pattern_info(PCRE2_INFO_JCHANGED, &jchanged) +
pattern_info(PCRE2_INFO_LASTCODEUNIT, &last_cunit) +
pattern_info(PCRE2_INFO_LASTCODETYPE, &last_ctype) +
pattern_info(PCRE2_INFO_MATCHEMPTY, &match_empty) +
pattern_info(PCRE2_INFO_MATCHLIMIT, &match_limit) +
pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind) +
pattern_info(PCRE2_INFO_MINLENGTH, &minlength) +
pattern_info(PCRE2_INFO_NAMECOUNT, &namecount) +
pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize) +
pattern_info(PCRE2_INFO_NAMETABLE, &nametable) +
pattern_info(PCRE2_INFO_NEWLINE, &newline_convention) +
pattern_info(PCRE2_INFO_RECURSIONLIMIT, &recursion_limit)
if (pattern_info(PCRE2_INFO_BACKREFMAX, &backrefmax, FALSE) +
pattern_info(PCRE2_INFO_BSR, &bsr_convention, FALSE) +
pattern_info(PCRE2_INFO_CAPTURECOUNT, &capture_count, FALSE) +
pattern_info(PCRE2_INFO_FIRSTBITMAP, &start_bits, FALSE) +
pattern_info(PCRE2_INFO_FIRSTCODEUNIT, &first_cunit, FALSE) +
pattern_info(PCRE2_INFO_FIRSTCODETYPE, &first_ctype, FALSE) +
pattern_info(PCRE2_INFO_HASCRORLF, &hascrorlf, FALSE) +
pattern_info(PCRE2_INFO_JCHANGED, &jchanged, FALSE) +
pattern_info(PCRE2_INFO_LASTCODEUNIT, &last_cunit, FALSE) +
pattern_info(PCRE2_INFO_LASTCODETYPE, &last_ctype, FALSE) +
pattern_info(PCRE2_INFO_MATCHEMPTY, &match_empty, FALSE) +
pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) +
pattern_info(PCRE2_INFO_MINLENGTH, &minlength, FALSE) +
pattern_info(PCRE2_INFO_NAMECOUNT, &namecount, FALSE) +
pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &nameentrysize, FALSE) +
pattern_info(PCRE2_INFO_NAMETABLE, &nametable, FALSE) +
pattern_info(PCRE2_INFO_NEWLINE, &newline_convention, FALSE)
!= 0)
return PR_ABEND;
@ -3063,10 +3098,10 @@ if ((pat_patctl.control & CTL_INFO) != 0)
if (maxlookbehind > 0)
fprintf(outfile, "Max lookbehind = %d\n", maxlookbehind);
if (match_limit != UINT32_MAX)
if (match_limit_set)
fprintf(outfile, "Match limit = %u\n", match_limit);
if (recursion_limit != UINT32_MAX)
if (recursion_limit_set)
fprintf(outfile, "Recursion limit = %u\n", recursion_limit);
if (namecount > 0)
@ -3099,8 +3134,8 @@ if ((pat_patctl.control & CTL_INFO) != 0)
if (hascrorlf) fprintf(outfile, "Contains explicit CR or LF match\n");
if (match_empty) fprintf(outfile, "May match empty string\n");
pattern_info(PCRE2_INFO_ARGOPTIONS, &compile_options);
pattern_info(PCRE2_INFO_ALLOPTIONS, &overall_options);
pattern_info(PCRE2_INFO_ARGOPTIONS, &compile_options, FALSE);
pattern_info(PCRE2_INFO_ALLOPTIONS, &overall_options, FALSE);
/* Remove UTF/UCP if they were there only because of forbid_utf. This saves
cluttering up the verification output of non-UTF test files. */
@ -3234,7 +3269,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
if (pat_patctl.jit != 0 && (pat_patctl.control & CTL_JITVERIFY) != 0)
{
size_t jitsize;
if (pattern_info(PCRE2_INFO_JITSIZE, &jitsize) == 0)
if (pattern_info(PCRE2_INFO_JITSIZE, &jitsize, FALSE) == 0)
{
if (jitsize > 0)
fprintf(outfile, "JIT compilation was successful\n");
@ -3625,14 +3660,14 @@ if ((pat_patctl.control & CTL_MEMORY) != 0)
if (test_mode == 32) cblock_size = sizeof(pcre2_real_code_32);
#endif
(void)pattern_info(PCRE2_INFO_SIZE, &size);
(void)pattern_info(PCRE2_INFO_NAMECOUNT, &name_count);
(void)pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size);
(void)pattern_info(PCRE2_INFO_SIZE, &size, FALSE);
(void)pattern_info(PCRE2_INFO_NAMECOUNT, &name_count, FALSE);
(void)pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size, FALSE);
fprintf(outfile, "Memory allocation (code space): %d\n",
(int)(size - name_count*name_entry_size*code_unit_size - cblock_size));
if (pat_patctl.jit != 0)
{
(void)pattern_info(PCRE2_INFO_JITSIZE, &size);
(void)pattern_info(PCRE2_INFO_JITSIZE, &size, FALSE);
fprintf(outfile, "Memory allocation (JIT code): %d\n", (int)size);
}
}
@ -4452,7 +4487,7 @@ for (gmatched = 0;; gmatched++)
if ((dat_datctl.control & CTL_ALLCAPTURES) != 0)
{
uint32_t maxcapcount;
if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount) < 0)
if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount, FALSE) < 0)
return PR_SKIP;
capcount = maxcapcount + 1; /* Allow for full match */
if (capcount > (int)dat_datctl.oveccount) capcount = dat_datctl.oveccount;
@ -4943,7 +4978,7 @@ printf(" newline newline type [CR, LF, CRLF, ANYCRLF, ANY]\n");
printf(" pcre8 8 bit library support enabled [0, 1]\n");
printf(" pcre16 16 bit library support enabled [0, 1]\n");
printf(" pcre32 32 bit library support enabled [0, 1]\n");
printf(" utf Unicode Transformation Format supported [0, 1]\n");
printf(" unicode Unicode and UTF support enabled [0, 1]\n");
printf(" -d set default pattern control 'debug'\n");
printf(" -dfa set default subject control 'dfa'\n");
printf(" -help show usage information\n");
@ -5057,7 +5092,7 @@ printf(" 16-bit support\n");
printf(" 32-bit support\n");
#endif
(void)PCRE2_CONFIG(PCRE2_CONFIG_UTF, &rc, sizeof(rc));
(void)PCRE2_CONFIG(PCRE2_CONFIG_UNICODE, &rc, sizeof(rc));
if (rc != 0)
printf(" UTF support (Unicode version %s)\n", uversion);
else

14
testdata/grepoutput vendored
View File

@ -384,15 +384,15 @@ aaaaa2
010203040506
RC=0
======== STDERR ========
pcre2grep: pcre2_match() gave error -47 while matching this text:
pcre2grep: pcre2_match() gave error -45 while matching this text:
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
pcre2grep: pcre2_match() gave error -47 while matching this text:
pcre2grep: pcre2_match() gave error -45 while matching this text:
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
pcre2grep: Error -46, -47 or -52 means that a resource limit was exceeded.
pcre2grep: Error -44, -45 or -50 means that a resource limit was exceeded.
pcre2grep: Check your regex for nested unlimited loops.
---------------------------- Test 38 ------------------------------
This line contains a binary zero here >< for testing.
@ -510,23 +510,23 @@ In the middle of a line, PATTERN appears.
Check up on PATTERN near the end.
RC=0
---------------------------- Test 62 -----------------------------
pcre2grep: pcre2_match() gave error -47 while matching text that starts:
pcre2grep: pcre2_match() gave error -45 while matching text that starts:
This is a file of miscellaneous text that is used as test data for checking
that the pcregrep command is working correctly. The file must be more than 24K
long so that it needs more than a single read
pcre2grep: Error -46, -47 or -52 means that a resource limit was exceeded.
pcre2grep: Error -44, -45 or -50 means that a resource limit was exceeded.
pcre2grep: Check your regex for nested unlimited loops.
RC=1
---------------------------- Test 63 -----------------------------
pcre2grep: pcre2_match() gave error -52 while matching text that starts:
pcre2grep: pcre2_match() gave error -50 while matching text that starts:
This is a file of miscellaneous text that is used as test data for checking
that the pcregrep command is working correctly. The file must be more than 24K
long so that it needs more than a single read
pcre2grep: Error -46, -47 or -52 means that a resource limit was exceeded.
pcre2grep: Error -44, -45 or -50 means that a resource limit was exceeded.
pcre2grep: Check your regex for nested unlimited loops.
RC=1
---------------------------- Test 64 ------------------------------

View File

@ -888,7 +888,7 @@ Subject length lower bound = 3
a\x{123}aa\=offset=1
0: aa
a\x{123}aa\=offset=2
Error -36 (bad UTF-8 offset)
Error -35 (bad UTF-8 offset)
a\x{123}aa\=offset=3
0: aa
a\x{123}aa\=offset=4
@ -896,7 +896,7 @@ Error -36 (bad UTF-8 offset)
a\x{123}aa\=offset=5
No match
a\x{123}aa\=offset=6
Failed: error -34: bad offset value
Failed: error -33: bad offset value
/\x{1234}+/Ii,utf
Capturing subpattern count = 0

View File

@ -787,9 +787,9 @@ Subject length lower bound = 3
a\x{123}aa\=offset=4
No match
a\x{123}aa\=offset=5
Failed: error -34: bad offset value
Failed: error -33: bad offset value
a\x{123}aa\=offset=6
Failed: error -34: bad offset value
Failed: error -33: bad offset value
/\x{1234}+/Ii,utf
Capturing subpattern count = 0
@ -851,9 +851,9 @@ Subject length lower bound = 1
/a/utf
\x{10000}\=offset=1
Error -36 (bad UTF-16 offset)
Error -35 (bad UTF-16 offset)
\x{10000}ab\=offset=1
Error -36 (bad UTF-16 offset)
Error -35 (bad UTF-16 offset)
\x{10000}ab\=offset=2
0: a
\x{10000}ab\=offset=3
@ -861,7 +861,7 @@ No match
\x{10000}ab\=offset=4
No match
\x{10000}ab\=offset=5
Failed: error -34: bad offset value
Failed: error -33: bad offset value
/<2F><><EFBFBD>/utf
Failed: error -26 at offset 0: UTF-16 error: isolated low surrogate

View File

@ -779,9 +779,9 @@ Subject length lower bound = 3
a\x{123}aa\=offset=4
No match
a\x{123}aa\=offset=5
Failed: error -34: bad offset value
Failed: error -33: bad offset value
a\x{123}aa\=offset=6
Failed: error -34: bad offset value
Failed: error -33: bad offset value
/\x{1234}+/Ii,utf
Capturing subpattern count = 0
@ -851,9 +851,9 @@ No match
\x{10000}ab\=offset=3
No match
\x{10000}ab\=offset=4
Failed: error -34: bad offset value
Failed: error -33: bad offset value
\x{10000}ab\=offset=5
Failed: error -34: bad offset value
Failed: error -33: bad offset value
/<2F><><EFBFBD>/utf
Failed: error -27 at offset 0: UTF-32 error: code points 0xd800-0xdfff are not defined

44
testdata/testoutput2 vendored
View File

@ -986,7 +986,7 @@ Subject length lower bound = 4
0: abcd
1: a
2: d
copy substring 5 failed (-49): unknown or unset substring
copy substring 5 failed (-47): unknown or unset substring
/(.{20})/I
Capturing subpattern count = 1
@ -1040,9 +1040,9 @@ Subject length lower bound = 4
2: <unset>
3: f
1G a (1)
get substring 2 failed (-49): unknown or unset substring
get substring 2 failed (-47): unknown or unset substring
3G f (1)
get substring 4 failed (-49): unknown or unset substring
get substring 4 failed (-47): unknown or unset substring
0L adef
1L a
2L
@ -1055,7 +1055,7 @@ get substring 4 failed (-49): unknown or unset substring
1G bc (2)
2G bc (2)
3G f (1)
get substring 4 failed (-49): unknown or unset substring
get substring 4 failed (-47): unknown or unset substring
0L bcdef
1L bc
2L bc
@ -4370,7 +4370,7 @@ Subject length lower bound = 8
0: abcdefgh
1: cd
2: gh
copy substring 'three' failed (-49): unknown or unset substring
copy substring 'three' failed (-47): unknown or unset substring
/(?P<Tes>)(?P<Test>)/IB
------------------------------------------------------------------
@ -5737,7 +5737,7 @@ No match
0: a1
1: a1
2: a1
copy substring 'Z' failed (-49): unknown or unset substring
copy substring 'Z' failed (-47): unknown or unset substring
C a1 (2) A
/(?|(?<a>)(?<b>)(?<a>)|(?<a>)(?<b>)(?<a>))/I,dupnames
@ -5778,7 +5778,7 @@ Subject length lower bound = 2
C a (1) A
cd\=copy=A
0: cd
copy substring 'A' failed (-49): unknown or unset substring
copy substring 'A' failed (-47): unknown or unset substring
/^(?P<A>a)(?P<A>b)|cd(?P<A>ef)(?P<A>gh)/I,dupnames
Capturing subpattern count = 4
@ -5822,7 +5822,7 @@ No match
0: a1
1: a1
2: a1
get substring 'Z' failed (-49): unknown or unset substring
get substring 'Z' failed (-47): unknown or unset substring
G a1 (2) A
/^(?P<A>a)(?P<A>b)/I,dupnames
@ -5853,7 +5853,7 @@ Subject length lower bound = 2
G a (1) A
cd\=get=A
0: cd
get substring 'A' failed (-49): unknown or unset substring
get substring 'A' failed (-47): unknown or unset substring
/^(?P<A>a)(?P<A>b)|cd(?P<A>ef)(?P<A>gh)/I,dupnames
Capturing subpattern count = 4
@ -10446,7 +10446,7 @@ Partial match: abc
abc\=offset=3
No match
abc\=offset=4
Failed: error -34: bad offset value
Failed: error -33: bad offset value
abc\=offset=-4
** Invalid value in 'offset=-4'
@ -11129,15 +11129,15 @@ Matched, but too many substrings
/((?2))((?1))/
abc
Failed: error -51: nested recursion at the same subject position
Failed: error -49: nested recursion at the same subject position
/((?(R2)a+|(?1)b))/
aaaabcde
Failed: error -51: nested recursion at the same subject position
Failed: error -49: nested recursion at the same subject position
/(?(R)a*(?1)|((?R))b)/
aaaabcde
Failed: error -51: nested recursion at the same subject position
Failed: error -49: nested recursion at the same subject position
/(a+|(?R)b)/
Failed: error 140 at offset 7: recursion could loop indefinitely
@ -12129,11 +12129,11 @@ Subject length lower bound = 3
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=match_limit=3000
Failed: error -47: match limit exceeded
Failed: error -45: match limit exceeded
/(a+)*zz/
aaaaaaaaaaaaaz\=recursion_limit=10
Failed: error -52: recursion limit exceeded
Failed: error -50: recursion limit exceeded
/(*LIMIT_MATCH=3000)(a+)*zz/I
Capturing subpattern count = 1
@ -12142,9 +12142,9 @@ Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -47: match limit exceeded
Failed: error -45: match limit exceeded
aaaaaaaaaaaaaz\=match_limit=60000
Failed: error -47: match limit exceeded
Failed: error -45: match limit exceeded
/(*LIMIT_MATCH=60000)(*LIMIT_MATCH=3000)(a+)*zz/I
Capturing subpattern count = 1
@ -12153,7 +12153,7 @@ Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -47: match limit exceeded
Failed: error -45: match limit exceeded
/(*LIMIT_MATCH=60000)(a+)*zz/I
Capturing subpattern count = 1
@ -12164,7 +12164,7 @@ Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=match_limit=3000
Failed: error -47: match limit exceeded
Failed: error -45: match limit exceeded
/(*LIMIT_RECURSION=10)(a+)*zz/I
Capturing subpattern count = 1
@ -12173,9 +12173,9 @@ Starting code units: a z
Last code unit = 'z'
Subject length lower bound = 2
aaaaaaaaaaaaaz
Failed: error -52: recursion limit exceeded
Failed: error -50: recursion limit exceeded
aaaaaaaaaaaaaz\=recursion_limit=1000
Failed: error -52: recursion limit exceeded
Failed: error -50: recursion limit exceeded
/(*LIMIT_RECURSION=10)(*LIMIT_RECURSION=1000)(a+)*zz/I
Capturing subpattern count = 1
@ -12195,7 +12195,7 @@ Subject length lower bound = 2
aaaaaaaaaaaaaz
No match
aaaaaaaaaaaaaz\=recursion_limit=10
Failed: error -52: recursion limit exceeded
Failed: error -50: recursion limit exceeded
# This test causes a segfault with Perl 5.18.0

26
testdata/testoutput6 vendored
View File

@ -6132,7 +6132,7 @@ No match
/^(?(2)a|(1)(2))+$/
123a
Failed: error -40: backreference condition or recursion test not supported for DFA matching
Failed: error -39: backreference condition or recursion test not supported for DFA matching
/(?<=a|bbbb)c/
ac
@ -7059,7 +7059,7 @@ Partial match: dogs
/abc\K123/
xyzabc123pqr
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
/(?<=abc)123/
xyzabc123pqr
@ -7185,29 +7185,29 @@ No match
/^(?!a(*SKIP)b)/
ac
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
/^(?=a(*SKIP)b|ac)/
** Failers
No match
ac
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
/^(?=a(*THEN)b|ac)/
ac
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
/^(?=a(*PRUNE)b)/
ab
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
** Failers
No match
ac
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
/^(?(?!a(*SKIP)b))/
ac
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
/(?<=abc)def/
abc\=ph
@ -7277,7 +7277,7 @@ Partial match: abc
abc\=offset=3
No match
abc\=offset=4
Failed: error -34: bad offset value
Failed: error -33: bad offset value
abc\=offset=-4
** Invalid value in 'offset=-4'
@ -7403,7 +7403,7 @@ No match
/((?2))((?1))/
abc
Failed: error -51: nested recursion at the same subject position
Failed: error -49: nested recursion at the same subject position
/(?(R)a+|(?R)b)/
aaaabcde
@ -7419,11 +7419,11 @@ Failed: error -51: nested recursion at the same subject position
/((?(R2)a+|(?1)b))/
aaaabcde
Failed: error -40: backreference condition or recursion test not supported for DFA matching
Failed: error -39: backreference condition or recursion test not supported for DFA matching
/(?(R)a*(?1)|((?R))b)/
aaaabcde
Failed: error -51: nested recursion at the same subject position
Failed: error -49: nested recursion at the same subject position
/(a+)/no_auto_possess
aaaa\=ovector=3
@ -7572,7 +7572,7 @@ Partial match: \x0d\x0d\x0d
/abcdef/
abc\=dfa_restart
Failed: error -38: invalid data in workspace for DFA restart
Failed: error -37: invalid data in workspace for DFA restart
/<H((?(?!<H|F>)(.)|(?R))++)*F>/
text <H more text <H texting more hexA0-"\xA0" hex above 7F-"\xBC" F> text xxxxx <H text F> text F> text2 <H text sample F> more text.

View File

@ -1230,7 +1230,7 @@ Partial match: the cat
/ab\Cde/utf
abXde
Failed: error -41: item unsupported for DFA matching
Failed: error -40: item unsupported for DFA matching
/(?<=ab\Cde)X/utf
Failed: error 136 at offset 10: \C is not allowed in a lookbehind assertion