Basic pcre2_compile() is working (no study, no auto-possess yet).

This commit is contained in:
Philip.Hazel 2014-06-14 18:29:51 +00:00
parent 2801d5d132
commit 1abd5a7f8d
23 changed files with 10009 additions and 624 deletions

View File

@ -275,19 +275,19 @@ COMMON_SOURCES = \
src/pcre2_jit_misc.c \
src/pcre2_maketables.c \
src/pcre2_match_data.c \
src/pcre2_newline.c \
src/pcre2_ord2utf.c \
src/pcre2_pattern_info.c \
src/pcre2_string_utils.c \
src/pcre2_substring.c \
src/pcre2_tables.c \
src/pcre2_ucd.c \
src/pcre2_ucp.h \
src/pcre2_valid_utf.c \
src/pcre2_version.c
# src/pcre2_newline.c \
# src/pcre2_ord2utf8.c \
# src/pcre2_refcount.c \
# src/pcre2_study.c \
# src/pcre2_valid_utf8.c \
# src/pcre2_xclass.c

View File

@ -2,13 +2,13 @@ dnl Process this file with autoconf to produce a configure script.
dnl NOTE FOR MAINTAINERS: Do not use minor version numbers 08 or 09 because
dnl the leading zeros may cause them to be treated as invalid octal constants
dnl if a PCRE user writes code that uses PCRE_MINOR as a number. There is now
dnl if a PCRE2 user writes code that uses PCRE2_MINOR as a number. There is now
dnl a check further down that throws an error if 08 or 09 are used.
dnl The PCRE_PRERELEASE feature is for identifying release candidates. It might
dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
dnl be defined as -RC2, for example. For real releases, it should be empty.
m4_define(pcre2_major, [9])
m4_define(pcre2_major, [10])
m4_define(pcre2_minor, [00])
m4_define(pcre2_prerelease, [-DEV])
m4_define(pcre2_date, [2014-99-99])
@ -125,11 +125,11 @@ AC_ARG_ENABLE(jit,
[enable Just-In-Time compiling support]),
, enable_jit=no)
# Handle --disable-pcregrep-jit (enabled by default)
AC_ARG_ENABLE(pcregrep-jit,
AS_HELP_STRING([--disable-pcregrep-jit],
[disable JIT support in pcregrep]),
, enable_pcregrep_jit=yes)
# Handle --disable-pcre2grep-jit (enabled by default)
AC_ARG_ENABLE(pcre2grep-jit,
AS_HELP_STRING([--disable-pcre2grep-jit],
[disable JIT support in pcre2grep]),
, enable_pcre2grep_jit=yes)
# Handle --enable-rebuild-chartables
AC_ARG_ENABLE(rebuild-chartables,
@ -144,28 +144,28 @@ AC_ARG_ENABLE(utf,
, enable_utf=unset)
# Handle newline options
ac_pcre_newline=lf
ac_pcre2_newline=lf
AC_ARG_ENABLE(newline-is-cr,
AS_HELP_STRING([--enable-newline-is-cr],
[use CR as newline character]),
ac_pcre_newline=cr)
ac_pcre2_newline=cr)
AC_ARG_ENABLE(newline-is-lf,
AS_HELP_STRING([--enable-newline-is-lf],
[use LF as newline character (default)]),
ac_pcre_newline=lf)
ac_pcre2_newline=lf)
AC_ARG_ENABLE(newline-is-crlf,
AS_HELP_STRING([--enable-newline-is-crlf],
[use CRLF as newline sequence]),
ac_pcre_newline=crlf)
ac_pcre2_newline=crlf)
AC_ARG_ENABLE(newline-is-anycrlf,
AS_HELP_STRING([--enable-newline-is-anycrlf],
[use CR, LF, or CRLF as newline sequence]),
ac_pcre_newline=anycrlf)
ac_pcre2_newline=anycrlf)
AC_ARG_ENABLE(newline-is-any,
AS_HELP_STRING([--enable-newline-is-any],
[use any valid Unicode newline sequence]),
ac_pcre_newline=any)
enable_newline="$ac_pcre_newline"
ac_pcre2_newline=any)
enable_newline="$ac_pcre2_newline"
# Handle --enable-bsr-anycrlf
AC_ARG_ENABLE(bsr-anycrlf,
@ -191,35 +191,35 @@ AC_ARG_ENABLE(stack-for-recursion,
[don't use stack recursion when matching]),
, enable_stack_for_recursion=yes)
# Handle --enable-pcregrep-libz
AC_ARG_ENABLE(pcregrep-libz,
AS_HELP_STRING([--enable-pcregrep-libz],
[link pcregrep with libz to handle .gz files]),
, enable_pcregrep_libz=no)
# Handle --enable-pcre2grep-libz
AC_ARG_ENABLE(pcre2grep-libz,
AS_HELP_STRING([--enable-pcre2grep-libz],
[link pcre2grep with libz to handle .gz files]),
, enable_pcre2grep_libz=no)
# Handle --enable-pcregrep-libbz2
AC_ARG_ENABLE(pcregrep-libbz2,
AS_HELP_STRING([--enable-pcregrep-libbz2],
[link pcregrep with libbz2 to handle .bz2 files]),
, enable_pcregrep_libbz2=no)
# Handle --enable-pcre2grep-libbz2
AC_ARG_ENABLE(pcre2grep-libbz2,
AS_HELP_STRING([--enable-pcre2grep-libbz2],
[link pcre2grep with libbz2 to handle .bz2 files]),
, enable_pcre2grep_libbz2=no)
# Handle --with-pcregrep-bufsize=N
AC_ARG_WITH(pcregrep-bufsize,
AS_HELP_STRING([--with-pcregrep-bufsize=N],
[pcregrep buffer size (default=20480, minimum=8192)]),
, with_pcregrep_bufsize=20480)
# Handle --with-pcre2grep-bufsize=N
AC_ARG_WITH(pcre2grep-bufsize,
AS_HELP_STRING([--with-pcre2grep-bufsize=N],
[pcre2grep buffer size (default=20480, minimum=8192)]),
, with_pcre2grep_bufsize=20480)
# Handle --enable-pcretest-libedit
AC_ARG_ENABLE(pcretest-libedit,
AS_HELP_STRING([--enable-pcretest-libedit],
[link pcretest with libedit]),
, enable_pcretest_libedit=no)
# Handle --enable-pcre2test-libedit
AC_ARG_ENABLE(pcre2test-libedit,
AS_HELP_STRING([--enable-pcre2test-libedit],
[link pcre2test with libedit]),
, enable_pcre2test_libedit=no)
# Handle --enable-pcretest-libreadline
AC_ARG_ENABLE(pcretest-libreadline,
AS_HELP_STRING([--enable-pcretest-libreadline],
[link pcretest with libreadline]),
, enable_pcretest_libreadline=no)
# Handle --enable-pcre2test-libreadline
AC_ARG_ENABLE(pcre2test-libreadline,
AS_HELP_STRING([--enable-pcre2test-libreadline],
[link pcre2test with libreadline]),
, enable_pcre2test_libreadline=no)
# Handle --with-link-size=N
AC_ARG_WITH(link-size,
@ -298,11 +298,11 @@ fi
# agree with the PCRE2_NEWLINE_xxx values in pcre2.h.
case "$enable_newline" in
cr) ac_pcre_newline_value=0 ;;
lf) ac_pcre_newline_value=1 ;;
crlf) ac_pcre_newline_value=2 ;;
any) ac_pcre_newline_value=3 ;;
anycrlf) ac_pcre_newline_value=4 ;;
cr) ac_pcre2_newline_value=1 ;;
lf) ac_pcre2_newline_value=2 ;;
crlf) ac_pcre2_newline_value=3 ;;
any) ac_pcre2_newline_value=4 ;;
anycrlf) ac_pcre2_newline_value=5 ;;
*)
AC_MSG_ERROR([invalid argument \"$enable_newline\" to --enable-newline option])
;;
@ -314,7 +314,7 @@ if test "x$enable_ebcdic_nl25" = "xyes"; then
fi
# Make sure that if enable_ebcdic is set, rebuild_chartables is also enabled.
# Also check that UTF support is not requested, because PCRE cannot handle
# Also check that UTF support is not requested, because PCRE2 cannot handle
# EBCDIC and UTF in the same build. To do so it would need to use different
# character constants depending on the mode.
#
@ -334,13 +334,13 @@ case "$with_link_size" in
esac
AH_TOP([
/* PCRE is written in Standard C, but there are a few non-standard things it
/* PCRE2 is written in Standard C, but there are a few non-standard things it
can cope with, allowing it to run on SunOS4 and other "close to standard"
systems.
In environments that support the GNU autotools, config.h.in is converted into
config.h by the "configure" script. In environments that use CMake,
config-cmake.in is converted into config.h. If you are going to build PCRE "by
config-cmake.in is converted into config.h. If you are going to build PCRE2 "by
hand" without using "configure" or CMake, you should copy the distributed
config.h.generic to config.h, and edit the macro definitions to be the way you
need them. You must then add -DHAVE_CONFIG_H to all of your compile commands,
@ -357,7 +357,7 @@ macros are listed as a commented #undef in config.h.generic. Macros such as
MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
surrounded by #ifndef/#endif lines so that the value can be overridden by -D.
PCRE uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make
sure both macros are undefined; an emulation function will then be used. */])
@ -370,7 +370,7 @@ AC_CHECK_HEADERS([windows.h], [HAVE_WINDOWS_H=1])
AM_CONDITIONAL(WITH_PCRE8, test "x$enable_pcre8" = "xyes")
AM_CONDITIONAL(WITH_PCRE16, test "x$enable_pcre16" = "xyes")
AM_CONDITIONAL(WITH_PCRE32, test "x$enable_pcre32" = "xyes")
AM_CONDITIONAL(WITH_PCRE_CPP, test "x$enable_cpp" = "xyes")
#AM_CONDITIONAL(WITH_PCRE2_CPP, test "x$enable_cpp" = "xyes")
AM_CONDITIONAL(WITH_REBUILD_CHARTABLES, test "x$enable_rebuild_chartables" = "xyes")
AM_CONDITIONAL(WITH_JIT, test "x$enable_jit" = "xyes")
AM_CONDITIONAL(WITH_UTF, test "x$enable_utf" = "xyes")
@ -400,7 +400,7 @@ AC_CHECK_LIB([z], [gzopen], [HAVE_LIBZ=1])
# therefore missing the function definition.
# - The compiler thus generates a "C" signature for the test function.
# - The linker fails to find the "C" function.
# - PCRE fails to configure if asked to do so against libbz2.
# - PCRE2 fails to configure if asked to do so against libbz2.
#
# Solution:
#
@ -426,7 +426,7 @@ LIBS="$OLD_LIBS"
# Check for the availabiity of libreadline
if test "$enable_pcretest_libreadline" = "yes"; then
if test "$enable_pcre2test_libreadline" = "yes"; then
AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_H=1])
AC_CHECK_HEADERS([readline/history.h], [HAVE_HISTORY_H=1])
AC_CHECK_LIB([readline], [readline], [LIBREADLINE="-lreadline"],
@ -459,7 +459,7 @@ fi
# Check for the availability of libedit. Different distributions put its
# headers in different places. Try to cover the most common ones.
if test "$enable_pcretest_libedit" = "yes"; then
if test "$enable_pcre2test_libedit" = "yes"; then
AC_CHECK_HEADERS([editline/readline.h], [HAVE_EDITLINE_READLINE_H=1],
[AC_CHECK_HEADERS([edit/readline/readline.h], [HAVE_READLINE_READLINE_H=1],
[AC_CHECK_HEADERS([readline/readline.h], [HAVE_READLINE_READLINE_H=1])])])
@ -477,21 +477,21 @@ if test "x$enable_shared" = "xno" ; then
fi
AC_SUBST(PCRE2_STATIC_CFLAG)
# Here is where pcre specific defines are handled
# Here is where PCRE2-specific defines are handled
if test "$enable_pcre8" = "yes"; then
AC_DEFINE([SUPPORT_PCRE8], [], [
Define to any value to enable the 8 bit PCRE library.])
Define to any value to enable the 8 bit PCRE2 library.])
fi
if test "$enable_pcre16" = "yes"; then
AC_DEFINE([SUPPORT_PCRE16], [], [
Define to any value to enable the 16 bit PCRE library.])
Define to any value to enable the 16 bit PCRE2 library.])
fi
if test "$enable_pcre32" = "yes"; then
AC_DEFINE([SUPPORT_PCRE32], [], [
Define to any value to enable the 32 bit PCRE library.])
Define to any value to enable the 32 bit PCRE2 library.])
fi
# Unless running under Windows, JIT support requires pthreads.
@ -506,87 +506,87 @@ if test "$enable_jit" = "yes"; then
AC_DEFINE([SUPPORT_JIT], [], [
Define to any value to enable support for Just-In-Time compiling.])
else
enable_pcregrep_jit="no"
enable_pcre2grep_jit="no"
fi
if test "$enable_pcregrep_jit" = "yes"; then
AC_DEFINE([SUPPORT_PCREGREP_JIT], [], [
Define to any value to enable JIT support in pcregrep.])
if test "$enable_pcre2grep_jit" = "yes"; then
AC_DEFINE([SUPPORT_PCRE2GREP_JIT], [], [
Define to any value to enable JIT support in pcre2grep.])
fi
if test "$enable_utf" = "yes"; then
AC_DEFINE([SUPPORT_UTF], [], [
Define to any value to enable support for the UTF-8/16/32 Unicode encoding.
This will work even in an EBCDIC environment, but it is incompatible
with the EBCDIC macro. That is, PCRE can support *either* EBCDIC
with the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC
code *or* ASCII/UTF-8/16/32, but not both at once.])
fi
if test "$enable_stack_for_recursion" = "no"; then
AC_DEFINE([NO_RECURSE], [], [
PCRE uses recursive function calls to handle backtracking while
PCRE2 uses recursive function calls to handle backtracking while
matching. This can sometimes be a problem on systems that have
stacks of limited size. Define NO_RECURSE to any value to get a
version that doesn't use recursion in the match() function; instead
it creates its own stack by steam using pcre_recurse_malloc() to obtain
memory from the heap. For more detail, see the comments and other stuff
just above the match() function.])
it creates its own stack by steam using memory from the heap. For more
detail, see the comments and other stuff just above the match() function.])
fi
if test "$enable_pcregrep_libz" = "yes"; then
if test "$enable_pcre2grep_libz" = "yes"; then
AC_DEFINE([SUPPORT_LIBZ], [], [
Define to any value to allow pcregrep to be linked with libz, so that it is
Define to any value to allow pcre2grep to be linked with libz, so that it is
able to handle .gz files.])
fi
if test "$enable_pcregrep_libbz2" = "yes"; then
if test "$enable_pcre2grep_libbz2" = "yes"; then
AC_DEFINE([SUPPORT_LIBBZ2], [], [
Define to any value to allow pcregrep to be linked with libbz2, so that it
Define to any value to allow pcre2grep to be linked with libbz2, so that it
is able to handle .bz2 files.])
fi
if test $with_pcregrep_bufsize -lt 8192 ; then
AC_MSG_WARN([$with_pcregrep_bufsize is too small for --with-pcregrep-bufsize; using 8192])
with_pcregrep_bufsize="8192"
if test $with_pcre2grep_bufsize -lt 8192 ; then
AC_MSG_WARN([$with_pcre2grep_bufsize is too small for --with-pcre2grep-bufsize; using 8192])
with_pcre2grep_bufsize="8192"
else
if test $? -gt 1 ; then
AC_MSG_ERROR([Bad value for --with-pcregrep-bufsize])
AC_MSG_ERROR([Bad value for --with-pcre2grep-bufsize])
fi
fi
AC_DEFINE_UNQUOTED([PCREGREP_BUFSIZE], [$with_pcregrep_bufsize], [
The value of PCREGREP_BUFSIZE determines the size of buffer used by pcregrep
AC_DEFINE_UNQUOTED([PCRE2GREP_BUFSIZE], [$with_pcre2grep_bufsize], [
The value of PCRE2GREP_BUFSIZE determines the size of buffer used by pcre2grep
to hold parts of the file it is searching. This is also the minimum value.
The actual amount of memory used by pcregrep is three times this number,
The actual amount of memory used by pcre2grep is three times this number,
because it allows for the buffering of "before" and "after" lines.])
if test "$enable_pcretest_libedit" = "yes"; then
if test "$enable_pcre2test_libedit" = "yes"; then
AC_DEFINE([SUPPORT_LIBEDIT], [], [
Define to any value to allow pcretest to be linked with libedit.])
Define to any value to allow pcre2test to be linked with libedit.])
LIBREADLINE="$LIBEDIT"
elif test "$enable_pcretest_libreadline" = "yes"; then
elif test "$enable_pcre2test_libreadline" = "yes"; then
AC_DEFINE([SUPPORT_LIBREADLINE], [], [
Define to any value to allow pcretest to be linked with libreadline.])
Define to any value to allow pcre2test to be linked with libreadline.])
fi
AC_DEFINE_UNQUOTED([NEWLINE], [$ac_pcre_newline_value], [
The value of NEWLINE determines the default newline character sequence. PCRE
client programs can override this by selecting other values at run time. The
valid values are 0 (CR), 1 (LF), 2 (CRLF), 3 (ANY), and 4 (ANYCRLF).])
AC_DEFINE_UNQUOTED([NEWLINE_DEFAULT], [$ac_pcre2_newline_value], [
The value of NEWLINE_DEFAULT determines the default newline character
sequence. PCRE2 client programs can override this by selecting other values
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY),
and 5 (ANYCRLF).])
if test "$enable_bsr_anycrlf" = "yes"; then
AC_DEFINE([BSR_ANYCRLF], [], [
By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined (to any
value), this is changed so that backslash-R matches only CR, LF, or CRLF.
The build-time default can be overridden by the user of PCRE at runtime.])
The build-time default can be overridden by the user of PCRE2 at runtime.])
fi
AC_DEFINE_UNQUOTED([LINK_SIZE], [$with_link_size], [
The value of LINK_SIZE determines the number of bytes used to store
links as offsets within the compiled regex. The default is 2, which
allows for compiled patterns up to 64K long. This covers the vast
majority of cases. However, PCRE can also be compiled to use 3 or 4
majority of cases. However, PCRE2 can also be compiled to use 3 or 4
bytes instead. This allows for longer patterns in extreme cases.])
AC_DEFINE_UNQUOTED([PARENS_NEST_LIMIT], [$with_parens_nest_limit], [
@ -597,7 +597,7 @@ AC_DEFINE_UNQUOTED([PARENS_NEST_LIMIT], [$with_parens_nest_limit], [
AC_DEFINE_UNQUOTED([MATCH_LIMIT], [$with_match_limit], [
The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different
pcre2_match(). There is a runtime interface for setting a different
limit. The limit exists in order to catch runaway regular
expressions that take for ever to determine that they do not match.
The default is set very large so that it does not accidentally catch
@ -639,10 +639,10 @@ AH_VERBATIM([PCRE2_EXP_DEFN], [
if test "$enable_ebcdic" = "yes"; then
AC_DEFINE_UNQUOTED([EBCDIC], [], [
If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro to any value. When EBCDIC is set, PCRE
character codes, define this macro to any value. When EBCDIC is set, PCRE2
assumes that all input strings are in EBCDIC. If you do not define this
macro, PCRE will assume input strings are ASCII or UTF-8/16/32 Unicode. It
is not possible to build a version of PCRE that supports both EBCDIC and
macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It
is not possible to build a version of PCRE2 that supports both EBCDIC and
UTF-8/16/32.])
fi
@ -695,65 +695,65 @@ AC_SUBST(EXTRA_LIBPCRE2_POSIX_LDFLAGS)
DISTCHECK_CONFIGURE_FLAGS="CFLAGS='' CXXFLAGS='' --enable-pcre16 --enable-pcre32 --enable-jit --enable-utf"
AC_SUBST(DISTCHECK_CONFIGURE_FLAGS)
# Check that, if --enable-pcregrep-libz or --enable-pcregrep-libbz2 is
# Check that, if --enable-pcre2grep-libz or --enable-pcre2grep-libbz2 is
# specified, the relevant library is available.
if test "$enable_pcregrep_libz" = "yes"; then
if test "$enable_pcre2grep_libz" = "yes"; then
if test "$HAVE_ZLIB_H" != "1"; then
echo "** Cannot --enable-pcregrep-libz because zlib.h was not found"
echo "** Cannot --enable-pcre2grep-libz because zlib.h was not found"
exit 1
fi
if test "$HAVE_LIBZ" != "1"; then
echo "** Cannot --enable-pcregrep-libz because libz was not found"
echo "** Cannot --enable-pcre2grep-libz because libz was not found"
exit 1
fi
LIBZ="-lz"
fi
AC_SUBST(LIBZ)
if test "$enable_pcregrep_libbz2" = "yes"; then
if test "$enable_pcre2grep_libbz2" = "yes"; then
if test "$HAVE_BZLIB_H" != "1"; then
echo "** Cannot --enable-pcregrep-libbz2 because bzlib.h was not found"
echo "** Cannot --enable-pcre2grep-libbz2 because bzlib.h was not found"
exit 1
fi
if test "$HAVE_LIBBZ2" != "1"; then
echo "** Cannot --enable-pcregrep-libbz2 because libbz2 was not found"
echo "** Cannot --enable-pcre2grep-libbz2 because libbz2 was not found"
exit 1
fi
LIBBZ2="-lbz2"
fi
AC_SUBST(LIBBZ2)
# Similarly for --enable-pcretest-readline
# Similarly for --enable-pcre2test-readline
if test "$enable_pcretest_libedit" = "yes"; then
if test "$enable_pcretest_libreadline" = "yes"; then
echo "** Cannot use both --enable-pcretest-libedit and --enable-pcretest-readline"
if test "$enable_pcre2test_libedit" = "yes"; then
if test "$enable_pcre2test_libreadline" = "yes"; then
echo "** Cannot use both --enable-pcre2test-libedit and --enable-pcre2test-readline"
exit 1
fi
if test "$HAVE_EDITLINE_READLINE_H" != "1" -a \
"$HAVE_READLINE_READLINE_H" != "1"; then
echo "** Cannot --enable-pcretest-libedit because neither editline/readline.h"
echo "** Cannot --enable-pcre2test-libedit because neither editline/readline.h"
echo "** nor readline/readline.h was found."
exit 1
fi
if test -z "$LIBEDIT"; then
echo "** Cannot --enable-pcretest-libedit because libedit library was not found."
echo "** Cannot --enable-pcre2test-libedit because libedit library was not found."
exit 1
fi
fi
if test "$enable_pcretest_libreadline" = "yes"; then
if test "$enable_pcre2test_libreadline" = "yes"; then
if test "$HAVE_READLINE_H" != "1"; then
echo "** Cannot --enable-pcretest-readline because readline/readline.h was not found."
echo "** Cannot --enable-pcre2test-readline because readline/readline.h was not found."
exit 1
fi
if test "$HAVE_HISTORY_H" != "1"; then
echo "** Cannot --enable-pcretest-readline because readline/history.h was not found."
echo "** Cannot --enable-pcre2test-readline because readline/history.h was not found."
exit 1
fi
if test -z "$LIBREADLINE"; then
echo "** Cannot --enable-pcretest-readline because readline library was not found."
echo "** Cannot --enable-pcre2test-readline because readline library was not found."
exit 1
fi
fi
@ -868,12 +868,12 @@ $PACKAGE-$VERSION configuration summary:
Match limit recursion ........... : ${with_match_limit_recursion}
Build shared libs ............... : ${enable_shared}
Build static libs ............... : ${enable_static}
Use JIT in pcregrep ............. : ${enable_pcregrep_jit}
Buffer size for pcregrep ........ : ${with_pcregrep_bufsize}
Link pcregrep with libz ......... : ${enable_pcregrep_libz}
Link pcregrep with libbz2 ....... : ${enable_pcregrep_libbz2}
Link pcretest with libedit ...... : ${enable_pcretest_libedit}
Link pcretest with libreadline .. : ${enable_pcretest_libreadline}
Use JIT in pcre2grep ............ : ${enable_pcre2grep_jit}
Buffer size for pcre2grep ....... : ${with_pcre2grep_bufsize}
Link pcre2grep with libz ........ : ${enable_pcre2grep_libz}
Link pcre2grep with libbz2 ...... : ${enable_pcre2grep_libbz2}
Link pcre2test with libedit ..... : ${enable_pcre2test_libedit}
Link pcre2test with libreadline . : ${enable_pcre2test_libreadline}
Valgrind support ................ : ${enable_valgrind}
Code coverage ................... : ${enable_coverage}

View File

@ -420,6 +420,7 @@ about the pattern:
flipbytes flip endianness
/BB fullbincode show binary code with lengths
/I info show info about compiled pattern
hex pattern is coded in hexadecimal
jit[=<number>] use JIT
locale=<name> use this locale
memory show memory used
@ -430,6 +431,7 @@ about the pattern:
save=<file name> save compiled pattern
stackguard=<number> test the stackguard feature
tables=[0|1|2] select internal tables
use_length use the pattern's length
.sp
The effects of these modifiers are described in the following sections.
FIXME: Give more examples.
@ -481,6 +483,27 @@ specified. See also the section about saving and reloading compiled patterns
below.
.
.
.SS "Specifying a pattern in hex"
.rs
.sp
The \fBhex\fP modifier specifies that the characters of the pattern are to be
interpreted as pairs of hexadecimal digits. White space is permitted between
pairs. For example:
.sp
/ab 32 59/hex
.sp
This feature is provided as a way of creating patterns that contain binary zero
characters. When \fBhex\fP is set, it implies \fBuse_length\fP.
.
.
.SS "Using the pattern's length"
.rs
.sp
By default, \fBpcre2test\fP passes patterns as zero-terminated strings to
\fBpcre2_compile()\fP, giving the length as -1. If \fBuse_length\fP is set, the
length of the pattern is passed. This is implied if \fBhex\fP is set.
.
.
.SS "JIT compilation"
.rs
.sp
@ -595,38 +618,6 @@ letters, digits, spaces, etc. Setting alternate character tables and a locale
are mutually exclusive.
.
.
.SS "Locking out certain modifiers"
.rs
.sp
FIXME FIXME
PCRE can be compiled with or without support for certain features such as
UTF-8/16/32 or Unicode properties. Accordingly, the standard tests are split up
into a number of different files that are selected for running depending on
which features are available. When updating the tests, it is all too easy to
put a new test into the wrong file by mistake; for example, to put a test that
requires UTF support into a file that is used when it is not available. To help
detect such mistakes as early as possible, there is a facility for locking out
specific modifiers. If an input line for \fBpcre2test\fP starts with the string
"< forbid " the following sequence of characters is taken as a list of
forbidden modifiers. For example, in the test files that must not use UTF or
Unicode property support, this line appears:
.sp
< forbid 8W
.sp
This locks out the /8 and /W modifiers. An immediate error is given if they are
subsequently encountered. If the character string contains < but not >, all the
multi-character modifiers that begin with < are locked out. Otherwise, such
modifiers must be explicitly listed, for example:
.sp
< forbid <JS><cr>
.sp
There must be a single space between < and "forbid" for this feature to be
recognised. If there is not, the line is interpreted either as a request to
re-load a pre-compiled pattern (see "SAVING AND RELOADING COMPILED PATTERNS"
below) or, if there is a another < character, as a pattern that uses < as its
delimiter.
.
.
.SS "Setting certain match controls"
.rs
.sp
@ -653,6 +644,7 @@ defaults, set them in a \fB#subject\fP command.
The modifiers that can appear in subject lines and the \fB#subject\fP
command are of two types.
.
.
.SS "Setting match options"
.rs
.sp
@ -1199,6 +1191,6 @@ Cambridge CB2 3QH, England.
.rs
.sp
.nf
Last updated: 13 May 2014
Last updated: 08 June 2014
Copyright (c) 1997-2014 University of Cambridge.
.fi

View File

@ -7,7 +7,7 @@ includedir=${prefix}/include
Name: libpcre2-posix
Description: Posix compatible interface to libpcre2-8
Version: 9.00-DEV
Version: 10.00-DEV
Libs: -L${libdir} -lpcre2-posix
Cflags: -I${includedir} @PCRE_STATIC_CFLAG@
Requires.private: libpcre2-8

View File

@ -2,13 +2,13 @@
/* src/config.h.in. Generated from configure.ac by autoheader. */
/* PCRE is written in Standard C, but there are a few non-standard things it
/* PCRE2 is written in Standard C, but there are a few non-standard things it
can cope with, allowing it to run on SunOS4 and other "close to standard"
systems.
In environments that support the GNU autotools, config.h.in is converted into
config.h by the "configure" script. In environments that use CMake,
config-cmake.in is converted into config.h. If you are going to build PCRE "by
config-cmake.in is converted into config.h. If you are going to build PCRE2 "by
hand" without using "configure" or CMake, you should copy the distributed
config.h.generic to config.h, and edit the macro definitions to be the way you
need them. You must then add -DHAVE_CONFIG_H to all of your compile commands,
@ -25,21 +25,22 @@ macros are listed as a commented #undef in config.h.generic. Macros such as
MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
surrounded by #ifndef/#endif lines so that the value can be overridden by -D.
PCRE uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make
sure both macros are undefined; an emulation function will then be used. */
/* By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined (to any
value), this is changed so that backslash-R matches only CR, LF, or CRLF.
The build-time default can be overridden by the user of PCRE at runtime. */
The build-time default can be overridden by the user of PCRE2 at runtime.
*/
/* #undef BSR_ANYCRLF */
/* If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro to any value. When EBCDIC is set, PCRE
character codes, define this macro to any value. When EBCDIC is set, PCRE2
assumes that all input strings are in EBCDIC. If you do not define this
macro, PCRE will assume input strings are ASCII or UTF-8/16/32 Unicode. It
is not possible to build a version of PCRE that supports both EBCDIC and
macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It
is not possible to build a version of PCRE2 that supports both EBCDIC and
UTF-8/16/32. */
/* #undef EBCDIC */
@ -126,8 +127,8 @@ sure both macros are undefined; an emulation function will then be used. */
/* The value of LINK_SIZE determines the number of bytes used to store links
as offsets within the compiled regex. The default is 2, which allows for
compiled patterns up to 64K long. This covers the vast majority of cases.
However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows
for longer patterns in extreme cases. */
However, PCRE2 can also be compiled to use 3 or 4 bytes instead. This
allows for longer patterns in extreme cases. */
#define LINK_SIZE 2
/* Define to the sub-directory in which libtool stores uninstalled libraries.
@ -136,7 +137,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different limit.
pcre2_match(). There is a runtime interface for setting a different limit.
The limit exists in order to catch runaway regular expressions that take
for ever to determine that they do not match. The default is set very large
so that it does not accidentally catch legitimate cases. */
@ -162,19 +163,18 @@ sure both macros are undefined; an emulation function will then be used. */
overflow caused by enormously large patterns. */
#define MAX_NAME_SIZE 32
/* The value of NEWLINE determines the default newline character sequence.
PCRE client programs can override this by selecting other values at run
time. The valid values are 0 (CR), 1 (LF), 2 (CRLF), 3 (ANY), and 4
/* The value of NEWLINE_DEFAULT determines the default newline character
sequence. PCRE2 client programs can override this by selecting other values
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
(ANYCRLF). */
#define NEWLINE 1
#define NEWLINE_DEFAULT 2
/* PCRE uses recursive function calls to handle backtracking while matching.
/* PCRE2 uses recursive function calls to handle backtracking while matching.
This can sometimes be a problem on systems that have stacks of limited
size. Define NO_RECURSE to any value to get a version that doesn't use
recursion in the match() function; instead it creates its own stack by
steam using pcre_recurse_malloc() to obtain memory from the heap. For more
detail, see the comments and other stuff just above the match() function.
*/
steam using memory from the heap. For more detail, see the comments and
other stuff just above the match() function. */
/* #undef NO_RECURSE */
/* Name of package */
@ -187,7 +187,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_NAME "PCRE2"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE2 9.00-DEV"
#define PACKAGE_STRING "PCRE2 10.00-DEV"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre2"
@ -196,13 +196,20 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "9.00-DEV"
#define PACKAGE_VERSION "10.00-DEV"
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
parentheses (of any kind) in a pattern. This limits the amount of system
stack that is used while compiling a pattern. */
#define PARENS_NEST_LIMIT 250
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
pcre2grep to hold parts of the file it is searching. This is also the
minimum value. The actual amount of memory used by pcre2grep is three times
this number, because it allows for the buffering of "before" and "after"
lines. */
#define PCRE2GREP_BUFSIZE 20480
/* to make a symbol visible */
#define PCRE2POSIX_EXP_DECL extern __attribute__ ((visibility ("default")))
@ -227,13 +234,6 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to any value if linking statically (TODO: make nice with Libtool) */
#define PCRE2_STATIC 1
/* The value of PCREGREP_BUFSIZE determines the size of buffer used by
pcregrep to hold parts of the file it is searching. This is also the
minimum value. The actual amount of memory used by pcregrep is three times
this number, because it allows for the buffering of "before" and "after"
lines. */
#define PCREGREP_BUFSIZE 20480
/* Define to necessary symbol if this constant uses a non-standard name on
your system. */
/* #undef PTHREAD_CREATE_JOINABLE */
@ -244,35 +244,35 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to any value to enable support for Just-In-Time compiling. */
/* #undef SUPPORT_JIT */
/* Define to any value to allow pcregrep to be linked with libbz2, so that it
/* Define to any value to allow pcre2grep to be linked with libbz2, so that it
is able to handle .bz2 files. */
#define SUPPORT_LIBBZ2 /**/
/* Define to any value to allow pcretest to be linked with libedit. */
/* Define to any value to allow pcre2test to be linked with libedit. */
/* #undef SUPPORT_LIBEDIT */
/* Define to any value to allow pcretest to be linked with libreadline. */
/* Define to any value to allow pcre2test to be linked with libreadline. */
#define SUPPORT_LIBREADLINE /**/
/* Define to any value to allow pcregrep to be linked with libz, so that it is
able to handle .gz files. */
/* Define to any value to allow pcre2grep to be linked with libz, so that it
is able to handle .gz files. */
#define SUPPORT_LIBZ /**/
/* Define to any value to enable the 16 bit PCRE library. */
/* Define to any value to enable the 16 bit PCRE2 library. */
#define SUPPORT_PCRE16 /**/
/* Define to any value to enable the 32 bit PCRE library. */
/* Define to any value to enable JIT support in pcre2grep. */
/* #undef SUPPORT_PCRE2GREP_JIT */
/* Define to any value to enable the 32 bit PCRE2 library. */
#define SUPPORT_PCRE32 /**/
/* Define to any value to enable the 8 bit PCRE library. */
/* Define to any value to enable the 8 bit PCRE2 library. */
#define SUPPORT_PCRE8 /**/
/* Define to any value to enable JIT support in pcregrep. */
/* #undef SUPPORT_PCREGREP_JIT */
/* Define to any value to enable support for the UTF-8/16/32 Unicode encoding.
This will work even in an EBCDIC environment, but it is incompatible with
the EBCDIC macro. That is, PCRE can support *either* EBCDIC code *or*
the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or*
ASCII/UTF-8/16/32, but not both at once. */
#define SUPPORT_UTF /**/
@ -280,7 +280,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define SUPPORT_VALGRIND /**/
/* Version number of package */
#define VERSION "9.00-DEV"
#define VERSION "10.00-DEV"
/* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */

View File

@ -1,13 +1,13 @@
/* src/config.h.in. Generated from configure.ac by autoheader. */
/* PCRE is written in Standard C, but there are a few non-standard things it
/* PCRE2 is written in Standard C, but there are a few non-standard things it
can cope with, allowing it to run on SunOS4 and other "close to standard"
systems.
In environments that support the GNU autotools, config.h.in is converted into
config.h by the "configure" script. In environments that use CMake,
config-cmake.in is converted into config.h. If you are going to build PCRE "by
config-cmake.in is converted into config.h. If you are going to build PCRE2 "by
hand" without using "configure" or CMake, you should copy the distributed
config.h.generic to config.h, and edit the macro definitions to be the way you
need them. You must then add -DHAVE_CONFIG_H to all of your compile commands,
@ -24,21 +24,22 @@ macros are listed as a commented #undef in config.h.generic. Macros such as
MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are
surrounded by #ifndef/#endif lines so that the value can be overridden by -D.
PCRE uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if
HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make
sure both macros are undefined; an emulation function will then be used. */
/* By default, the \R escape sequence matches any Unicode line ending
character or sequence of characters. If BSR_ANYCRLF is defined (to any
value), this is changed so that backslash-R matches only CR, LF, or CRLF.
The build-time default can be overridden by the user of PCRE at runtime. */
The build-time default can be overridden by the user of PCRE2 at runtime.
*/
#undef BSR_ANYCRLF
/* If you are compiling for a system that uses EBCDIC instead of ASCII
character codes, define this macro to any value. When EBCDIC is set, PCRE
character codes, define this macro to any value. When EBCDIC is set, PCRE2
assumes that all input strings are in EBCDIC. If you do not define this
macro, PCRE will assume input strings are ASCII or UTF-8/16/32 Unicode. It
is not possible to build a version of PCRE that supports both EBCDIC and
macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It
is not possible to build a version of PCRE2 that supports both EBCDIC and
UTF-8/16/32. */
#undef EBCDIC
@ -125,8 +126,8 @@ sure both macros are undefined; an emulation function will then be used. */
/* The value of LINK_SIZE determines the number of bytes used to store links
as offsets within the compiled regex. The default is 2, which allows for
compiled patterns up to 64K long. This covers the vast majority of cases.
However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows
for longer patterns in extreme cases. */
However, PCRE2 can also be compiled to use 3 or 4 bytes instead. This
allows for longer patterns in extreme cases. */
#undef LINK_SIZE
/* Define to the sub-directory in which libtool stores uninstalled libraries.
@ -135,7 +136,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* The value of MATCH_LIMIT determines the default number of times the
internal match() function can be called during a single execution of
pcre_exec(). There is a runtime interface for setting a different limit.
pcre2_match(). There is a runtime interface for setting a different limit.
The limit exists in order to catch runaway regular expressions that take
for ever to determine that they do not match. The default is set very large
so that it does not accidentally catch legitimate cases. */
@ -161,19 +162,18 @@ sure both macros are undefined; an emulation function will then be used. */
overflow caused by enormously large patterns. */
#undef MAX_NAME_SIZE
/* The value of NEWLINE determines the default newline character sequence.
PCRE client programs can override this by selecting other values at run
time. The valid values are 0 (CR), 1 (LF), 2 (CRLF), 3 (ANY), and 4
/* The value of NEWLINE_DEFAULT determines the default newline character
sequence. PCRE2 client programs can override this by selecting other values
at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5
(ANYCRLF). */
#undef NEWLINE
#undef NEWLINE_DEFAULT
/* PCRE uses recursive function calls to handle backtracking while matching.
/* PCRE2 uses recursive function calls to handle backtracking while matching.
This can sometimes be a problem on systems that have stacks of limited
size. Define NO_RECURSE to any value to get a version that doesn't use
recursion in the match() function; instead it creates its own stack by
steam using pcre_recurse_malloc() to obtain memory from the heap. For more
detail, see the comments and other stuff just above the match() function.
*/
steam using memory from the heap. For more detail, see the comments and
other stuff just above the match() function. */
#undef NO_RECURSE
/* Name of package */
@ -202,6 +202,13 @@ sure both macros are undefined; an emulation function will then be used. */
stack that is used while compiling a pattern. */
#undef PARENS_NEST_LIMIT
/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by
pcre2grep to hold parts of the file it is searching. This is also the
minimum value. The actual amount of memory used by pcre2grep is three times
this number, because it allows for the buffering of "before" and "after"
lines. */
#undef PCRE2GREP_BUFSIZE
/* to make a symbol visible */
#undef PCRE2POSIX_EXP_DECL
@ -226,13 +233,6 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to any value if linking statically (TODO: make nice with Libtool) */
#undef PCRE2_STATIC
/* The value of PCREGREP_BUFSIZE determines the size of buffer used by
pcregrep to hold parts of the file it is searching. This is also the
minimum value. The actual amount of memory used by pcregrep is three times
this number, because it allows for the buffering of "before" and "after"
lines. */
#undef PCREGREP_BUFSIZE
/* Define to necessary symbol if this constant uses a non-standard name on
your system. */
#undef PTHREAD_CREATE_JOINABLE
@ -243,35 +243,35 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to any value to enable support for Just-In-Time compiling. */
#undef SUPPORT_JIT
/* Define to any value to allow pcregrep to be linked with libbz2, so that it
/* Define to any value to allow pcre2grep to be linked with libbz2, so that it
is able to handle .bz2 files. */
#undef SUPPORT_LIBBZ2
/* Define to any value to allow pcretest to be linked with libedit. */
/* Define to any value to allow pcre2test to be linked with libedit. */
#undef SUPPORT_LIBEDIT
/* Define to any value to allow pcretest to be linked with libreadline. */
/* Define to any value to allow pcre2test to be linked with libreadline. */
#undef SUPPORT_LIBREADLINE
/* Define to any value to allow pcregrep to be linked with libz, so that it is
able to handle .gz files. */
/* Define to any value to allow pcre2grep to be linked with libz, so that it
is able to handle .gz files. */
#undef SUPPORT_LIBZ
/* Define to any value to enable the 16 bit PCRE library. */
/* Define to any value to enable the 16 bit PCRE2 library. */
#undef SUPPORT_PCRE16
/* Define to any value to enable the 32 bit PCRE library. */
/* Define to any value to enable JIT support in pcre2grep. */
#undef SUPPORT_PCRE2GREP_JIT
/* Define to any value to enable the 32 bit PCRE2 library. */
#undef SUPPORT_PCRE32
/* Define to any value to enable the 8 bit PCRE library. */
/* Define to any value to enable the 8 bit PCRE2 library. */
#undef SUPPORT_PCRE8
/* Define to any value to enable JIT support in pcregrep. */
#undef SUPPORT_PCREGREP_JIT
/* Define to any value to enable support for the UTF-8/16/32 Unicode encoding.
This will work even in an EBCDIC environment, but it is incompatible with
the EBCDIC macro. That is, PCRE can support *either* EBCDIC code *or*
the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or*
ASCII/UTF-8/16/32, but not both at once. */
#undef SUPPORT_UTF

View File

@ -41,7 +41,7 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */
#define PCRE2_MAJOR 9
#define PCRE2_MAJOR 10
#define PCRE2_MINOR 00
#define PCRE2_PRERELEASE -DEV
#define PCRE2_DATE 2014-99-99
@ -138,86 +138,83 @@ D is inspected during pcre2_dfa_exec() execution
/* Newline and \R settings, for use in the compile context. */
#define PCRE2_NEWLINE_DEFAULT 0
#define PCRE2_NEWLINE_CR 1
#define PCRE2_NEWLINE_LF 2
#define PCRE2_NEWLINE_CRLF 3
#define PCRE2_NEWLINE_ANY 4
#define PCRE2_NEWLINE_ANYCRLF 5
#define PCRE2_BSR_DEFAULT 0
#define PCRE2_BSR_UNICODE 1
#define PCRE2_BSR_ANYCRLF 2
/* Match-time and get/set-time error codes */
/* Error codes: no match and partial match are "expected" errors. */
#define PCRE2_ERROR_NOMATCH (-1)
#define PCRE2_ERROR_PARTIAL (-2)
#define PCRE2_ERROR_BADCOUNT (-2)
#define PCRE2_ERROR_BADENDIANNESS (-3)
#define PCRE2_ERROR_BADLENGTH (-4)
#define PCRE2_ERROR_BADMAGIC (-5)
#define PCRE2_ERROR_BADMODE (-6)
#define PCRE2_ERROR_BADOFFSET (-7)
#define PCRE2_ERROR_BADOPTION (-8)
#define PCRE2_ERROR_BADUTF (-9)
#define PCRE2_ERROR_BADUTF_OFFSET (-10)
#define PCRE2_ERROR_CALLOUT (-11) /* Never used by PCRE2 itself */
#define PCRE2_ERROR_INTERNAL (-12)
#define PCRE2_ERROR_JIT_BADOPTION (-13)
#define PCRE2_ERROR_JIT_STACKLIMIT (-14)
#define PCRE2_ERROR_MATCHLIMIT (-15)
#define PCRE2_ERROR_NOMEMORY (-16)
#define PCRE2_ERROR_NOSUBSTRING (-17)
#define PCRE2_ERROR_NULL (-18)
#define PCRE2_ERROR_PARTIAL (-19)
#define PCRE2_ERROR_RECURSELOOP (-20)
#define PCRE2_ERROR_RECURSIONLIMIT (-21)
#define PCRE2_ERROR_UNKNOWN_OPCODE (-22)
#define PCRE2_ERROR_UNSET (-23)
/* Error codes for UTF-8 validity checks */
#define PCRE2_ERROR_DFA_BADRESTART (-30)
#define PCRE2_ERROR_DFA_RECURSE (-31)
#define PCRE2_ERROR_DFA_UCOND (-32)
#define PCRE2_ERROR_DFA_UITEM (-33)
#define PCRE2_ERROR_DFA_UMLIMIT (-34)
#define PCRE2_ERROR_DFA_WSSIZE (-35)
#define PCRE2_ERROR_UTF8_ERR1 (-3)
#define PCRE2_ERROR_UTF8_ERR2 (-4)
#define PCRE2_ERROR_UTF8_ERR3 (-5)
#define PCRE2_ERROR_UTF8_ERR4 (-6)
#define PCRE2_ERROR_UTF8_ERR5 (-7)
#define PCRE2_ERROR_UTF8_ERR6 (-8)
#define PCRE2_ERROR_UTF8_ERR7 (-9)
#define PCRE2_ERROR_UTF8_ERR8 (-10)
#define PCRE2_ERROR_UTF8_ERR9 (-11)
#define PCRE2_ERROR_UTF8_ERR10 (-12)
#define PCRE2_ERROR_UTF8_ERR11 (-13)
#define PCRE2_ERROR_UTF8_ERR12 (-14)
#define PCRE2_ERROR_UTF8_ERR13 (-15)
#define PCRE2_ERROR_UTF8_ERR14 (-16)
#define PCRE2_ERROR_UTF8_ERR15 (-17)
#define PCRE2_ERROR_UTF8_ERR16 (-18)
#define PCRE2_ERROR_UTF8_ERR17 (-19)
#define PCRE2_ERROR_UTF8_ERR18 (-20)
#define PCRE2_ERROR_UTF8_ERR19 (-21)
#define PCRE2_ERROR_UTF8_ERR20 (-22)
#define PCRE2_ERROR_UTF8_ERR21 (-23)
/* Error codes for UTF-16 validity checks */
/* Specific error codes for UTF-8 validity checks */
#define PCRE2_ERROR_UTF16_ERR1 (-24)
#define PCRE2_ERROR_UTF16_ERR2 (-25)
#define PCRE2_ERROR_UTF16_ERR3 (-26)
#define PCRE2_ERROR_UTF8_ERR1 (-41)
#define PCRE2_ERROR_UTF8_ERR2 (-42)
#define PCRE2_ERROR_UTF8_ERR3 (-43)
#define PCRE2_ERROR_UTF8_ERR4 (-44)
#define PCRE2_ERROR_UTF8_ERR5 (-45)
#define PCRE2_ERROR_UTF8_ERR6 (-46)
#define PCRE2_ERROR_UTF8_ERR7 (-47)
#define PCRE2_ERROR_UTF8_ERR8 (-48)
#define PCRE2_ERROR_UTF8_ERR9 (-49)
#define PCRE2_ERROR_UTF8_ERR10 (-50)
#define PCRE2_ERROR_UTF8_ERR11 (-51)
#define PCRE2_ERROR_UTF8_ERR12 (-52)
#define PCRE2_ERROR_UTF8_ERR13 (-53)
#define PCRE2_ERROR_UTF8_ERR14 (-54)
#define PCRE2_ERROR_UTF8_ERR15 (-55)
#define PCRE2_ERROR_UTF8_ERR16 (-56)
#define PCRE2_ERROR_UTF8_ERR17 (-57)
#define PCRE2_ERROR_UTF8_ERR18 (-58)
#define PCRE2_ERROR_UTF8_ERR19 (-59)
#define PCRE2_ERROR_UTF8_ERR20 (-60)
#define PCRE2_ERROR_UTF8_ERR21 (-61)
/* Error codes for UTF-32 validity checks */
/* Specific error codes for UTF-16 validity checks */
#define PCRE2_ERROR_UTF32_ERR1 (-27)
#define PCRE2_ERROR_UTF32_ERR2 (-28)
#define PCRE2_ERROR_UTF16_ERR1 (-62)
#define PCRE2_ERROR_UTF16_ERR2 (-63)
#define PCRE2_ERROR_UTF16_ERR3 (-64)
/* Error codes for pcre2[_dfa]_match() */
/* Specific error codes for UTF-32 validity checks */
#define PCRE2_ERROR_UTF32_ERR1 (-65)
#define PCRE2_ERROR_UTF32_ERR3 (-66)
#define PCRE2_ERROR_BADCOUNT (-29)
#define PCRE2_ERROR_BADENDIANNESS (-30)
#define PCRE2_ERROR_BADLENGTH (-31)
#define PCRE2_ERROR_BADMAGIC (-32)
#define PCRE2_ERROR_BADMODE (-33)
#define PCRE2_ERROR_BADOFFSET (-34)
#define PCRE2_ERROR_BADOPTION (-35)
#define PCRE2_ERROR_BADUTF_OFFSET (-36)
#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */
#define PCRE2_ERROR_DFA_BADRESTART (-38)
#define PCRE2_ERROR_DFA_RECURSE (-39)
#define PCRE2_ERROR_DFA_UCOND (-40)
#define PCRE2_ERROR_DFA_UITEM (-41)
#define PCRE2_ERROR_DFA_UMLIMIT (-42)
#define PCRE2_ERROR_DFA_WSSIZE (-43)
#define PCRE2_ERROR_INTERNAL (-44)
#define PCRE2_ERROR_JIT_BADOPTION (-45)
#define PCRE2_ERROR_JIT_STACKLIMIT (-46)
#define PCRE2_ERROR_MATCHLIMIT (-47)
#define PCRE2_ERROR_NOMEMORY (-48)
#define PCRE2_ERROR_NOSUBSTRING (-49)
#define PCRE2_ERROR_NULL (-50)
#define PCRE2_ERROR_RECURSELOOP (-51)
#define PCRE2_ERROR_RECURSIONLIMIT (-52)
#define PCRE2_ERROR_UNKNOWN_OPCODE (-53)
#define PCRE2_ERROR_UNSET (-54)
/* Request types for pcre2_pattern_info() */

View File

@ -138,86 +138,83 @@ D is inspected during pcre2_dfa_exec() execution
/* Newline and \R settings, for use in the compile context. */
#define PCRE2_NEWLINE_DEFAULT 0
#define PCRE2_NEWLINE_CR 1
#define PCRE2_NEWLINE_LF 2
#define PCRE2_NEWLINE_CRLF 3
#define PCRE2_NEWLINE_ANY 4
#define PCRE2_NEWLINE_ANYCRLF 5
#define PCRE2_BSR_DEFAULT 0
#define PCRE2_BSR_UNICODE 1
#define PCRE2_BSR_ANYCRLF 2
/* Match-time and get/set-time error codes */
/* Error codes: no match and partial match are "expected" errors. */
#define PCRE2_ERROR_NOMATCH (-1)
#define PCRE2_ERROR_PARTIAL (-2)
#define PCRE2_ERROR_BADCOUNT (-2)
#define PCRE2_ERROR_BADENDIANNESS (-3)
#define PCRE2_ERROR_BADLENGTH (-4)
#define PCRE2_ERROR_BADMAGIC (-5)
#define PCRE2_ERROR_BADMODE (-6)
#define PCRE2_ERROR_BADOFFSET (-7)
#define PCRE2_ERROR_BADOPTION (-8)
#define PCRE2_ERROR_BADUTF (-9)
#define PCRE2_ERROR_BADUTF_OFFSET (-10)
#define PCRE2_ERROR_CALLOUT (-11) /* Never used by PCRE2 itself */
#define PCRE2_ERROR_INTERNAL (-12)
#define PCRE2_ERROR_JIT_BADOPTION (-13)
#define PCRE2_ERROR_JIT_STACKLIMIT (-14)
#define PCRE2_ERROR_MATCHLIMIT (-15)
#define PCRE2_ERROR_NOMEMORY (-16)
#define PCRE2_ERROR_NOSUBSTRING (-17)
#define PCRE2_ERROR_NULL (-18)
#define PCRE2_ERROR_PARTIAL (-19)
#define PCRE2_ERROR_RECURSELOOP (-20)
#define PCRE2_ERROR_RECURSIONLIMIT (-21)
#define PCRE2_ERROR_UNKNOWN_OPCODE (-22)
#define PCRE2_ERROR_UNSET (-23)
/* Error codes for UTF-8 validity checks */
#define PCRE2_ERROR_DFA_BADRESTART (-30)
#define PCRE2_ERROR_DFA_RECURSE (-31)
#define PCRE2_ERROR_DFA_UCOND (-32)
#define PCRE2_ERROR_DFA_UITEM (-33)
#define PCRE2_ERROR_DFA_UMLIMIT (-34)
#define PCRE2_ERROR_DFA_WSSIZE (-35)
#define PCRE2_ERROR_UTF8_ERR1 (-3)
#define PCRE2_ERROR_UTF8_ERR2 (-4)
#define PCRE2_ERROR_UTF8_ERR3 (-5)
#define PCRE2_ERROR_UTF8_ERR4 (-6)
#define PCRE2_ERROR_UTF8_ERR5 (-7)
#define PCRE2_ERROR_UTF8_ERR6 (-8)
#define PCRE2_ERROR_UTF8_ERR7 (-9)
#define PCRE2_ERROR_UTF8_ERR8 (-10)
#define PCRE2_ERROR_UTF8_ERR9 (-11)
#define PCRE2_ERROR_UTF8_ERR10 (-12)
#define PCRE2_ERROR_UTF8_ERR11 (-13)
#define PCRE2_ERROR_UTF8_ERR12 (-14)
#define PCRE2_ERROR_UTF8_ERR13 (-15)
#define PCRE2_ERROR_UTF8_ERR14 (-16)
#define PCRE2_ERROR_UTF8_ERR15 (-17)
#define PCRE2_ERROR_UTF8_ERR16 (-18)
#define PCRE2_ERROR_UTF8_ERR17 (-19)
#define PCRE2_ERROR_UTF8_ERR18 (-20)
#define PCRE2_ERROR_UTF8_ERR19 (-21)
#define PCRE2_ERROR_UTF8_ERR20 (-22)
#define PCRE2_ERROR_UTF8_ERR21 (-23)
/* Error codes for UTF-16 validity checks */
/* Specific error codes for UTF-8 validity checks */
#define PCRE2_ERROR_UTF16_ERR1 (-24)
#define PCRE2_ERROR_UTF16_ERR2 (-25)
#define PCRE2_ERROR_UTF16_ERR3 (-26)
#define PCRE2_ERROR_UTF8_ERR1 (-41)
#define PCRE2_ERROR_UTF8_ERR2 (-42)
#define PCRE2_ERROR_UTF8_ERR3 (-43)
#define PCRE2_ERROR_UTF8_ERR4 (-44)
#define PCRE2_ERROR_UTF8_ERR5 (-45)
#define PCRE2_ERROR_UTF8_ERR6 (-46)
#define PCRE2_ERROR_UTF8_ERR7 (-47)
#define PCRE2_ERROR_UTF8_ERR8 (-48)
#define PCRE2_ERROR_UTF8_ERR9 (-49)
#define PCRE2_ERROR_UTF8_ERR10 (-50)
#define PCRE2_ERROR_UTF8_ERR11 (-51)
#define PCRE2_ERROR_UTF8_ERR12 (-52)
#define PCRE2_ERROR_UTF8_ERR13 (-53)
#define PCRE2_ERROR_UTF8_ERR14 (-54)
#define PCRE2_ERROR_UTF8_ERR15 (-55)
#define PCRE2_ERROR_UTF8_ERR16 (-56)
#define PCRE2_ERROR_UTF8_ERR17 (-57)
#define PCRE2_ERROR_UTF8_ERR18 (-58)
#define PCRE2_ERROR_UTF8_ERR19 (-59)
#define PCRE2_ERROR_UTF8_ERR20 (-60)
#define PCRE2_ERROR_UTF8_ERR21 (-61)
/* Error codes for UTF-32 validity checks */
/* Specific error codes for UTF-16 validity checks */
#define PCRE2_ERROR_UTF32_ERR1 (-27)
#define PCRE2_ERROR_UTF32_ERR2 (-28)
#define PCRE2_ERROR_UTF16_ERR1 (-62)
#define PCRE2_ERROR_UTF16_ERR2 (-63)
#define PCRE2_ERROR_UTF16_ERR3 (-64)
/* Error codes for pcre2[_dfa]_match() */
/* Specific error codes for UTF-32 validity checks */
#define PCRE2_ERROR_UTF32_ERR1 (-65)
#define PCRE2_ERROR_UTF32_ERR3 (-66)
#define PCRE2_ERROR_BADCOUNT (-29)
#define PCRE2_ERROR_BADENDIANNESS (-30)
#define PCRE2_ERROR_BADLENGTH (-31)
#define PCRE2_ERROR_BADMAGIC (-32)
#define PCRE2_ERROR_BADMODE (-33)
#define PCRE2_ERROR_BADOFFSET (-34)
#define PCRE2_ERROR_BADOPTION (-35)
#define PCRE2_ERROR_BADUTF_OFFSET (-36)
#define PCRE2_ERROR_CALLOUT (-37) /* Never used by PCRE2 itself */
#define PCRE2_ERROR_DFA_BADRESTART (-38)
#define PCRE2_ERROR_DFA_RECURSE (-39)
#define PCRE2_ERROR_DFA_UCOND (-40)
#define PCRE2_ERROR_DFA_UITEM (-41)
#define PCRE2_ERROR_DFA_UMLIMIT (-42)
#define PCRE2_ERROR_DFA_WSSIZE (-43)
#define PCRE2_ERROR_INTERNAL (-44)
#define PCRE2_ERROR_JIT_BADOPTION (-45)
#define PCRE2_ERROR_JIT_STACKLIMIT (-46)
#define PCRE2_ERROR_MATCHLIMIT (-47)
#define PCRE2_ERROR_NOMEMORY (-48)
#define PCRE2_ERROR_NOSUBSTRING (-49)
#define PCRE2_ERROR_NULL (-50)
#define PCRE2_ERROR_RECURSELOOP (-51)
#define PCRE2_ERROR_RECURSIONLIMIT (-52)
#define PCRE2_ERROR_UNKNOWN_OPCODE (-53)
#define PCRE2_ERROR_UNSET (-54)
/* Request types for pcre2_pattern_info() */

File diff suppressed because it is too large Load Diff

View File

@ -102,7 +102,7 @@ switch (what)
break;
case PCRE2_CONFIG_NEWLINE:
*((int *)where) = NEWLINE;
*((int *)where) = NEWLINE_DEFAULT;
break;
case PCRE2_CONFIG_PARENS_LIMIT:

View File

@ -139,9 +139,9 @@ if (defmemctl)
}
ccontext->stack_guard = NULL;
ccontext->tables = PRIV(default_tables);
ccontext->bsr_convention = PCRE2_BSR_DEFAULT;
ccontext->newline_convention = PCRE2_NEWLINE_DEFAULT;
ccontext->parens_nest_limit = PARENS_NEST_LIMIT;
ccontext->newline_convention = NEWLINE_DEFAULT;
ccontext->bsr_convention = BSR_DEFAULT;
}

View File

@ -51,7 +51,10 @@ POSSIBILITY OF SUCH DAMAGE.
#define STRING(a) # a
#define XSTRING(s) STRING(s)
/* The texts of compile-time error messages. Do not ever re-use any error
/* The texts of compile-time error messages. Compile-time error numbers start
at COMPILE_ERROR_BASE (100).
Do not ever re-use any error
number, because they are documented. Always add a new error instead. Messages
marked DEAD below are no longer used. This used to be a table of strings, but
in order to reduce the number of relocations needed when a shared library is
@ -85,7 +88,7 @@ static const char compile_error_texts[] =
"missing )\0"
/* 15 */
"reference to non-existent subpattern\0"
"erroffset passed as NULL\0"
"pattern or erroffset passed as NULL\0"
"unknown option bit(s) set\0"
"missing ) after comment\0"
"parentheses nested too deeply\0" /** DEAD **/
@ -104,7 +107,7 @@ static const char compile_error_texts[] =
/* 30 */
"unknown POSIX class name\0"
"POSIX collating elements are not supported\0"
"this version of PCRE is compiled without UTF support\0"
"this version of PCRE does not have UTF or Unicode property support\0"
"spare error\0" /** DEAD **/
"character value in \\x{} or \\o{} is too large\0"
/* 35 */
@ -133,7 +136,7 @@ static const char compile_error_texts[] =
"DEFINE group contains more than one branch\0"
/* 55 */
"repeating a DEFINE group is not allowed\0" /** DEAD **/
"inconsistent NEWLINE options\0"
"internal error: unknown newline setting\0"
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
"a numbered reference must not be zero\0"
"an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
@ -171,58 +174,74 @@ static const char compile_error_texts[] =
"parentheses are too deeply nested (stack check)\0"
;
/* Match-time error texts are in the same format. */
/* Match-time and UTF error texts are in the same format. */
static const char match_error_texts[] =
"no error\0"
"no match\0"
"partial match\0"
"UTF-8 error: 1 byte missing at end\0"
"UTF-8 error: 2 bytes missing at end\0"
/* 5 */
"UTF-8 error: 3 bytes missing at end\0"
"UTF-8 error: 4 bytes missing at end\0"
"UTF-8 error: 5 bytes missing at end\0"
"UTF-8 error: byte 2 top bits not 0x80\0"
"UTF-8 error: byte 3 top bits not 0x80\0"
/* 10 */
"UTF-8 error: byte 4 top bits not 0x80\0"
"UTF-8 error: byte 5 top bits not 0x80\0"
"UTF-8 error: byte 6 top bits not 0x80\0"
"UTF-8 error: 5-byte character is not allowed (RFC 3629)\0"
"UTF-8 error: 6-byte character is not allowed (RFC 3629)\0"
/* 15 */
"UTF-8 error: code point > 0x10ffff is not defined\0"
"UTF-8 error: code points 0xd000-0xdfff are not defined\0"
"UTF-8 error: overlong 2-byte sequence\0"
"UTF-8 error: overlong 3-byte sequence\0"
"UTF-8 error: overlong 4-byte sequence\0"
/* 20 */
"UTF-8 error: overlong 5-byte sequence\0"
"UTF-8 error: overlong 6-byte sequence\0"
"UTF-8 error: isolated 0x80 byte\0"
"UTF-8 error: illegal byte (0xfe or 0xff)\0"
"UTF-16 error: missing low surrogate at end\0"
/* 25 */
"UTF-16 error: invalid low surrogate\0"
"UTF-16 error: isolated low surrogate\0"
"UTF-32 error: surrogate character not allowed\0"
"UTF-32 error: code point > 0x10ffff is not defined\0"
"bad count value\0"
/* 30 */
"pattern compiled with other endianness\0"
"bad length\0"
/* -5 */
"magic number missing\0"
"pattern compiled in wrong mode: 8/16/32-bit error\0"
"bad offset value\0"
/* 35 */
"bad option value\0"
"bad UTF string\0"
/* -10 */
"bad offset into UTF string\0"
"callout error code\0" /* Never returned by PCRE2 itself */
"invalid data in workspace for DFA restart\0"
"too much recursion for DFA matching\0"
/* 40 */
"backreference condition or recursion test not supported for DFA matching\0"
"item unsupported for DFA matching\0"
"match limit not supported for DFA matching\0"
"workspace size exceeded in DFA matching\0"
"internal error - pattern overwritten?\0"
/* 45 */
"bad JIT option\0"
"JIT stack limit reached\0"
/* -15 */
"match limit exceeded\0"
"no more memory\0"
"unknown substring\0"
/* 50 */
"NULL argument passed\0"
"partial match\0"
/* -20 */
"nested recursion at the same subject position\0"
"recursion limit exceeded\0"
"unknown opcode - pattern overwritten?\0"
"value unset\0" /* Used by pcre2_pattern_info() */
"spare -24\0"
/* -25 */
"spare -25\0"
"spare -26\0"
"spare -27\0"
"spare -28\0"
"spare -29\0"
/* -30 */
"invalid data in workspace for DFA restart\0"
"too much recursion for DFA matching\0"
"backreference condition or recursion test not supported for DFA matching\0"
"item unsupported for DFA matching\0"
"match limit not supported for DFA matching\0"
/* -35 */
"workspace size exceeded in DFA matching\0"
"spare -36\0"
"spare -37\0"
"spare -38\0"
"spare -39\0"
/* -40 */
"spare -39\0"
;
@ -232,7 +251,8 @@ static const char match_error_texts[] =
/* This function copies an error message into a buffer whose units are of an
appropriate width. Error numbers are positive for compile-time errors, and
negative for exec-time errors.
negative for match-time errors (except for UTF errors), but the numbers are all
distinct.
Arguments:
enumber error number
@ -253,16 +273,15 @@ uint32_t n;
if (size == 0) return PCRE2_ERROR_NOMEMORY;
if (enumber > 0) /* Compile-time error */
if (enumber > COMPILE_ERROR_BASE) /* Compile error */
{
message = compile_error_texts;
n = enumber;
n = enumber - COMPILE_ERROR_BASE;
}
else /* Match-time error */
else /* Match or UTF error */
{
message = match_error_texts;
n = -enumber;
n = -enumber;
}
for (; n > 0; n--)

View File

@ -40,78 +40,235 @@ POSSIBILITY OF SUCH DAMAGE.
/* FIXME: this file is incomplete, being gradually built. */
/* We do not support both EBCDIC and UTF at the same time. The "configure"
script prevents both being selected, but not everybody uses "configure". */
#if defined EBCDIC && defined SUPPORT_UTF
#error The use of both EBCDIC and SUPPORT_UTF is not supported.
#endif
/* Standard C headers */
#include <ctype.h>
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "pcre2.h"
#include "pcre2_ucp.h"
#define PUBL(name) pcre2_##name
#ifndef PRIV
#define PRIV(name) _pcre2_##name
#endif
#define PCRE2_CALL_CONVENTION
extern const uint8_t PRIV(default_tables)[];
/* Macros to make boolean values more obvious. The #ifndef is to pacify
compiler warnings in environments where these macros are defined elsewhere.
Unfortunately, there is no way to do the same for the typedef. */
typedef int BOOL;
#ifndef FALSE
#define FALSE 0
#define TRUE 1
#endif
/* Valgrind (memcheck) support */
#ifdef SUPPORT_VALGRIND
#include <valgrind/memcheck.h>
#endif
/* This is an unsigned int value that no character can ever have, as
/* When compiling a DLL for Windows, the exported symbols have to be declared
using some MS magic. I found some useful information on this web page:
http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the
information there, using __declspec(dllexport) without "extern" we have a
definition; with "extern" we have a declaration. The settings here override the
setting in pcre.h (which is included below); it defines only PCRE2_EXP_DECL,
which is all that is needed for applications (they just import the symbols). We
use:
PCRE2_EXP_DECL for declarations
PCRE2_EXP_DEFN for definitions of exported functions
PCRE2_EXP_DATA_DEFN for definitions of exported variables
The reason for the two DEFN macros is that in non-Windows environments, one
does not want to have "extern" before variable definitions because it leads to
compiler warnings. So we distinguish between functions and variables. In
Windows, the two should always be the same.
The reason for wrapping this in #ifndef PCRE2_EXP_DECL is so that pcretest,
which is an application, but needs to import this file in order to "peek" at
internals, can #include pcre.h first to get an application's-eye view.
In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon,
special-purpose environments) might want to stick other stuff in front of
exported symbols. That's why, in the non-Windows case, we set PCRE2_EXP_DEFN and
PCRE2_EXP_DATA_DEFN only if they are not already set. */
#ifndef PCRE2_EXP_DECL
# ifdef _WIN32
# ifndef PCRE2_STATIC
# define PCRE2_EXP_DECL extern __declspec(dllexport)
# define PCRE2_EXP_DEFN __declspec(dllexport)
# define PCRE2_EXP_DATA_DEFN __declspec(dllexport)
# else
# define PCRE2_EXP_DECL extern
# define PCRE2_EXP_DEFN
# define PCRE2_EXP_DATA_DEFN
# endif
# else
# ifdef __cplusplus
# define PCRE2_EXP_DECL extern "C"
# else
# define PCRE2_EXP_DECL extern
# endif
# ifndef PCRE2_EXP_DEFN
# define PCRE2_EXP_DEFN PCRE2_EXP_DECL
# endif
# ifndef PCRE2_EXP_DATA_DEFN
# define PCRE2_EXP_DATA_DEFN
# endif
# endif
#endif
/* Include the public PCRE2 header and the definitions of UCP character
property values. This must follow the setting of PCRE2_EXP_DECL above. */
#include "pcre2.h"
#include "pcre2_ucp.h"
/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
with a custom type. This makes it possible, for example, to allow pcre2_match()
to process subject strings that are discontinuous by using a smart pointer
class. It must always be possible to inspect all of the subject string in
pcre2_match() because of the way it backtracks. */
/* WARNING: This is as yet untested for PCRE2. */
#ifdef CUSTOM_SUBJECT_PTR
#undef PCRE2_SPTR
#define PCRE2_SPTR CUSTOM_SUBJECT_PTR
#endif
/* When compiling with the MSVC compiler, it is sometimes necessary to include
a "calling convention" before exported function names. (This is secondhand
information; I know nothing about MSVC myself). For example, something like
void __cdecl function(....)
might be needed. In order so make this easy, all the exported functions have
PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not
set, we ensure here that it has no effect. */
#ifndef PCRE2_CALL_CONVENTION
#define PCRE2_CALL_CONVENTION
#endif
/* When checking for integer overflow in pcre2_compile(), we need to handle
large integers. If a 64-bit integer type is available, we can use that.
Otherwise we have to cast to double, which of course requires floating point
arithmetic. Handle this by defining a macro for the appropriate type. If
stdint.h is available, include it; it may define INT64_MAX. Systems that do not
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
by "configure". */
#if defined HAVE_STDINT_H
#include <stdint.h>
#elif defined HAVE_INTTYPES_H
#include <inttypes.h>
#endif
#if defined INT64_MAX || defined int64_t
#define INT64_OR_DOUBLE int64_t
#else
#define INT64_OR_DOUBLE double
#endif
/* When compiling for use with the Virtual Pascal compiler, these functions
need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
option on the command line. */
#ifdef VPCOMPAT
#define strlen(s) _strlen(s)
#define strncmp(s1,s2,m) _strncmp(s1,s2,m)
#define memcmp(s,c,n) _memcmp(s,c,n)
#define memcpy(d,s,n) _memcpy(d,s,n)
#define memmove(d,s,n) _memmove(d,s,n)
#define memset(s,c,n) _memset(s,c,n)
#else /* VPCOMPAT */
/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
is set. Otherwise, include an emulating function for those systems that have
neither (there some non-Unix environments where this is the case). */
#ifndef HAVE_MEMMOVE
#undef memmove /* some systems may have a macro */
#ifdef HAVE_BCOPY
#define memmove(a, b, c) bcopy(b, a, c)
#else /* HAVE_BCOPY */
static void *
pcre_memmove(void *d, const void *s, size_t n)
{
size_t i;
unsigned char *dest = (unsigned char *)d;
const unsigned char *src = (const unsigned char *)s;
if (dest > src)
{
dest += n;
src += n;
for (i = 0; i < n; ++i) *(--dest) = *(--src);
return (void *)dest;
}
else
{
for (i = 0; i < n; ++i) *dest++ = *src++;
return (void *)(dest - n);
}
}
#define memmove(a, b, c) pcre_memmove(a, b, c)
#endif /* not HAVE_BCOPY */
#endif /* not HAVE_MEMMOVE */
#endif /* not VPCOMPAT */
/* External (in the C sense) functions and macros that are private to the
libraries are always referenced using the PRIV macro. This makes it possible
for pcre2test.c to include some of the source files from the libraries using a
different PRIV definition to avoid name clashes. */
#ifndef PRIV
#define PRIV(name) _pcre2_##name
#endif
/* This is an unsigned int value that no UTF character can ever have, as
Unicode doesn't go beyond 0x0010ffff. */
#define NOTACHAR 0xffffffff
/* When UTF encoding is being used, a character is no longer just a single
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
handling generate simple sequences when used in the basic mode, and more
complicated ones for UTF characters. GETCHARLENTEST and other macros are not
used when UTF is not supported. To make sure they can never even appear when
UTF support is omitted, we don't even define them. */
/* Compile-time errors are added to this value. As they are documented, it
should probably never be changed. */
#ifndef SUPPORT_UTF
#define COMPILE_ERROR_BASE 100
/* #define MAX_VALUE_FOR_SINGLE_CHAR */
/* #define HAS_EXTRALEN(c) */
/* #define GET_EXTRALEN(c) */
/* #define NOT_FIRSTCHAR(c) */
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
#define GETCHARINCTEST(c, eptr) c = *eptr++;
#define GETCHARLEN(c, eptr, len) c = *eptr;
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
/* #define FORWARDCHAR(eptr) */
/* #define ACROSSCHAR(condition, eptr, action) */
/* Define the default BSR convention. */
#else /* SUPPORT_UTF */
#ifdef BSR_ANYCRLF
#define BSR_DEFAULT PCRE2_BSR_ANYCRLF
#else
#define BSR_DEFAULT PCRE2_BSR_UNICODE
#endif
/* ---------------- Basic UTF-8 macros ---------------- */
/* These UTF-8 macros are always defined because they are used in pcre2test for
handling wide characters in 16-bit and 32-bit modes, even if an 8-bit library
is not supported. */
/* Tests whether a UTF-8 code point needs extra bytes to decode. */
#define HASUTF8EXTRALEN(c) ((c) >= 0xc0)
/* The following macros were originally written in the form of loops that used
data from the tables whose names start with PRIV(utf8_table). They were
rewritten by a user so as not to use loops, because in some environments this
gives a significant performance advantage, and it seems never to do any harm.
*/
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer. */
@ -168,8 +325,44 @@ the pointer. */
} \
}
#endif /* SUPPORT_UTF */
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer, incrementing the length. */
#define GETUTF8LEN(c, eptr, len) \
{ \
if ((c & 0x20) == 0) \
{ \
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
len++; \
} \
else if ((c & 0x10) == 0) \
{ \
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
len += 2; \
} \
else if ((c & 0x08) == 0) \
{\
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
len += 3; \
} \
else if ((c & 0x04) == 0) \
{ \
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
(eptr[4] & 0x3f); \
len += 4; \
} \
else \
{\
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
len += 5; \
} \
}
/* --------------- Whitespace macros ---------------- */
/* Tests for Unicode horizontal and vertical whitespace characters must check a
number of different values. Using a switch statement for this generates the
@ -187,7 +380,7 @@ NOTACHAR (which is 0xffffffff).
Any changes should ensure that the various macros are kept in step with each
other. NOTE: The values also appear in pcre2_jit_compile.c. */
/* ------ ASCII/Unicode environments ------ */
/* -------------- ASCII/Unicode environments -------------- */
#ifndef EBCDIC
@ -242,7 +435,7 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */
VSPACE_BYTE_CASES: \
VSPACE_MULTIBYTE_CASES
/* ------ EBCDIC environments ------ */
/* -------------- EBCDIC environments -------------- */
#else
#define HSPACE_LIST CHAR_HT, CHAR_SPACE
@ -271,9 +464,47 @@ other. NOTE: The values also appear in pcre2_jit_compile.c. */
#define VSPACE_CASES VSPACE_BYTE_CASES
#endif /* EBCDIC */
/* ------ End of whitespace macros ------ */
/* -------------- End of whitespace macros -------------- */
/* PCRE2 is able to support several different kinds of newline (CR, LF, CRLF,
"any" and "anycrlf" at present). The following macros are used to package up
testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
modules to indicate in which datablock the parameters exist, and what the
start/end of string field names are. */
#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
#define NLTYPE_ANYCRLF 2 /* Newline is CR, LF, or CRLF */
/* This macro checks for a newline at the given position */
#define IS_NEWLINE(p) \
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) < NLBLOCK->PSEND && \
PRIV(is_newline)((p), NLBLOCK->nltype, NLBLOCK->PSEND, \
&(NLBLOCK->nllen), utf)) \
: \
((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
UCHAR21TEST(p) == NLBLOCK->nl[0] && \
(NLBLOCK->nllen == 1 || UCHAR21TEST(p+1) == NLBLOCK->nl[1]) \
) \
)
/* This macro checks for a newline immediately preceding the given position */
#define WAS_NEWLINE(p) \
((NLBLOCK->nltype != NLTYPE_FIXED)? \
((p) > NLBLOCK->PSSTART && \
PRIV(was_newline)((p), NLBLOCK->nltype, NLBLOCK->PSSTART, \
&(NLBLOCK->nllen), utf)) \
: \
((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
UCHAR21TEST(p - NLBLOCK->nllen) == NLBLOCK->nl[0] && \
(NLBLOCK->nllen == 1 || UCHAR21TEST(p - NLBLOCK->nllen + 1) == NLBLOCK->nl[1]) \
) \
)
/* Private flags containing information about the compiled pattern. The first
three must not be changed, because whichever is set is actually the number of
bytes in a code unit in that mode. */
@ -296,16 +527,55 @@ bytes in a code unit in that mode. */
#define PCRE2_MODE_MASK (PCRE2_MODE8 | PCRE2_MODE16 | PCRE2_MODE32)
/* Magic number to provide a small check against being handed junk. */
#define MAGIC_NUMBER 0x50435245UL /* 'PCRE' */
/* This variable is used to detect a loaded regular expression
in different endianness. */
/* This value is used to detect a loaded regular expression in different
endianness. */
#define REVERSED_MAGIC_NUMBER 0x45524350UL /* 'ERCP' */
/* The maximum remaining length of subject we are prepared to search for a
req_unit match. */
#define REQ_UNIT_MAX 1000
/* Bit definitions for entries in the pcre_ctypes table. */
#define ctype_space 0x01
#define ctype_letter 0x02
#define ctype_digit 0x04
#define ctype_xdigit 0x08
#define ctype_word 0x10 /* alphanumeric or '_' */
#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
of bits for a class map. Some classes are built by combining these tables. */
#define cbit_space 0 /* [:space:] or \s */
#define cbit_xdigit 32 /* [:xdigit:] */
#define cbit_digit 64 /* [:digit:] or \d */
#define cbit_upper 96 /* [:upper:] */
#define cbit_lower 128 /* [:lower:] */
#define cbit_word 160 /* [:word:] or \w */
#define cbit_graph 192 /* [:graph:] */
#define cbit_print 224 /* [:print:] */
#define cbit_punct 256 /* [:punct:] */
#define cbit_cntrl 288 /* [:cntrl:] */
#define cbit_length 320 /* Length of the cbits table */
/* Offsets of the various tables from the base tables pointer, and
total length. */
#define lcc_offset 0
#define fcc_offset 256
#define cbits_offset 512
#define ctypes_offset (cbits_offset + cbit_length)
#define tables_length (ctypes_offset + 256)
/* -------------------- Character and string names ------------------------ */
@ -1432,6 +1702,17 @@ typedef struct pcre2_memctl {
void *memory_data;
} pcre2_memctl;
/* Structure for building a chain of open capturing subpatterns during
compiling, so that instructions to close them can be compiled when (*ACCEPT) is
encountered. This is also used to identify subpatterns that contain recursive
back references to themselves, so that they can be made atomic. */
typedef struct open_capitem {
struct open_capitem *next; /* Chain link */
uint16_t number; /* Capture number */
uint16_t flag; /* Set TRUE if recursive back ref */
} open_capitem;
/* Layout of the UCP type table that translates property names into types and
codes. Each entry used to point directly to a name, but to reduce the number of
relocations in shared libraries, it now has an offset into a single string
@ -1481,13 +1762,52 @@ extern const int PRIV(ucp_typerange)[];
/* ----------------- Items that need PCRE2_CODE_UNIT_WIDTH ----------------- */
/* When this file is included by pcre2test, PCRE2_CODE_UNIT_WIDTH is not
defined, so the following items are omitted. */
#ifdef PCRE2_CODE_UNIT_WIDTH
/* Mode-dependent macros and private structures are defined in a separate file.
When compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we
include them at the appropriate width. When compiling pcre2test, however, that
macro is not set at this point because pcre2test needs to include them at all
supported widths. */
/* This is the largest non-UTF code point. */
#define MAX_NON_UTF_CHAR (0xffffffffU >> (32 - PCRE2_CODE_UNIT_WIDTH))
/* Internal shared data tables. These are tables that are used by more than one
of the exported public functions. They have to be "external" in the C sense,
but are not part of the PCRE2 public API. The data for these tables is in the
pcre2_tables.c module. Even though some of them are identical in each library,
they must have different names so that more than one library can be linked with
an application. UTF-8 tables are needed only when compiling the 8-bit library.
*/
#if PCRE2_CODE_UNIT_WIDTH == 8
extern const int PRIV(utf8_table1)[];
extern const int PRIV(utf8_table1_size);
extern const int PRIV(utf8_table2)[];
extern const int PRIV(utf8_table3)[];
extern const uint8_t PRIV(utf8_table4)[];
#endif
extern const uint8_t PRIV(default_tables)[];
extern const uint8_t PRIV(OP_lengths)[];
extern const uint32_t PRIV(hspace_list)[];
extern const uint32_t PRIV(vspace_list)[];
extern const ucp_type_table PRIV(utt)[];
extern const char PRIV(utt_names)[];
extern const size_t PRIV(utt_size);
/* Mode-dependent macros and hidden and private structures are defined in a
separate file so that pcre2test can include them at all supported widths. When
compiling the library, PCRE2_CODE_UNIT_WIDTH will be defined, and we can
include them at the appropriate width, after setting up suffix macros for the
private structures. */
#define compile_data PCRE2_SUFFIX(compile_data_)
#define branch_chain PCRE2_SUFFIX(branch_chain_)
#define named_group PCRE2_SUFFIX(named_group_)
#include "pcre2_intmodedep.h"
@ -1498,14 +1818,32 @@ from pcre2test, and must not be defined when no code unit width is available.
*/
#define _pcre2_compile_context_init PCRE2_SUFFIX(_pcre2_compile_context_init_)
#define _pcre2_find_bracket PCRE2_SUFFIX(_pcre2_find_bracket_)
#define _pcre2_is_newline PCRE2_SUFFIX(_pcre2_is_newline_)
#define _pcre2_match_context_init PCRE2_SUFFIX(_pcre2_match_context_init_)
#define _pcre2_memctl_malloc PCRE2_SUFFIX(_pcre2_memctl_malloc_)
#define _pcre2_ord2utf PCRE2_SUFFIX(_pcre2_ord2utf_)
#define _pcre2_strcmp PCRE2_SUFFIX(_pcre_strcmp_)
#define _pcre2_strcmp_c8 PCRE2_SUFFIX(_pcre_strcmp_c8_)
#define _pcre2_strlen PCRE2_SUFFIX(_pcre_strlen_)
#define _pcre2_strncmp PCRE2_SUFFIX(_pcre_strncmp_)
#define _pcre2_strncmp_c8 PCRE2_SUFFIX(_pcre_strncmp_c8_)
#define _pcre2_valid_utf PCRE2_SUFFIX(_pcre_valid_utf_)
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
extern void _pcre2_compile_context_init(pcre2_compile_context *, BOOL);
extern void _pcre2_match_context_init(pcre2_match_context *, BOOL);
extern void *_pcre2_memctl_malloc(size_t, size_t, pcre2_memctl *);
extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
#endif
extern void _pcre2_compile_context_init(pcre2_compile_context *, BOOL);
extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int);
extern BOOL _pcre2_is_newline(PCRE2_SPTR, int, PCRE2_SPTR, int *, BOOL);
extern void _pcre2_match_context_init(pcre2_match_context *, BOOL);
extern void *_pcre2_memctl_malloc(size_t, size_t, pcre2_memctl *);
extern unsigned int _pcre2_ord2utf(uint32_t, PCRE2_UCHAR *);
extern int _pcre2_strcmp(PCRE2_SPTR, PCRE2_SPTR);
extern int _pcre2_strcmp_c8(PCRE2_SPTR, const char *);
extern int _pcre2_strlen(PCRE2_SPTR);
extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t);
extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t);
extern int _pcre2_valid_utf(PCRE2_SPTR, int, size_t *);
extern BOOL _pcre2_was_newline(PCRE2_SPTR, int, PCRE2_SPTR, int *, BOOL);
#endif /* PCRE2_CODE_UNIT_WIDTH */
/* End of pcre2_internal.h */

View File

@ -42,21 +42,44 @@ POSSIBILITY OF SUCH DAMAGE.
/* This module contains mode-dependent macro and structure definitions. The
file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
These mode-dependent items are kept in a separate file so that they can also be
#included multiple times for different code unit widths by pcre2test. Start by
undefining all the new macros defined herein so that they can be redefined for
multiple inclusions. */
#included multiple times for different code unit widths by pcre2test in order
to have access to the hidden structures at all supported widths.
Some of the mode-dependent macros are required at different widths for
different parts of the pcre2test code (in particular, the included
pcre_printint.c file). We undefine them here so that they can be re-defined for
multiple inclusions. Not all of these are used in pcretest, but it's easier
just to undefine them all. */
#undef ACROSSCHAR
#undef BACKCHAR
#undef CU2BYTES
#undef FORWARDCHAR
#undef GET
#undef GET2
#undef GETCHAR
#undef GETCHARINC
#undef GETCHARINCTEST
#undef GETCHARLEN
#undef GETCHARLENTEST
#undef GETCHARTEST
#undef GET_EXTRALEN
#undef HAS_EXTRALEN
#undef IMM2_SIZE
#undef MAX_255
#undef MAX_MARK
#undef MAX_PATTERN_SIZE
#undef MAX_UTF_SINGLE_CU
#undef NOT_FIRSTCHAR
#undef PUT
#undef PUT2
#undef PUT2INC
#undef PUTCHAR
#undef PUTINC
/* ---------------------------MACROS ----------------------------- */
/* -------------------------- MACROS ----------------------------- */
/* PCRE keeps offsets in its compiled code as at least 16-bit quantities
(always stored in big-endian order in 8-bit mode) by default. These are used,
@ -70,7 +93,6 @@ unit string is now handled by the macros that are defined here.
The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
values of 2 or 4 are also supported. */
/* ------------------- 8-bit support ------------------ */
#if PCRE2_CODE_UNIT_WIDTH == 8
@ -150,8 +172,8 @@ values of 2 or 4 are also supported. */
#error Unsupported compiling mode
#endif
/* -------------------------------------------------------*/
/* --------------- Other mode-specific macros ----------------- */
/* PCRE uses some other (at least) 16-bit quantities that do not change when
the size of offsets changes. There are used for repeat counts and for other
@ -166,7 +188,7 @@ arithmetic results in a signed value. Hence the cast. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define IMM2_SIZE 2
#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
#define PUT2(a,n,d) { a[n] = (d) >> 8; a[(n)+1] = (d) & 255; }
#define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
#else /* Code units are 16 or 32 bits */
#define IMM2_SIZE 1
@ -174,14 +196,338 @@ arithmetic results in a signed value. Hence the cast. */
#define PUT2(a,n,d) a[n] = d
#endif
/* Other macros that are different for 8-bit mode. The maximum length of a MARK
name must fit in one code unit; currently it is set to 255 or 65535. */
#if PCRE2_CODE_UNIT_WIDTH == 8
#define MAX_255(c) TRUE
#define MAX_MARK ((1u << 8) - 1)
#ifdef SUPPORT_UTF
#define SUPPORT_WIDE_CHARS
#endif /* SUPPORT_UTF */
#else /* Code units are 16 or 32 bits */
#define MAX_255(c) ((c) <= 255u)
#define MAX_MARK ((1u << 16) - 1)
#define SUPPORT_WIDE_CHARS
#endif
/* ----------------- Character-handling macros ----------------- */
/* There is a proposed future special "UTF-21" mode, in which only the lowest
21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
high-order bits available to the application for other uses. In preparation for
the future implementation of this mode, there are macros that load a data item
and, if in this special mode, mask it to 21 bits. These macros all have names
starting with UCHAR21. In all other modes, including the normal 32-bit
library, the macros all have the same simple definitions. When the new mode is
implemented, it is expected that these definitions will be varied appropriately
using #ifdef when compiling the library that supports the special mode. */
#define UCHAR21(eptr) (*(eptr))
#define UCHAR21TEST(eptr) (*(eptr))
#define UCHAR21INC(eptr) (*(eptr)++)
#define UCHAR21INCTEST(eptr) (*(eptr)++)
/* When UTF encoding is being used, a character is no longer just a single
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
handling generate simple sequences when used in the basic mode, and more
complicated ones for UTF characters. GETCHARLENTEST and other macros are not
used when UTF is not supported. To make sure they can never even appear when
UTF support is omitted, we don't even define them. */
#ifndef SUPPORT_UTF
/* #define MAX_UTF_SINGLE_CU */
/* #define HAS_EXTRALEN(c) */
/* #define GET_EXTRALEN(c) */
/* #define NOT_FIRSTCHAR(c) */
#define GETCHAR(c, eptr) c = *eptr;
#define GETCHARTEST(c, eptr) c = *eptr;
#define GETCHARINC(c, eptr) c = *eptr++;
#define GETCHARINCTEST(c, eptr) c = *eptr++;
#define GETCHARLEN(c, eptr, len) c = *eptr;
#define PUTCHAR(c, p) (*p = c, 1)
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
/* #define FORWARDCHAR(eptr) */
/* #define ACROSSCHAR(condition, eptr, action) */
#else /* SUPPORT_UTF */
/* ------------------- 8-bit support ------------------ */
#if PCRE2_CODE_UNIT_WIDTH == 8
/* The largest UTF code point that can be encoded as a single code unit. */
#define MAX_UTF_SINGLE_CU 127
/* Tests whether the code point needs extra characters to decode. */
#define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
Otherwise it has an undefined behaviour. */
#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3f])
/* Returns TRUE, if the given character is not the first character
of a UTF sequence. */
#define NOT_FIRSTCHAR(c) (((c) & 0xc0) == 0x80)
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
#define GETCHAR(c, eptr) \
c = *eptr; \
if (c >= 0xc0) GETUTF8(c, eptr);
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
if (utf && c >= 0xc0) GETUTF8(c, eptr);
/* Get the next UTF-8 character, advancing the pointer. This is called when we
know we are in UTF-8 mode. */
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if (c >= 0xc0) GETUTF8INC(c, eptr);
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
if (utf && c >= 0xc0) GETUTF8INC(c, eptr);
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
do not know if we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
if (utf && c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
because almost all calls are already within a block of UTF-8 only code. */
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
/* Same as above, just in the other direction. */
#define FORWARDCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr++
/* Same as above, but it allows a fully customizable form. */
#define ACROSSCHAR(condition, eptr, action) \
while((condition) && ((eptr) & 0xc0) == 0x80) action
/* Deposit a character into memory, returning the number of code units. */
#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
PRIV(ord2utf)(c,p) : (*p = c, 1))
/* ------------------- 16-bit support ------------------ */
#elif PCRE2_CODE_UNIT_WIDTH == 16
/* The largest UTF code point that can be encoded as a single code unit. */
#define MAX_UTF_SINGLE_CU 65535
/* Tests whether the code point needs extra characters to decode. */
#define HAS_EXTRALEN(c) (((c) & 0xfc00) == 0xd800)
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
Otherwise it has an undefined behaviour. */
#define GET_EXTRALEN(c) 1
/* Returns TRUE, if the given character is not the first character
of a UTF sequence. */
#define NOT_FIRSTCHAR(c) (((c) & 0xfc00) == 0xdc00)
/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer. */
#define GETUTF16(c, eptr) \
{ c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; }
/* Get the next UTF-16 character, not advancing the pointer. This is called when
we know we are in UTF-16 mode. */
#define GETCHAR(c, eptr) \
c = *eptr; \
if ((c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
if (utf && (c & 0xfc00) == 0xd800) GETUTF16(c, eptr);
/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
the pointer. */
#define GETUTF16INC(c, eptr) \
{ c = (((c & 0x3ff) << 10) | (*eptr++ & 0x3ff)) + 0x10000; }
/* Get the next UTF-16 character, advancing the pointer. This is called when we
know we are in UTF-16 mode. */
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-16 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
if (utf && (c & 0xfc00) == 0xd800) GETUTF16INC(c, eptr);
/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer, incrementing the length. */
#define GETUTF16LEN(c, eptr, len) \
{ c = (((c & 0x3ff) << 10) | (eptr[1] & 0x3ff)) + 0x10000; len++; }
/* Get the next UTF-16 character, not advancing the pointer, incrementing
length if there is a low surrogate. This is called when we know we are in
UTF-16 mode. */
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
if ((c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
pointer, incrementing length if there is a low surrogate. This is called when
we do not know if we are in UTF-16 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
if (utf && (c & 0xfc00) == 0xd800) GETUTF16LEN(c, eptr, len);
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-16 mode - we don't put a test within the
macro because almost all calls are already within a block of UTF-16 only
code. */
#define BACKCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr--
/* Same as above, just in the other direction. */
#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00) == 0xdc00) eptr++
/* Same as above, but it allows a fully customizable form. */
#define ACROSSCHAR(condition, eptr, action) \
if ((condition) && ((eptr) & 0xfc00) == 0xdc00) action
/* Deposit a character into memory, returning the number of code units. */
#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
PRIV(ord2utf)(c,p) : (*p = c, 1))
/* ------------------- 32-bit support ------------------ */
#else
/* These are trivial for the 32-bit library, since all UTF-32 characters fit
into one PCRE_UCHAR unit. */
#define MAX_UTF_SINGLE_CU (0x10ffffu)
#define HAS_EXTRALEN(c) (0)
#define GET_EXTRALEN(c) (0)
#define NOT_FIRSTCHAR(c) (0)
/* Get the next UTF-32 character, not advancing the pointer. This is called when
we know we are in UTF-32 mode. */
#define GETCHAR(c, eptr) \
c = *(eptr);
/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
c = *(eptr);
/* Get the next UTF-32 character, advancing the pointer. This is called when we
know we are in UTF-32 mode. */
#define GETCHARINC(c, eptr) \
c = *((eptr)++);
/* Get the next character, testing for UTF-32 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-32 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *((eptr)++);
/* Get the next UTF-32 character, not advancing the pointer, not incrementing
length (since all UTF-32 is of length 1). This is called when we know we are in
UTF-32 mode. */
#define GETCHARLEN(c, eptr, len) \
GETCHAR(c, eptr)
/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
pointer, not incrementing the length (since all UTF-32 is of length 1).
This is called when we do not know if we are in UTF-32 mode. */
#define GETCHARLENTEST(c, eptr, len) \
GETCHARTEST(c, eptr)
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-32 mode - we don't put a test within the
macro because almost all calls are already within a block of UTF-32 only
code.
These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
#define BACKCHAR(eptr) do { } while (0)
/* Same as above, just in the other direction. */
#define FORWARDCHAR(eptr) do { } while (0)
/* Same as above, but it allows a fully customizable form. */
#define ACROSSCHAR(condition, eptr, action) do { } while (0)
/* Deposit a character into memory, returning the number of code units. */
#define PUTCHAR(c, p) (*p = c, 1)
#endif /* UTF-32 character handling */
#endif /* SUPPORT_UTF */
/* Mode-dependent macros that have the same definition in all modes. */
#define CU2BYTES(x) (x)*((PCRE2_CODE_UNIT_WIDTH/8))
#define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE
#define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE
/* --------------------------- STRUCTURES ----------------------------- */
/* ----------------------- HIDDEN STRUCTURES ----------------------------- */
/* The real general context structure. At present it hold only data for custom
memory control. */
@ -195,7 +541,7 @@ typedef struct pcre2_real_general_context {
typedef struct pcre2_real_compile_context {
pcre2_memctl memctl;
int (*stack_guard)(uint32_t);
const unsigned char *tables;
const uint8_t *tables;
uint16_t bsr_convention;
uint16_t newline_convention;
uint32_t parens_nest_limit;
@ -217,11 +563,12 @@ typedef struct pcre2_real_match_context {
/* The real compiled code structure */
typedef struct pcre2_real_code {
pcre2_memctl memctl;
pcre2_memctl memctl; /* Memory control fields */
const uint8_t *tables; /* The character tables */
void *executable_jit; /* Pointer to JIT code */
uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */
size_t blocksize; /* Total (bytes) that was malloc-ed */
uint32_t magic_number; /* Paranoid and endianness check */
uint32_t size; /* Total (bytes) that was malloc-ed */
uint32_t compile_options; /* Options passed to pcre2_compile() */
uint32_t pattern_options; /* Options taken from the pattern */
uint32_t flags; /* Various state flags */
@ -239,14 +586,13 @@ typedef struct pcre2_real_code {
uint16_t name_count; /* Number of name entries in the table */
} pcre2_real_code;
/* The reat match data structure. */
/* The real match data structure. */
typedef struct pcre2_real_match_data {
pcre2_memctl memctl;
const pcre2_real_code *code; /* The pattern used for the match */
PCRE2_SPTR subject; /* The subject that was matched */
int rc; /* The return code from the match */
int utf_reason; /* Reason code for bad UTF */
size_t leftchar; /* Offset to leftmost code unit */
size_t rightchar; /* Offset to rightmost code unit */
size_t startchar; /* Offset to starting code unit */
@ -255,4 +601,71 @@ typedef struct pcre2_real_match_data {
size_t ovector[1]; /* The first field */
} pcre2_real_match_data;
/* ----------------------- PRIVATE STRUCTURES ----------------------------- */
/* These structures are not needed for pcre2test. */
#ifndef PCRE2_PCRE2TEST
/* Structure for maintaining a chain of pointers to the currently incomplete
branches, for testing for left recursion while compiling. */
typedef struct branch_chain {
struct branch_chain *outer;
PCRE2_UCHAR *current_branch;
} branch_chain;
/* Structure for building a list of named groups during the first pass of
compiling. */
typedef struct named_group {
PCRE2_SPTR name; /* Points to the name in the pattern */
int length; /* Length of the name */
uint32_t number; /* Group number */
} named_group;
/* Structure for passing "static" information around between the functions
doing the compiling, so that they are thread-safe. */
typedef struct compile_data {
pcre2_real_compile_context *cx; /* Points to the compile context */
const uint8_t *lcc; /* Points to lower casing table */
const uint8_t *fcc; /* Points to case-flipping table */
const uint8_t *cbits; /* Points to character type table */
const uint8_t *ctypes; /* Points to table of type maps */
PCRE2_SPTR start_workspace; /* The start of working space */
PCRE2_SPTR start_code; /* The start of the compiled code */
PCRE2_SPTR start_pattern; /* The start of the pattern */
PCRE2_SPTR end_pattern; /* The end of the pattern */
PCRE2_UCHAR *hwm; /* High watermark of workspace */
open_capitem *open_caps; /* Chain of open capture items */
named_group *named_groups; /* Points to vector in pre-compile */
PCRE2_UCHAR *name_table; /* The name/number table */
int names_found; /* Number of entries so far */
int name_entry_size; /* Size of each entry */
int named_group_list_size; /* Number of entries in the list */
int workspace_size; /* Size of workspace */
unsigned int bracount; /* Count of capturing parens as we compile */
int final_bracount; /* Saved value after first pass */
int max_lookbehind; /* Maximum lookbehind (characters) */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
unsigned int namedrefcount; /* Number of backreferences by name */
int parens_depth; /* Depth of nested parentheses */
int assert_depth; /* Depth of nested assertions */
uint32_t external_options; /* External (initial) options */
uint32_t external_flags; /* External flag bits to be set */
int req_varyopt; /* "After variable item" flag for reqbyte */
BOOL had_accept; /* (*ACCEPT) encountered */
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
BOOL check_lookbehind; /* Lookbehinds need later checking */
BOOL dupnames; /* Duplicate names exist */
int nltype; /* Newline type */
int nllen; /* Newline string length */
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
} compile_data;
#endif /* PCRE2_PCRE2TEST */
/* End of pcre2_intmodedep.h */

213
src/pcre2_newline.c Normal file
View File

@ -0,0 +1,213 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains internal functions for testing newlines when more than
one kind of newline is to be recognized. When a newline is found, its length is
returned. In principle, we could implement several newline "types", each
referring to a different set of newline characters. At present, PCRE2 supports
only NLTYPE_FIXED, which gets handled without these functions, NLTYPE_ANYCRLF,
and NLTYPE_ANY. The full list of Unicode newline characters is taken from
http://unicode.org/unicode/reports/tr18/. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
/*************************************************
* Check for newline at given position *
*************************************************/
/* It is guaranteed that the initial value of ptr is less than the end of the
string that is being processed.
Arguments:
ptr pointer to possible newline
type the newline type
endptr pointer to the end of the string
lenptr where to return the length
utf TRUE if in utf mode
Returns: TRUE or FALSE
*/
BOOL
PRIV(is_newline)(PCRE2_SPTR ptr, int type, PCRE2_SPTR endptr, int *lenptr,
BOOL utf)
{
uint32_t c;
#ifdef SUPPORT_UTF
if (utf) { GETCHAR(c, ptr); } else
#else
(void)utf;
#endif /* SUPPORT_UTF */
c = *ptr;
/* Note that this function is called only for ANY or ANYCRLF. */
if (type == NLTYPE_ANYCRLF) switch(c)
{
case CHAR_LF: *lenptr = 1; return TRUE;
case CHAR_CR: *lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
return TRUE;
default: return FALSE;
}
/* NLTYPE_ANY */
else switch(c)
{
#ifdef EBCDIC
case CHAR_NEL:
#endif
case CHAR_LF:
case CHAR_VT:
case CHAR_FF: *lenptr = 1; return TRUE;
case CHAR_CR:
*lenptr = (ptr < endptr - 1 && ptr[1] == CHAR_LF)? 2 : 1;
return TRUE;
#ifndef EBCDIC
#if PCRE2_CODE_UNIT_WIDTH == 8
case CHAR_NEL: *lenptr = utf? 2 : 1; return TRUE;
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
#else /* 16-bit or 32-bit code units */
case CHAR_NEL:
case 0x2028: /* LS */
case 0x2029: *lenptr = 1; return TRUE; /* PS */
#endif
#endif /* Not EBCDIC */
default: return FALSE;
}
}
/*************************************************
* Check for newline at previous position *
*************************************************/
/* It is guaranteed that the initial value of ptr is greater than the start of
the string that is being processed.
Arguments:
ptr pointer to possible newline
type the newline type
startptr pointer to the start of the string
lenptr where to return the length
utf TRUE if in utf mode
Returns: TRUE or FALSE
*/
BOOL
PRIV(was_newline)(PCRE2_SPTR ptr, int type, PCRE2_SPTR startptr, int *lenptr,
BOOL utf)
{
uint32_t c;
ptr--;
#ifdef SUPPORT_UTF
if (utf)
{
BACKCHAR(ptr);
GETCHAR(c, ptr);
}
else
#else
(void)utf;
#endif /* SUPPORT_UTF */
c = *ptr;
/* Note that this function is called only for ANY or ANYCRLF. */
if (type == NLTYPE_ANYCRLF) switch(c)
{
case CHAR_LF:
*lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
return TRUE;
case CHAR_CR: *lenptr = 1; return TRUE;
default: return FALSE;
}
/* NLTYPE_ANY */
else switch(c)
{
case CHAR_LF:
*lenptr = (ptr > startptr && ptr[-1] == CHAR_CR)? 2 : 1;
return TRUE;
#ifdef EBCDIC
case CHAR_NEL:
#endif
case CHAR_VT:
case CHAR_FF:
case CHAR_CR: *lenptr = 1; return TRUE;
#ifndef EBCDIC
#if PCRE2_CODE_UNIT_WIDTH == 8
case CHAR_NEL: *lenptr = utf? 2 : 1; return TRUE;
case 0x2028: /* LS */
case 0x2029: *lenptr = 3; return TRUE; /* PS */
#else /* 16-bit or 32-bit code units */
case CHAR_NEL:
case 0x2028: /* LS */
case 0x2029: *lenptr = 1; return TRUE; /* PS */
#endif
#endif /* Not EBCDIC */
default: return FALSE;
}
}
/* End of pcre2_newline.c */

119
src/pcre2_ord2utf.c Normal file
View File

@ -0,0 +1,119 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This file contains a function that converts a Unicode character code point
into a UTF string. The behaviour is different for each code unit width. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
/* If SUPPORT_UTF is not defined, this function will never be called. Supply a
dummy function because some compilers do not like empty source modules. */
#ifndef SUPPORT_UTF
unsigned int
PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
{
(void)(cvalue);
(void)(buffer);
return 0;
}
#else /* SUPPORT_UTF */
/*************************************************
* Convert code point to UTF *
*************************************************/
/*
Arguments:
cvalue the character value
buffer pointer to buffer for result
Returns: number of code units placed in the buffer
*/
unsigned int
PRIV(ord2utf)(uint32_t cvalue, PCRE2_UCHAR *buffer)
{
/* Convert to UTF-8 */
#if PCRE2_CODE_UNIT_WIDTH == 8
register int i, j;
for (i = 0; i < PRIV(utf8_table1_size); i++)
if ((int)cvalue <= PRIV(utf8_table1)[i]) break;
buffer += i;
for (j = i; j > 0; j--)
{
*buffer-- = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
*buffer = PRIV(utf8_table2)[i] | cvalue;
return i + 1;
/* Convert to UTF-16 */
#elif PCRE2_CODE_UNIT_WIDTH == 16
if (cvalue <= 0xffff)
{
*buffer = (PCRE2_UCHAR)cvalue;
return 1;
}
cvalue -= 0x10000;
*buffer++ = 0xd800 | (cvalue >> 10);
*buffer = 0xdc00 | (cvalue & 0x3ff);
return 2;
/* Convert to UTF-32 */
#else
*buffer = (PCRE2_UCHAR)cvalue;
return 1;
#endif
}
#endif /* SUPPORT_UTF */
/* End of pcre_ord2utf.c */

View File

@ -184,7 +184,7 @@ switch(what)
break;
case PCRE2_INFO_SIZE:
*((size_t *)where) = re->size;
*((size_t *)where) = re->blocksize;
break;
default: return PCRE2_ERROR_BADOPTION;

View File

@ -53,7 +53,7 @@ functions work only on 8-bit data. */
/*************************************************
* Compare two strings *
* Compare two zero-terminated PCRE2 strings *
*************************************************/
/*
@ -77,4 +77,105 @@ while (*str1 != '\0' || *str2 != '\0')
return 0;
}
/*************************************************
* Compare zero-terminated PCRE2 & 8-bit strings *
*************************************************/
/*
Arguments:
str1 first string
str2 second string
Returns: 0, 1, or -1
*/
int
PRIV(strcmp_c8)(PCRE2_SPTR str1, const char *str2)
{
PCRE2_UCHAR c1, c2;
while (*str1 != '\0' || *str2 != '\0')
{
c1 = *str1++;
c2 = *str2++;
if (c1 != c2) return ((c1 > c2) << 1) - 1;
}
return 0;
}
/*************************************************
* Compare two PCRE2 strings, given a length *
*************************************************/
/*
Arguments:
str1 first string
str2 second string
len the length
Returns: 0, 1, or -1
*/
int
PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len)
{
PCRE2_UCHAR c1, c2;
while (len-- > 0)
{
c1 = *str1++;
c2 = *str2++;
if (c1 != c2) return ((c1 > c2) << 1) - 1;
}
return 0;
}
/*************************************************
* Compare PCRE2 string to 8-bit string by length *
*************************************************/
/* As the 8-bit string is almost always a literal, its type is specified as
'const char *'.
Arguments:
str1 first string
str2 second string
len the length
Returns: 0, 1, or -1
*/
int
PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len)
{
PCRE2_UCHAR c1, c2;
while (len-- > 0)
{
c1 = *str1++;
c2 = *str2++;
if (c1 != c2) return ((c1 > c2) << 1) - 1;
}
return 0;
}
/*************************************************
* Find the length of a string *
*************************************************/
/*
Argument: the string
Returns: the length
*/
int
PRIV(strlen)(PCRE2_SPTR str)
{
int c = 0;
while (*str++ != 0) c++;
return c;
}
/* End of pcre2_string_utils.c */

View File

@ -41,22 +41,22 @@ POSSIBILITY OF SUCH DAMAGE.
/* This module contains some fixed tables that are used by more than one of the
PCRE code modules. The tables are also #included by the pcre2test program,
which uses macros to change their names from _pcre2_xxx to xxxx, thereby
avoiding name clashes with the library. In this case, PCRE2_INCLUDED is
avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is
defined. */
#ifndef PCRE2_INCLUDED /* We're compiling the library */
#ifndef PCRE2_PCRE2TEST /* We're compiling the library */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
#endif /* PCRE2_INCLUDED */
#endif /* PCRE2_PCRE2TEST */
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
the definition is next to the definition of the opcodes in pcre2_internal.h.
the definition is next to the definition of the opcodes in pcre2_internal.h.
This is mode-dependent, so is skipped when this file is included by pcre2test. */
#ifndef PCRE2_INCLUDED
#ifndef PCRE2_PCRE2TEST
const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS };
#endif
@ -71,14 +71,18 @@ const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST };
* Tables for UTF-8 support *
*************************************************/
/* These tables are required by pcre2test in 16- or 32-bit mode, as well
as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for
handling wide characters. */
#if defined PCRE2_PCRE2TEST || \
(defined SUPPORT_UTF && \
defined PCRE2_CODE_UNIT_WIDTH && \
PCRE2_CODE_UNIT_WIDTH == 8)
/* These are the breakpoints for different numbers of bytes in a UTF-8
character. */
#if (defined SUPPORT_UTF && defined COMPILE_PCRE8) \
|| (defined PCRE2_INCLUDED && (defined SUPPORT_PCRE16 || defined SUPPORT_PCRE32))
/* These tables are also required by pcretest in 16- or 32-bit mode. */
const int PRIV(utf8_table1)[] =
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
@ -99,7 +103,7 @@ const uint8_t PRIV(utf8_table4)[] = {
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
#endif /* (SUPPORT_UTF && COMPILE_PCRE8) || (PCRE2_INCLUDED && SUPPORT_PCRE[16|32])*/
#endif /* UTF-8 support needed */
#ifdef SUPPORT_UTF
@ -653,7 +657,7 @@ const ucp_type_table PRIV(utt)[] = {
{ 1042, PT_PC, ucp_Zs }
};
const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
#endif /* SUPPORT_UTF */

View File

@ -8,7 +8,7 @@ table names from _pcre2_xxx to xxxx, thereby avoiding name clashes
with the library. At present, just one of these tables is actually
needed. */
#ifndef PCRE2_INCLUDED
#ifndef PCRE2_PCRE2TEST
#ifdef HAVE_CONFIG_H
#include "config.h"
@ -16,7 +16,7 @@ needed. */
#include "pcre2_internal.h"
#endif /* PCRE2_INCLUDED */
#endif /* PCRE2_PCRE2TEST */
/* Unicode character database. */
/* This file was autogenerated by the MultiStage2.py script. */
@ -78,7 +78,7 @@ const uint32_t PRIV(ucd_caseless_sets)[] = {
/* When #included in pcre2test, we don't need this large table. */
#ifndef PCRE2_INCLUDED
#ifndef PCRE2_PCRE2TEST
const ucd_record PRIV(ucd_records)[] = { /* 5016 bytes, record size 8 */
{ 9, 0, 2, 0, 0, }, /* 0 */
@ -3295,4 +3295,4 @@ const uint16_t PRIV(ucd_stage2)[] = { /* 51968 bytes, block = 128 */
#endif
#endif /* SUPPORT_UTF */
#endif /* PCRE2_INCLUDED */
#endif /* PCRE2_PCRE2TEST */

399
src/pcre2_valid_utf.c Normal file
View File

@ -0,0 +1,399 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains an internal function for validating UTF character
strings. */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "pcre2_internal.h"
#ifndef SUPPORT_UTF
/*************************************************
* Dummy function when UTF not supported *
*************************************************/
/* This function should never be called when UTF is not supported. */
int
PRIV(valid_utf)(PCRE2_SPTR string, int length, size_t *erroroffset)
{
(void)string;
(void)length;
(void)erroroffset);
return 0;
}
#else
/*************************************************
* Validate a UTF string *
*************************************************/
/* This function is called (optionally) at the start of compile or match, to
check that a supposed UTF string is actually valid. The early check means
that subsequent code can assume it is dealing with a valid string. The check
can be turned off for maximum performance, but the consequences of supplying an
invalid string are then undefined.
Arguments:
string points to the string
length length of string, or -1 if the string is zero-terminated
errp pointer to an error position offset variable
Returns: == 0 if the string is a valid UTF string
!= 0 otherwise, setting the offset of the bad character
*/
int
PRIV(valid_utf)(PCRE2_SPTR string, int length, size_t *erroroffset)
{
register PCRE2_SPTR p;
register uint32_t c;
if (length < 0)
{
for (p = string; *p != 0; p++);
length = (int)(p - string);
}
/* ----------------- Check a UTF-8 string ----------------- */
#if PCRE2_CODE_UNIT_WIDTH == 8
/* Originally, this function checked according to RFC 2279, allowing for values
in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were
in the canonical format. Once somebody had pointed out RFC 3629 to me (it
obsoletes 2279), additional restrictions were applied. The values are now
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
characters is still checked. Error returns are as follows:
PCRE2_ERROR_UTF8_ERR1 Missing 1 byte at the end of the string
PCRE2_ERROR_UTF8_ERR2 Missing 2 bytes at the end of the string
PCRE2_ERROR_UTF8_ERR3 Missing 3 bytes at the end of the string
PCRE2_ERROR_UTF8_ERR4 Missing 4 bytes at the end of the string
PCRE2_ERROR_UTF8_ERR5 Missing 5 bytes at the end of the string
PCRE2_ERROR_UTF8_ERR6 2nd-byte's two top bits are not 0x80
PCRE2_ERROR_UTF8_ERR7 3rd-byte's two top bits are not 0x80
PCRE2_ERROR_UTF8_ERR8 4th-byte's two top bits are not 0x80
PCRE2_ERROR_UTF8_ERR9 5th-byte's two top bits are not 0x80
PCRE2_ERROR_UTF8_ERR10 6th-byte's two top bits are not 0x80
PCRE2_ERROR_UTF8_ERR11 5-byte character is not permitted by RFC 3629
PCRE2_ERROR_UTF8_ERR12 6-byte character is not permitted by RFC 3629
PCRE2_ERROR_UTF8_ERR13 4-byte character with value > 0x10ffff is not permitted
PCRE2_ERROR_UTF8_ERR14 3-byte character with value 0xd000-0xdfff is not permitted
PCRE2_ERROR_UTF8_ERR15 Overlong 2-byte sequence
PCRE2_ERROR_UTF8_ERR16 Overlong 3-byte sequence
PCRE2_ERROR_UTF8_ERR17 Overlong 4-byte sequence
PCRE2_ERROR_UTF8_ERR18 Overlong 5-byte sequence (won't ever occur)
PCRE2_ERROR_UTF8_ERR19 Overlong 6-byte sequence (won't ever occur)
PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character)
PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff
*/
for (p = string; length-- > 0; p++)
{
register uint32_t ab, d;
c = *p;
if (c < 128) continue; /* ASCII character */
if (c < 0xc0) /* Isolated 10xx xxxx byte */
{
*erroroffset = (int)(p - string);
return PCRE2_ERROR_UTF8_ERR20;
}
if (c >= 0xfe) /* Invalid 0xfe or 0xff bytes */
{
*erroroffset = (int)(p - string);
return PCRE2_ERROR_UTF8_ERR21;
}
ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */
if (length < (int)ab) /* Missing bytes */
{
*erroroffset = (int)(p - string);
switch(ab - length)
{
case 1: return PCRE2_ERROR_UTF8_ERR1;
case 2: return PCRE2_ERROR_UTF8_ERR2;
case 3: return PCRE2_ERROR_UTF8_ERR3;
case 4: return PCRE2_ERROR_UTF8_ERR4;
case 5: return PCRE2_ERROR_UTF8_ERR5;
}
}
length -= ab; /* Length remaining */
/* Check top bits in the second byte */
if (((d = *(++p)) & 0xc0) != 0x80)
{
*erroroffset = (int)(p - string) - 1;
return PCRE2_ERROR_UTF8_ERR6;
}
/* For each length, check that the remaining bytes start with the 0x80 bit
set and not the 0x40 bit. Then check for an overlong sequence, and for the
excluded range 0xd800 to 0xdfff. */
switch (ab)
{
/* 2-byte character. No further bytes to check for 0x80. Check first byte
for for xx00 000x (overlong sequence). */
case 1: if ((c & 0x3e) == 0)
{
*erroroffset = (int)(p - string) - 1;
return PCRE2_ERROR_UTF8_ERR15;
}
break;
/* 3-byte character. Check third byte for 0x80. Then check first 2 bytes
for 1110 0000, xx0x xxxx (overlong sequence) or
1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
case 2:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if (c == 0xe0 && (d & 0x20) == 0)
{
*erroroffset = (int)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR16;
}
if (c == 0xed && d >= 0xa0)
{
*erroroffset = (int)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR14;
}
break;
/* 4-byte character. Check 3rd and 4th bytes for 0x80. Then check first 2
bytes for for 1111 0000, xx00 xxxx (overlong sequence), then check for a
character greater than 0x0010ffff (f4 8f bf bf) */
case 3:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR8;
}
if (c == 0xf0 && (d & 0x30) == 0)
{
*erroroffset = (int)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR17;
}
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
{
*erroroffset = (int)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR13;
}
break;
/* 5-byte and 6-byte characters are not allowed by RFC 3629, and will be
rejected by the length test below. However, we do the appropriate tests
here so that overlong sequences get diagnosed, and also in case there is
ever an option for handling these larger code points. */
/* 5-byte character. Check 3rd, 4th, and 5th bytes for 0x80. Then check for
1111 1000, xx00 0xxx */
case 4:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
return PCRE2_ERROR_UTF8_ERR9;
}
if (c == 0xf8 && (d & 0x38) == 0)
{
*erroroffset = (int)(p - string) - 4;
return PCRE2_ERROR_UTF8_ERR18;
}
break;
/* 6-byte character. Check 3rd-6th bytes for 0x80. Then check for
1111 1100, xx00 00xx. */
case 5:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
return PCRE2_ERROR_UTF8_ERR9;
}
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
{
*erroroffset = (int)(p - string) - 5;
return PCRE2_ERROR_UTF8_ERR10;
}
if (c == 0xfc && (d & 0x3c) == 0)
{
*erroroffset = (int)(p - string) - 5;
return PCRE2_ERROR_UTF8_ERR19;
}
break;
}
/* Character is valid under RFC 2279, but 4-byte and 5-byte characters are
excluded by RFC 3629. The pointer p is currently at the last byte of the
character. */
if (ab > 3)
{
*erroroffset = (int)(p - string) - ab;
return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
}
}
return 0;
/* ----------------- Check a UTF-16 string ----------------- */
#elif PCRE2_CODE_UNIT_WIDTH == 16
/* There's not so much work, nor so many errors, for UTF-16.
PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string
PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate
PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate
*/
for (p = string; length-- > 0; p++)
{
c = *p;
if ((c & 0xf800) != 0xd800)
{
/* Normal UTF-16 code point. Neither high nor low surrogate. */
}
else if ((c & 0x0400) == 0)
{
/* High surrogate. Must be a followed by a low surrogate. */
if (length == 0)
{
*erroroffset = p - string;
return PCRE2_ERROR_UTF16_ERR1;
}
p++;
length--;
if ((*p & 0xfc00) != 0xdc00)
{
*erroroffset = p - string;
return PCRE2_ERROR_UTF16_ERR2;
}
}
else
{
/* Isolated low surrogate. Always an error. */
*erroroffset = p - string;
return PCRE2_ERROR_UTF16_ERR3;
}
}
return 0;
/* ----------------- Check a UTF-32 string ----------------- */
#else
/* There is very little to do for a UTF-32 string.
PCRE2_ERROR_UTF32_ERR1 Surrogate character
PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff
*/
for (p = string; length-- > 0; p++)
{
c = *p;
if ((c & 0xfffff800u) != 0xd800u)
{
/* Normal UTF-32 code point. Neither high nor low surrogate. */
if (c > 0x10ffffu)
{
*erroroffset = p - string;
return PCRE2_ERROR_UTF32_ERR2;
}
}
else
{
/* A surrogate */
*erroroffset = p - string;
return PCRE2_ERROR_UTF32_ERR1;
}
}
return 0;
#endif /* CODE_UNIT_WIDTH */
#endif /* SUPPORT_UTF */
}
/* End of pcre2_valid_utf.c */

View File

@ -108,7 +108,7 @@ static const int eint2[] = {
30, REG_ECTYPE, /* unknown POSIX class name */
32, REG_INVARG, /* this version of PCRE2 is not compiled with PCRE2_UTF8 support */
37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N, \U, or \u */
56, REG_INVARG, /* inconsistent NEWLINE options */
56, REG_INVARG, /* internal error: unknown newline setting */
67, REG_INVARG, /* this version of PCRE2 is not compiled with PCRE2_UCP support */
};
@ -148,6 +148,8 @@ regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
const char *message, *addmessage;
size_t length, addlength;
errcode -= COMPILE_ERROR_BASE;
message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
"unknown error code" : pstring[errcode];
length = strlen(message) + 1;
@ -224,6 +226,8 @@ preg->re_erroffset = erroffset;
if (preg->re_pcre2_code == NULL)
{
unsigned int i;
if (errorcode < 0) return REG_BADPAT; /* UTF error */
errorcode -= COMPILE_ERROR_BASE;
if (errorcode < (int)(sizeof(eint1)/sizeof(const int)))
return eint1[errorcode];
for (i = 0; i < sizeof(eint2)/(2*sizeof(const int)); i += 2)
@ -307,13 +311,15 @@ if (rc >= 0)
/* Unsuccessful match */
if (rc <= PCRE2_ERROR_UTF8_ERR1 && rc >= PCRE2_ERROR_UTF8_ERR21)
return REG_INVARG;
switch(rc)
{
default: return REG_ASSERT;
case PCRE2_ERROR_BADMODE: return REG_INVARG;
case PCRE2_ERROR_BADMAGIC: return REG_INVARG;
case PCRE2_ERROR_BADOPTION: return REG_INVARG;
case PCRE2_ERROR_BADUTF: return REG_INVARG;
case PCRE2_ERROR_BADUTF_OFFSET: return REG_INVARG;
case PCRE2_ERROR_MATCHLIMIT: return REG_ESPACE;
case PCRE2_ERROR_NOMATCH: return REG_NOMATCH;

View File

@ -211,10 +211,10 @@ for building the library. */
#include "pcre2_internal.h"
/* We need access to some of the data tables that PCRE uses. Defining
PCRE2_INCLUDED makes some minor changes in the files. The previous definition
PCRE2_PCRETEST makes some minor changes in the files. The previous definition
of PRIV avoids name clashes. */
#define PCRE2_INCLUDED
#define PCRE2_PCRE2TEST
#include "pcre2_tables.c"
#include "pcre2_ucd.c"
@ -340,12 +340,14 @@ either on a pattern or a data line, so they must all be distinct. */
#define CTL_FULLBINCODE 0x00000200
#define CTL_GETALL 0x00000400
#define CTL_GLOBAL 0x00000800
#define CTL_INFO 0x00001000
#define CTL_JITVERIFY 0x00002000
#define CTL_LIMITS 0x00004000
#define CTL_MARK 0x00008000
#define CTL_MEMORY 0x00010000
#define CTL_POSIX 0x00020000
#define CTL_HEXPAT 0x00001000
#define CTL_INFO 0x00002000
#define CTL_JITVERIFY 0x00004000
#define CTL_LIMITS 0x00008000
#define CTL_MARK 0x00010000
#define CTL_MEMORY 0x00020000
#define CTL_PATLEN 0x00040000
#define CTL_POSIX 0x00080000
#define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */
#define CTL_ANYINFO (CTL_DEBUG|CTL_BINCODE) /* For testing */
@ -441,6 +443,7 @@ static modstruct modlist[] = {
{ "get", MOD_DAT, MOD_NN, DO(get_numbers), DO(get_names) },
{ "getall", MOD_DAT, MOD_CTL, CTL_GETALL, DO(control) },
{ "global", MOD_PNDP, MOD_CTL, CTL_GLOBAL, PO(control) },
{ "hex", MOD_PAT, MOD_CTL, CTL_HEXPAT, PO(control) },
{ "info", MOD_PAT, MOD_CTL, CTL_INFO, PO(control) },
{ "jit", MOD_PAT, MOD_IND, 7, PO(jit) },
{ "jitstack", MOD_DAT, MOD_INT, 0, DO(jitstack) },
@ -475,6 +478,7 @@ static modstruct modlist[] = {
{ "tables", MOD_PAT, MOD_INT, 0, PO(tables_id) },
{ "ucp", MOD_PATP, MOD_OPT, PCRE2_UCP, PO(options) },
{ "ungreedy", MOD_PAT, MOD_OPT, PCRE2_UNGREEDY, PO(options) },
{ "use_length", MOD_PAT, MOD_CTL, CTL_PATLEN, PO(control) },
{ "utf", MOD_PATP, MOD_OPT, PCRE2_UTF, PO(options) }
};
@ -625,14 +629,9 @@ buffer is where all input lines are read. Its size is the same as pbuffer8.
Pattern lines are always copied to pbuffer8 for use in callouts, even if they
are actually compiled from pbuffer16 or pbuffer32. */
static int pbuffer8_size = 50000; /* Initial size, bytes */
static int pbuffer16_size = 0; /* Only set once needed */
static int pbuffer32_size = 0; /* Only set once needed */
static uint8_t *buffer = NULL;
static int pbuffer8_size = 50000; /* Initial size, bytes */
static uint8_t *pbuffer8 = NULL;
static uint16_t *pbuffer16 = NULL;
static uint32_t *pbuffer32 = NULL;
static uint8_t *buffer = NULL;
/* The dbuffer is where all processed data lines are put. In non-8-bit modes it
is cast as needed. For long data lines it grows as necessary. */
@ -655,6 +654,8 @@ pcre2_code_16 *compiled_code16;
pcre2_compile_context_16 *pat_context16, *default_pat_context16;
pcre2_match_context_16 *dat_context16, *default_dat_context16;
pcre2_match_data_16 *match_data16;
static int pbuffer16_size = 0; /* Only set once needed */
static uint16_t *pbuffer16 = NULL;
#endif
#ifdef SUPPORT_PCRE32
@ -662,6 +663,8 @@ pcre2_code_32 *compiled_code32;
pcre2_compile_context_32 *pat_context32, *default_pat_context32;
pcre2_match_context_32 *dat_context32, *default_dat_context32;
pcre2_match_data_32 *match_data32;
static int pbuffer32_size = 0; /* Only set once needed */
static uint32_t *pbuffer32 = NULL;
#endif
@ -997,10 +1000,10 @@ the three different cases. */
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
if (test_mode == G(G(PCRE,BITONE),_MODE)) \
a = G(pcre2_dfa-match_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \
a = G(pcre2_dfa_match_,BITONE)(G(b,BITONE),(G(PCRE2_SPTR,BITONE))c,d,e,f, \
G(g,BITONE),G(h,BITONE),i,j); \
else \
a = G(pcre2_dfa-match_,BITTWO)(G(b,BITTWO),(G(PCRE2_SPTR,BITTWO))c,d,e,f, \
a = G(pcre2_dfa_match_,BITTWO)(G(b,BITTWO),(G(PCRE2_SPTR,BITTWO))c,d,e,f, \
G(g,BITTWO),G(h,BITTWO),i,j)
#define PCRE2_GET_ERROR_MESSAGE(r,a,b) \
@ -1178,20 +1181,20 @@ the three different cases. */
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
G(a,8) = pcre2_compile_8(G(b,8),c,d,e,f,G(g,8))
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
G(a,8) = pcre2_dfa-match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8),i,j)
a = pcre2_dfa_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8),i,j)
#define PCRE2_GET_ERROR_MESSAGE(r,a,b) \
r = pcre2_get_error_message_8(a,G(b,8),G(G(b,8),_size))
#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_8(G(a,8),b)
#define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
G(a,8) = pcre2_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8))
a = pcre2_match_8(G(b,8),(PCRE2_SPTR8)c,d,e,f,G(g,8),G(h,8))
#define PCRE2_MAKETABLES(a) a = pcre2_maketables_8(NULL)
#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,8) = pcre2_match_data_create_8(b,c)
#define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_8(a)
#define PCRE2_PATTERN_INFO(a,b,c,d) G(a,8) = pcre2_pattern_info_8(G(b,8),c,d)
#define PCRE2_PRINTINT(a,b) pcre2_printint_8(compiled_code8,outfile,a)
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_8(G(a,8),b) \
#define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_8(G(a,8))
#define PCRE2_PATTERN_INFO(a,b,c,d) a = pcre2_pattern_info_8(G(b,8),c,d)
#define PCRE2_PRINTINT(a) pcre2_printint_8(compiled_code8,outfile,a)
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_8(G(a,8),b)
#define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_8(G(b,8),G(c,8),(PCRE2_UCHAR8 *)d,e)
a = pcre2_substring_copy_byname_8(G(b,8),G(c,8),(PCRE2_UCHAR8 *)d,e)
#define PCRE2_SUBSTRING_COPY_BYNUMBER(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_8(G(b,8),c,(PCRE2_UCHAR8 *)d,e)
#define PCRE2_SUBSTRING_FREE(a) pcre2_substring_free_8((PCRE2_UCHAR8 *)a)
@ -1207,12 +1210,12 @@ the three different cases. */
#define SETFLD(x,y,z) G(x,8)->y = z
#define SETFLDVEC(x,y,v,z) G(x,8)->y[v] = z
#define SETOP(x,y,z) G(x,8) z y
#define SETCASTPTR(x,y) G(x,8) = (uint8_t) *)y
#define STRLEN(p) (int)strlen(p)
#define SETCASTPTR(x,y) G(x,8) = (uint8_t *)y
#define STRLEN(p) (int)strlen((char *)p)
#define SUB1(a,b) G(a,8)(G(b,8))
#define SUB2(a,b,c) G(a,8)(G(b,8),G(c,8))
#define TEST(x,r,y) (G(a,8) r (y))
#define TESTFLD(x,f,r,y) (G(a,8)->f r (y))
#define TEST(x,r,y) (G(x,8) r (y))
#define TESTFLD(x,f,r,y) (G(x,8)->f r (y))
/* ----- Only 16-bit mode is supported ----- */
@ -1231,20 +1234,20 @@ the three different cases. */
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
G(a,16) = pcre2_compile_16(G(b,16),c,d,e,f,G(g,16))
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
G(a,16) = pcre2_dfa_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16),i,j)
a = pcre2_dfa_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16),i,j)
#define PCRE2_GET_ERROR_MESSAGE(r,a,b) \
r = pcre2_get_error_message_16(a,G(b,16),G(G(b,16),_size))
#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_16(G(a,16),b)
#define PCRE2_MAKETABLES(a) a = pcre2_maketables_16(NULL)
#define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
G(a,16) = pcre2_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16))
a = pcre2_match_16(G(b,16),(PCRE2_SPTR16)c,d,e,f,G(g,16),G(h,16))
#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,16) = pcre2_match_data_create_16(b,c)
#define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_16(a)
#define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_16(G(a,16))
#define PCRE2_PATTERN_INFO(a,b,c,d) G(a,16) = pcre2_pattern_info_16(G(b,16),c,d)
#define PCRE2_PRINTINT(a,b) pcre2_printint_16(compiled_code16,outfile,a)
#define PCRE2_PRINTINT(a) pcre2_printint_16(compiled_code16,outfile,a)
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_16(G(a,16),b)
#define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_16(G(b,16),G(c,16),(PCRE2_UCHAR16 *)d,e);
a = pcre2_substring_copy_byname_16(G(b,16),G(c,16),(PCRE2_UCHAR16 *)d,e);
#define PCRE2_SUBSTRING_COPY_BYNUMBER(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_16(G(b,16),c,(PCRE2_UCHAR16 *)d,e);
#define PCRE2_SUBSTRING_FREE(a) pcre2_substring_free_16((PCRE2_UCHAR16 *)a)
@ -1260,12 +1263,12 @@ the three different cases. */
#define SETFLD(x,y,z) G(x,16)->y = z
#define SETFLDVEC(x,y,v,z) G(x,16)->y[v] = z
#define SETOP(x,y,z) G(x,16) z y
#define SETCASTPTR(x,y) G(x,16) = (uint16_t) *)y
#define SETCASTPTR(x,y) G(x,16) = (uint16_t *)y
#define STRLEN(p) (int)strlen16(p)
#define SUB1(a,b) G(a,16)(G(b,16))
#define SUB2(a,b,c) G(a,16)(G(b,16),G(c,16))
#define TEST(x,r,y) (G(a,16) r (y))
#define TESTFLD(x,f,r,y) (G(a,16)->f r (y))
#define TEST(x,r,y) (G(x,16) r (y))
#define TESTFLD(x,f,r,y) (G(x,16)->f r (y))
/* ----- Only 32-bit mode is supported ----- */
@ -1284,20 +1287,20 @@ the three different cases. */
#define PCRE2_COMPILE(a,b,c,d,e,f,g) \
G(a,32) = pcre2_compile_32(G(b,32),c,d,e,f,G(g,32))
#define PCRE2_DFA_MATCH(a,b,c,d,e,f,g,h,i,j) \
G(a,32) = pcre2_dfa_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32),i,j)
a = pcre2_dfa_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),G(h,32),i,j)
#define PCRE2_GET_ERROR_MESSAGE(r,a,b) \
r = pcre2_get_error_message_32(a,G(b,32),G(G(b,32),_size))
#define PCRE2_JIT_COMPILE(a,b) pcre2_jit_compile_32(G(a,32),b)
#define PCRE2_MATCH(a,b,c,d,e,f,g,h) \
G(a,32) = pcre2_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),g(h,32))
a = pcre2_match_32(G(b,32),(PCRE2_SPTR32)c,d,e,f,G(g,32),g(h,32))
#define PCRE2_MAKETABLES(a) a = pcre2_maketables_32(NULL)
#define PCRE2_MATCH_DATA_CREATE(a,b,c) G(a,32) = pcre2_match_data_create_32(b,c)
#define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_32(a)
#define PCRE2_MATCH_DATA_FREE(a) pcre2_match_data_free_32(G(a,32))
#define PCRE2_PATTERN_INFO(a,b,c,d) G(a,32) = pcre2_pattern_info_32(G(b,32),c,d)
#define PCRE2_PRINTINT(a,b) pcre2_printint_32(compiled_code32,outfile,a)
#define PCRE2_PRINTINT(a) pcre2_printint_32(compiled_code32,outfile,a)
#define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b)
#define PCRE2_SUBSTRING_COPY_BYNAME(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_32(G(b,32),G(c,32),(PCRE2_UCHAR32 *)d,e);
a = pcre2_substring_copy_byname_32(G(b,32),G(c,32),(PCRE2_UCHAR32 *)d,e);
#define PCRE2_SUBSTRING_COPY_BYNUMBER(a,b,c,d,e) \
a = pcre2_substring_copy_bynumber_32(G(b,32),c,(PCRE2_UCHAR32 *)d,e);
#define PCRE2_SUBSTRING_FREE(a) pcre2_substring_free_32((PCRE2_UCHAR32 *)a)
@ -1313,12 +1316,12 @@ define PCRE2_SUBSTRING_GET_BYNUMBER(a,b,c,d) \
#define SETFLD(x,y,z) G(x,32)->y = z
#define SETFLDVEC(x,y,v,z) G(x,32)->y[v] = z
#define SETOP(x,y,z) G(x,32) z y
#define SETCASTPTR(x,y) G(x,32) = (uint32_t) *)y
#define SETCASTPTR(x,y) G(x,32) = (uint32_t *)y
#define STRLEN(p) (int)strle32(p)
#define SUB1(a,b) G(a,32)(G(b,32))
#define SUB2(a,b,c) G(a,32)(G(b,32),G(c,32))
#define TEST(x,r,y) (G(a,32) r (y))
#define TESTFLD(x,f,r,y) (G(a,32)->f r (y))
#define TEST(x,r,y) (G(x,32) r (y))
#define TESTFLD(x,f,r,y) (G(x,32)->f r (y))
#endif
@ -2669,7 +2672,7 @@ Returns: nothing
static void
show_compile_controls(uint32_t controls, const char *before, const char *after)
{
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
before,
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
@ -2679,9 +2682,11 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
((controls & CTL_FLIPBYTES) != 0)? " flipbytes" : "",
((controls & CTL_FULLBINCODE) != 0)? " fullbincode" : "",
((controls & CTL_GLOBAL) != 0)? " global" : "",
((controls & CTL_HEXPAT) != 0)? " hex" : "",
((controls & CTL_INFO) != 0)? " info" : "",
((controls & CTL_JITVERIFY) != 0)? " jitverify" : "",
((controls & CTL_MARK) != 0)? " mark" : "",
((controls & CTL_PATLEN) != 0)? " use_length" : "",
((controls & CTL_POSIX) != 0)? " posix" : "",
after);
}
@ -2705,7 +2710,8 @@ Returns: nothing
static void
show_compile_options(uint32_t options, const char *before, const char *after)
{
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
before,
((options & PCRE2_ANCHORED) != 0)? " anchored" : "",
((options & PCRE2_CASELESS) != 0)? " caseless" : "",
@ -2896,15 +2902,13 @@ if ((pat_patctl.control & CTL_INFO) != 0)
fprintf(outfile, "No options\n");
else
{
if (compile_options != 0)
show_compile_options(compile_options, "Compile options:", "\n");
if (pattern_options != 0)
show_compile_options(pattern_options, "Pattern options:", "\n");
show_compile_options(compile_options, "Compile options:", "\n");
show_compile_options(pattern_options, "Pattern options:", "\n");
}
if (jchanged) fprintf(outfile, "Duplicate name status changes\n");
if (bsr_convention != PCRE2_BSR_DEFAULT)
if (bsr_convention != BSR_DEFAULT)
fprintf(outfile, "\\R matches %s\n", (bsr_convention == PCRE2_BSR_UNICODE)?
"any Unicode newline" : "CR, LF, or CRLF");
@ -3272,25 +3276,63 @@ for(;;)
if (infile != stdin) fprintf(outfile, "%s", (char *)p);
}
/* If the first character after the delimiter is backslash, make
the pattern end with backslash. This is purely to provide a way
of testing for the error message when a pattern ends with backslash. */
/* If the first character after the delimiter is backslash, make the pattern
end with backslash. This is purely to provide a way of testing for the error
message when a pattern ends with backslash. */
if (p[1] == '\\') *p++ = '\\';
/* Terminate the pattern at the delimiter, and save a copy of the pattern
for callouts. */
/* Terminate the pattern at the delimiter, and compute the length. */
*p++ = 0;
patlen = p - buffer - 1;
strncpy((char *)pbuffer8, (char *)(buffer+1), patlen);
patlen = p - buffer - 2;
/* Look for modifiers and options after the final delimiter. If successful,
compile the pattern. */
/* Look for modifiers and options after the final delimiter. */
if (!decode_modifiers(p, CTX_PAT, &pat_patctl, NULL)) return PR_SKIP;
utf = (pat_patctl.options & PCRE2_UTF) != 0;
/* Now copy the pattern to pbuffer8 for use in 8-bit testing and for reflecting
in callouts. Convert to binary if required. */
if ((pat_patctl.control & CTL_HEXPAT) != 0)
{
uint8_t *pp, *pt;
uint32_t c, d;
if ((pat_patctl.control & CTL_POSIX) != 0)
{
fprintf(outfile, "** Hex patterns are not supported for the POSIX API\n");
return PR_SKIP;
}
pt = pbuffer8;
for (pp = buffer + 1; *pp != 0; pp++)
{
if (isspace(*pp)) continue;
c = toupper(*pp++);
if (*pp == 0)
{
fprintf(outfile, "** Odd number of digits in hex pattern.\n");
return PR_SKIP;
}
d = toupper(*pp);
if (!isxdigit(c) || !isxdigit(d))
{
fprintf(outfile, "** Non-hex-digit in hex pattern.\n");
return PR_SKIP;
}
*pt++ = ((isdigit(c)? (c - '0') : (c - 'A' + 10)) << 4) +
(isdigit(d)? (d - '0') : (d - 'A' + 10));
}
*pt = 0;
patlen = pt - pbuffer8;
}
else
{
strncpy((char *)pbuffer8, (char *)(buffer+1), patlen + 1);
}
/* Sort out character tables */
if (pat_patctl.locale[0] != 0)
@ -3394,12 +3436,12 @@ modes. */
#ifdef SUPPORT_PCRE16
if (test_mode == PCRE16_MODE)
patlen = to16(pbuffer8, utf, (int)strlen((char *)pbuffer8));
patlen = to16(pbuffer8, utf, patlen);
#endif
#ifdef SUPPORT_PCRE32
if (test_mode == PCRE32_MODE)
patlen = to32(pbuffer8, utf, (int)strlen((char *)pbuffer8));
patlen = to32(pbuffer8, utf, patlen);
#endif
switch(patlen)
@ -3423,8 +3465,13 @@ switch(patlen)
break;
}
/* The pattern in now in pbuffer[8|16|32], with the length in patlen. Compile
many times when timing. */
/* The pattern in now in pbuffer[8|16|32], with the length in patlen. By
default, however, we pass a zero-terminated pattern. The length is passed only
if we had a hex pattern or if use_length was set. */
if ((pat_patctl.control & (CTL_PATLEN|CTL_HEXPAT)) == 0) patlen = -1;
/* Compile many times when timing. */
if (timeit > 0)
{
@ -3960,9 +4007,9 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
if (dat_datctl.cfail[0] != 0 || dat_datctl.cfail[1] != 0)
prmsg(&msg, "callout_fail");
if (dat_datctl.copy_numbers[0] != 0 || dat_datctl.copy_names[0] != 0)
if (dat_datctl.copy_numbers[0] >= 0 || dat_datctl.copy_names[0] != 0)
prmsg(&msg, "copy");
if (dat_datctl.get_numbers[0] != 0 || dat_datctl.get_names[0] != 0)
if (dat_datctl.get_numbers[0] >= 0 || dat_datctl.get_names[0] != 0)
prmsg(&msg, "get");
if (dat_datctl.jitstack != 0) prmsg(&msg, "jitstack");
@ -4059,6 +4106,9 @@ for (gmatched = 0;; gmatched++)
#ifdef FIXME
jit_was_used = FALSE;
Need to set newline and bsr in match context and allow them to be
set in the datctl block.
#endif
/* Adjust match_data according to size of offsets required. */
@ -4502,12 +4552,6 @@ if ((dat_datctl.control & CTL_DFA) != 0)
}
break;
case PCRE2_ERROR_BADUTF:
fprintf(outfile, "Error %d (bad UTF-%d string) offset=%d reason=%d\n",
capcount, test_mode, CASTFLD(int, match_data, startchar),
CASTFLD(int, match_data, utf_reason));
break;
case PCRE2_ERROR_BADUTF_OFFSET:
fprintf(outfile, "Error %d (bad UTF-%d offset)\n", capcount, test_mode);
break;