From 7efba85b56ccb0aebe3825f3a2767e0e83b4e7a2 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 16 Aug 2014 09:46:58 +0000 Subject: [PATCH] Change lengths to PCRE2_SIZE and revise OP_RECURSE processing. --- Makefile.am | 2 +- configure.ac | 6 +- src/config.h.generic | 131 +++++++-------- src/pcre2.h.in | 66 ++++---- src/pcre2_compile.c | 14 +- src/pcre2_config.c | 2 +- src/pcre2_context.c | 4 +- src/pcre2_dfa_match.c | 59 +++---- src/pcre2_internal.h | 4 +- src/pcre2_intmodedep.h | 66 ++++---- src/pcre2_jit_compile.c | 15 +- src/pcre2_jit_match.c | 17 +- src/pcre2_jit_misc.c | 40 ++++- src/pcre2_match.c | 348 ++++++++++++++++++++++++++-------------- src/pcre2_match_data.c | 10 +- src/pcre2_substring.c | 2 +- src/pcre2_valid_utf.c | 14 +- src/pcre2demo.c | 6 +- src/pcre2grep.c | 14 +- src/pcre2posix.c | 2 +- src/pcre2test.c | 32 ++-- testdata/testinput2 | 6 + testdata/testoutput2 | 8 + 23 files changed, 499 insertions(+), 369 deletions(-) diff --git a/Makefile.am b/Makefile.am index 18a4886..4b3ebb8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -429,7 +429,7 @@ endif # WITH_PCRE8 if WITH_JIT TESTS += pcre2_jit_test noinst_PROGRAMS += pcre2_jit_test -pcre2_jit_test_SOURCES = pcre2_jit_test.c +pcre2_jit_test_SOURCES = src/pcre2_jit_test.c pcre2_jit_test_CFLAGS = $(AM_CFLAGS) pcre2_jit_test_LDADD = if WITH_PCRE8 diff --git a/configure.ac b/configure.ac index 3130962..167d3b8 100644 --- a/configure.ac +++ b/configure.ac @@ -523,10 +523,10 @@ if test "$enable_utf" = "yes"; then fi if test "$enable_stack_for_recursion" = "no"; then - AC_DEFINE([NO_RECURSE], [], [ + AC_DEFINE([HEAP_MATCH_RECURSE], [], [ PCRE2 uses recursive function calls to handle backtracking while matching. This can sometimes be a problem on systems that have - stacks of limited size. Define NO_RECURSE to any value to get a + stacks of limited size. Define HEAP_MATCH_RECURSE to any value to get a version that doesn't use recursion in the match() function; instead it creates its own stack by steam using memory from the heap. For more detail, see the comments and other stuff just above the match() function.]) @@ -608,7 +608,7 @@ AC_DEFINE_UNQUOTED([MATCH_LIMIT_RECURSION], [$with_match_limit_recursion], [ increase the recursion depth. In some environments it is desirable to limit the depth of recursive calls of match() more strictly, in order to restrict the maximum amount of stack (or heap, if - NO_RECURSE is defined) that is used. The value of + HEAP_MATCH_RECURSE is defined) that is used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of match(). To have any useful effect, it must be less than the value of MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. diff --git a/src/config.h.generic b/src/config.h.generic index 58bbe20..67fa46a 100644 --- a/src/config.h.generic +++ b/src/config.h.generic @@ -1,13 +1,13 @@ /* src/config.h. Generated from config.h.in by configure. */ /* src/config.h.in. Generated from configure.ac by autoheader. */ -/* PCRE is written in Standard C, but there are a few non-standard things it +/* PCRE2 is written in Standard C, but there are a few non-standard things it can cope with, allowing it to run on SunOS4 and other "close to standard" systems. In environments that support the GNU autotools, config.h.in is converted into config.h by the "configure" script. In environments that use CMake, -config-cmake.in is converted into config.h. If you are going to build PCRE "by +config-cmake.in is converted into config.h. If you are going to build PCRE2 "by hand" without using "configure" or CMake, you should copy the distributed config.h.generic to config.h, and edit the macro definitions to be the way you need them. You must then add -DHAVE_CONFIG_H to all of your compile commands, @@ -24,31 +24,28 @@ macros are listed as a commented #undef in config.h.generic. Macros such as MATCH_LIMIT, whose actual value is relevant, have defaults defined, but are surrounded by #ifndef/#endif lines so that the value can be overridden by -D. -PCRE uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if +PCRE2 uses memmove() if HAVE_MEMMOVE is defined; otherwise it uses bcopy() if HAVE_BCOPY is defined. If your system has neither bcopy() nor memmove(), make sure both macros are undefined; an emulation function will then be used. */ /* By default, the \R escape sequence matches any Unicode line ending character or sequence of characters. If BSR_ANYCRLF is defined (to any value), this is changed so that backslash-R matches only CR, LF, or CRLF. - The build-time default can be overridden by the user of PCRE at runtime. */ + The build-time default can be overridden by the user of PCRE2 at runtime. + */ /* #undef BSR_ANYCRLF */ /* If you are compiling for a system that uses EBCDIC instead of ASCII - character codes, define this macro to any value. You must also edit the - NEWLINE macro below to set a suitable EBCDIC newline, commonly 21 (0x15). - On systems that can use "configure" or CMake to set EBCDIC, NEWLINE is - automatically adjusted. When EBCDIC is set, PCRE assumes that all input - strings are in EBCDIC. If you do not define this macro, PCRE will assume - input strings are ASCII or UTF-8/16/32 Unicode. It is not possible to build - a version of PCRE that supports both EBCDIC and UTF-8/16/32. */ + character codes, define this macro to any value. When EBCDIC is set, PCRE2 + assumes that all input strings are in EBCDIC. If you do not define this + macro, PCRE2 will assume input strings are ASCII or UTF-8/16/32 Unicode. It + is not possible to build a version of PCRE2 that supports both EBCDIC and + UTF-8/16/32. */ /* #undef EBCDIC */ /* In an EBCDIC environment, define this macro to any value to arrange for the NL character to be 0x25 instead of the default 0x15. NL plays the role that - LF does in an ASCII/Unicode environment. The value must also be set in the - NEWLINE macro below. On systems that can use "configure" or CMake to set - EBCDIC_NL25, the adjustment of NEWLINE is automatic. */ + LF does in an ASCII/Unicode environment. */ /* #undef EBCDIC_NL25 */ /* Define to 1 if you have the `bcopy' function. */ @@ -126,11 +123,19 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to 1 if you have the header file. */ /* #undef HAVE_ZLIB_H */ +/* PCRE2 uses recursive function calls to handle backtracking while matching. + This can sometimes be a problem on systems that have stacks of limited + size. Define HEAP_MATCH_RECURSE to any value to get a version that doesn't + use recursion in the match() function; instead it creates its own stack by + steam using memory from the heap. For more detail, see the comments and + other stuff just above the match() function. */ +/* #undef HEAP_MATCH_RECURSE */ + /* The value of LINK_SIZE determines the number of bytes used to store links as offsets within the compiled regex. The default is 2, which allows for compiled patterns up to 64K long. This covers the vast majority of cases. - However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows - for longer patterns in extreme cases. */ + However, PCRE2 can also be compiled to use 3 or 4 bytes instead. This + allows for longer patterns in extreme cases. */ #ifndef LINK_SIZE #define LINK_SIZE 2 #endif @@ -144,7 +149,7 @@ sure both macros are undefined; an emulation function will then be used. */ /* The value of MATCH_LIMIT determines the default number of times the internal match() function can be called during a single execution of - pcre_exec(). There is a runtime interface for setting a different limit. + pcre2_match(). There is a runtime interface for setting a different limit. The limit exists in order to catch runaway regular expressions that take for ever to determine that they do not match. The default is set very large so that it does not accidentally catch legitimate cases. */ @@ -155,11 +160,11 @@ sure both macros are undefined; an emulation function will then be used. */ /* The above limit applies to all calls of match(), whether or not they increase the recursion depth. In some environments it is desirable to limit the depth of recursive calls of match() more strictly, in order to restrict - the maximum amount of stack (or heap, if NO_RECURSE is defined) that is - used. The value of MATCH_LIMIT_RECURSION applies only to recursive calls of - match(). To have any useful effect, it must be less than the value of - MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There is - a runtime method for setting a different limit. */ + the maximum amount of stack (or heap, if HEAP_MATCH_RECURSE is defined) + that is used. The value of MATCH_LIMIT_RECURSION applies only to recursive + calls of match(). To have any useful effect, it must be less than the value + of MATCH_LIMIT. The default is to use the same value as MATCH_LIMIT. There + is a runtime method for setting a different limit. */ #ifndef MATCH_LIMIT_RECURSION #define MATCH_LIMIT_RECURSION MATCH_LIMIT #endif @@ -178,27 +183,14 @@ sure both macros are undefined; an emulation function will then be used. */ #define MAX_NAME_SIZE 32 #endif -/* The value of NEWLINE determines the default newline character sequence. - PCRE client programs can override this by selecting other values at run - time. In ASCII environments, the value can be 10 (LF), 13 (CR), or 3338 - (CRLF); in EBCDIC environments the value can be 21 or 37 (LF), 13 (CR), or - 3349 or 3365 (CRLF) because there are two alternative codepoints (0x15 and - 0x25) that are used as the NL line terminator that is equivalent to ASCII - LF. In both ASCII and EBCDIC environments the value can also be -1 (ANY), - or -2 (ANYCRLF). */ -#ifndef NEWLINE -#define NEWLINE 10 +/* The value of NEWLINE_DEFAULT determines the default newline character + sequence. PCRE2 client programs can override this by selecting other values + at run time. The valid values are 1 (CR), 2 (LF), 3 (CRLF), 4 (ANY), and 5 + (ANYCRLF). */ +#ifndef NEWLINE_DEFAULT +#define NEWLINE_DEFAULT 2 #endif -/* PCRE uses recursive function calls to handle backtracking while matching. - This can sometimes be a problem on systems that have stacks of limited - size. Define NO_RECURSE to any value to get a version that doesn't use - recursion in the match() function; instead it creates its own stack by - steam using pcre_recurse_malloc() to obtain memory from the heap. For more - detail, see the comments and other stuff just above the match() function. - */ -/* #undef NO_RECURSE */ - /* Name of package */ #define PACKAGE "pcre2" @@ -209,7 +201,7 @@ sure both macros are undefined; an emulation function will then be used. */ #define PACKAGE_NAME "PCRE2" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "PCRE2 9.00-DEV" +#define PACKAGE_STRING "PCRE2 10.00-DEV" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "pcre2" @@ -218,7 +210,7 @@ sure both macros are undefined; an emulation function will then be used. */ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "9.00-DEV" +#define PACKAGE_VERSION "10.00-DEV" /* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested parentheses (of any kind) in a pattern. This limits the amount of system @@ -227,15 +219,13 @@ sure both macros are undefined; an emulation function will then be used. */ #define PARENS_NEST_LIMIT 250 #endif -/* #undef PCRE2_EXP_DEFN */ - -/* The value of PCREGREP_BUFSIZE determines the size of buffer used by - pcregrep to hold parts of the file it is searching. This is also the - minimum value. The actual amount of memory used by pcregrep is three times +/* The value of PCRE2GREP_BUFSIZE determines the size of buffer used by + pcre2grep to hold parts of the file it is searching. This is also the + minimum value. The actual amount of memory used by pcre2grep is three times this number, because it allows for the buffering of "before" and "after" lines. */ -#ifndef PCREGREP_BUFSIZE -#define PCREGREP_BUFSIZE 20480 +#ifndef PCRE2GREP_BUFSIZE +#define PCRE2GREP_BUFSIZE 20480 #endif /* If you are compiling for a system other than a Unix-like system or @@ -247,21 +237,10 @@ sure both macros are undefined; an emulation function will then be used. */ This macro apears at the start of every exported function that is part of the external API. It does not appear on functions that are "external" in the C sense, but which are internal to the library. */ -/* #undef PCRE_EXP_DEFN */ +/* #undef PCRE2_EXP_DEFN */ /* Define to any value if linking statically (TODO: make nice with Libtool) */ -/* #undef PCRE_STATIC */ - -/* When calling PCRE via the POSIX interface, additional working storage is - required for holding the pointers to capturing substrings because PCRE - requires three integers per substring, whereas the POSIX interface provides - only two. If the number of expected substrings is small, the wrapper - function uses space on the stack, because this is faster than using - malloc() for each call. The threshold above which the stack is no longer - used is defined by POSIX_MALLOC_THRESHOLD. */ -#ifndef POSIX_MALLOC_THRESHOLD -#define POSIX_MALLOC_THRESHOLD 10 -#endif +/* #undef PCRE2_STATIC */ /* Define to necessary symbol if this constant uses a non-standard name on your system. */ @@ -273,35 +252,35 @@ sure both macros are undefined; an emulation function will then be used. */ /* Define to any value to enable support for Just-In-Time compiling. */ /* #undef SUPPORT_JIT */ -/* Define to any value to allow pcregrep to be linked with libbz2, so that it +/* Define to any value to allow pcre2grep to be linked with libbz2, so that it is able to handle .bz2 files. */ /* #undef SUPPORT_LIBBZ2 */ -/* Define to any value to allow pcretest to be linked with libedit. */ +/* Define to any value to allow pcre2test to be linked with libedit. */ /* #undef SUPPORT_LIBEDIT */ -/* Define to any value to allow pcretest to be linked with libreadline. */ +/* Define to any value to allow pcre2test to be linked with libreadline. */ /* #undef SUPPORT_LIBREADLINE */ -/* Define to any value to allow pcregrep to be linked with libz, so that it is - able to handle .gz files. */ +/* Define to any value to allow pcre2grep to be linked with libz, so that it + is able to handle .gz files. */ /* #undef SUPPORT_LIBZ */ -/* Define to any value to enable the 16 bit PCRE library. */ +/* Define to any value to enable the 16 bit PCRE2 library. */ /* #undef SUPPORT_PCRE16 */ -/* Define to any value to enable the 32 bit PCRE library. */ +/* Define to any value to enable JIT support in pcre2grep. */ +/* #undef SUPPORT_PCRE2GREP_JIT */ + +/* Define to any value to enable the 32 bit PCRE2 library. */ /* #undef SUPPORT_PCRE32 */ -/* Define to any value to enable the 8 bit PCRE library. */ +/* Define to any value to enable the 8 bit PCRE2 library. */ /* #undef SUPPORT_PCRE8 */ -/* Define to any value to enable JIT support in pcregrep. */ -/* #undef SUPPORT_PCREGREP_JIT */ - /* Define to any value to enable support for the UTF-8/16/32 Unicode encoding. This will work even in an EBCDIC environment, but it is incompatible with - the EBCDIC macro. That is, PCRE can support *either* EBCDIC code *or* + the EBCDIC macro. That is, PCRE2 can support *either* EBCDIC code *or* ASCII/UTF-8/16/32, but not both at once. */ /* #undef SUPPORT_UTF */ @@ -309,7 +288,7 @@ sure both macros are undefined; an emulation function will then be used. */ /* #undef SUPPORT_VALGRIND */ /* Version number of package */ -#define VERSION "9.00-DEV" +#define VERSION "10.00-DEV" /* Define to empty if `const' does not conform to ANSI C. */ /* #undef const */ diff --git a/src/pcre2.h.in b/src/pcre2.h.in index d7194e7..5b14129 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -271,12 +271,14 @@ typedef const PCRE2_UCHAR8 *PCRE2_SPTR8; typedef const PCRE2_UCHAR16 *PCRE2_SPTR16; typedef const PCRE2_UCHAR32 *PCRE2_SPTR32; -/* Offsets in the pattern (for errors) and in the subject (after a match) are -unsigned 32-bit numbers. We also define a value to indicate "unset" in the -offset vector (ovector). */ +/* The PCRE2_SIZE type is used for all string lengths and offsets in PCRE22, +including pattern offsets for errors and subject offsets after a match. We +define special values to indicate zero-terminated strings and unset offsets in +the offset vector (ovector). */ -#define PCRE2_OFFSET PCRE2_UCHAR32 -#define PCRE2_UNSET (~(PCRE2_OFFSET)0) +#define PCRE2_SIZE size_t +#define PCRE2_ZERO_TERMINATED (~(PCRE2_SIZE)0) +#define PCRE2_UNSET (~(PCRE2_SIZE)0) /* Generic types for opaque structures and JIT callback functions. These declarations are defined in a macro that is expanded for each width later. */ @@ -314,17 +316,17 @@ typedef struct pcre2_callout_block { \ int version; /* Identifies version of block */ \ /* ------------------------ Version 0 ------------------------------- */ \ uint32_t callout_number; /* Number compiled into pattern */ \ - PCRE2_OFFSET *offset_vector; /* The offset vector */ \ + PCRE2_SIZE *offset_vector; /* The offset vector */ \ PCRE2_SPTR subject; /* The subject being matched */ \ size_t subject_length; /* The length of the subject */ \ - PCRE2_OFFSET start_match; /* Offset to start of this match attempt */ \ - PCRE2_OFFSET current_position; /* Where we currently are in the subject */ \ + PCRE2_SIZE start_match; /* Offset to start of this match attempt */ \ + PCRE2_SIZE current_position; /* Where we currently are in the subject */ \ uint32_t capture_top; /* Max current capture */ \ uint32_t capture_last; /* Most recently closed capture */ \ void *callout_data; /* Data passed in with the call */ \ /* ------------------- Added for Version 1 -------------------------- */ \ - PCRE2_OFFSET pattern_position; /* Offset to next item in the pattern */ \ - PCRE2_OFFSET next_item_length; /* Length of next item in the pattern */ \ + PCRE2_SIZE pattern_position; /* Offset to next item in the pattern */ \ + PCRE2_SIZE next_item_length; /* Length of next item in the pattern */ \ /* ------------------- Added for Version 2 -------------------------- */ \ PCRE2_SPTR mark; /* Pointer to current mark or NULL */ \ /* ------------------------------------------------------------------ */ \ @@ -392,8 +394,8 @@ PCRE2_EXP_DECL int pcre2_set_recursion_memory_management( \ #define PCRE2_COMPILE_FUNCTIONS \ PCRE2_EXP_DECL \ - pcre2_code *pcre2_compile(PCRE2_SPTR, int, uint32_t, \ - int *, PCRE2_OFFSET *, pcre2_compile_context *); \ + pcre2_code *pcre2_compile(PCRE2_SPTR, PCRE2_SIZE, uint32_t, \ + int *, PCRE2_SIZE *, pcre2_compile_context *); \ PCRE2_EXP_DECL void pcre2_code_free(pcre2_code *); @@ -408,25 +410,25 @@ PCRE2_EXP_DECL int pcre2_pattern_info(const pcre2_code *, uint32_t, \ #define PCRE2_MATCH_FUNCTIONS \ PCRE2_EXP_DECL \ - pcre2_match_data *pcre2_match_data_create(uint32_t, \ - pcre2_general_context *); \ + pcre2_match_data *pcre2_match_data_create(uint32_t, \ + pcre2_general_context *); \ PCRE2_EXP_DECL \ - pcre2_match_data *pcre2_match_data_create_from_pattern(pcre2_code *, \ - pcre2_general_context *); \ -PCRE2_EXP_DECL int pcre2_dfa_match(const pcre2_code *, \ - PCRE2_SPTR, int, PCRE2_OFFSET, uint32_t, \ - pcre2_match_data *, pcre2_match_context *, int *, \ - size_t); \ -PCRE2_EXP_DECL int pcre2_match(const pcre2_code *, \ - PCRE2_SPTR, int, PCRE2_OFFSET, uint32_t, \ - pcre2_match_data *, pcre2_match_context *); \ -PCRE2_EXP_DECL void pcre2_match_data_free(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_OFFSET pcre2_get_leftchar(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_SPTR pcre2_get_mark(pcre2_match_data *); \ -PCRE2_EXP_DECL uint32_t pcre2_get_ovector_count(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_OFFSET *pcre2_get_ovector_pointer(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_OFFSET pcre2_get_rightchar(pcre2_match_data *); \ -PCRE2_EXP_DECL PCRE2_OFFSET pcre2_get_startchar(pcre2_match_data *); + pcre2_match_data *pcre2_match_data_create_from_pattern(pcre2_code *, \ + pcre2_general_context *); \ +PCRE2_EXP_DECL int pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, \ + PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ + pcre2_match_data *, pcre2_match_context *, int *, \ + size_t); \ +PCRE2_EXP_DECL int pcre2_match(const pcre2_code *, \ + PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ + pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void pcre2_match_data_free(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE pcre2_get_leftchar(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SPTR pcre2_get_mark(pcre2_match_data *); \ +PCRE2_EXP_DECL uint32_t pcre2_get_ovector_count(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE pcre2_get_rightchar(pcre2_match_data *); \ +PCRE2_EXP_DECL PCRE2_SIZE pcre2_get_startchar(pcre2_match_data *); /* Convenience functions for handling matched substrings. */ @@ -457,9 +459,9 @@ PCRE2_EXP_DECL int pcre2_substring_list_get(pcre2_match_data *, \ /* Functions for JIT processing */ #define PCRE2_JIT_FUNCTIONS \ -PCRE2_EXP_DECL void pcre2_jit_compile(pcre2_code *, uint32_t); \ +PCRE2_EXP_DECL int pcre2_jit_compile(pcre2_code *, uint32_t); \ PCRE2_EXP_DECL int pcre2_jit_match(const pcre2_code *, \ - PCRE2_SPTR, int, PCRE2_OFFSET, uint32_t, \ + PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, \ pcre2_match_data *, pcre2_match_context *, \ pcre2_jit_stack *); \ PCRE2_EXP_DECL void pcre2_jit_free_unused_memory(pcre2_general_context *);\ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 0a236db..5f5b7ea 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -7256,8 +7256,8 @@ Returns: pointer to compiled data block, or NULL on error, */ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION -pcre2_compile(PCRE2_SPTR pattern, int patlen, uint32_t options, - int *errorptr, PCRE2_OFFSET *erroroffset, pcre2_compile_context *ccontext) +pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options, + int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext) { BOOL utf; /* Set TRUE for UTF mode */ pcre2_real_code *re = NULL; /* What we will return */ @@ -7324,10 +7324,12 @@ if (ccontext == NULL) ccontext = &default_context; } -/* A negative pattern length means "zero-terminated". Otherwise, we make -a copy of the pattern and add a zero. */ +/* A zero-terminated pattern is indicated by the special length value +PCRE2_ZERO_TERMINATED. Otherwise, we make a copy of the pattern and add a zero, +to ensure that it is always possible to look one code unit beyond the end of +the pattern's characters. */ -if (patlen < 0) patlen = PRIV(strlen)(pattern); else +if (patlen == PCRE2_ZERO_TERMINATED) patlen = PRIV(strlen)(pattern); else { if (patlen < COPIED_PATTERN_SIZE) copied_pattern = stack_copied_pattern; @@ -7473,7 +7475,7 @@ if (utf) goto HAD_ERROR; } if ((options & PCRE2_NO_UTF_CHECK) == 0 && - (errorcode = PRIV(valid_utf)(pattern, -1, erroroffset)) != 0) + (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) goto HAD_ERROR; } diff --git a/src/pcre2_config.c b/src/pcre2_config.c index f468087..c6b0cbe 100644 --- a/src/pcre2_config.c +++ b/src/pcre2_config.c @@ -136,7 +136,7 @@ switch (what) break; case PCRE2_CONFIG_STACKRECURSE: -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE *((int *)where) = 0; #else *((int *)where) = 1; diff --git a/src/pcre2_context.c b/src/pcre2_context.c index 3376ef6..564f642 100644 --- a/src/pcre2_context.c +++ b/src/pcre2_context.c @@ -164,7 +164,7 @@ if (defmemctl) mcontext->memctl.free = default_free; mcontext->memctl.memory_data = NULL; } -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE mcontext->stack_memctl = mcontext->memctl; #endif mcontext->callout = NULL; @@ -389,7 +389,7 @@ pcre2_set_recursion_memory_management(pcre2_match_context *mcontext, void *(*mymalloc)(size_t, void *), void (*myfree)(void *, void *), void *mydata) { -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE mcontext->stack_memctl.malloc = mymalloc; mcontext->stack_memctl.free = myfree; mcontext->stack_memctl.memory_data = mydata; diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index de6622c..267d956 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -365,8 +365,8 @@ internal_dfa_match( dfa_match_block *mb, PCRE2_SPTR this_start_code, PCRE2_SPTR current_subject, - PCRE2_OFFSET start_offset, - PCRE2_OFFSET *offsets, + PCRE2_SIZE start_offset, + PCRE2_SIZE *offsets, uint32_t offsetcount, int *workspace, int wscount, @@ -730,7 +730,7 @@ for (;;) else if (match_count > 0 && ++match_count * 2 > (int)offsetcount) match_count = 0; count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2; - if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); + if (count > 0) memmove(offsets + 2, offsets, count * sizeof(PCRE2_SIZE)); if (offsetcount >= 2) { offsets[0] = (int)(current_subject - start_subject); @@ -2560,7 +2560,7 @@ for (;;) case OP_ASSERTBACK_NOT: { PCRE2_SPTR endasscode = code + GET(code, 1); - PCRE2_OFFSET local_offsets[2]; + PCRE2_SIZE local_offsets[2]; int rc; int local_workspace[1000]; @@ -2572,7 +2572,7 @@ for (;;) ptr, /* where we currently are */ (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ - sizeof(local_offsets)/sizeof(PCRE2_OFFSET), /* size of same */ + sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ rlevel); /* function recursion level */ @@ -2587,7 +2587,7 @@ for (;;) case OP_COND: case OP_SCOND: { - PCRE2_OFFSET local_offsets[1000]; + PCRE2_SIZE local_offsets[1000]; int local_workspace[1000]; int codelink = GET(code, 1); int condcode; @@ -2606,9 +2606,9 @@ for (;;) cb.callout_number = code[LINK_SIZE+2]; cb.offset_vector = offsets; cb.subject = start_subject; - cb.subject_length = (int)(end_subject - start_subject); - cb.start_match = (int)(current_subject - start_subject); - cb.current_position = (int)(ptr - start_subject); + cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject); + cb.start_match = (PCRE2_SIZE)(current_subject - start_subject); + cb.current_position = (PCRE2_SIZE)(ptr - start_subject); cb.pattern_position = GET(code, LINK_SIZE + 3); cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); cb.capture_top = 1; @@ -2664,7 +2664,7 @@ for (;;) ptr, /* where we currently are */ (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ - sizeof(local_offsets)/sizeof(PCRE2_OFFSET), /* size of same */ + sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ rlevel); /* function recursion level */ @@ -2683,7 +2683,7 @@ for (;;) case OP_RECURSE: { dfa_recursion_info *ri; - PCRE2_OFFSET local_offsets[1000]; + PCRE2_SIZE local_offsets[1000]; int local_workspace[1000]; PCRE2_SPTR callpat = start_code + GET(code, 1); uint32_t recno = (callpat == mb->start_code)? 0 : @@ -2712,7 +2712,7 @@ for (;;) ptr, /* where we currently are */ (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ - sizeof(local_offsets)/sizeof(PCRE2_OFFSET), /* size of same */ + sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ rlevel); /* function recursion level */ @@ -2777,7 +2777,7 @@ for (;;) for (matched_count = 0;; matched_count++) { - PCRE2_OFFSET local_offsets[2]; + PCRE2_SIZE local_offsets[2]; int local_workspace[1000]; int rc = internal_dfa_match( @@ -2786,7 +2786,7 @@ for (;;) local_ptr, /* where we currently are */ (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ - sizeof(local_offsets)/sizeof(PCRE2_OFFSET), /* size of same */ + sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ rlevel); /* function recursion level */ @@ -2849,7 +2849,7 @@ for (;;) case OP_ONCE: case OP_ONCE_NC: { - PCRE2_OFFSET local_offsets[2]; + PCRE2_SIZE local_offsets[2]; int local_workspace[1000]; int rc = internal_dfa_match( @@ -2858,7 +2858,7 @@ for (;;) ptr, /* where we currently are */ (int)(ptr - start_subject), /* start offset */ local_offsets, /* offset vector */ - sizeof(local_offsets)/sizeof(PCRE2_OFFSET), /* size of same */ + sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ rlevel); /* function recursion level */ @@ -2948,9 +2948,9 @@ for (;;) cb.callout_number = code[1]; cb.offset_vector = offsets; cb.subject = start_subject; - cb.subject_length = (int)(end_subject - start_subject); - cb.start_match = (int)(current_subject - start_subject); - cb.current_position = (int)(ptr - start_subject); + cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject); + cb.start_match = (PCRE2_SIZE)(current_subject - start_subject); + cb.current_position = (PCRE2_SIZE)(ptr - start_subject); cb.pattern_position = GET(code, 2); cb.next_item_length = GET(code, 2 + LINK_SIZE); cb.capture_top = 1; @@ -3049,8 +3049,8 @@ Returns: > 0 => number of match offset pairs placed in offsets */ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, int length, - PCRE2_OFFSET start_offset, uint32_t options, pcre2_match_data *match_data, +pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, + PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, int *workspace, size_t wscount) { const pcre2_real_code *re = (const pcre2_real_code *)code; @@ -3078,9 +3078,10 @@ is used below, and it expects NLBLOCK to be defined as a pointer. */ dfa_match_block actual_match_block; dfa_match_block *mb = &actual_match_block; -/* A negative length implies a zero-terminated subject string. */ +/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated +subject string. */ -if (length < 0) length = PRIV(strlen)(subject); +if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); /* Plausibility checks */ @@ -3088,7 +3089,7 @@ if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL) return PCRE2_ERROR_NULL; if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE; -if ((int)start_offset > length) return PCRE2_ERROR_BADOFFSET; +if (start_offset > length) return PCRE2_ERROR_BADOFFSET; /* Check that the first field in the block is the magic number. If it is not, return with PCRE2_ERROR_BADMAGIC. However, if the magic number is equal to @@ -3214,7 +3215,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) return match_data->rc; } #if PCRE2_CODE_UNIT_WIDTH != 32 - if (start_offset > 0 && (int)start_offset < length && + if (start_offset > 0 && start_offset < length && NOT_FIRSTCHAR(subject[start_offset])) return PCRE2_ERROR_BADUTFOFFSET; #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ @@ -3466,12 +3467,12 @@ for (;;) { if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0) { - match_data->ovector[0] = (PCRE2_OFFSET)(start_match - subject); - match_data->ovector[1] = (PCRE2_OFFSET)(end_subject - subject); + match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject); + match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject); } - match_data->leftchar = (PCRE2_OFFSET)(mb->start_used_ptr - subject); + match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject); match_data->rightchar = 0; /* FIXME */ - match_data->startchar = (PCRE2_OFFSET)(start_match - subject); + match_data->startchar = (PCRE2_SIZE)(start_match - subject); match_data->rc = rc; return rc; } diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 6e295b3..f1caeaa 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1859,7 +1859,7 @@ extern void _pcre2_compile_context_init(pcre2_compile_context *, BOOL); extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); -extern int _pcre2_jit_get_size(void *); +extern size_t _pcre2_jit_get_size(void *); extern void _pcre2_match_context_init(pcre2_match_context *, BOOL); extern void *_pcre2_memctl_malloc(size_t, pcre2_memctl *); extern unsigned int _pcre2_ord2utf(uint32_t, PCRE2_UCHAR *); @@ -1869,7 +1869,7 @@ extern int _pcre2_strlen(PCRE2_SPTR); extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t); extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t); extern int _pcre2_study(pcre2_real_code *); -extern int _pcre2_valid_utf(PCRE2_SPTR, int, PCRE2_OFFSET *); +extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *); extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL); diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index b2f895a..a6d518d 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -565,7 +565,7 @@ typedef struct pcre2_real_compile_context { typedef struct pcre2_real_match_context { pcre2_memctl memctl; -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE pcre2_memctl stack_memctl; #endif int (*callout)(pcre2_callout_block *); @@ -609,12 +609,12 @@ typedef struct pcre2_real_match_data { const pcre2_real_code *code; /* The pattern used for the match */ PCRE2_SPTR subject; /* The subject that was matched */ int rc; /* The return code from the match */ - PCRE2_OFFSET leftchar; /* Offset to leftmost code unit */ - PCRE2_OFFSET rightchar; /* Offset to rightmost code unit */ - PCRE2_OFFSET startchar; /* Offset to starting code unit */ + PCRE2_SIZE leftchar; /* Offset to leftmost code unit */ + PCRE2_SIZE rightchar; /* Offset to rightmost code unit */ + PCRE2_SIZE startchar; /* Offset to starting code unit */ PCRE2_SPTR mark; /* Pointer to last mark */ uint16_t oveccount; /* Number of pairs */ - PCRE2_OFFSET ovector[1]; /* The first field */ + PCRE2_SIZE ovector[1]; /* The first field */ } pcre2_real_match_data; @@ -686,12 +686,12 @@ typedef struct compile_block { call within the pattern; used by pcre_match(). */ typedef struct recursion_info { - struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ - unsigned int group_num; /* Number of group that was called */ - PCRE2_OFFSET *offset_save; /* Pointer to start of saved offsets */ - uint32_t saved_max; /* Number of saved offsets */ - uint32_t saved_capture_last; /* Last capture number */ - PCRE2_SPTR subject_position; /* Position at start of recursion */ + struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ + unsigned int group_num; /* Number of group that was called */ + PCRE2_SIZE *ovec_save; /* Pointer to start of saved ovector */ + uint32_t saved_max; /* Number of saved offsets */ + uint32_t saved_capture_last; /* Last capture number */ + PCRE2_SPTR subject_position; /* Position at start of recursion */ } recursion_info; /* A similar structure for pcre_dfa_match(). */ @@ -717,7 +717,7 @@ doing traditional NFA matching (pcre2_match() and friends). */ typedef struct match_block { pcre2_memctl memctl; /* For general use */ -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE pcre2_memctl stack_memctl; /* For "stack" frames */ #endif uint32_t match_call_count; /* As it says */ @@ -728,11 +728,11 @@ typedef struct match_block { const uint8_t *lcc; /* Points to lower casing table */ const uint8_t *fcc; /* Points to case-flipping table */ const uint8_t *ctypes; /* Points to table of type maps */ - PCRE2_OFFSET *ovector; /* Pointer to the offset vector */ - PCRE2_OFFSET offset_end; /* One past the end */ - PCRE2_OFFSET offset_max; /* The maximum usable for return data */ - PCRE2_OFFSET start_offset; /* The start offset value */ - PCRE2_OFFSET end_offset_top; /* Highwater mark at end of match */ + PCRE2_SIZE *ovector; /* Pointer to the offset vector */ + PCRE2_SIZE offset_end; /* One past the end */ + PCRE2_SIZE offset_max; /* The maximum usable for return data */ + PCRE2_SIZE start_offset; /* The start offset value */ + PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */ uint16_t partial; /* PARTIAL options */ uint16_t bsr_convention; /* \R interpretation */ uint16_t name_count; /* Number of names in name table */ @@ -760,7 +760,7 @@ typedef struct match_block { recursion_info *recursive; /* Linked list of recursion data */ void *callout_data; /* To pass back to callouts */ int (*callout)(pcre2_callout_block *); /* Callout function or NULL */ -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE void *match_frames_base; /* For remembering malloc'd frames */ #endif } match_block; @@ -769,22 +769,22 @@ typedef struct match_block { functions. */ typedef struct dfa_match_block { - pcre2_memctl memctl; /* For general use */ - PCRE2_SPTR start_code; /* Start of the compiled pattern */ - PCRE2_SPTR start_subject ; /* Start of the subject string */ - PCRE2_SPTR end_subject; /* End of subject string */ - PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ - const uint8_t *tables; /* Character tables */ - PCRE2_OFFSET start_offset; /* The start offset value */ - uint32_t moptions; /* Match options */ - uint32_t poptions; /* Pattern options */ - uint32_t nltype; /* Newline type */ - uint32_t nllen; /* Newline string length */ - PCRE2_UCHAR nl[4]; /* Newline string when fixed */ - uint16_t bsr_convention; /* \R interpretation */ - void *callout_data; /* To pass back to callouts */ + pcre2_memctl memctl; /* For general use */ + PCRE2_SPTR start_code; /* Start of the compiled pattern */ + PCRE2_SPTR start_subject ; /* Start of the subject string */ + PCRE2_SPTR end_subject; /* End of subject string */ + PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ + const uint8_t *tables; /* Character tables */ + PCRE2_SIZE start_offset; /* The start offset value */ + uint32_t moptions; /* Match options */ + uint32_t poptions; /* Pattern options */ + uint32_t nltype; /* Newline type */ + uint32_t nllen; /* Newline string length */ + PCRE2_UCHAR nl[4]; /* Newline string when fixed */ + uint16_t bsr_convention; /* \R interpretation */ + void *callout_data; /* To pass back to callouts */ int (*callout)(pcre2_callout_block *); /* Callout function or NULL */ - dfa_recursion_info *recursive; /* Linked list of recursion data */ + dfa_recursion_info *recursive; /* Linked list of recursion data */ } dfa_match_block; #endif /* PCRE2_PCRE2TEST */ diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index bb8dfe1..4aebfbf 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -50,11 +50,11 @@ POSSIBILITY OF SUCH DAMAGE. * JIT compile a Regular Expression * *************************************************/ -/* This function used JIT to convert a previously-compiled pattern into machine +/* This function used JIT to convert a previously-compiled pattern into machine code. Arguments: - code a compiled pattern + code a compiled pattern options JIT option bits Returns: nothing @@ -62,16 +62,21 @@ Returns: nothing /* FIXME: this is currently a placeholder function */ -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_jit_compile(pcre2_code *code, uint32_t options) { #ifndef SUPPORT_JIT + (void)code; (void)options; +return PCRE2_ERROR_JIT_BADOPTION; + #else /* SUPPORT_JIT */ - -code=code; options = options; /* Dummy.... */ +/* Dummy code */ +code=code; +options = options; +return PCRE2_ERROR_JIT_BADOPTION; #endif /* SUPPORT_JIT */ } diff --git a/src/pcre2_jit_match.c b/src/pcre2_jit_match.c index c779387..847feb2 100644 --- a/src/pcre2_jit_match.c +++ b/src/pcre2_jit_match.c @@ -71,11 +71,12 @@ Returns: > 0 => success; value is the number of ovector pairs filled /* FIXME: this is currently a placeholder function */ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, int length, - PCRE2_OFFSET start_offset, uint32_t options, pcre2_match_data *match_data, +pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, + PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext, pcre2_jit_stack *jit_stack) { #ifndef SUPPORT_JIT + (void)code; (void)subject; (void)length; @@ -85,17 +86,21 @@ pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, int length, (void)mcontext; (void)jit_stack; return PCRE2_ERROR_JIT_BADOPTION; + #else /* SUPPORT_JIT */ - /* Dummy code */ -code=code;subject=subject;length=length; -start_offset=start_offset; options=options; match_data=match_data; +code=code; +subject=subject; +length=length; +start_offset=start_offset; +options=options; +match_data=match_data; mcontext=mcontext; jit_stack=jit_stack; return PCRE2_ERROR_JIT_BADOPTION; #endif /* SUPPORT_JIT */ -} +} /* End of pcre2_jit_match.c */ diff --git a/src/pcre2_jit_misc.c b/src/pcre2_jit_misc.c index 9c28c72..95ed670 100644 --- a/src/pcre2_jit_misc.c +++ b/src/pcre2_jit_misc.c @@ -54,14 +54,18 @@ POSSIBILITY OF SUCH DAMAGE. *************************************************/ -PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION +PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION pcre2_jit_free_unused_memory(pcre2_general_context *gcontext) { #ifndef SUPPORT_JIT + (void)gcontext; /* Suppress warning */ + #else /* SUPPORT_JIT */ -gcontext=gcontext; /* Dummy */ +/* Dummy code */ +gcontext=gcontext; + #endif /* SUPPORT_JIT */ } @@ -71,19 +75,25 @@ gcontext=gcontext; /* Dummy */ * Allocate a JIT stack * *************************************************/ -PCRE2_EXP_DEFN pcre2_jit_stack * PCRE2_CALL_CONVENTION -pcre2_jit_stack_alloc(pcre2_general_context *gcontext, size_t startsize, +PCRE2_EXP_DEFN pcre2_jit_stack * PCRE2_CALL_CONVENTION +pcre2_jit_stack_alloc(pcre2_general_context *gcontext, size_t startsize, size_t maxsize) { #ifndef SUPPORT_JIT + (void)gcontext; (void)startsize; (void)maxsize; return NULL; + #else /* SUPPORT_JIT */ -gcontext=gcontext;startsize=startsize;maxsize=maxsize; +/* Dummy code */ +gcontext=gcontext; +startsize=startsize; +maxsize=maxsize; return NULL; + #endif } @@ -97,12 +107,18 @@ pcre2_jit_stack_assign(const pcre2_code *code, pcre2_jit_callback callback, void *callback_data) { #ifndef SUPPORT_JIT + (void)code; (void)callback; (void)callback_data; + #else /* SUPPORT_JIT */ -code=code;callback=callback;callback_data=callback_data; +/* Dummy code */ +code=code; +callback=callback; +callback_data=callback_data; + #endif /* SUPPORT_JIT */ } @@ -115,10 +131,14 @@ PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION pcre2_jit_stack_free(pcre2_jit_stack *jit_stack) { #ifndef SUPPORT_JIT + (void)jit_stack; + #else /* SUPPORT_JIT */ +/* Dummy code */ jit_stack=jit_stack; + #endif /* SUPPORT_JIT */ } @@ -127,16 +147,20 @@ jit_stack=jit_stack; * Get size of JIT code * *************************************************/ -int +size_t PRIV(jit_get_size)(void *executable_jit) { #ifndef SUPPORT_JIT + (void)executable_jit; return 0; + #else /* SUPPORT_JIT */ +/* Dummy code */ executable_jit = executable_jit; -return 0; /* FIXME */ +return 0; + #endif } diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 1047f0a..4d56ec2 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -104,17 +104,18 @@ for any one of them can use a range. */ #define MATCH_BACKTRACK_MAX MATCH_THEN #define MATCH_BACKTRACK_MIN MATCH_COMMIT -/* Maximum number of ints of offset to save on the stack for recursive calls. -If the offset vector is bigger, malloc is used. This should be a multiple of 3, -because the offset vector is always a multiple of 3 long. */ - -#define REC_STACK_SAVE_MAX 30 - /* Min and max values for the common repeats; for the maxima, 0 => infinity */ static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; +/* Maximum number of ovector elements that can be saved on the system stack +when processing OP_RECURSE in non-HEAP_MATCH_RECURSE mode. If the ovector is +bigger, malloc() is used. This value should be a multiple of 3, because the +ovector length is always a multiple of 3. */ + +#define OP_RECURSE_STACK_SAVE_MAX 45 + /************************************************* @@ -129,11 +130,11 @@ subject bytes matched may be different to the number of reference bytes. Arguments: offset index into the offset vector eptr pointer into the subject - length length of reference to be matched (number of bytes) + length length of reference to be matched (number of code units) mb points to match block caseless TRUE if caseless -Returns: >= 0 the number of subject bytes matched +Returns: >= 0 the number of subject code units matched -1 no match -2 partial match; always given if at end subject */ @@ -230,7 +231,7 @@ return (int)(eptr - eptr_start); RECURSION IN THE match() FUNCTION The match() function is highly recursive, though not every recursive call -increases the recursive depth. Nevertheless, some regular expressions can cause +increases the recursion depth. Nevertheless, some regular expressions can cause it to recurse to a great depth. I was writing for Unix, so I just let it call itself recursively. This uses the stack for saving everything that has to be saved for a recursive call. On Unix, the stack can be large, and this works @@ -241,9 +242,9 @@ programs that use a lot of stack. (This despite the fact that every last chip has oodles of memory these days, and techniques for extending the stack have been known for decades.) So.... -There is a fudge, triggered by defining NO_RECURSE, which avoids recursive -calls by keeping local variables that need to be preserved in blocks of memory -obtained from malloc() instead instead of on the stack. Macros are used to +There is a fudge, triggered by defining HEAP_MATCH_RECURSE, which avoids +recursive calls by keeping local variables that need to be preserved in blocks +of memory on the heap instead instead of on the stack. Macros are used to achieve this so that the actual code doesn't look very different to what it always used to. @@ -274,11 +275,10 @@ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60, RM61, RM62, RM63, RM64, RM65, RM66, RM67 }; -/* These versions of the macros use the stack, as normal. There are debugging -versions and production versions. Note that the "rw" argument of RMATCH isn't -actually used in this definition. */ +/* These versions of the macros use the stack, as normal. Note that the "rw" +argument of RMATCH isn't actually used in this definition. */ -#ifndef NO_RECURSE +#ifndef HEAP_MATCH_RECURSE #define REGISTER register #define RMATCH(ra,rb,rc,rd,re,rw) \ rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1) @@ -350,10 +350,9 @@ typedef struct heapframe { eptrblock *Xeptrb; - PCRE2_OFFSET Xoffset; - PCRE2_OFFSET Xoffset_top; - PCRE2_OFFSET Xstacksave[REC_STACK_SAVE_MAX]; - PCRE2_OFFSET Xsave_offset1, Xsave_offset2, Xsave_offset3; + PCRE2_SIZE Xoffset; + PCRE2_SIZE Xoffset_top; + PCRE2_SIZE Xsave_offset1, Xsave_offset2, Xsave_offset3; uint32_t Xfc; uint32_t Xnumber; @@ -395,6 +394,99 @@ typedef struct heapframe { ***************************************************************************/ +/* When HEAP_MATCH_RECURSE is not defined, the match() function implements +backtrack points by calling itself recursively in all but one case. The one +special case is when processing OP_RECURSE, which specifies recursion in the +pattern. The entire ovector must be saved and restored while processing +OP_RECURSE. If the ovector is small enough, instead of calling match() +directly, op_recurse_ovecsave() is called. This function uses the system stack +to save the ovector while calling match() to process the pattern recursion. */ + +#ifndef HEAP_MATCH_RECURSE + +/* We need a prototype for match() because it is mutually recursive with +op_recurse_ovecsave(). */ + +static int +match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart, + PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth); + + +/************************************************* +* Process OP_RECURSE, stacking ovector * +*************************************************/ + +/* When this function is called, mb->recursive has already been updated to +point to a new recursion data block, and all its fields other than ovec_save +have been set. + +Arguments: + eptr pointer to current character in subject + callpat the recursion point in the pattern + mstart pointer to the current match start position (can be modified + by encountering \K) + offset_top current top pointer + mb pointer to "static" info block for the match + eptrb pointer to chain of blocks containing eptr at start of + brackets - for testing for empty matches + rdepth the recursion depth + +Returns: a match() return code +*/ + +static int +op_recurse_ovecsave(REGISTER PCRE2_SPTR eptr, PCRE2_SPTR callpat, + PCRE2_SPTR mstart, PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, + uint32_t rdepth) +{ +register int rrc; +BOOL cbegroup = *callpat >= OP_SBRA; +recursion_info *new_recursive = mb->recursive; +PCRE2_SIZE ovecsave[OP_RECURSE_STACK_SAVE_MAX]; + +/* Save the ovector */ + +new_recursive->ovec_save = ovecsave; +memcpy(ovecsave, mb->ovector, new_recursive->saved_max * sizeof(PCRE2_SIZE)); + +/* Do the recursion. After processing each alternative, restore the ovector +data and the last captured value. */ + +do + { + if (cbegroup) mb->match_function_type = MATCH_CBEGROUP; + rrc = match(eptr, callpat + PRIV(OP_lengths)[*callpat], mstart, offset_top, + mb, eptrb, rdepth + 1); + memcpy(mb->ovector, new_recursive->ovec_save, + new_recursive->saved_max * sizeof(PCRE2_SIZE)); + mb->capture_last = new_recursive->saved_capture_last; + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) return rrc; + + /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a + recursion; they cause a NOMATCH for the entire recursion. These codes + are defined in a range that can be tested for. */ + + if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) + return MATCH_NOMATCH; + + /* Any return code other than NOMATCH is an error. Otherwise, advance to the + next alternative or to the end of the recursing subpattern. If there were + nested recursions, mb->recursive might be changed, so reset it before + looping. */ + + if (rrc != MATCH_NOMATCH) return rrc; + mb->recursive = new_recursive; + callpat += GET(callpat, 1); + } +while (*callpat == OP_ALT); /* Loop for the alternatives */ + +/* None of the alternatives matched. */ + +return MATCH_NOMATCH; +} +#endif /* HEAP_MATCH_RECURSE */ + + /************************************************* * Match from current position * @@ -451,9 +543,8 @@ Returns: MATCH_MATCH if matched ) these values are >= 0 */ static int -match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, - PCRE2_SPTR mstart, PCRE2_OFFSET offset_top, match_block *mb, eptrblock *eptrb, - uint32_t rdepth) +match(REGISTER PCRE2_SPTR eptr, REGISTER PCRE2_SPTR ecode, PCRE2_SPTR mstart, + PCRE2_SIZE offset_top, match_block *mb, eptrblock *eptrb, uint32_t rdepth) { /* These variables do not need to be preserved over recursion in this function, so they can be ordinary variables in all cases. Mark some of them with @@ -475,7 +566,7 @@ whenever RMATCH() does a "recursion". See the macro definitions above. Putting the top-level on the stack rather than malloc-ing them all gives a performance boost in many cases where there is not much "recursion". */ -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE heapframe *frame = (heapframe *)mb->match_frames_base; /* Copy in the original argument variables */ @@ -535,7 +626,6 @@ HEAP_RECURSE: #define save_offset1 frame->Xsave_offset1 #define save_offset2 frame->Xsave_offset2 #define save_offset3 frame->Xsave_offset3 -#define stacksave frame->Xstacksave #define condition frame->Xcondition #define cur_is_word frame->Xcur_is_word @@ -543,11 +633,11 @@ HEAP_RECURSE: #define newptrb frame->Xnewptrb -/* When recursion is being used, local variables are allocated on the stack and -get preserved during recursion in the normal way. In this environment, fi and -i, and fc and c, can be the same variables. */ +/* When normal stack-based recursion is being used for match(), local variables +are allocated on the stack and get preserved during recursion in the usual way. +In this environment, fi and i, and fc and c, can be the same variables. */ -#else /* NO_RECURSE not defined */ +#else /* HEAP_MATCH_RECURSE not defined */ #define fi i #define fc c @@ -569,9 +659,8 @@ PCRE2_SPTR pp; PCRE2_SPTR prev; PCRE2_SPTR saved_eptr; -PCRE2_OFFSET offset; -PCRE2_OFFSET stacksave[REC_STACK_SAVE_MAX]; -PCRE2_OFFSET save_offset1, save_offset2, save_offset3; +PCRE2_SIZE offset; +PCRE2_SIZE save_offset1, save_offset2, save_offset3; uint32_t number; uint32_t op; @@ -597,7 +686,7 @@ BOOL prev_is_word; eptrblock newptrb; recursion_info new_recursive; -#endif /* NO_RECURSE not defined */ +#endif /* HEAP_MATCH_RECURSE not defined */ /* To save space on the stack and in the heap frame, I have doubled up on some of the local variables that are used only in localised parts of the code, but @@ -622,19 +711,19 @@ prop_fail_result = 0; /* This label is used for tail recursion, which is used in a few cases even -when NO_RECURSE is not defined, in order to reduce the amount of stack that is -used. Thanks to Ian Taylor for noticing this possibility and sending the -original patch. */ +when HEAP_MATCH_RECURSE is not defined, in order to reduce the amount of stack +that is used. Thanks to Ian Taylor for noticing this possibility and sending +the original patch. */ TAIL_RECURSE: /* OK, now we can get on with the real code of the function. Recursive calls are specified by the macro RMATCH and RRETURN is used to return. When -NO_RECURSE is *not* defined, these just turn into a recursive call to match() -and a "return", respectively. However, RMATCH isn't like a function call -because it's quite a complicated macro. It has to be used in one particular -way. This shouldn't, however, impact performance when true recursion is being -used. */ +HEAP_MATCH_RECURSE is *not* defined, these just turn into a recursive call to +match() and a "return", respectively. However, RMATCH isn't like a function +call because it's quite a complicated macro. It has to be used in one +particular way. This shouldn't, however, impact performance when true recursion +is being used. */ #ifdef SUPPORT_UTF utf = (mb->poptions & PCRE2_UTF) != 0; @@ -668,7 +757,7 @@ if (mb->match_function_type == MATCH_CBEGROUP) mb->match_function_type = 0; } -/* Now start processing the opcodes. */ +/* Now, at last, we can start processing the opcodes. */ for (;;) { @@ -1205,9 +1294,9 @@ for (;;) cb.callout_number = ecode[1]; cb.offset_vector = mb->ovector; cb.subject = mb->start_subject; - cb.subject_length = (int)(mb->end_subject - mb->start_subject); - cb.start_match = (int)(mstart - mb->start_subject); - cb.current_position = (int)(eptr - mb->start_subject); + cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject); + cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject); + cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject); cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; @@ -1230,7 +1319,7 @@ for (;;) condition = FALSE; switch(condcode = *ecode) { - case OP_RREF: /* Numbered group recursion test */ + case OP_RREF: /* Numbered group recursion test */ if (mb->recursive != NULL) /* Not recursing => FALSE */ { uint32_t recno = GET2(ecode, 1); /* Recursion group number*/ @@ -1588,9 +1677,9 @@ for (;;) cb.callout_number = ecode[1]; cb.offset_vector = mb->ovector; cb.subject = mb->start_subject; - cb.subject_length = (int)(mb->end_subject - mb->start_subject); - cb.start_match = (int)(mstart - mb->start_subject); - cb.current_position = (int)(eptr - mb->start_subject); + cb.subject_length = (PCRE2_SIZE)(mb->end_subject - mb->start_subject); + cb.start_match = (PCRE2_SIZE)(mstart - mb->start_subject); + cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject); cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; @@ -1613,7 +1702,7 @@ for (;;) all the potential data. There may be up to 65535 such values, which is too large to put on the stack, but using malloc for small numbers seems expensive. As a compromise, the stack is used when there are no more than - REC_STACK_SAVE_MAX values to store; otherwise malloc is used. + OP_RECURSE_STACK_SAVE_MAX values to store; otherwise malloc is used. There are also other values that have to be saved. We use a chained sequence of blocks that actually live on the stack. Thanks to Robin Houston @@ -1626,12 +1715,11 @@ for (;;) uint32_t recno; callpat = mb->start_code + GET(ecode, 1); - recno = (callpat == mb->start_code)? 0 : - GET2(callpat, 1 + LINK_SIZE); + recno = (callpat == mb->start_code)? 0 : GET2(callpat, 1 + LINK_SIZE); - /* Check for repeating a recursion without advancing the subject pointer. - This should catch convoluted mutual recursions. (Some simple cases are - caught at compile time.) */ + /* Check for repeating a pattern recursion without advancing the subject + pointer. This should catch convoluted mutual recursions. (Some simple + cases are caught at compile time.) */ for (ri = mb->recursive; ri != NULL; ri = ri->prevrec) if (recno == ri->group_num && eptr == ri->subject_position) @@ -1641,6 +1729,7 @@ for (;;) new_recursive.group_num = recno; new_recursive.saved_capture_last = mb->capture_last; + new_recursive.saved_max = mb->offset_end; new_recursive.subject_position = eptr; new_recursive.prevrec = mb->recursive; mb->recursive = &new_recursive; @@ -1649,78 +1738,93 @@ for (;;) ecode += 1 + LINK_SIZE; - /* Now save the offset data */ - - new_recursive.saved_max = mb->offset_end; - if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) - new_recursive.offset_save = stacksave; - else + /* When we are using the system stack for match() recursion we can call a + function that uses the system stack for preserving the ovector while + processing the pattern recursion, but only if the ovector is small + enough. */ + +#ifndef HEAP_MATCH_RECURSE + if (new_recursive.saved_max <= OP_RECURSE_STACK_SAVE_MAX) { - new_recursive.offset_save = (PCRE2_OFFSET *) - (mb->memctl.malloc(new_recursive.saved_max * sizeof(PCRE2_OFFSET), - mb->memctl.memory_data)); - if (new_recursive.offset_save == NULL) RRETURN(PCRE2_ERROR_NOMEMORY); + rrc = op_recurse_ovecsave(eptr, callpat, mstart, offset_top, mb, + eptrb, rdepth); + mb->recursive = new_recursive.prevrec; + if (rrc != MATCH_MATCH && rrc != MATCH_ACCEPT) RRETURN(rrc); + + /* Set where we got to in the subject, and reset the start, in case + it was changed by \K. This *is* propagated back out of a recursion, + for Perl compatibility. */ + + eptr = mb->end_match_ptr; + mstart = mb->start_match_ptr; + break; /* End of processing OP_RECURSE */ } - memcpy(new_recursive.offset_save, mb->ovector, - new_recursive.saved_max * sizeof(PCRE2_OFFSET)); - - /* OK, now we can do the recursion. After processing each alternative, - restore the offset data and the last captured value. If there were nested - recursions, mb->recursive might be changed, so reset it before looping. - */ - +#endif + /* If the ovector is too big, or if we are using the heap for match() + recursion, we have to use the heap for saving the ovector. */ + + new_recursive.ovec_save = (PCRE2_SIZE *) + (mb->memctl.malloc(new_recursive.saved_max * sizeof(PCRE2_SIZE), + mb->memctl.memory_data)); + if (new_recursive.ovec_save == NULL) RRETURN(PCRE2_ERROR_NOMEMORY); + memcpy(new_recursive.ovec_save, mb->ovector, + new_recursive.saved_max * sizeof(PCRE2_SIZE)); + + /* Do the recursion. After processing each alternative, restore the + ovector data and the last captured value. This code has the same overall + logic as the code in the op_recurse_ovecsave() function, but is adapted + to use RMATCH/RRETURN and to release the heap block containing the saved + ovector. */ + cbegroup = (*callpat >= OP_SBRA); do { if (cbegroup) mb->match_function_type = MATCH_CBEGROUP; RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, mb, eptrb, RM6); - memcpy(mb->ovector, new_recursive.offset_save, - new_recursive.saved_max * sizeof(PCRE2_OFFSET)); + memcpy(mb->ovector, new_recursive.ovec_save, + new_recursive.saved_max * sizeof(PCRE2_SIZE)); mb->capture_last = new_recursive.saved_capture_last; mb->recursive = new_recursive.prevrec; + if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) { - if (new_recursive.offset_save != stacksave) - mb->memctl.free(new_recursive.offset_save, mb->memctl.memory_data); - - /* Set where we got to in the subject, and reset the start in case + mb->memctl.free(new_recursive.ovec_save, mb->memctl.memory_data); + + /* Set where we got to in the subject, and reset the start, in case it was changed by \K. This *is* propagated back out of a recursion, for Perl compatibility. */ - + eptr = mb->end_match_ptr; mstart = mb->start_match_ptr; goto RECURSION_MATCHED; /* Exit loop; end processing */ } - + /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a recursion; they cause a NOMATCH for the entire recursion. These codes are defined in a range that can be tested for. */ - + if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX) - RRETURN(MATCH_NOMATCH); - - /* Any return code other than NOMATCH is an error. */ - - if (rrc != MATCH_NOMATCH) - { - if (new_recursive.offset_save != stacksave) - mb->memctl.free(new_recursive.offset_save, mb->memctl.memory_data); - RRETURN(rrc); + { + rrc = MATCH_NOMATCH; + goto RECURSION_RETURN; } - + + /* Any return code other than NOMATCH is an error. */ + + if (rrc != MATCH_NOMATCH) goto RECURSION_RETURN; mb->recursive = &new_recursive; callpat += GET(callpat, 1); } while (*callpat == OP_ALT); - + + RECURSION_RETURN: mb->recursive = new_recursive.prevrec; - if (new_recursive.offset_save != stacksave) - mb->memctl.free(new_recursive.offset_save, mb->memctl.memory_data); - RRETURN(MATCH_NOMATCH); + mb->memctl.free(new_recursive.ovec_save, mb->memctl.memory_data); + RRETURN(rrc); } - - RECURSION_MATCHED: + + RECURSION_MATCHED: break; /* An alternation is the end of a branch; scan along to find the end of the @@ -1840,8 +1944,8 @@ for (;;) if (offset > offset_top) { - register PCRE2_OFFSET *iptr = mb->ovector + offset_top; - register PCRE2_OFFSET *iend = mb->ovector + offset; + register PCRE2_SIZE *iptr = mb->ovector + offset_top; + register PCRE2_SIZE *iend = mb->ovector + offset; while (iptr < iend) *iptr++ = PCRE2_UNSET; } @@ -6023,7 +6127,7 @@ for (;;) match(), the RRETURN() macro jumps here. The number that is saved in frame->Xwhere indicates which label we actually want to return to. */ -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE #define LBL(val) case val: goto L_RM##val; HEAP_RETURN: switch (frame->Xwhere) @@ -6048,7 +6152,7 @@ switch (frame->Xwhere) return PCRE2_ERROR_INTERNAL; } #undef LBL -#endif /* NO_RECURSE */ +#endif /* HEAP_MATCH_RECURSE */ } @@ -6058,7 +6162,7 @@ switch (frame->Xwhere) Undefine all the macros that were defined above to handle this. */ -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE #undef eptr #undef ecode #undef mstart @@ -6091,10 +6195,9 @@ Undefine all the macros that were defined above to handle this. */ #undef save_offset1 #undef save_offset2 #undef save_offset3 -#undef stacksave #undef newptrb -#endif /* NO_RECURSE */ +#endif /* HEAP_MATCH_RECURSE */ /* These two are defined as macros in both cases */ @@ -6105,7 +6208,7 @@ Undefine all the macros that were defined above to handle this. */ ***************************************************************************/ -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE /************************************************* * Release allocated heap frames * *************************************************/ @@ -6131,7 +6234,7 @@ while (nextframe != NULL) mb->stack_memctl.free(oldframe, mb->stack_memctl.memory_data); } } -#endif /* NO_RECURSE */ +#endif /* HEAP_MATCH_RECURSE */ @@ -6160,8 +6263,8 @@ Returns: > 0 => success; value is the number of ovector pairs filled */ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION -pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, int length, - PCRE2_OFFSET start_offset, uint32_t options, pcre2_match_data *match_data, +pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, + PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data, pcre2_match_context *mcontext) { int rc; @@ -6198,23 +6301,24 @@ is used below, and it expects NLBLOCK to be defined as a pointer. */ match_block actual_match_block; match_block *mb = &actual_match_block; -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE heapframe frame_zero; frame_zero.Xprevframe = NULL; /* Marks the top level */ frame_zero.Xnextframe = NULL; /* None are allocated yet */ mb->match_frames_base = &frame_zero; #endif -/* A negative length implies a zero-terminated subject string. */ +/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated +subject string. */ -if (length < 0) length = PRIV(strlen)(subject); +if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); /* Plausibility checks */ if ((options & ~PUBLIC_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION; if (code == NULL || subject == NULL || match_data == NULL) return PCRE2_ERROR_NULL; -if ((int)start_offset > length) return PCRE2_ERROR_BADOFFSET; +if (start_offset > length) return PCRE2_ERROR_BADOFFSET; /* Check that the first field in the block is the magic number. If it is not, return with PCRE2_ERROR_BADMAGIC. However, if the magic number is equal to @@ -6261,7 +6365,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) return match_data->rc; } #if PCRE2_CODE_UNIT_WIDTH != 32 - if (start_offset > 0 && (int)start_offset < length && + if (start_offset > 0 && start_offset < length && NOT_FIRSTCHAR(subject[start_offset])) return PCRE2_ERROR_BADUTFOFFSET; #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ @@ -6296,7 +6400,7 @@ if (mcontext == NULL) { mb->callout = NULL; mb->memctl = re->memctl; -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE mb->stack_memctl = re->memctl; #endif } @@ -6305,7 +6409,7 @@ else mb->callout = mcontext->callout; mb->callout_data = mcontext->callout_data; mb->memctl = mcontext->memctl; -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE mb->stack_memctl = mcontext->stack_memctl; #endif } @@ -6394,7 +6498,7 @@ offsets, and the top third is working space. */ if (re->top_backref >= match_data->oveccount) { ocount = re->top_backref * 3 + 3; - mb->ovector = (PCRE2_OFFSET *)(mb->memctl.malloc(ocount * sizeof(PCRE2_OFFSET), + mb->ovector = (PCRE2_SIZE *)(mb->memctl.malloc(ocount * sizeof(PCRE2_SIZE), mb->memctl.memory_data)); if (mb->ovector == NULL) return PCRE2_ERROR_NOMEMORY; using_temporary_offsets = TRUE; @@ -6417,8 +6521,8 @@ in case they inspect these fields. */ if (ocount > 0) { - register PCRE2_OFFSET *iptr = mb->ovector + ocount; - register PCRE2_OFFSET *iend = iptr - re->top_bracket; + register PCRE2_SIZE *iptr = mb->ovector + ocount; + register PCRE2_SIZE *iend = iptr - re->top_bracket; if (iend < mb->ovector + 2) iend = mb->ovector + 2; while (--iptr >= iend) *iptr = PCRE2_UNSET; mb->ovector[0] = mb->ovector[1] = PCRE2_UNSET; @@ -6782,7 +6886,7 @@ for(;;) ENDLOOP: -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE release_match_heapframes(&frame_zero, mb); #endif @@ -6810,7 +6914,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) if (arg_offset_max >= 4) { memcpy(match_data->ovector + 2, mb->ovector + 2, - (arg_offset_max - 2) * sizeof(PCRE2_OFFSET)); + (arg_offset_max - 2) * sizeof(PCRE2_SIZE)); } if (mb->end_offset_top > arg_offset_max) mb->capture_last |= OVFLBIT; mb->memctl.free(mb->ovector, mb->memctl.memory_data); @@ -6834,7 +6938,7 @@ if (rc == MATCH_MATCH || rc == MATCH_ACCEPT) if (mb->end_offset_top/2 <= re->top_bracket) { - register PCRE2_OFFSET *iptr, *iend; + register PCRE2_SIZE *iptr, *iend; int resetcount = re->top_bracket + 1; if (resetcount > match_data->oveccount) resetcount = match_data->oveccount; iptr = match_data->ovector + mb->end_offset_top; diff --git a/src/pcre2_match_data.c b/src/pcre2_match_data.c index f297f64..8fe1a5a 100644 --- a/src/pcre2_match_data.c +++ b/src/pcre2_match_data.c @@ -55,7 +55,7 @@ PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext) { pcre2_match_data *yield = PRIV(memctl_malloc)( - sizeof(pcre2_match_data) + 3*oveccount*sizeof(PCRE2_OFFSET), + sizeof(pcre2_match_data) + 3*oveccount*sizeof(PCRE2_SIZE), (pcre2_memctl *)gcontext); yield->oveccount = oveccount; return yield; @@ -94,7 +94,7 @@ if (match_data != NULL) * Get left-most code unit in match * *************************************************/ -PCRE2_EXP_DEFN PCRE2_OFFSET PCRE2_CALL_CONVENTION +PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION pcre2_get_leftchar(pcre2_match_data *match_data) { return match_data->leftchar; @@ -118,7 +118,7 @@ return match_data->mark; * Get pointer to ovector * *************************************************/ -PCRE2_EXP_DEFN PCRE2_OFFSET * PCRE2_CALL_CONVENTION +PCRE2_EXP_DEFN PCRE2_SIZE * PCRE2_CALL_CONVENTION pcre2_get_ovector_pointer(pcre2_match_data *match_data) { return match_data->ovector; @@ -142,7 +142,7 @@ return match_data->oveccount; * Get right-most code unit in match * *************************************************/ -PCRE2_EXP_DEFN PCRE2_OFFSET PCRE2_CALL_CONVENTION +PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION pcre2_get_rightchar(pcre2_match_data *match_data) { return match_data->rightchar; @@ -154,7 +154,7 @@ return match_data->rightchar; * Get starting code unit in match * *************************************************/ -PCRE2_EXP_DEFN PCRE2_OFFSET PCRE2_CALL_CONVENTION +PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION pcre2_get_startchar(pcre2_match_data *match_data) { return match_data->startchar; diff --git a/src/pcre2_substring.c b/src/pcre2_substring.c index a69c902..c7f06ad 100644 --- a/src/pcre2_substring.c +++ b/src/pcre2_substring.c @@ -335,7 +335,7 @@ size_t *lensp; pcre2_memctl *memp; PCRE2_UCHAR **listp; PCRE2_UCHAR *sp; -PCRE2_OFFSET *ovector; +PCRE2_SIZE *ovector; if ((count = match_data->rc) < 0) return count; diff --git a/src/pcre2_valid_utf.c b/src/pcre2_valid_utf.c index 6cdaf81..d7c22ca 100644 --- a/src/pcre2_valid_utf.c +++ b/src/pcre2_valid_utf.c @@ -58,7 +58,7 @@ strings. */ /* This function should never be called when UTF is not supported. */ int -PRIV(valid_utf)(PCRE2_SPTR string, int length, PCRE2_OFFSET *erroroffset) +PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) { (void)string; (void)length; @@ -81,7 +81,7 @@ invalid string are then undefined. Arguments: string points to the string - length length of string, or -1 if the string is zero-terminated + length length of string errp pointer to an error position offset variable Returns: == 0 if the string is a valid UTF string @@ -89,17 +89,11 @@ Returns: == 0 if the string is a valid UTF string */ int -PRIV(valid_utf)(PCRE2_SPTR string, int length, PCRE2_OFFSET *erroroffset) +PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) { register PCRE2_SPTR p; register uint32_t c; -if (length < 0) - { - for (p = string; *p != 0; p++); - length = (int)(p - string); - } - /* ----------------- Check a UTF-8 string ----------------- */ #if PCRE2_CODE_UNIT_WIDTH == 8 @@ -155,7 +149,7 @@ for (p = string; length-- > 0; p++) } ab = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes (1-5) */ - if (length < (int)ab) /* Missing bytes */ + if (length < ab) /* Missing bytes */ { *erroroffset = (int)(p - string); switch(ab - length) diff --git a/src/pcre2demo.c b/src/pcre2demo.c index e3846af..6153ffa 100644 --- a/src/pcre2demo.c +++ b/src/pcre2demo.c @@ -77,8 +77,8 @@ int utf8; uint32_t option_bits; uint32_t newline; -PCRE2_OFFSET erroroffset; -PCRE2_OFFSET *ovector; +PCRE2_SIZE erroroffset; +PCRE2_SIZE *ovector; size_t subject_length; pcre2_match_data *match_data; @@ -314,7 +314,7 @@ crlf_is_newline = newline == PCRE2_NEWLINE_ANY || for (;;) { uint32_t options = 0; /* Normally no options */ - PCRE2_OFFSET start_offset = ovector[1]; /* Start at end of previous match */ + PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ /* If the previous match was for an empty string, we are finished if we are at the end of the subject. Otherwise, arrange to run another match at the diff --git a/src/pcre2grep.c b/src/pcre2grep.c index 6cefbdd..e4d48d2 100644 --- a/src/pcre2grep.c +++ b/src/pcre2grep.c @@ -179,7 +179,7 @@ static uint32_t recursion_limit = 0; static pcre2_compile_context *compile_context; static pcre2_match_context *match_context; static pcre2_match_data *match_data; -static PCRE2_OFFSET *offsets; +static PCRE2_SIZE *offsets; static BOOL count_only = FALSE; static BOOL do_colour = FALSE; @@ -1736,14 +1736,14 @@ while (ptr < endptr) if (line_offsets) fprintf(stdout, "%d,%d\n", (int)(matchptr + offsets[0] - ptr), - offsets[1] - offsets[0]); + (int)(offsets[1] - offsets[0])); /* Handle --file-offsets */ else if (file_offsets) fprintf(stdout, "%d,%d\n", (int)(filepos + matchptr + offsets[0] - ptr), - offsets[1] - offsets[0]); + (int)(offsets[1] - offsets[0])); /* Handle --only-matching, which may occur many times */ @@ -2451,7 +2451,7 @@ compile_pattern(patstr *p, int options, int popts, int fromfile, const char *fromtext, int count) { unsigned char buffer[PATBUFSIZE]; -PCRE2_OFFSET erroffset; +PCRE2_SIZE erroffset; char *ps = p->string; unsigned int patlen = strlen(ps); int errcode; @@ -2485,16 +2485,16 @@ pcre2_get_error_message(errcode, buffer, PATBUFSIZE); if (fromfile) { fprintf(stderr, "pcre2grep: Error in regex in line %d of %s " - "at offset %d: %s\n", count, fromtext, erroffset, buffer); + "at offset %d: %s\n", count, fromtext, (int)erroffset, buffer); } else { if (count == 0) fprintf(stderr, "pcre2grep: Error in %s regex at offset %d: %s\n", - fromtext, erroffset, buffer); + fromtext, (int)erroffset, buffer); else fprintf(stderr, "pcre2grep: Error in %s %s regex at offset %d: %s\n", - ordin(count), fromtext, erroffset, buffer); + ordin(count), fromtext, (int)erroffset, buffer); } return FALSE; diff --git a/src/pcre2posix.c b/src/pcre2posix.c index 31d8228..42e7967 100644 --- a/src/pcre2posix.c +++ b/src/pcre2posix.c @@ -203,7 +203,7 @@ Returns: 0 on success PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION regcomp(regex_t *preg, const char *pattern, int cflags) { -PCRE2_OFFSET erroffset; +PCRE2_SIZE erroffset; int errorcode; int options = 0; int re_nsub = 0; diff --git a/src/pcre2test.c b/src/pcre2test.c index 6fbb129..2afedf1 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -401,7 +401,7 @@ typedef struct modstruct { uint16_t which; uint16_t type; uint32_t value; - PCRE2_OFFSET offset; + PCRE2_SIZE offset; } modstruct; static modstruct modlist[] = { @@ -1758,7 +1758,7 @@ free(block); /* For recursion malloc/free, to test stacking calls */ -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE static void *my_stack_malloc(size_t size, void *data) { void *block = malloc(size); @@ -1775,7 +1775,7 @@ if (show_memory) fprintf(outfile, "stack_free %p\n", block); free(block); } -#endif /* NO_RECURSE */ +#endif /* HEAP_MATCH_RECURSE */ /************************************************* @@ -2422,7 +2422,7 @@ static void * check_modifier(modstruct *m, int ctx, patctl *pctl, datctl *dctl, uint32_t c) { void *field = NULL; -PCRE2_OFFSET offset = m->offset; +PCRE2_SIZE offset = m->offset; if (restrict_for_perl_test) switch(m->which) { @@ -2448,7 +2448,7 @@ switch (m->which) /* Fall through for something that can also be in a match context. In this case the offset is taken from the other field. */ - offset = (PCRE2_OFFSET)(m->value); + offset = (PCRE2_SIZE)(m->value); case MOD_CTM: /* Match context modifier */ if (ctx == CTX_DEFDAT) field = PTR(default_dat_context); @@ -3310,7 +3310,7 @@ uint8_t *p = buffer; const uint8_t *use_tables; unsigned int delimiter = *p++; int patlen, errorcode; -PCRE2_OFFSET erroroffset; +PCRE2_SIZE erroroffset; /* Initialize the context and pattern/data controls for this test from the defaults. */ @@ -4403,7 +4403,7 @@ for (gmatched = 0;; gmatched++) { int i; uint8_t *nptr; - PCRE2_OFFSET *ovector; + PCRE2_SIZE *ovector; /* This is a check against a lunatic return value. */ @@ -4439,8 +4439,8 @@ for (gmatched = 0;; gmatched++) ovector = FLD(match_data, ovector); for (i = 0; i < 2*capcount; i += 2) { - PCRE2_OFFSET start = ovector[i]; - PCRE2_OFFSET end = ovector[i+1]; + PCRE2_SIZE start = ovector[i]; + PCRE2_SIZE end = ovector[i+1]; if (start > end) { @@ -4643,7 +4643,7 @@ for (gmatched = 0;; gmatched++) else if (capcount == PCRE2_ERROR_PARTIAL) { - PCRE2_OFFSET leftchar = FLD(match_data, leftchar); + PCRE2_SIZE leftchar = FLD(match_data, leftchar); fprintf(outfile, "Partial match"); if (leftchar != FLD(match_data, startchar)) fprintf(outfile, " at offset %d", (int)FLD(match_data, startchar)); @@ -4685,8 +4685,8 @@ for (gmatched = 0;; gmatched++) else if (g_notempty != 0) /* There was a previous null match */ { uint16_t nl = FLD(compiled_code, newline_convention); - PCRE2_OFFSET start_offset = dat_datctl.offset; /* Where the match was */ - PCRE2_OFFSET end_offset = start_offset + 1; + PCRE2_SIZE start_offset = dat_datctl.offset; /* Where the match was */ + PCRE2_SIZE end_offset = start_offset + 1; if ((nl == PCRE2_NEWLINE_CRLF || nl == PCRE2_NEWLINE_ANY || nl == PCRE2_NEWLINE_ANYCRLF) && @@ -4765,7 +4765,7 @@ for (gmatched = 0;; gmatched++) if ((dat_datctl.control & CTL_ANYGLOB) == 0) break; else { - PCRE2_OFFSET end_offset = FLD(match_data, ovector)[1]; + PCRE2_SIZE end_offset = FLD(match_data, ovector)[1]; /* We must now set up for the next iteration of a global search. If we have matched an empty string, first check to see if we are at the end of the @@ -5278,7 +5278,7 @@ if (test_mode == PCRE8_MODE) default_dat_context8 = pcre2_match_context_create_8(general_context8); dat_context8 = pcre2_match_context_create_8(general_context8); match_data8 = pcre2_match_data_create_8(max_oveccount, general_context8); -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE (void)pcre2_set_recursion_memory_management_8(default_dat_context8, &my_stack_malloc, &my_stack_free, NULL); #endif @@ -5295,7 +5295,7 @@ if (test_mode == PCRE16_MODE) default_dat_context16 = pcre2_match_context_create_16(general_context16); dat_context16 = pcre2_match_context_create_16(general_context16); match_data16 = pcre2_match_data_create_16(max_oveccount, general_context16); -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE (void)pcre2_set_recursion_memory_management_16(default_dat_context16, &my_stack_malloc, &my_stack_free, NULL); #endif @@ -5312,7 +5312,7 @@ if (test_mode == PCRE32_MODE) default_dat_context32 = pcre2_match_context_create_32(general_context32); dat_context32 = pcre2_match_context_create_32(general_context32); match_data32 = pcre2_match_data_create_32(max_oveccount, general_context32); -#ifdef NO_RECURSE +#ifdef HEAP_MATCH_RECURSE (void)pcre2_set_recursion_memory_management_32(default_dat_context32, &my_stack_malloc, &my_stack_free, NULL); #endif diff --git a/testdata/testinput2 b/testdata/testinput2 index 20b198c..f0ab080 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4042,5 +4042,11 @@ a random value. /Ix abxxx ca cd + +# This should test both paths for processing OP_RECURSE. + +/(?(R)a+|(?R)b)/ + aaaabcde + aaaabcde\=ovector=100 # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 497885b..f190234 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -13671,5 +13671,13 @@ No match cd 0: 0+ cd + +# This should test both paths for processing OP_RECURSE. + +/(?(R)a+|(?R)b)/ + aaaabcde + 0: aaaab + aaaabcde\=ovector=100 + 0: aaaab # End of testinput2