From 9fcdf2cc6f76308dba8813e49939abab7011871d Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Wed, 26 Nov 2014 16:51:53 +0000 Subject: [PATCH] Add user data to recursion guard; get ready for RC1 (again) --- ChangeLog | 2 +- NEWS | 2 +- configure.ac | 2 +- doc/html/pcre2_config.html | 20 +- doc/html/pcre2_pattern_info.html | 2 +- .../pcre2_set_compile_recursion_guard.html | 13 +- doc/html/pcre2api.html | 185 ++++++++++-------- doc/html/pcre2callout.html | 21 +- doc/html/pcre2limits.html | 27 +-- doc/pcre2.txt | 177 +++++++++-------- doc/pcre2_set_compile_recursion_guard.3 | 13 +- doc/pcre2api.3 | 27 +-- maint/README | 41 ++-- src/pcre2.h.in | 3 +- src/pcre2_compile.c | 3 +- src/pcre2_context.c | 4 +- src/pcre2_dfa_match.c | 4 +- src/pcre2_internal.h | 6 +- src/pcre2_intmodedep.h | 27 +-- src/pcre2_match.c | 4 +- src/pcre2test.c | 92 ++++----- 21 files changed, 362 insertions(+), 313 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4307adc..8c641d1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,7 @@ Change Log for PCRE2 -------------------- -Version 10.00 24-November-2014 +Version 10.00 28-November-2014 ------------------------------ Version 10.00 is the first release of PCRE2, a revised API for the PCRE diff --git a/NEWS b/NEWS index 30c7f5a..a63cd2b 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,7 @@ News about PCRE2 releases ------------------------- -Version 10.00 24-November-2014 +Version 10.00 28-November-2014 ------------------------------ Version 10.00 is the first release of PCRE2, a revised API for the PCRE diff --git a/configure.ac b/configure.ac index 20c3c4d..bfb7cf9 100644 --- a/configure.ac +++ b/configure.ac @@ -11,7 +11,7 @@ dnl be defined as -RC2, for example. For real releases, it should be empty. m4_define(pcre2_major, [10]) m4_define(pcre2_minor, [00]) m4_define(pcre2_prerelease, [-RC1]) -m4_define(pcre2_date, [2014-11-24]) +m4_define(pcre2_date, [2014-11-28]) # NOTE: The CMakeLists.txt file searches for the above variables in the first # 50 lines of this file. Please update that if the variables above are moved. diff --git a/doc/html/pcre2_config.html b/doc/html/pcre2_config.html index d65d70d..a51b0c7 100644 --- a/doc/html/pcre2_config.html +++ b/doc/html/pcre2_config.html @@ -39,14 +39,12 @@ code units; for other types of data it is in bytes.

If where is not NULL, for PCRE2_CONFIG_JITTARGET, PCRE2_CONFIG_UNICODE_VERSION, and PCRE2_CONFIG_VERSION it must point to a -buffer that is large enough to hold the string. For PCRE2_CONFIG_MATCHLIMIT, -PCRE2_CONFIG_PARENSLIMIT, and PCRE2_CONFIG_RECURSIONLIMIT it must point to an -unsigned long int variable, and for all other codes to an int variable. The -available codes are: +buffer that is large enough to hold the string. For all other codes it must +point to a uint32_t integer variable. The available codes are:

   PCRE2_CONFIG_BSR             Indicates what \R matches by default:
-                                 0    all Unicode line endings
-                                 1    CR, LF, or CRLF only
+                                 PCRE2_BSR_UNICODE
+                                 PCRE2_BSR_ANYCRLF
   PCRE2_CONFIG_JIT             Availability of just-in-time compiler
                                 support (1=yes 0=no)
   PCRE2_CONFIG_JITTARGET       Information about the target archi-
@@ -54,11 +52,11 @@ available codes are:
   PCRE2_CONFIG_LINKSIZE        Configured internal link size (2, 3, 4)
   PCRE2_CONFIG_MATCHLIMIT      Default internal resource limit
   PCRE2_CONFIG_NEWLINE         Code for the default newline sequence:
-                                 1    for CR
-                                 2    for LF
-                                 3    for CRLF
-                                 4    for ANY
-                                 5    for ANYCRLF
+                                 PCRE2_NEWLINE_CR
+                                 PCRE2_NEWLINE_LF
+                                 PCRE2_NEWLINE_CRLF
+                                 PCRE2_NEWLINE_ANY
+                                 PCRE2_NEWLINE_ANYCRLF
   PCRE2_CONFIG_PARENSLIMIT     Default parentheses nesting limit
   PCRE2_CONFIG_RECURSIONLIMIT  Internal recursion depth limit
   PCRE2_CONFIG_STACKRECURSE    Recursion implementation (1=stack
diff --git a/doc/html/pcre2_pattern_info.html b/doc/html/pcre2_pattern_info.html
index 63b9870..6aae36e 100644
--- a/doc/html/pcre2_pattern_info.html
+++ b/doc/html/pcre2_pattern_info.html
@@ -78,7 +78,7 @@ the requested information, in bytes. The following information is available:
 The where argument must point to an unsigned 32-bit integer (uint32_t
 variable), except for the following what values:
 
-  PCRE2_INFO_FIRSTBITMAP     const uint8_t
+  PCRE2_INFO_FIRSTBITMAP     const uint8_t *
   PCRE2_INFO_JITSIZE         size_t
   PCRE2_INFO_NAMETABLE       PCRE2_SPTR
   PCRE2_INFO_SIZE            size_t
diff --git a/doc/html/pcre2_set_compile_recursion_guard.html b/doc/html/pcre2_set_compile_recursion_guard.html
index 8fa3edc..c09942c 100644
--- a/doc/html/pcre2_set_compile_recursion_guard.html
+++ b/doc/html/pcre2_set_compile_recursion_guard.html
@@ -20,7 +20,7 @@ SYNOPSIS
 

int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, - int (*guard_function)(uint32_t)); + int (*guard_function)(uint32_t, void *), void *user_data);


DESCRIPTION @@ -28,11 +28,12 @@ DESCRIPTION

This function defines, within a compile context, a function that is called whenever pcre2_compile() starts to compile a parenthesized part of a -pattern. The argument to the function gives the current depth of parenthesis -nesting. The function should return zero if all is well, or non-zero to force -an error. This feature is provided so that applications can check the available -system stack space, in order to avoid running out. The result of this function -is always zero. +pattern. The first argument to the function gives the current depth of +parenthesis nesting, and the second is user data that is supplied when the +function is set up. The callout function should return zero if all is well, or +non-zero to force an error. This feature is provided so that applications can +check the available system stack space, in order to avoid running out. The +result of pcre2_set_compile_recursion_guard() is always zero.

There is a complete description of the PCRE2 native API in the diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index 2ed6968..d126af9 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -24,31 +24,32 @@ please consult the man page, in case the conversion went wrong.

  • PCRE2 NATIVE API AUXILIARY FUNCTIONS
  • PCRE2 8-BIT, 16-BIT, AND 32-BIT LIBRARIES
  • PCRE2 API OVERVIEW -
  • NEWLINES -
  • MULTITHREADING -
  • PCRE2 CONTEXTS -
  • CHECKING BUILD-TIME OPTIONS -
  • COMPILING A PATTERN -
  • COMPILATION ERROR CODES -
  • JUST-IN-TIME (JIT) COMPILATION -
  • LOCALE SUPPORT -
  • INFORMATION ABOUT A COMPILED PATTERN -
  • THE MATCH DATA BLOCK -
  • MATCHING A PATTERN: THE TRADITIONAL FUNCTION -
  • NEWLINE HANDLING WHEN MATCHING -
  • HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS -
  • OTHER INFORMATION ABOUT A MATCH -
  • ERROR RETURNS FROM pcre2_match() -
  • EXTRACTING CAPTURED SUBSTRINGS BY NUMBER -
  • EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS -
  • EXTRACTING CAPTURED SUBSTRINGS BY NAME -
  • CREATING A NEW STRING WITH SUBSTITUTIONS -
  • DUPLICATE SUBPATTERN NAMES -
  • FINDING ALL POSSIBLE MATCHES AT ONE POSITION -
  • MATCHING A PATTERN: THE ALTERNATIVE FUNCTION -
  • SEE ALSO -
  • AUTHOR -
  • REVISION +
  • STRING LENGTHS AND OFFSETS +
  • NEWLINES +
  • MULTITHREADING +
  • PCRE2 CONTEXTS +
  • CHECKING BUILD-TIME OPTIONS +
  • COMPILING A PATTERN +
  • COMPILATION ERROR CODES +
  • JUST-IN-TIME (JIT) COMPILATION +
  • LOCALE SUPPORT +
  • INFORMATION ABOUT A COMPILED PATTERN +
  • THE MATCH DATA BLOCK +
  • MATCHING A PATTERN: THE TRADITIONAL FUNCTION +
  • NEWLINE HANDLING WHEN MATCHING +
  • HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS +
  • OTHER INFORMATION ABOUT A MATCH +
  • ERROR RETURNS FROM pcre2_match() +
  • EXTRACTING CAPTURED SUBSTRINGS BY NUMBER +
  • EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS +
  • EXTRACTING CAPTURED SUBSTRINGS BY NAME +
  • CREATING A NEW STRING WITH SUBSTITUTIONS +
  • DUPLICATE SUBPATTERN NAMES +
  • FINDING ALL POSSIBLE MATCHES AT ONE POSITION +
  • MATCHING A PATTERN: THE ALTERNATIVE FUNCTION +
  • SEE ALSO +
  • AUTHOR +
  • REVISION

    #include <pcre2.h> @@ -148,7 +149,7 @@ document for an overview of all the PCRE2 documentation.

    int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, - int (*guard_function)(uint32_t)); + int (*guard_function)(uint32_t, void *), void *user_data);


    PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS

    @@ -164,7 +165,7 @@ document for an overview of all the PCRE2 documentation.

    int pcre2_set_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_callout_block *), + int (*callout_function)(pcre2_callout_block *, void *), void *callout_data);

    @@ -424,8 +425,18 @@ matched. Finally, there are functions for finding out information about a compiled pattern (pcre2_pattern_info()) and about the configuration with which PCRE2 was built (pcre2_config()). +

    +
    STRING LENGTHS AND OFFSETS
    +

    +The PCRE2 API uses string lengths and offsets into strings of code units in +several places. These values are always of type PCRE2_SIZE, which is an +unsigned integer type, currently always defined as size_t. The largest +value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved +as a special indicator for zero-terminated strings and unset offsets. +Therefore, the longest string that can be handled is one less than this +maximum.

    -
    NEWLINES
    +
    NEWLINES

    PCRE2 supports five different conventions for indicating line breaks in strings: a single CR (carriage return) character, a single LF (linefeed) @@ -460,7 +471,7 @@ The choice of newline convention does not affect the interpretation of the \n or \r escape sequences, nor does it affect what \R matches; this has its own separate convention.

    -
    MULTITHREADING
    +
    MULTITHREADING

    In a multithreaded application it is important to keep thread-specific data separate from data that can be shared between threads. The PCRE2 library code @@ -505,7 +516,7 @@ storing the results of a match. This includes details of what was matched, as well as additional information such as the name of a (*MARK) setting. Each thread must provide its own version of this memory.

    -
    PCRE2 CONTEXTS
    +
    PCRE2 CONTEXTS

    Some PCRE2 functions have a lot of parameters, many of which are used only by specialist applications, for example, those that use custom memory management @@ -636,7 +647,7 @@ This parameter ajusts the limit, set when PCRE2 is built (default 250), on the depth of parenthesis nesting in a pattern. This limit stops rogue patterns using up too much system stack when being compiled. int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, - int (*guard_function)(uint32_t)); + int (*guard_function)(uint32_t, void *), void *user_data);

    There is at least one application that runs PCRE2 in threads with very limited @@ -644,8 +655,14 @@ system stack, where running out of stack is to be avoided at all costs. The parenthesis limit above cannot take account of how much stack is actually available. For a finer control, you can supply a function that is called whenever pcre2_compile() starts to compile a parenthesized part of a -pattern. The argument to the function gives the current depth of nesting. The -function should return zero if all is well, or non-zero to force an error. +pattern. This function can check the actual stack size (or anything else that +it wants to, of course). +

    +

    +The first argument to the callout function gives the current depth of +nesting, and the second is user data that is set up by the last argument of +pcre2_set_compile_recursion_guard(). The callout function should return +zero if all is well, or non-zero to force an error.


    The match context @@ -679,7 +696,7 @@ A match context is created with default values for its parameters. These can be changed by calling the following functions, which return 0 on success, or PCRE2_ERROR_BADDATA if invalid data is detected. int pcre2_set_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_callout_block *), + int (*callout_function)(pcre2_callout_block *, void *), void *callout_data);

    @@ -780,7 +797,7 @@ exit so that they can be re-used when possible during the match. In the absence of these functions, the normal custom memory management functions are used, if supplied, otherwise the system functions.

    -
    CHECKING BUILD-TIME OPTIONS
    +
    CHECKING BUILD-TIME OPTIONS

    int pcre2_config(uint32_t what, void *where);

    @@ -807,15 +824,15 @@ available:
       PCRE2_CONFIG_BSR
     
    -The output is an integer whose value indicates what character sequences the \R -escape sequence matches by default. A value of PCRE2_BSR_UNICODE means that \R -matches any Unicode line ending sequence; a value of PCRE2_BSR_ANYCRLF means -that \R matches only CR, LF, or CRLF. The default can be overridden when a -pattern is compiled. +The output is a uint32_t integer whose value indicates what character +sequences the \R escape sequence matches by default. A value of +PCRE2_BSR_UNICODE means that \R matches any Unicode line ending sequence; a +value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The +default can be overridden when a pattern is compiled.
       PCRE2_CONFIG_JIT
     
    -The output is an integer that is set to one if support for just-in-time +The output is a uint32_t integer that is set to one if support for just-in-time compiling is available; otherwise it is set to zero.
       PCRE2_CONFIG_JITTARGET
    @@ -831,12 +848,13 @@ for the terminating zero.
     
       PCRE2_CONFIG_LINKSIZE
     
    -The output is an integer that contains the number of bytes used for internal -linkage in compiled regular expressions. When PCRE2 is configured, the value -can be set to 2, 3, or 4, with the default being 2. This is the value that is -returned by pcre2_config(). However, when the 16-bit library is compiled, -a value of 3 is rounded up to 4, and when the 32-bit library is compiled, -internal linkages always use 4 bytes, so the configured value is not relevant. +The output is a uint32_t integer that contains the number of bytes used for +internal linkage in compiled regular expressions. When PCRE2 is configured, the +value can be set to 2, 3, or 4, with the default being 2. This is the value +that is returned by pcre2_config(). However, when the 16-bit library is +compiled, a value of 3 is rounded up to 4, and when the 32-bit library is +compiled, internal linkages always use 4 bytes, so the configured value is not +relevant.

    The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all @@ -846,14 +864,14 @@ be compiled by those two libraries, but at the expense of slower matching.

       PCRE2_CONFIG_MATCHLIMIT
     
    -The output is an unsigned long integer that gives the default limit for the -number of internal matching function calls in a pcre2_match() execution. -Further details are given with pcre2_match() below. +The output is a uint32_t integer that gives the default limit for the number of +internal matching function calls in a pcre2_match() execution. Further +details are given with pcre2_match() below.
       PCRE2_CONFIG_NEWLINE
     
    -The output is an integer whose value specifies the default character sequence -that is recognized as meaning "newline". The values are: +The output is a uint32_t integer whose value specifies the default character +sequence that is recognized as meaning "newline". The values are:
       PCRE2_NEWLINE_CR       Carriage return (CR)
       PCRE2_NEWLINE_LF       Linefeed (LF)
    @@ -866,7 +884,7 @@ operating system.
     
       PCRE2_CONFIG_PARENSLIMIT
     
    -The output is an unsigned long integer that gives the maximum depth of nesting +The output is a uint32_t integer that gives the maximum depth of nesting of parentheses (of any kind) in a pattern. This limit is imposed to cap the amount of system stack used when a pattern is compiled. It is specified when PCRE2 is built; the default is 250. This limit does not take into account the @@ -875,16 +893,15 @@ over compilation stack usage, see pcre2_set_compile_recursion_guard().
       PCRE2_CONFIG_RECURSIONLIMIT
     
    -The output is an unsigned long integer that gives the default limit for the -depth of recursion when calling the internal matching function in a -pcre2_match() execution. Further details are given with -pcre2_match() below. +The output is a uint32_t integer that gives the default limit for the depth of +recursion when calling the internal matching function in a pcre2_match() +execution. Further details are given with pcre2_match() below.
       PCRE2_CONFIG_STACKRECURSE
     
    -The output is an integer that is set to one if internal recursion when running -pcre2_match() is implemented by recursive function calls that use the -system stack to remember their state. This is the usual way that PCRE2 is +The output is a uint32_t integer that is set to one if internal recursion when +running pcre2_match() is implemented by recursive function calls that use +the system stack to remember their state. This is the usual way that PCRE2 is compiled. The output is zero if PCRE2 was compiled to use blocks of data on the heap instead of recursive function calls.
    @@ -900,8 +917,8 @@ string plus one unit for the terminating zero.
     
       PCRE2_CONFIG_UNICODE
     
    -The output is an integer that is set to one if Unicode support is available; -otherwise it is set to zero. Unicode support implies UTF support. +The output is a uint32_t integer that is set to one if Unicode support is +available; otherwise it is set to zero. Unicode support implies UTF support.
       PCRE2_CONFIG_VERSION
     
    @@ -912,7 +929,7 @@ the PCRE2 version string, zero-terminated. The number of code units used is returned. This is the length of the string plus one unit for the terminating zero.

    -
    COMPILING A PATTERN
    +
    COMPILING A PATTERN

    pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, @@ -1267,7 +1284,7 @@ the behaviour of PCRE2 are given in the pcre2unicode page.

    -
    COMPILATION ERROR CODES
    +
    COMPILATION ERROR CODES

    There are over 80 positive error codes that pcre2_compile() may return if it finds an error in the pattern. There are also some negative error codes that @@ -1277,7 +1294,7 @@ are used for invalid UTF strings. These are the same as given by page. The pcre2_get_error_message() function can be called to obtain a textual error message from any error code.

    -
    JUST-IN-TIME (JIT) COMPILATION
    +
    JUST-IN-TIME (JIT) COMPILATION

    int pcre2_jit_compile(pcre2_code *code, uint32_t options);
    @@ -1315,7 +1332,7 @@ patterns to be analyzed, and for one-off matches and simple patterns the benefit of faster execution might be offset by a much slower compilation time. Most, but not all patterns can be optimized by the JIT compiler.

    -
    LOCALE SUPPORT
    +
    LOCALE SUPPORT

    PCRE2 handles caseless matching, and determines whether characters are letters, digits, or whatever, by reference to a set of tables, indexed by character code @@ -1371,7 +1388,7 @@ is saved with the compiled pattern, and the same tables are used by compilation, and matching all happen in the same locale, but different patterns can be processed in different locales.

    -
    INFORMATION ABOUT A COMPILED PATTERN
    +
    INFORMATION ABOUT A COMPILED PATTERN

    int pcre2_pattern_info(const pcre2 *code, uint32_t what, void *where);

    @@ -1660,7 +1677,7 @@ getting memory in which to place the compiled data is the value returned by this option plus the size of the pcre2_code structure. Processing a pattern with the JIT compiler does not alter the value returned by this option.

    -
    THE MATCH DATA BLOCK
    +
    THE MATCH DATA BLOCK

    pcre2_match_data_create(uint32_t ovecsize, pcre2_general_context *gcontext); @@ -1712,7 +1729,7 @@ and other match data below.

    -
    MATCHING A PATTERN: THE TRADITIONAL FUNCTION
    +
    MATCHING A PATTERN: THE TRADITIONAL FUNCTION

    int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -1926,7 +1943,7 @@ examples, in the pcre2partial documentation.

    -
    NEWLINE HANDLING WHEN MATCHING
    +
    NEWLINE HANDLING WHEN MATCHING

    When PCRE2 is built, a default newline convention is set; this is usually the standard convention for the operating system. The default can be overridden in @@ -1961,7 +1978,7 @@ LF in the characters that it matches. Notwithstanding the above, anomalous effects may still occur when CRLF is a valid newline sequence and explicit \r or \n escapes appear in the pattern.

    -
    HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS
    +
    HOW PCRE2_MATCH() RETURNS A STRING AND CAPTURED SUBSTRINGS

    uint32_t pcre2_get_ovector_count(pcre2_match_data *match_data);
    @@ -2051,7 +2068,7 @@ parentheses, no more than ovector[0] to ovector[2n+1] are set by pcre2_match(). The other elements retain whatever values they previously had.

    -
    OTHER INFORMATION ABOUT A MATCH
    +
    OTHER INFORMATION ABOUT A MATCH

    PCRE2_SPTR pcre2_get_mark(pcre2_match_data *match_data);
    @@ -2081,7 +2098,7 @@ UTF character when UTF checking fails. Details are given in the pcre2unicode page.

    -
    ERROR RETURNS FROM pcre2_match()
    +
    ERROR RETURNS FROM pcre2_match()

    If pcre2_match() fails, it returns a negative number. This can be converted to a text string by calling pcre2_get_error_message(). Negative @@ -2190,7 +2207,7 @@ is attempted.

    The internal recursion limit was reached.

    -
    EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
    +
    EXTRACTING CAPTURED SUBSTRINGS BY NUMBER

    int pcre2_substring_length_bynumber(pcre2_match_data *match_data, unsigned int number, PCRE2_SIZE *length); @@ -2262,7 +2279,7 @@ no capturing group of that number in the pattern, or because the group with that number did not participate in the match, or because the ovector was too small to capture that group.

    -
    EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS
    +
    EXTRACTING A LIST OF ALL CAPTURED SUBSTRINGS

    int pcre2_substring_list_get(pcre2_match_data *match_data, " PCRE2_UCHAR ***listptr, PCRE2_SIZE **lengthsptr); @@ -2297,7 +2314,7 @@ can be distinguished from a genuine zero-length substring by inspecting the appropriate offset in the ovector, which contain PCRE2_UNSET for unset substrings.

    -
    EXTRACTING CAPTURED SUBSTRINGS BY NAME
    +
    EXTRACTING CAPTURED SUBSTRINGS BY NAME

    int pcre2_substring_number_from_name(const pcre2_code *code, PCRE2_SPTR name); @@ -2349,7 +2366,7 @@ names are not included in the compiled code. The matching process uses only numbers. For this reason, the use of different names for subpatterns of the same number causes an error at compile time.

    -
    CREATING A NEW STRING WITH SUBSTITUTIONS
    +
    CREATING A NEW STRING WITH SUBSTITUTIONS

    int pcre2_substitute(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -2410,7 +2427,7 @@ straight back. PCRE2_ERROR_BADREPLACEMENT is returned for an invalid replacement string (unrecognized sequence following a dollar sign), and PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough.

    -
    DUPLICATE SUBPATTERN NAMES
    +
    DUPLICATE SUBPATTERN NAMES

    int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); @@ -2455,7 +2472,7 @@ The format of the name table is described above in the section entitled Given all the relevant entries for the name, you can extract each of their numbers, and hence the captured data.

    -
    FINDING ALL POSSIBLE MATCHES AT ONE POSITION
    +
    FINDING ALL POSSIBLE MATCHES AT ONE POSITION

    The traditional matching function uses a similar algorithm to Perl, which stops when it finds the first match at a given point in the subject. If you want to @@ -2473,7 +2490,7 @@ substring. Then return 1, which forces pcre2_match() to backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH.

    -
    MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
    +
    MATCHING A PATTERN: THE ALTERNATIVE FUNCTION

    int pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, PCRE2_SIZE startoffset, @@ -2647,13 +2664,13 @@ some plausibility checks are made on the contents of the workspace, which should contain data about the previous partial match. If any of these checks fail, this error is given.

    -
    SEE ALSO
    +
    SEE ALSO

    pcre2build(3), pcre2callout(3), pcre2demo(3), pcre2matching(3), pcre2partial(3), pcre2posix(3), pcre2sample(3), pcre2stack(3), pcre2unicode(3).

    -
    AUTHOR
    +
    AUTHOR

    Philip Hazel
    @@ -2662,9 +2679,9 @@ University Computing Service Cambridge, England.

    -
    REVISION
    +
    REVISION

    -Last updated: 23 November 2014 +Last updated: 26 November 2014
    Copyright © 1997-2014 University of Cambridge.
    diff --git a/doc/html/pcre2callout.html b/doc/html/pcre2callout.html index 4d0238a..e6894da 100644 --- a/doc/html/pcre2callout.html +++ b/doc/html/pcre2callout.html @@ -26,7 +26,7 @@ please consult the man page, in case the conversion went wrong. #include <pcre2.h>

    -int (*pcre2_callout)(pcre2_callout_block *); +int (*pcre2_callout)(pcre2_callout_block *, void *);


    DESCRIPTION

    @@ -137,14 +137,17 @@ callouts such as the example above are obeyed.

    During matching, when PCRE2 reaches a callout point, if an external function is set in the match context, it is called. This applies to both normal and DFA -matching. The only argument to the callout function is a pointer to a -pcre2_callout block. This structure contains the following fields: +matching. The first argument to the callout function is a pointer to a +pcre2_callout block. The second argument is the void * callout data that +was supplied when the callout was set up by calling pcre2_set_callout() +(see the +pcre2api +documentation). The callout block structure contains the following fields:

       uint32_t      version;
       uint32_t      callout_number;
       uint32_t      capture_top;
       uint32_t      capture_last;
    -  void         *callout_data;
       PCRE2_SIZE   *offset_vector;
       PCRE2_SPTR    mark;
       PCRE2_SPTR    subject;
    @@ -203,14 +206,6 @@ substrings have been captured, the value of capture_last is 0. This is
     always the case for the DFA matching functions.
     

    -The callout_data field contains a value that is passed to a matching -function specifically so that it can be passed back in callouts. It is set in -the match context when the callout is set up by calling -pcre2_set_callout() (see the -pcre2api -documentation). -

    -

    The pattern_position field contains the offset to the next item to be matched in the pattern string.

    @@ -259,7 +254,7 @@ Cambridge, England.


    REVISION

    -Last updated: 23 November 2014 +Last updated: 25 November 2014
    Copyright © 1997-2014 University of Cambridge.
    diff --git a/doc/html/pcre2limits.html b/doc/html/pcre2limits.html index 29aca8c..b1c06f5 100644 --- a/doc/html/pcre2limits.html +++ b/doc/html/pcre2limits.html @@ -32,6 +32,21 @@ However, the speed of execution is slower. In the 32-bit library, the internal linkage size is always 4.

    +The maximum length (in code units) of a subject string is one less than the +largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned +integer type, usually defined as size_t. Its maximum value (that is +~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated strings +and unset offsets. +

    +

    +Note that when using the traditional matching function, PCRE2 uses recursion to +handle subpatterns and indefinite repetition. This means that the available +stack space may limit the size of a subject string that can be processed by +certain patterns. For a discussion of stack issues, see the +pcre2stack +documentation. +

    +

    All values in repeating quantifiers must be less than 65536.

    @@ -55,16 +70,6 @@ maximum number of named subpatterns is 10000. The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries.

    -

    -The maximum length of a subject string is the largest number a PCRE2_SIZE -variable can hold. PCRE2_SIZE is an unsigned integer type, usually defined as -size_t. However, when using the traditional matching function, PCRE2 uses -recursion to handle subpatterns and indefinite repetition. This means that the -available stack space may limit the size of a subject string that can be -processed by certain patterns. For a discussion of stack issues, see the -pcre2stack -documentation. -


    AUTHOR
    @@ -80,7 +85,7 @@ Cambridge, England. REVISION

    -Last updated: 29 September 2014 +Last updated: 25 November 2014
    Copyright © 1997-2014 University of Cambridge.
    diff --git a/doc/pcre2.txt b/doc/pcre2.txt index 7712acb..c603c98 100644 --- a/doc/pcre2.txt +++ b/doc/pcre2.txt @@ -248,7 +248,7 @@ PCRE2 NATIVE API COMPILE CONTEXT FUNCTIONS uint32_t value); int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, - int (*guard_function)(uint32_t)); + int (*guard_function)(uint32_t, void *), void *user_data); PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS @@ -262,7 +262,7 @@ PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS void pcre2_match_context_free(pcre2_match_context *mcontext); int pcre2_set_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_callout_block *), + int (*callout_function)(pcre2_callout_block *, void *), void *callout_data); int pcre2_set_match_limit(pcre2_match_context *mcontext, @@ -492,6 +492,17 @@ PCRE2 API OVERVIEW which PCRE2 was built (pcre2_config()). +STRING LENGTHS AND OFFSETS + + The PCRE2 API uses string lengths and offsets into strings of code + units in several places. These values are always of type PCRE2_SIZE, + which is an unsigned integer type, currently always defined as size_t. + The largest value that can be stored in such a type (that is + ~(PCRE2_SIZE)0) is reserved as a special indicator for zero-terminated + strings and unset offsets. Therefore, the longest string that can be + handled is one less than this maximum. + + NEWLINES PCRE2 supports five different conventions for indicating line breaks in @@ -694,16 +705,20 @@ PCRE2 CONTEXTS rogue patterns using up too much system stack when being compiled. int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, - int (*guard_function)(uint32_t)); + int (*guard_function)(uint32_t, void *), void *user_data); There is at least one application that runs PCRE2 in threads with very limited system stack, where running out of stack is to be avoided at all costs. The parenthesis limit above cannot take account of how much stack is actually available. For a finer control, you can supply a function that is called whenever pcre2_compile() starts to compile a - parenthesized part of a pattern. The argument to the function gives the - current depth of nesting. The function should return zero if all is - well, or non-zero to force an error. + parenthesized part of a pattern. This function can check the actual + stack size (or anything else that it wants to, of course). + + The first argument to the callout function gives the current depth of + nesting, and the second is user data that is set up by the last argu- + ment of pcre2_set_compile_recursion_guard(). The callout function + should return zero if all is well, or non-zero to force an error. The match context @@ -734,7 +749,7 @@ PCRE2 CONTEXTS on success, or PCRE2_ERROR_BADDATA if invalid data is detected. int pcre2_set_callout(pcre2_match_context *mcontext, - int (*callout_function)(pcre2_callout_block *), + int (*callout_function)(pcre2_callout_block *, void *), void *callout_data); This sets up a "callout" function, which PCRE2 will call at specified @@ -853,16 +868,16 @@ CHECKING BUILD-TIME OPTIONS PCRE2_CONFIG_BSR - The output is an integer whose value indicates what character sequences - the \R escape sequence matches by default. A value of PCRE2_BSR_UNICODE - means that \R matches any Unicode line ending sequence; a value of - PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The - default can be overridden when a pattern is compiled. + The output is a uint32_t integer whose value indicates what character + sequences the \R escape sequence matches by default. A value of + PCRE2_BSR_UNICODE means that \R matches any Unicode line ending + sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, + LF, or CRLF. The default can be overridden when a pattern is compiled. PCRE2_CONFIG_JIT - The output is an integer that is set to one if support for just-in-time - compiling is available; otherwise it is set to zero. + The output is a uint32_t integer that is set to one if support for + just-in-time compiling is available; otherwise it is set to zero. PCRE2_CONFIG_JITTARGET @@ -877,13 +892,13 @@ CHECKING BUILD-TIME OPTIONS PCRE2_CONFIG_LINKSIZE - The output is an integer that contains the number of bytes used for - internal linkage in compiled regular expressions. When PCRE2 is config- - ured, the value can be set to 2, 3, or 4, with the default being 2. - This is the value that is returned by pcre2_config(). However, when the - 16-bit library is compiled, a value of 3 is rounded up to 4, and when - the 32-bit library is compiled, internal linkages always use 4 bytes, - so the configured value is not relevant. + The output is a uint32_t integer that contains the number of bytes used + for internal linkage in compiled regular expressions. When PCRE2 is + configured, the value can be set to 2, 3, or 4, with the default being + 2. This is the value that is returned by pcre2_config(). However, when + the 16-bit library is compiled, a value of 3 is rounded up to 4, and + when the 32-bit library is compiled, internal linkages always use 4 + bytes, so the configured value is not relevant. The default value of 2 for the 8-bit and 16-bit libraries is sufficient for all but the most massive patterns, since it allows the size of the @@ -893,14 +908,15 @@ CHECKING BUILD-TIME OPTIONS PCRE2_CONFIG_MATCHLIMIT - The output is an unsigned long integer that gives the default limit for - the number of internal matching function calls in a pcre2_match() exe- - cution. Further details are given with pcre2_match() below. + The output is a uint32_t integer that gives the default limit for the + number of internal matching function calls in a pcre2_match() execu- + tion. Further details are given with pcre2_match() below. PCRE2_CONFIG_NEWLINE - The output is an integer whose value specifies the default character - sequence that is recognized as meaning "newline". The values are: + The output is a uint32_t integer whose value specifies the default + character sequence that is recognized as meaning "newline". The values + are: PCRE2_NEWLINE_CR Carriage return (CR) PCRE2_NEWLINE_LF Linefeed (LF) @@ -908,33 +924,34 @@ CHECKING BUILD-TIME OPTIONS PCRE2_NEWLINE_ANY Any Unicode line ending PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF - The default should normally correspond to the standard sequence for + The default should normally correspond to the standard sequence for your operating system. PCRE2_CONFIG_PARENSLIMIT - The output is an unsigned long integer that gives the maximum depth of - nesting of parentheses (of any kind) in a pattern. This limit is - imposed to cap the amount of system stack used when a pattern is com- - piled. It is specified when PCRE2 is built; the default is 250. This - limit does not take into account the stack that may already be used by - the calling application. For finer control over compilation stack - usage, see pcre2_set_compile_recursion_guard(). + The output is a uint32_t integer that gives the maximum depth of nest- + ing of parentheses (of any kind) in a pattern. This limit is imposed to + cap the amount of system stack used when a pattern is compiled. It is + specified when PCRE2 is built; the default is 250. This limit does not + take into account the stack that may already be used by the calling + application. For finer control over compilation stack usage, see + pcre2_set_compile_recursion_guard(). PCRE2_CONFIG_RECURSIONLIMIT - The output is an unsigned long integer that gives the default limit for - the depth of recursion when calling the internal matching function in a - pcre2_match() execution. Further details are given with pcre2_match() + The output is a uint32_t integer that gives the default limit for the + depth of recursion when calling the internal matching function in a + pcre2_match() execution. Further details are given with pcre2_match() below. PCRE2_CONFIG_STACKRECURSE - The output is an integer that is set to one if internal recursion when - running pcre2_match() is implemented by recursive function calls that - use the system stack to remember their state. This is the usual way - that PCRE2 is compiled. The output is zero if PCRE2 was compiled to use - blocks of data on the heap instead of recursive function calls. + The output is a uint32_t integer that is set to one if internal recur- + sion when running pcre2_match() is implemented by recursive function + calls that use the system stack to remember their state. This is the + usual way that PCRE2 is compiled. The output is zero if PCRE2 was com- + piled to use blocks of data on the heap instead of recursive function + calls. PCRE2_CONFIG_UNICODE_VERSION @@ -948,8 +965,8 @@ CHECKING BUILD-TIME OPTIONS PCRE2_CONFIG_UNICODE - The output is an integer that is set to one if Unicode support is - available; otherwise it is set to zero. Unicode support implies UTF + The output is a uint32_t integer that is set to one if Unicode support + is available; otherwise it is set to zero. Unicode support implies UTF support. PCRE2_CONFIG_VERSION @@ -2605,7 +2622,7 @@ AUTHOR REVISION - Last updated: 23 November 2014 + Last updated: 26 November 2014 Copyright (c) 1997-2014 University of Cambridge. ------------------------------------------------------------------------------ @@ -3076,7 +3093,7 @@ SYNOPSIS #include - int (*pcre2_callout)(pcre2_callout_block *); + int (*pcre2_callout)(pcre2_callout_block *, void *); DESCRIPTION @@ -3183,15 +3200,16 @@ THE CALLOUT INTERFACE During matching, when PCRE2 reaches a callout point, if an external function is set in the match context, it is called. This applies to - both normal and DFA matching. The only argument to the callout function - is a pointer to a pcre2_callout block. This structure contains the fol- - lowing fields: + both normal and DFA matching. The first argument to the callout func- + tion is a pointer to a pcre2_callout block. The second argument is the + void * callout data that was supplied when the callout was set up by + calling pcre2_set_callout() (see the pcre2api documentation). The call- + out block structure contains the following fields: uint32_t version; uint32_t callout_number; uint32_t capture_top; uint32_t capture_last; - void *callout_data; PCRE2_SIZE *offset_vector; PCRE2_SPTR mark; PCRE2_SPTR subject; @@ -3242,28 +3260,23 @@ THE CALLOUT INTERFACE substrings. If no substrings have been captured, the value of cap- ture_last is 0. This is always the case for the DFA matching functions. - The callout_data field contains a value that is passed to a matching - function specifically so that it can be passed back in callouts. It is - set in the match context when the callout is set up by calling - pcre2_set_callout() (see the pcre2api documentation). - - The pattern_position field contains the offset to the next item to be + The pattern_position field contains the offset to the next item to be matched in the pattern string. - The next_item_length field contains the length of the next item to be + The next_item_length field contains the length of the next item to be matched in the pattern string. When the callout immediately precedes an - alternation bar, a closing parenthesis, or the end of the pattern, the - length is zero. When the callout precedes an opening parenthesis, the + alternation bar, a closing parenthesis, or the end of the pattern, the + length is zero. When the callout precedes an opening parenthesis, the length is that of the entire subpattern. - The pattern_position and next_item_length fields are intended to help - in distinguishing between different automatic callouts, which all have + The pattern_position and next_item_length fields are intended to help + in distinguishing between different automatic callouts, which all have the same callout number. However, they are set for all callouts. In callouts from pcre2_match() the mark field contains a pointer to the - zero-terminated name of the most recently passed (*MARK), (*PRUNE), or - (*THEN) item in the match, or NULL if no such items have been passed. - Instances of (*PRUNE) or (*THEN) without a name do not obliterate a + zero-terminated name of the most recently passed (*MARK), (*PRUNE), or + (*THEN) item in the match, or NULL if no such items have been passed. + Instances of (*PRUNE) or (*THEN) without a name do not obliterate a previous (*MARK). In callouts from the DFA matching function this field always contains NULL. @@ -3271,16 +3284,16 @@ THE CALLOUT INTERFACE RETURN VALUES The external callout function returns an integer to PCRE2. If the value - is zero, matching proceeds as normal. If the value is greater than - zero, matching fails at the current point, but the testing of other + is zero, matching proceeds as normal. If the value is greater than + zero, matching fails at the current point, but the testing of other matching possibilities goes ahead, just as if a lookahead assertion had failed. If the value is less than zero, the match is abandoned, and the matching function returns the negative value. - Negative values should normally be chosen from the set of - PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a - standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is - reserved for use by callout functions; it will never be used by PCRE2 + Negative values should normally be chosen from the set of + PCRE2_ERROR_xxx values. In particular, PCRE2_ERROR_NOMATCH forces a + standard "no match" failure. The error number PCRE2_ERROR_CALLOUT is + reserved for use by callout functions; it will never be used by PCRE2 itself. @@ -3293,7 +3306,7 @@ AUTHOR REVISION - Last updated: 23 November 2014 + Last updated: 25 November 2014 Copyright (c) 1997-2014 University of Cambridge. ------------------------------------------------------------------------------ @@ -3891,6 +3904,18 @@ SIZE AND OTHER LIMITATIONS of execution is slower. In the 32-bit library, the internal linkage size is always 4. + The maximum length (in code units) of a subject string is one less than + the largest number a PCRE2_SIZE variable can hold. PCRE2_SIZE is an + unsigned integer type, usually defined as size_t. Its maximum value + (that is ~(PCRE2_SIZE)0) is reserved as a special indicator for zero- + terminated strings and unset offsets. + + Note that when using the traditional matching function, PCRE2 uses + recursion to handle subpatterns and indefinite repetition. This means + that the available stack space may limit the size of a subject string + that can be processed by certain patterns. For a discussion of stack + issues, see the pcre2stack documentation. + All values in repeating quantifiers must be less than 65536. There is no limit to the number of parenthesized subpatterns, but there @@ -3913,14 +3938,6 @@ SIZE AND OTHER LIMITATIONS (*THEN) verb is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries. - The maximum length of a subject string is the largest number a - PCRE2_SIZE variable can hold. PCRE2_SIZE is an unsigned integer type, - usually defined as size_t. However, when using the traditional matching - function, PCRE2 uses recursion to handle subpatterns and indefinite - repetition. This means that the available stack space may limit the - size of a subject string that can be processed by certain patterns. For - a discussion of stack issues, see the pcre2stack documentation. - AUTHOR @@ -3931,7 +3948,7 @@ AUTHOR REVISION - Last updated: 29 September 2014 + Last updated: 25 November 2014 Copyright (c) 1997-2014 University of Cambridge. ------------------------------------------------------------------------------ diff --git a/doc/pcre2_set_compile_recursion_guard.3 b/doc/pcre2_set_compile_recursion_guard.3 index 96f1566..0575f94 100644 --- a/doc/pcre2_set_compile_recursion_guard.3 +++ b/doc/pcre2_set_compile_recursion_guard.3 @@ -8,7 +8,7 @@ PCRE2 - Perl-compatible regular expressions (revised API) .PP .nf .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP, -.B " int (*\fIguard_function\fP)(uint32_t));" +.B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);" .fi . .SH DESCRIPTION @@ -16,11 +16,12 @@ PCRE2 - Perl-compatible regular expressions (revised API) .sp This function defines, within a compile context, a function that is called whenever \fBpcre2_compile()\fP starts to compile a parenthesized part of a -pattern. The argument to the function gives the current depth of parenthesis -nesting. The function should return zero if all is well, or non-zero to force -an error. This feature is provided so that applications can check the available -system stack space, in order to avoid running out. The result of this function -is always zero. +pattern. The first argument to the function gives the current depth of +parenthesis nesting, and the second is user data that is supplied when the +function is set up. The callout function should return zero if all is well, or +non-zero to force an error. This feature is provided so that applications can +check the available system stack space, in order to avoid running out. The +result of \fBpcre2_set_compile_recursion_guard()\fP is always zero. .P There is a complete description of the PCRE2 native API in the .\" HREF diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 67f5802..2b758b4 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "25 November 2014" "PCRE2 10.00" +.TH PCRE2API 3 "26 November 2014" "PCRE2 10.00" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -97,7 +97,7 @@ document for an overview of all the PCRE2 documentation. .B " uint32_t \fIvalue\fP);" .sp .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP, -.B " int (*\fIguard_function\fP)(uint32_t));" +.B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);" .fi . . @@ -375,11 +375,11 @@ PCRE2 was built (\fBpcre2_config()\fP). .sp The PCRE2 API uses string lengths and offsets into strings of code units in several places. These values are always of type PCRE2_SIZE, which is an -unsigned integer type, currently always defined as \fIsize_t\fP. The largest -value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved -as a special indicator for zero-terminated strings and unset offsets. -Therefore, the longest string that can be handled is one less than this -maximum. +unsigned integer type, currently always defined as \fIsize_t\fP. The largest +value that can be stored in such a type (that is ~(PCRE2_SIZE)0) is reserved +as a special indicator for zero-terminated strings and unset offsets. +Therefore, the longest string that can be handled is one less than this +maximum. . . .\" HTML @@ -612,7 +612,7 @@ using up too much system stack when being compiled. .sp .nf .B int pcre2_set_compile_recursion_guard(pcre2_compile_context *\fIccontext\fP, -.B " int (*\fIguard_function\fP)(uint32_t));" +.B " int (*\fIguard_function\fP)(uint32_t, void *), void *\fIuser_data\fP);" .fi .sp There is at least one application that runs PCRE2 in threads with very limited @@ -620,8 +620,13 @@ system stack, where running out of stack is to be avoided at all costs. The parenthesis limit above cannot take account of how much stack is actually available. For a finer control, you can supply a function that is called whenever \fBpcre2_compile()\fP starts to compile a parenthesized part of a -pattern. The argument to the function gives the current depth of nesting. The -function should return zero if all is well, or non-zero to force an error. +pattern. This function can check the actual stack size (or anything else that +it wants to, of course). +.P +The first argument to the callout function gives the current depth of +nesting, and the second is user data that is set up by the last argument of +\fBpcre2_set_compile_recursion_guard()\fP. The callout function should return +zero if all is well, or non-zero to force an error. . . .\" HTML @@ -2726,6 +2731,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 25 November 2014 +Last updated: 26 November 2014 Copyright (c) 1997-2014 University of Cambridge. .fi diff --git a/maint/README b/maint/README index cd47da2..d7e6db5 100644 --- a/maint/README +++ b/maint/README @@ -37,11 +37,11 @@ pcre2_chartables.c.non-standard README This file. -Unicode.tables The files in this directory (CaseFolding.txt, +Unicode.tables The files in this directory (CaseFolding.txt, DerivedGeneralCategory.txt, GraphemeBreakProperty.txt, Scripts.txt and UnicodeData.txt) were downloaded from the Unicode web site. They contain information about Unicode - characters and scripts. + characters and scripts. ucptest.c A short C program for testing the Unicode property macros that do lookups in the pcre2_ucd.c data, mainly useful after @@ -87,21 +87,21 @@ Note also that both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode script names. -Preparing for a PCRE release -============================ +Preparing for a PCRE2 release +============================= This section contains a checklist of things that I consult before building a distribution for a new release. . Ensure that the version number and version date are correct in configure.ac. -. Update the library version numbers in configure.ac according to the rules +. Update the library version numbers in configure.ac according to the rules given below. . If new build options have been added, ensure that they are added to the CMake files as well as to the autoconf files. The relevant files are CMakeLists.txt - and config-cmake.h.in. After making a release tarball, test it out with CMake - if there have been changes here. + and config-cmake.h.in. After making a release tarball, test it out with CMake + if there have been changes here. . Run ./autogen.sh to ensure everything is up-to-date. @@ -112,7 +112,7 @@ distribution for a new release. different configurations, and it also runs some of them with valgrind, all of which can take quite some time. -. Run perltest.sh on the test data for tests 1 and 4. The output should match +. Run perltest.sh on the test data for tests 1 and 4. The output should match the PCRE2 test output, apart from the version identification at the start of each test. The other tests are not Perl-compatible (they use various PCRE2-specific features or options). @@ -122,7 +122,7 @@ distribution for a new release. may see a number of "pcre2_memmove defined but not used" warnings for the modules in which there is no call to memmove(). These can be ignored. -. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE, +. Documentation: check AUTHORS, ChangeLog (check version and date), LICENCE, NEWS (check version and date), NON-AUTOTOOLS-BUILD, and README. Many of these won't need changing, but over the long term things do change. @@ -133,15 +133,15 @@ distribution for a new release. pcre2test to increase the stack size for test 2. Since I retired I can no longer do this, but instead I rely on putting out release candidates for folks on the pcre-dev list to test. - + . The buildbots at http://buildfarm.opencsw.org/ do some automated testing - of PCRE2 and should be checked before putting out a release. - + of PCRE2 and should be checked before putting out a release. + Updating version info for libtool ================================= -This set of rules for updating library version information came from a web page +This set of rules for updating library version information came from a web page whose URL I have forgotten. The version information consists of three parts: (current, revision, age). @@ -194,7 +194,7 @@ and the zipball. Double-check with "svn status", then create an SVN tagged copy: svn copy svn://vcs.exim.org/pcre2/code/trunk \ - svn://vcs.exim.org/pcre2/code/tags/pcre-10.xx + svn://vcs.exim.org/pcre2/code/tags/pcre2-10.xx When the new release is out, don't forget to tell webmaster@pcre.org and the mailing list. Also, update the list of version numbers in Bugzilla (edit @@ -255,7 +255,7 @@ very sensible; some are rather wacky. Some have been on this list for years. . An option to convert results into character offsets and character lengths. -. An option for pcre2grep to scan only the start of a file. I am not keen - +. An option for pcre2grep to scan only the start of a file. I am not keen - this is the job of "head". . A (non-Unix) user wanted pcregrep options to (a) list a file name just once, @@ -282,14 +282,14 @@ very sensible; some are rather wacky. Some have been on this list for years. . Callouts with arguments: (?Cn:ARG) for instance. -. Write a function that generates random matching strings for a compiled +. Write a function that generates random matching strings for a compiled pattern. . Pcre2grep: an option to specify the output line separator, either as a string or select from a fixed list. This is not straightforward, because at the moment it outputs whatever is in the input file. -. Improve the code for duplicate checking in pcre_dfa_match(). An incomplete, +. Improve the code for duplicate checking in pcre2_dfa_match(). An incomplete, non-thread-safe patch showed that this can help performance for patterns where there are many alternatives. However, a simple thread-safe implementation that I tried made things worse in many simple cases, so this @@ -303,7 +303,12 @@ very sensible; some are rather wacky. Some have been on this list for years. . Instead of having #ifdef HAVE_CONFIG_H in each module, put #include "something" and the the #ifdef appears only in one place, in "something". +. Implement something like (?(R2+)... to check outer recursions. + +. If Perl ever supports the POSIX notation [[.something.]] PCRE2 should try + to follow. + Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -Last updated: 18 November 2014 +Last updated: 26 November 2014 diff --git a/src/pcre2.h.in b/src/pcre2.h.in index 0993297..6c50601 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -368,7 +368,8 @@ PCRE2_EXP_DECL int pcre2_set_newline(pcre2_compile_context *, uint32_t); \ PCRE2_EXP_DECL int pcre2_set_parens_nest_limit(pcre2_compile_context *, \ uint32_t); \ PCRE2_EXP_DECL int pcre2_set_compile_recursion_guard(\ - pcre2_compile_context *, int (*)(uint32_t)); \ + pcre2_compile_context *, int (*)(uint32_t, void *), \ + void *); #define PCRE2_MATCH_CONTEXT_FUNCTIONS \ PCRE2_EXP_DECL \ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index e1eb5b9..57753e9 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -6646,7 +6646,8 @@ branch_chain bc; /* If set, call the external function that checks for stack availability. */ -if (cb->cx->stack_guard != NULL && cb->cx->stack_guard(cb->parens_depth)) +if (cb->cx->stack_guard != NULL && + cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data)) { *errorcodeptr= ERR33; return FALSE; diff --git a/src/pcre2_context.c b/src/pcre2_context.c index 7300c1f..1fe3d4c 100644 --- a/src/pcre2_context.c +++ b/src/pcre2_context.c @@ -133,6 +133,7 @@ when no context is supplied to the compile function. */ const pcre2_compile_context PRIV(default_compile_context) = { { default_malloc, default_free, NULL }, NULL, + NULL, PRIV(default_tables), BSR_DEFAULT, NEWLINE_DEFAULT, @@ -320,9 +321,10 @@ return 0; PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontext, - int (*guard)(uint32_t)) + int (*guard)(uint32_t, void *), void *user_data) { ccontext->stack_guard = guard; +ccontext->stack_guard_data = user_data; return 0; } diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 40d8744..8b82115 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -2623,7 +2623,7 @@ for (;;) cb.current_position = (PCRE2_SIZE)(ptr - start_subject); cb.pattern_position = GET(code, LINK_SIZE + 3); cb.next_item_length = GET(code, 3 + 2*LINK_SIZE); - if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) + if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) return rrc; /* Abandon */ } if (rrc > 0) break; /* Fail this thread */ @@ -2970,7 +2970,7 @@ for (;;) cb.current_position = (PCRE2_SIZE)(ptr - start_subject); cb.pattern_position = GET(code, 2); cb.next_item_length = GET(code, 2 + LINK_SIZE); - if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) + if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0) return rrc; /* Abandon */ } if (rrc == 0) diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index b9499e6..8f541dd 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1877,10 +1877,10 @@ is available. */ #define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) #define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) -extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, +extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_block *); extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); -extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, +extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); extern void _pcre2_jit_free(void *, pcre2_memctl *); extern size_t _pcre2_jit_get_size(void *); @@ -1895,7 +1895,7 @@ extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t); extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t); extern int _pcre2_study(pcre2_real_code *); extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *); -extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, +extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL); #endif /* PCRE2_CODE_UNIT_WIDTH */ diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 72771b7..bb8e6fc 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -552,41 +552,42 @@ code that uses them is simpler because it assumes this. */ memory control. */ typedef struct pcre2_real_general_context { - pcre2_memctl memctl; + pcre2_memctl memctl; } pcre2_real_general_context; /* The real compile context structure */ typedef struct pcre2_real_compile_context { - pcre2_memctl memctl; - int (*stack_guard)(uint32_t); + pcre2_memctl memctl; + int (*stack_guard)(uint32_t, void *); + void *stack_guard_data; const uint8_t *tables; - uint16_t bsr_convention; - uint16_t newline_convention; - uint32_t parens_nest_limit; + uint16_t bsr_convention; + uint16_t newline_convention; + uint32_t parens_nest_limit; } pcre2_real_compile_context; /* The real match context structure. */ typedef struct pcre2_real_match_context { - pcre2_memctl memctl; + pcre2_memctl memctl; #ifdef HEAP_MATCH_RECURSE - pcre2_memctl stack_memctl; + pcre2_memctl stack_memctl; #endif #ifdef SUPPORT_JIT pcre2_jit_callback jit_callback; void *jit_callback_data; #endif - int (*callout)(pcre2_callout_block *, void *); - void *callout_data; - uint32_t match_limit; - uint32_t recursion_limit; + int (*callout)(pcre2_callout_block *, void *); + void *callout_data; + uint32_t match_limit; + uint32_t recursion_limit; } pcre2_real_match_context; /* The real compiled code structure */ typedef struct pcre2_real_code { - pcre2_memctl memctl; /* Memory control fields */ + pcre2_memctl memctl; /* Memory control fields */ const uint8_t *tables; /* The character tables */ void *executable_jit; /* Pointer to JIT code */ uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 2053415..c567dfe 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -1319,7 +1319,7 @@ for (;;) cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject); cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); - if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) + if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } @@ -1723,7 +1723,7 @@ for (;;) cb.current_position = (PCRE2_SIZE)(eptr - mb->start_subject); cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); - if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) + if ((rrc = mb->callout(&cb, mb->callout_data)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } diff --git a/src/pcre2test.c b/src/pcre2test.c index f585310..f9ebff3 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -943,13 +943,13 @@ are supported. */ else \ pcre2_set_character_tables_32(G(a,32),b) -#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \ +#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ if (test_mode == PCRE8_MODE) \ - pcre2_set_compile_recursion_guard_8(G(a,8),b); \ + pcre2_set_compile_recursion_guard_8(G(a,8),b,c); \ else if (test_mode == PCRE16_MODE) \ - pcre2_set_compile_recursion_guard_16(G(a,16),b); \ + pcre2_set_compile_recursion_guard_16(G(a,16),b,c); \ else \ - pcre2_set_compile_recursion_guard_32(G(a,32),b) + pcre2_set_compile_recursion_guard_32(G(a,32),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) \ if (test_mode == PCRE8_MODE) \ @@ -1315,11 +1315,11 @@ the three different cases. */ else \ G(pcre2_set_character_tables_,BITTWO)(G(a,BITTWO),b) -#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \ +#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ - G(pcre2_set_compile_recursion_guard_,BITONE)(G(a,BITONE),b); \ + G(pcre2_set_compile_recursion_guard_,BITONE)(G(a,BITONE),b,c); \ else \ - G(pcre2_set_compile_recursion_guard_,BITTWO)(G(a,BITTWO),b) + G(pcre2_set_compile_recursion_guard_,BITTWO)(G(a,BITTWO),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) \ if (test_mode == G(G(PCRE,BITONE),_MODE)) \ @@ -1512,8 +1512,8 @@ the three different cases. */ #define PCRE2_SET_CALLOUT(a,b,c) \ pcre2_set_callout_8(G(a,8),(int (*)(pcre2_callout_block_8 *, void *))b,c) #define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_8(G(a,8),b) -#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \ - pcre2_set_compile_recursion_guard_8(G(a,8),b) +#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ + pcre2_set_compile_recursion_guard_8(G(a,8),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_8(G(a,8),b) #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_8(G(a,8),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_8(G(a,8),b) @@ -1593,8 +1593,8 @@ the three different cases. */ #define PCRE2_SET_CALLOUT(a,b,c) \ pcre2_set_callout_16(G(a,16),(int (*)(pcre2_callout_block_16 *, void *))b,c); #define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_16(G(a,16),b) -#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \ - pcre2_set_compile_recursion_guard_16(G(a,16),b) +#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ + pcre2_set_compile_recursion_guard_16(G(a,16),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_16(G(a,16),b) #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_16(G(a,16),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_16(G(a,16),b) @@ -1674,8 +1674,8 @@ the three different cases. */ #define PCRE2_SET_CALLOUT(a,b,c) \ pcre2_set_callout_32(G(a,32),(int (*)(pcre2_callout_block_32 *, void *))b,c); #define PCRE2_SET_CHARACTER_TABLES(a,b) pcre2_set_character_tables_32(G(a,32),b) -#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b) \ - pcre2_set_compile_recursion_guard_32(G(a,32),b) +#define PCRE2_SET_COMPILE_RECURSION_GUARD(a,b,c) \ + pcre2_set_compile_recursion_guard_32(G(a,32),b,c) #define PCRE2_SET_MATCH_LIMIT(a,b) pcre2_set_match_limit_32(G(a,32),b) #define PCRE2_SET_PARENS_NEST_LIMIT(a,b) pcre2_set_parens_nest_limit_32(G(a,32),b) #define PCRE2_SET_RECURSION_LIMIT(a,b) pcre2_set_recursion_limit_32(G(a,32),b) @@ -2104,8 +2104,9 @@ Returns: non-zero to kill the compilation */ static int -stack_guard(uint32_t depth) +stack_guard(uint32_t depth, void *user_data) { +(void)user_data; return depth > pat_patctl.stackguard_test; } @@ -3827,7 +3828,7 @@ PCRE2_SET_CHARACTER_TABLES(pat_context, use_tables); if (pat_patctl.stackguard_test != 0) { - PCRE2_SET_COMPILE_RECURSION_GUARD(pat_context, stack_guard); + PCRE2_SET_COMPILE_RECURSION_GUARD(pat_context, stack_guard, NULL); } /* Handle compiling via the POSIX interface, which doesn't support the @@ -5686,13 +5687,13 @@ Returns: nothing */ static void -print_newline_config(unsigned int rc, BOOL isc) +print_newline_config(uint32_t optval, BOOL isc) { if (!isc) printf(" Newline sequence is "); -if (rc < sizeof(newlines)/sizeof(char *)) - printf("%s\n", newlines[rc]); +if (optval < sizeof(newlines)/sizeof(char *)) + printf("%s\n", newlines[optval]); else - printf("a non-standard value: %d\n", rc); + printf("a non-standard value: %d\n", optval); } @@ -5769,8 +5770,7 @@ Returns: the return code static int c_option(const char *arg) { -unsigned long int lrc; -int rc; +uint32_t optval; int yield = 0; if (arg != NULL) @@ -5789,8 +5789,8 @@ if (arg != NULL) switch (coptlist[i].type) { case CONF_BSR: - (void)PCRE2_CONFIG(coptlist[i].value, &rc); - printf("%s\n", rc? "ANYCRLF" : "ANY"); + (void)PCRE2_CONFIG(coptlist[i].value, &optval); + printf("%s\n", optval? "ANYCRLF" : "ANY"); break; case CONF_FIX: @@ -5799,8 +5799,8 @@ if (arg != NULL) break; case CONF_FIZ: - rc = coptlist[i].value; - printf("%d\n", rc); + optval = coptlist[i].value; + printf("%d\n", optval); break; case CONF_INT: @@ -5809,8 +5809,8 @@ if (arg != NULL) break; case CONF_NL: - (void)PCRE2_CONFIG(coptlist[i].value, &rc); - print_newline_config(rc, TRUE); + (void)PCRE2_CONFIG(coptlist[i].value, &optval); + print_newline_config(optval, TRUE); break; } @@ -5822,7 +5822,7 @@ if (arg != NULL) char ucname[16]; strcpy(ucname, coptlist[i].name); for (i = 0; ucname[i] != 0; i++) ucname[i] = toupper[ucname[i]; - vms_setsymbol(ucname, 0, rc); + vms_setsymbol(ucname, 0, optval); } #endif @@ -5848,8 +5848,8 @@ printf(" 16-bit support\n"); printf(" 32-bit support\n"); #endif -(void)PCRE2_CONFIG(PCRE2_CONFIG_UNICODE, &rc); -if (rc != 0) +(void)PCRE2_CONFIG(PCRE2_CONFIG_UNICODE, &optval); +if (optval != 0) { printf(" UTF and UCP support ("); print_unicode_version(stdout); @@ -5857,8 +5857,8 @@ if (rc != 0) } else printf(" No Unicode support\n"); -(void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &rc); -if (rc != 0) +(void)PCRE2_CONFIG(PCRE2_CONFIG_JIT, &optval); +if (optval != 0) { printf(" Just-in-time compiler support: "); print_jit_target(stdout); @@ -5869,21 +5869,21 @@ else printf(" No just-in-time compiler support\n"); } -(void)PCRE2_CONFIG(PCRE2_CONFIG_NEWLINE, &rc); -print_newline_config(rc, FALSE); -(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &rc); -printf(" \\R matches %s\n", rc? "CR, LF, or CRLF only" : +(void)PCRE2_CONFIG(PCRE2_CONFIG_NEWLINE, &optval); +print_newline_config(optval, FALSE); +(void)PCRE2_CONFIG(PCRE2_CONFIG_BSR, &optval); +printf(" \\R matches %s\n", optval? "CR, LF, or CRLF only" : "all Unicode newlines"); -(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &rc); -printf(" Internal link size = %d\n", rc); -(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &lrc); -printf(" Parentheses nest limit = %ld\n", lrc); -(void)PCRE2_CONFIG(PCRE2_CONFIG_MATCHLIMIT, &lrc); -printf(" Default match limit = %ld\n", lrc); -(void)PCRE2_CONFIG(PCRE2_CONFIG_RECURSIONLIMIT, &lrc); -printf(" Default recursion depth limit = %ld\n", lrc); -(void)PCRE2_CONFIG(PCRE2_CONFIG_STACKRECURSE, &rc); -printf(" Match recursion uses %s", rc? "stack" : "heap"); +(void)PCRE2_CONFIG(PCRE2_CONFIG_LINKSIZE, &optval); +printf(" Internal link size = %d\n", optval); +(void)PCRE2_CONFIG(PCRE2_CONFIG_PARENSLIMIT, &optval); +printf(" Parentheses nest limit = %d\n", optval); +(void)PCRE2_CONFIG(PCRE2_CONFIG_MATCHLIMIT, &optval); +printf(" Default match limit = %d\n", optval); +(void)PCRE2_CONFIG(PCRE2_CONFIG_RECURSIONLIMIT, &optval); +printf(" Default recursion depth limit = %d\n", optval); +(void)PCRE2_CONFIG(PCRE2_CONFIG_STACKRECURSE, &optval); +printf(" Match recursion uses %s", optval? "stack" : "heap"); printf("\n"); return 0;