From 49a7eada2d459af97babf86ce7e40059993123c4 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 20 Feb 2015 11:20:40 +0000 Subject: [PATCH] Final source file tidies for 10.10-RC1 --- ChangeLog | 36 ++++++++++++++++++------------------ NEWS | 13 +++++++++++++ RunTest | 20 ++++++++++---------- configure.ac | 8 ++++---- doc/html/pcre2pattern.html | 21 +++++++++++++++------ doc/html/pcre2syntax.html | 3 ++- doc/pcre2pattern.3 | 8 ++++---- doc/pcre2syntax.3 | 2 +- src/pcre2.h.generic | 19 ++++++++++--------- src/pcre2_auto_possess.c | 10 +++++----- src/pcre2_compile.c | 32 ++++++++++++++++---------------- src/pcre2_error.c | 2 +- src/pcre2_intmodedep.h | 8 ++++---- src/pcre2_jit_compile.c | 4 ++-- src/pcre2_study.c | 4 ++-- 15 files changed, 107 insertions(+), 83 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8cd61c7..d7e0554 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,8 @@ Change Log for PCRE2 -------------------- -Version 10.10 xx-xxx-2015 -------------------------- +Version 10.10 20-February-2015 +------------------------------ 1. When a pattern is compiled, it remembers the highest back reference so that when matching, if the ovector is too small, extra memory can be obtained to @@ -32,27 +32,27 @@ in order to avoid accessing uninitialized data when serializing. 5. The (*NO_JIT) feature is implemented. 6. If a bug that caused pcre2_compile() to use more memory than allocated was -triggered when using valgrind, the code in (3) above passed a stupidly large +triggered when using valgrind, the code in (3) above passed a stupidly large value to valgrind. This caused a crash instead of an "internal error" return. -7. A reference to a duplicated named group (either a back reference or a test -for being set in a conditional) that occurred in a part of the pattern where -PCRE2_DUPNAMES was not set caused the amount of memory needed for the pattern +7. A reference to a duplicated named group (either a back reference or a test +for being set in a conditional) that occurred in a part of the pattern where +PCRE2_DUPNAMES was not set caused the amount of memory needed for the pattern to be incorrectly calculated, leading to overwriting. -8. A mutually recursive set of back references such as (\2)(\1) caused a -segfault at compile time (while trying to find the minimum matching length). +8. A mutually recursive set of back references such as (\2)(\1) caused a +segfault at compile time (while trying to find the minimum matching length). The infinite loop is now broken (with the minimum length unset, that is, zero). -9. If an assertion that was used as a condition was quantified with a minimum -of zero, matching went wrong. In particular, if the whole group had unlimited +9. If an assertion that was used as a condition was quantified with a minimum +of zero, matching went wrong. In particular, if the whole group had unlimited repetition and could match an empty string, a segfault was likely. The pattern (?(?=0)?)+ is an example that caused this. Perl allows assertions to be quantified, but not if they are being used as conditions, so the above pattern is faulted by Perl. PCRE2 has now been changed so that it also rejects such patterns. -10. The error message for an invalid quantifier has been changed from "nothing +10. The error message for an invalid quantifier has been changed from "nothing to repeat" to "quantifier does not follow a repeatable item". 11. If a bad UTF string is compiled with NO_UTF_CHECK, it may succeed, but @@ -62,14 +62,14 @@ infinite loop. Now it generates an "internal error" error. This is a tidyup, not a bug fix; passing bad UTF with NO_UTF_CHECK is documented as having an undefined outcome. -12. A UTF pattern containing a "not" match of a non-ASCII character and a +12. A UTF pattern containing a "not" match of a non-ASCII character and a subroutine reference could loop at compile time. Example: /[^\xff]((?1))/. -13. The locale test (RunTest 3) has been upgraded. It now checks that a locale -that is found in the output of "locale -a" can actually be set by pcre2test -before it is accepted. Previously, in an environment where a locale was listed -but would not set (an example does exist), the test would "pass" without -actually doing anything. Also the fr_CA locale has been added to the list of +13. The locale test (RunTest 3) has been upgraded. It now checks that a locale +that is found in the output of "locale -a" can actually be set by pcre2test +before it is accepted. Previously, in an environment where a locale was listed +but would not set (an example does exist), the test would "pass" without +actually doing anything. Also the fr_CA locale has been added to the list of locales that can be used. 14. Fixed a bug in pcre2_substitute(). If a replacement string ended in a @@ -77,7 +77,7 @@ capturing group number without parentheses, the last character was incorrectly literally included at the end of the replacement string. 15. A possessive capturing group such as (a)*+ with a minimum repeat of zero -failed to allow the zero-repeat case if pcre2_match() was called with an +failed to allow the zero-repeat case if pcre2_match() was called with an ovector too small to capture the group. diff --git a/NEWS b/NEWS index c3d4d91..3478bf4 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,19 @@ News about PCRE2 releases ------------------------- +Version 10.10 20-February-2015 +------------------------------ + +1. Serialization and de-serialization functions have been added to the API, +making it possible to save and restore sets of compiled patterns, though +restoration must be done in the same environment that was used for compilation. + +2. The (*NO_JIT) feature has been added; this makes it possible for a pattern +creator to specify that JIT is not to be used. + +3. A number of bugs have been fixed. + + Version 10.00 05-January-2015 ----------------------------- diff --git a/RunTest b/RunTest index 1fbda5e..0b3f5ba 100755 --- a/RunTest +++ b/RunTest @@ -466,20 +466,20 @@ for bmode in "$test8" "$test16" "$test32"; do # output matches any one of the alternative output files. if [ $do3 = yes ] ; then - locale= - + locale= + # In some environments locales that are listed by the "locale -a" # command do not seem to work with setlocale(). Therefore, we do # a preliminary test to see if pcre2test can set one before going - # on to use it. + # on to use it. - for loc in 'fr_FR' 'french' 'fr' 'fr_CA'; do + for loc in 'fr_FR' 'french' 'fr' 'fr_CA'; do locale -a | grep "^$loc\$" >/dev/null if [ $? -eq 0 ] ; then echo "/a/locale=$loc" | \ $sim $valgrind ./pcre2test -q $bmode | \ grep "Failed to set locale" >/dev/null - if [ $? -ne 0 ] ; then + if [ $? -ne 0 ] ; then locale=$loc if [ "$locale" = "fr_FR" ] ; then infile=$testdata/testinput3 @@ -495,11 +495,11 @@ for bmode in "$test8" "$test16" "$test32"; do sed "s/fr_FR/$loc/" $testdata/testoutput3 >test3output sed "s/fr_FR/$loc/" $testdata/testoutput3A >test3outputA sed "s/fr_FR/$loc/" $testdata/testoutput3B >test3outputB - fi - break - fi + fi + break + fi fi - done + done if [ "$locale" != "" ] ; then echo $title3 "(using '$locale' locale)" @@ -528,7 +528,7 @@ for bmode in "$test8" "$test16" "$test32"; do else echo "Cannot test locale-specific features - none of the 'fr_FR', 'fr_CA'," echo "'fr' or 'french' locales can be set, or the \"locale\" command is" - echo "not available to check for them." + echo "not available to check for them." echo " " fi fi diff --git a/configure.ac b/configure.ac index b9ba4e1..b474be4 100644 --- a/configure.ac +++ b/configure.ac @@ -11,15 +11,15 @@ dnl be defined as -RC2, for example. For real releases, it should be empty. m4_define(pcre2_major, [10]) m4_define(pcre2_minor, [10]) m4_define(pcre2_prerelease, [-RC1]) -m4_define(pcre2_date, [2014-01-13]) +m4_define(pcre2_date, [2015-02-20]) # NOTE: The CMakeLists.txt file searches for the above variables in the first # 50 lines of this file. Please update that if the variables above are moved. # Libtool shared library interface versions (current:revision:age) -m4_define(libpcre2_8_version, [0:0:0]) -m4_define(libpcre2_16_version, [0:0:0]) -m4_define(libpcre2_32_version, [0:0:0]) +m4_define(libpcre2_8_version, [1:0:1]) +m4_define(libpcre2_16_version, [1:0:1]) +m4_define(libpcre2_32_version, [1:0:1]) m4_define(libpcre2_posix_version, [0:0:0]) AC_PREREQ(2.57) diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html index dccb648..4e7f87b 100644 --- a/doc/html/pcre2pattern.html +++ b/doc/html/pcre2pattern.html @@ -162,6 +162,14 @@ of arbitrary characters). For more details, see the documentation.


+Disabling JIT compilation +
+

+If a pattern that starts with (*NO_JIT) is successfully compiled, an attempt by +the application to apply the JIT optimization by calling +pcre2_jit_compile() is ignored. +

+
Setting match and recursion limits

@@ -1715,8 +1723,8 @@ items: the \R escape sequence an escape such as \d or \pL that matches a single character a character class - a back reference (see next section) - a parenthesized subpattern (including assertions) + a back reference + a parenthesized subpattern (including most assertions) a subroutine call to a subpattern (recursive or otherwise) The general repetition quantifier specifies a minimum and maximum number of @@ -2126,10 +2134,11 @@ capturing is carried out only for positive assertions. (Perl sometimes, but not always, does do capturing in negative assertions.)

-For compatibility with Perl, assertion subpatterns may be repeated; though +For compatibility with Perl, most assertion subpatterns may be repeated; though it makes no sense to assert the same thing several times, the side effect of -capturing parentheses may occasionally be useful. In practice, there only three -cases: +capturing parentheses may occasionally be useful. However, an assertion that +forms the condition for a conditional subpattern may not be quantified. In +practice, for other assertions, there only three cases:

(1) If the quantifier is {0}, the assertion is never obeyed during matching. @@ -3249,7 +3258,7 @@ Cambridge, England.


REVISION

-Last updated: 26 January 2015 +Last updated: 28 January 2015
Copyright © 1997-2015 University of Cambridge.
diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html index 373b5aa..1b7237f 100644 --- a/doc/html/pcre2syntax.html +++ b/doc/html/pcre2syntax.html @@ -417,6 +417,7 @@ appear. (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) + (*NO_JIT) disable JIT optimization (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) (*UTF) set appropriate UTF mode for the library in use (*UCP) set PCRE2_UCP (use Unicode properties for \d etc) @@ -554,7 +555,7 @@ Cambridge, England.


REVISION

-Last updated: 02 January 2015 +Last updated: 26 January 2015
Copyright © 1997-2015 University of Cambridge.
diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 5f9ba78..e0d9b49 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -133,9 +133,9 @@ documentation. .SS "Disabling JIT compilation" .rs .sp -If a pattern that starts with (*NO_JIT) is successfully compiled, an attempt by -the application to apply the JIT optimization by calling -\fBpcre2_jit_compile()\fP is ignored. +If a pattern that starts with (*NO_JIT) is successfully compiled, an attempt by +the application to apply the JIT optimization by calling +\fBpcre2_jit_compile()\fP is ignored. . . .SS "Setting match and recursion limits" @@ -2154,7 +2154,7 @@ always, does do capturing in negative assertions.) .P For compatibility with Perl, most assertion subpatterns may be repeated; though it makes no sense to assert the same thing several times, the side effect of -capturing parentheses may occasionally be useful. However, an assertion that +capturing parentheses may occasionally be useful. However, an assertion that forms the condition for a conditional subpattern may not be quantified. In practice, for other assertions, there only three cases: .sp diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3 index 2802c89..f7e231c 100644 --- a/doc/pcre2syntax.3 +++ b/doc/pcre2syntax.3 @@ -390,7 +390,7 @@ appear. (*NOTEMPTY_ATSTART) set PCRE2_NOTEMPTY_ATSTART when matching (*NO_AUTO_POSSESS) no auto-possessification (PCRE2_NO_AUTO_POSSESS) (*NO_DOTSTAR_ANCHOR) no .* anchoring (PCRE2_NO_DOTSTAR_ANCHOR) - (*NO_JIT) disable JIT optimization + (*NO_JIT) disable JIT optimization (*NO_START_OPT) no start-match optimization (PCRE2_NO_START_OPTIMIZE) (*UTF) set appropriate UTF mode for the library in use (*UCP) set PCRE2_UCP (use Unicode properties for \ed etc) diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic index 05ad575..0620ed9 100644 --- a/src/pcre2.h.generic +++ b/src/pcre2.h.generic @@ -44,7 +44,7 @@ POSSIBILITY OF SUCH DAMAGE. #define PCRE2_MAJOR 10 #define PCRE2_MINOR 10 #define PCRE2_PRERELEASE -RC1 -#define PCRE2_DATE 2014-01-13 +#define PCRE2_DATE 2015-02-20 /* When an application links to a PCRE DLL in Windows, the symbols that are imported have to be identified as such. When building PCRE2, the appropriate @@ -198,11 +198,13 @@ greater than zero. */ #define PCRE2_ERROR_UTF32_ERR1 (-27) #define PCRE2_ERROR_UTF32_ERR2 (-28) -/* Error codes for pcre2[_dfa]_match(), substring extraction functions, and -context functions. */ +/* Error codes for pcre2[_dfa]_match(), substring extraction functions, context +functions, and serializing functions. They are in numerical order. Originally +they were in alphabetical order too, but now that PCRE2 is released, the +numbers must not be changed. */ #define PCRE2_ERROR_BADDATA (-29) -#define PCRE2_ERROR_BADLENGTH (-30) +#define PCRE2_ERROR_MIXEDTABLES (-30) /* Name was changed */ #define PCRE2_ERROR_BADMAGIC (-31) #define PCRE2_ERROR_BADMODE (-32) #define PCRE2_ERROR_BADOFFSET (-33) @@ -458,13 +460,12 @@ PCRE2_EXP_DECL int pcre2_substring_list_get(pcre2_match_data *, \ /* Functions for serializing / deserializing compiled patterns. */ #define PCRE2_SERIALIZE_FUNCTIONS \ -PCRE2_EXP_DECL int pcre2_serialize_encode(const pcre2_code **, \ - PCRE2_SIZE, uint8_t **, PCRE2_SIZE *, \ +PCRE2_EXP_DECL int32_t pcre2_serialize_encode(const pcre2_code **, \ + int32_t, uint8_t **, PCRE2_SIZE *, \ pcre2_general_context *); \ -PCRE2_EXP_DECL int pcre2_serialize_decode(pcre2_code **, PCRE2_SIZE, \ +PCRE2_EXP_DECL int32_t pcre2_serialize_decode(pcre2_code **, int32_t, \ const uint8_t *, pcre2_general_context *); \ -PCRE2_EXP_DECL int pcre2_serialize_get_number_of_codes(const uint8_t *, \ - PCRE2_SIZE *); \ +PCRE2_EXP_DECL int32_t pcre2_serialize_get_number_of_codes(const uint8_t *); \ PCRE2_EXP_DECL void pcre2_serialize_free(uint8_t *); diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c index 0e050e6..15dd770 100644 --- a/src/pcre2_auto_possess.c +++ b/src/pcre2_auto_possess.c @@ -1090,8 +1090,8 @@ but some compilers complain about an unreachable statement. */ *************************************************/ /* Replaces single character iterations with their possessive alternatives -if appropriate. This function modifies the compiled opcode! Hitting a -non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a +if appropriate. This function modifies the compiled opcode! Hitting a +non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a bad UTF string was compiled with PCRE2_NO_UTF_CHECK. Arguments: @@ -1114,9 +1114,9 @@ uint32_t list[8]; for (;;) { c = *code; - - if (c > OP_TABLE_LENGTH) return -1; /* Something gone wrong */ - + + if (c > OP_TABLE_LENGTH) return -1; /* Something gone wrong */ + if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) { c -= get_repeat_base(c) - OP_STAR; diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index a55ab65..a79b2b8 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -5208,7 +5208,7 @@ for (;; ptr++) /* For conditions that are assertions, check the syntax, and then exit the switch. This will take control down to where bracketed groups are processed. The assertion will be handled as part of the group, - but we need to identify this case because the conditional assertion may + but we need to identify this case because the conditional assertion may not be quantifier. */ if (tempptr[1] == CHAR_QUESTION_MARK && @@ -5216,9 +5216,9 @@ for (;; ptr++) tempptr[2] == CHAR_EXCLAMATION_MARK || tempptr[2] == CHAR_LESS_THAN_SIGN)) { - cb->iscondassert = TRUE; + cb->iscondassert = TRUE; break; - } + } /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all need to skip at least 1+IMM2_SIZE bytes at the start of the group. */ @@ -5771,10 +5771,10 @@ for (;; ptr++) /* If duplicate names are permitted, we have to allow for a named reference to a duplicated name (this cannot be determined until the - second pass). This needs an extra data item. Counting named back - references and incrementing the count at the end does not work - because it does not account for duplication of groups containing such - references. Nor does checking for PCRE2_DUPNAMES because that need + second pass). This needs an extra data item. Counting named back + references and incrementing the count at the end does not work + because it does not account for duplication of groups containing such + references. Nor does checking for PCRE2_DUPNAMES because that need not be set at the point of reference. */ *lengthptr += IMM2_SIZE; @@ -6132,11 +6132,11 @@ for (;; ptr++) } /* All assertions used not to be repeatable, but this was changed for Perl - compatibility. All kinds can now be repeated except for assertions that are + compatibility. All kinds can now be repeated except for assertions that are conditions (Perl also forbids these to be repeated). We copy code into a non-register variable (tempcode) in order to be able to pass its address - because some compilers complain otherwise. At the start of a conditional - group whose condition is an assertion, cb->iscondassert is set. We unset it + because some compilers complain otherwise. At the start of a conditional + group whose condition is an assertion, cb->iscondassert is set. We unset it here so as to allow assertions later in the group to be quantified. */ if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT && @@ -6145,8 +6145,8 @@ for (;; ptr++) previous = NULL; cb->iscondassert = FALSE; } - else previous = code; - + else previous = code; + *code = bravalue; tempcode = code; tempreqvary = cb->req_varyopt; /* Save value before bracket */ @@ -6917,7 +6917,7 @@ for (;;) } /* Fill in the ket */ - + *code = OP_KET; PUT(code, 1, (int)(code - start_bracket)); code += 1 + LINK_SIZE; @@ -7725,7 +7725,7 @@ if (re == NULL) errorcode = ERR21; goto HAD_ERROR; } - + re->memctl = ccontext->memctl; re->tables = tables; re->executable_jit = NULL; @@ -7831,7 +7831,7 @@ if (usedlength > length) errorcode = ERR23; else /* Fill in any forward references that are required. There may be repeated references; optimize for them, as searching a large regex takes time. The -test of errorcode inside the loop means that nothing is done if it is already +test of errorcode inside the loop means that nothing is done if it is already non-zero. */ if (cb.hwm > cb.start_workspace) @@ -7878,7 +7878,7 @@ if (errorcode == 0) PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; } - } + } /* If there were any lookbehind assertions that contained OP_RECURSE (recursions or subroutine calls), a flag is set for them to be checked here, diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 3cd8792..07d92de 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -159,7 +159,7 @@ static const char compile_error_texts[] = "character code point value in \\u.... sequence is too large\0" "digits missing in \\x{} or \\o{}\0" "syntax error in (?(VERSION condition\0" - /* 80 */ + /* 80 */ "internal error: unknown opcode in auto_possessify()\0" ; diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 7172acf..f38581f 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -585,9 +585,9 @@ typedef struct pcre2_real_match_context { } pcre2_real_match_context; /* The real compiled code structure. The type for the blocksize field is -defined specially because it is required in pcre2_serialize_decode() when -copying the size from possibly unaligned memory into a variable of the same -type. Use a macro rather than a typedef to avoid compiler warnings when this +defined specially because it is required in pcre2_serialize_decode() when +copying the size from possibly unaligned memory into a variable of the same +type. Use a macro rather than a typedef to avoid compiler warnings when this file is included multiple times by pcre2test. */ #undef CODE_BLOCKSIZE_TYPE @@ -695,7 +695,7 @@ typedef struct compile_block { BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ BOOL check_lookbehind; /* Lookbehinds need later checking */ BOOL dupnames; /* Duplicate names exist */ - BOOL iscondassert; /* Next assert is a condition */ + BOOL iscondassert; /* Next assert is a condition */ } compile_block; /* Structure for keeping the properties of the in-memory stack used diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 0306db6..f6d2a68 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -10325,7 +10325,7 @@ Arguments: options JIT option bits Returns: 0: success or (*NOJIT) was used - <0: an error code + <0: an error code */ #define PUBLIC_JIT_COMPILE_OPTIONS \ @@ -10351,7 +10351,7 @@ if (code == NULL) if ((options & ~PUBLIC_JIT_COMPILE_OPTIONS) != 0) return PCRE2_ERROR_JIT_BADOPTION; - + if ((re->flags & PCRE2_NOJIT) != 0) return 0; functions = (executable_functions *)re->executable_jit; diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 16e61d7..3f93a12 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -77,9 +77,9 @@ Arguments: utf UTF flag Returns: the minimum length - -1 \C in UTF-8 mode + -1 \C in UTF-8 mode or (*ACCEPT) - or too much back reference recursion + or too much back reference recursion -2 internal error (missing capturing bracket) -3 internal error (opcode not listed) */