From 56084cc92240a8fcc4c79ba7f759f4a88ea6273a Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Wed, 26 Oct 2016 16:59:22 +0000 Subject: [PATCH] Fix *MARK length check in UTF mode (it was checking characters, not code units). --- ChangeLog | 12 +++++++++--- doc/pcre2limits.3 | 7 ++++--- src/pcre2_compile.c | 10 +++++++--- testdata/testinput10 | 2 ++ testdata/testinput9 | 2 ++ testdata/testoutput10 | 3 +++ testdata/testoutput9 | 3 +++ 7 files changed, 30 insertions(+), 9 deletions(-) diff --git a/ChangeLog b/ChangeLog index 422245c..7ca7f24 100644 --- a/ChangeLog +++ b/ChangeLog @@ -40,6 +40,15 @@ parenthesis item, not the length of the whole group. A length of zero is now given only for a callout at the end of the pattern. Automatic callouts are no longer inserted before and after explicit callouts in the pattern. +Some bugs in the refactored code were subsequently fixed before release: + + (a) An overall recursion such as (?0) inside a lookbehind assertion was not + being diagnosed as an error. + + (b) In utf mode, the length of a *MARK (or other verb) name was being checked + in characters instead of code units, which could lead to bad code being + compiled, leading to unpredictable behaviour. + 4. Back references are now permitted in lookbehind assertions when there are no duplicated group numbers (that is, (?| has not been used), and, if the reference is by name, there is only one group of that name. The referenced @@ -96,9 +105,6 @@ only when PCRE2_NO_START_OPTIMIZE was *not* set: 16. The "offset" modifier in pcre2test was not being ignored (as documented) when the POSIX API was in use. -17. An overall recursion such as (?0) inside a lookbehind assertion was not -being diagnosed as an error. - Version 10.22 29-July-2016 -------------------------- diff --git a/doc/pcre2limits.3 b/doc/pcre2limits.3 index fdb49fa..573c8d3 100644 --- a/doc/pcre2limits.3 +++ b/doc/pcre2limits.3 @@ -1,4 +1,4 @@ -.TH PCRE2LIMITS 3 "29 September 2016" "PCRE2 10.23" +.TH PCRE2LIMITS 3 "26 October 2016" "PCRE2 10.23" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SIZE AND OTHER LIMITATIONS" @@ -55,7 +55,8 @@ The maximum length of name for a named subpattern is 32 code units, and the maximum number of named subpatterns is 10000. .P The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb -is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries. +is 255 code units for the 8-bit library and 65535 code units for the 16-bit and +32-bit libraries. .P The maximum length of a string argument to a callout is the largest number a 32-bit unsigned integer can hold. @@ -75,6 +76,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 29 September 2016 +Last updated: 26 October 2016 Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 8bb4251..a76ca0f 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2161,6 +2161,7 @@ BOOL negate_class; BOOL okquantifier = FALSE; PCRE2_SPTR name; PCRE2_SPTR ptrend = cb->end_pattern; +PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */ named_group *ng; nest_save *top_nest = NULL; nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); @@ -2248,8 +2249,10 @@ while (ptr < ptrend) case CHAR_RIGHT_PARENTHESIS: inverbname = FALSE; + /* This is the length in characters */ verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1); - if (verbnamelength > MAX_MARK) + /* But the limit on the length is in code units */ + if (ptr - verbnamestart - 1 > MAX_MARK) { ptr--; errorcode = ERR76; @@ -3149,6 +3152,7 @@ while (ptr < ptrend) *parsed_pattern++ = verbs[i].meta + ((verbs[i].meta != META_MARK)? 0x00010000u:0); verblengthptr = parsed_pattern++; + verbnamestart = ptr; inverbname = TRUE; } else /* No verb "name" argument */ @@ -8503,7 +8507,7 @@ for (;; pptr++) if (META_CODE(*gptr) == META_BIGVALUE) gptr++; else if (*gptr == (META_CAPTURE | group)) break; } - + gptrend = parsed_skip(gptr, PSKIP_KET); if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED; /* Local recursion */ for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break; @@ -8862,7 +8866,7 @@ if (pattern == NULL) *errorptr = ERR16; return NULL; } - + /* Check that all undefined public option bits are zero. */ if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) diff --git a/testdata/testinput10 b/testdata/testinput10 index 4b80778..a1806ae 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -454,4 +454,6 @@ \= Expect no match 123 +/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf + # End of testinput10 diff --git a/testdata/testinput9 b/testdata/testinput9 index 9a26f5f..7be4b15 100644 --- a/testdata/testinput9 +++ b/testdata/testinput9 @@ -258,4 +258,6 @@ /(*MARK:a\x{100}b)z/alt_verbnames +/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/ + # End of testinput9 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index 0c1e9b2..3c35f0b 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1564,4 +1564,7 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1 123 No match +/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf +Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN) + # End of testinput10 diff --git a/testdata/testoutput9 b/testdata/testoutput9 index 99ee77a..6b014e5 100644 --- a/testdata/testoutput9 +++ b/testdata/testoutput9 @@ -364,4 +364,7 @@ Failed: error 177 at offset 7: character code point value in \u.... sequence is /(*MARK:a\x{100}b)z/alt_verbnames Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large +/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/ +Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN) + # End of testinput9