Fix *MARK length check in UTF mode (it was checking characters, not code

units).
This commit is contained in:
Philip.Hazel 2016-10-26 16:59:22 +00:00
parent 2821076981
commit 56084cc922
7 changed files with 30 additions and 9 deletions

View File

@ -40,6 +40,15 @@ parenthesis item, not the length of the whole group. A length of zero is now
given only for a callout at the end of the pattern. Automatic callouts are no given only for a callout at the end of the pattern. Automatic callouts are no
longer inserted before and after explicit callouts in the pattern. longer inserted before and after explicit callouts in the pattern.
Some bugs in the refactored code were subsequently fixed before release:
(a) An overall recursion such as (?0) inside a lookbehind assertion was not
being diagnosed as an error.
(b) In utf mode, the length of a *MARK (or other verb) name was being checked
in characters instead of code units, which could lead to bad code being
compiled, leading to unpredictable behaviour.
4. Back references are now permitted in lookbehind assertions when there are 4. Back references are now permitted in lookbehind assertions when there are
no duplicated group numbers (that is, (?| has not been used), and, if the no duplicated group numbers (that is, (?| has not been used), and, if the
reference is by name, there is only one group of that name. The referenced reference is by name, there is only one group of that name. The referenced
@ -96,9 +105,6 @@ only when PCRE2_NO_START_OPTIMIZE was *not* set:
16. The "offset" modifier in pcre2test was not being ignored (as documented) 16. The "offset" modifier in pcre2test was not being ignored (as documented)
when the POSIX API was in use. when the POSIX API was in use.
17. An overall recursion such as (?0) inside a lookbehind assertion was not
being diagnosed as an error.
Version 10.22 29-July-2016 Version 10.22 29-July-2016
-------------------------- --------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2LIMITS 3 "29 September 2016" "PCRE2 10.23" .TH PCRE2LIMITS 3 "26 October 2016" "PCRE2 10.23"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "SIZE AND OTHER LIMITATIONS" .SH "SIZE AND OTHER LIMITATIONS"
@ -55,7 +55,8 @@ The maximum length of name for a named subpattern is 32 code units, and the
maximum number of named subpatterns is 10000. maximum number of named subpatterns is 10000.
.P .P
The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb The maximum length of a name in a (*MARK), (*PRUNE), (*SKIP), or (*THEN) verb
is 255 for the 8-bit library and 65535 for the 16-bit and 32-bit libraries. is 255 code units for the 8-bit library and 65535 code units for the 16-bit and
32-bit libraries.
.P .P
The maximum length of a string argument to a callout is the largest number a The maximum length of a string argument to a callout is the largest number a
32-bit unsigned integer can hold. 32-bit unsigned integer can hold.
@ -75,6 +76,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 29 September 2016 Last updated: 26 October 2016
Copyright (c) 1997-2016 University of Cambridge. Copyright (c) 1997-2016 University of Cambridge.
.fi .fi

View File

@ -2161,6 +2161,7 @@ BOOL negate_class;
BOOL okquantifier = FALSE; BOOL okquantifier = FALSE;
PCRE2_SPTR name; PCRE2_SPTR name;
PCRE2_SPTR ptrend = cb->end_pattern; PCRE2_SPTR ptrend = cb->end_pattern;
PCRE2_SPTR verbnamestart = NULL; /* Value avoids compiler warning */
named_group *ng; named_group *ng;
nest_save *top_nest = NULL; nest_save *top_nest = NULL;
nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
@ -2248,8 +2249,10 @@ while (ptr < ptrend)
case CHAR_RIGHT_PARENTHESIS: case CHAR_RIGHT_PARENTHESIS:
inverbname = FALSE; inverbname = FALSE;
/* This is the length in characters */
verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1); verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
if (verbnamelength > MAX_MARK) /* But the limit on the length is in code units */
if (ptr - verbnamestart - 1 > MAX_MARK)
{ {
ptr--; ptr--;
errorcode = ERR76; errorcode = ERR76;
@ -3149,6 +3152,7 @@ while (ptr < ptrend)
*parsed_pattern++ = verbs[i].meta + *parsed_pattern++ = verbs[i].meta +
((verbs[i].meta != META_MARK)? 0x00010000u:0); ((verbs[i].meta != META_MARK)? 0x00010000u:0);
verblengthptr = parsed_pattern++; verblengthptr = parsed_pattern++;
verbnamestart = ptr;
inverbname = TRUE; inverbname = TRUE;
} }
else /* No verb "name" argument */ else /* No verb "name" argument */

View File

@ -454,4 +454,6 @@
\= Expect no match \= Expect no match
123 123
/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf
# End of testinput10 # End of testinput10

2
testdata/testinput9 vendored
View File

@ -258,4 +258,6 @@
/(*MARK:a\x{100}b)z/alt_verbnames /(*MARK:a\x{100}b)z/alt_verbnames
/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/
# End of testinput9 # End of testinput9

View File

@ -1564,4 +1564,7 @@ Failed: error -22: UTF-8 error: isolated byte with 0x80 bit set at offset 1
123 123
No match No match
/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf
Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)
# End of testinput10 # End of testinput10

View File

@ -364,4 +364,7 @@ Failed: error 177 at offset 7: character code point value in \u.... sequence is
/(*MARK:a\x{100}b)z/alt_verbnames /(*MARK:a\x{100}b)z/alt_verbnames
Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large Failed: error 134 at offset 14: character code point value in \x{} or \o{} is too large
/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/
Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)
# End of testinput9 # End of testinput9