From 2caf22dc61c7baf06c9f3a8d9980a62b3d86f8a7 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 3 Nov 2015 17:38:00 +0000 Subject: [PATCH] Forbid \K patterns that end before they start in pcre2_substitute(). --- ChangeLog | 3 +++ doc/pcre2api.3 | 13 ++++++++----- src/pcre2.h | 1 + src/pcre2.h.in | 1 + src/pcre2_error.c | 8 +++++--- src/pcre2_substitute.c | 41 ++++++++++++++++++++++++----------------- testdata/testinput2 | 3 +++ testdata/testoutput2 | 4 ++++ 8 files changed, 49 insertions(+), 25 deletions(-) diff --git a/ChangeLog b/ChangeLog index 82a77d7..c43d02c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -257,6 +257,9 @@ as /(?<=(a)(?-1))x/ which have a recursion within a backreference. 74. Give an error if a lookbehind assertion is longer than 65535 code units. +75. Give an error in pcre2_substitute() if a match ends before it starts (as a +result of the use of \K). + Version 10.20 30-June-2015 -------------------------- diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index dc6daeb..50d9606 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "16 October 2015" "PCRE2 10.21" +.TH PCRE2API 3 "03 November 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -2666,7 +2666,9 @@ same number causes an error at compile time. This function calls \fBpcre2_match()\fP and then makes a copy of the subject string in \fIoutputbuffer\fP, replacing the part that was matched with the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This can -be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. +be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. Matches in +which a \eK item in a lookahead in the pattern causes the match to end before +it starts are not supported, and give rise to an error return. .P The first seven arguments of \fBpcre2_substitute()\fP are the same as for \fBpcre2_match()\fP, except that the partial matching options are not @@ -2769,8 +2771,9 @@ are passed straight back. PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE (invalid escape sequence), -PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found), and -PCRE2_BADSUBSTITUTION (syntax error in extended group substitution). As for all +PCRE2_ERROR_REPMISSING_BRACE (closing curly bracket not found), +PCRE2_BADSUBSTITUTION (syntax error in extended group substitution), and +PCRE2_BADSUBPATTERN (the pattern match ended before it started). As for all PCRE2 errors, a text message that describes the error can be obtained by calling \fBpcre2_get_error_message()\fP. . @@ -3066,6 +3069,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 16 October 2015 +Last updated: 03 November 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2.h b/src/pcre2.h index 8bc0345..e5425c0 100644 --- a/src/pcre2.h +++ b/src/pcre2.h @@ -240,6 +240,7 @@ numbers must not be changed. */ #define PCRE2_ERROR_BADREPESCAPE (-57) #define PCRE2_ERROR_REPMISSINGBRACE (-58) #define PCRE2_ERROR_BADSUBSTITUTION (-59) +#define PCRE2_ERROR_BADSUBSPATTERN (-60) /* Request types for pcre2_pattern_info() */ diff --git a/src/pcre2.h.in b/src/pcre2.h.in index fa559ad..d77994d 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -240,6 +240,7 @@ numbers must not be changed. */ #define PCRE2_ERROR_BADREPESCAPE (-57) #define PCRE2_ERROR_REPMISSINGBRACE (-58) #define PCRE2_ERROR_BADSUBSTITUTION (-59) +#define PCRE2_ERROR_BADSUBSPATTERN (-60) /* Request types for pcre2_pattern_info() */ diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 2c1caaa..c8e7afb 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -170,8 +170,8 @@ static const char compile_error_texts[] = "(?| and/or (?J: or (?x: parentheses are too deeply nested\0" /* 85 */ "using \\C is disabled in this PCRE2 library\0" - "regular expression is too complicated\0" - "lookbehind assertion is too long\0" + "regular expression is too complicated\0" + "lookbehind assertion is too long\0" ; /* Match-time and UTF error texts are in the same format. */ @@ -247,7 +247,9 @@ static const char match_error_texts[] = "offset limit set without PCRE2_USE_OFFSET_LIMIT\0" "bad escape sequence in replacement string\0" "expected closing curly bracket in replacement string\0" - "bad substitution in replacement string\0" + "bad substitution in replacement string\0" + /* 60 */ + "match with end before start is not supported\0" ; diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c index 1c60381..b861ba5 100644 --- a/src/pcre2_substitute.c +++ b/src/pcre2_substitute.c @@ -55,7 +55,7 @@ POSSIBILITY OF SUCH DAMAGE. /* In extended mode, we recognize ${name:+set text:unset text} and similar constructions. This requires the identification of unescaped : and } characters. This function scans for such. It must deal with nested ${ -constructions. The pointer to the text is updated, either to the required end +constructions. The pointer to the text is updated, either to the required end character, or to where an error was detected. Arguments: @@ -107,7 +107,7 @@ for (; ptr < ptrend; ptr++) else if (*ptr == CHAR_BACKSLASH) { - int erc; + int erc; int errorcode = 0; uint32_t ch; @@ -279,10 +279,10 @@ do rc = pcre2_match(code, subject, length, start_offset, options|goptions, match_data, mcontext); - + #ifdef SUPPORT_UNICODE if (utf) options |= PCRE2_NO_UTF_CHECK; /* Only need to check once */ -#endif +#endif /* Any error other than no match returns the error code. No match when not doing the special after-empty-match global rematch, or when at the end of the @@ -320,7 +320,14 @@ do continue; } - /* Handle a successful match. */ + /* Handle a successful match. Matches that use \K to end before they start + are not supported. */ + + if (ovector[1] < ovector[0]) + { + rc = PCRE2_ERROR_BADSUBSPATTERN; + goto EXIT; + } subs++; if (rc == 0) rc = ovector_count; @@ -409,14 +416,14 @@ do next = *ptr; if (next < CHAR_0 || next > CHAR_9) break; group = group * 10 + next - CHAR_0; - + /* A check for a number greater than the hightest captured group is sufficient here; no need for a separate overflow check. */ - + if (group > code->top_bracket) { rc = PCRE2_ERROR_NOSUBSTRING; - goto PTREXIT; + goto PTREXIT; } } } @@ -439,7 +446,7 @@ do if (inparens) { - + if (extended && !star && ptr < repend - 2 && next == CHAR_COLON) { special = *(++ptr); @@ -501,8 +508,8 @@ do else { PCRE2_SPTR subptr, subptrend; - - /* Find a number for a named group. In case there are duplicate names, + + /* Find a number for a named group. In case there are duplicate names, search for the first one that is set. */ if (group < 0) @@ -516,18 +523,18 @@ do if (ng < ovector_count) { if (group < 0) group = ng; /* First in ovector */ - if (ovector[ng*2] != PCRE2_UNSET) + if (ovector[ng*2] != PCRE2_UNSET) { group = ng; /* First that is set */ break; - } + } } } - - /* If group is still negative, it means we did not find a group that + + /* If group is still negative, it means we did not find a group that is in the ovector. Just set the first group. */ - - if (group < 0) group = GET2(first, 0); + + if (group < 0) group = GET2(first, 0); } rc = pcre2_substring_length_bynumber(match_data, group, &sublength); diff --git a/testdata/testinput2 b/testdata/testinput2 index 426b5bf..5142943 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4596,4 +4596,7 @@ B)x/alt_verbnames,mark /(?