From 299e587f9bb7f841ec323a6d3871053ee0278f43 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 13 Nov 2015 16:52:26 +0000 Subject: [PATCH] Don't split CRLF in pcre2_substitute() when it's a valid newline sequence. --- ChangeLog | 3 +++ doc/pcre2api.3 | 9 +++++++-- doc/pcre2pattern.3 | 20 ++++++++++++++++---- src/pcre2_substitute.c | 16 +++++++++++++++- testdata/testinput2 | 17 +++++++++++++++++ testdata/testoutput2 | 29 +++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index d6d57b2..e373ec9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -296,6 +296,9 @@ not dereferencing it) while handling lookbehind assertions. 87. Failure to get memory for the match data in regcomp() is now given as a regcomp() error instead of waiting for regexec() to pick it up. +88. In pcre2_substitute(), ensure that CRLF is not split when it is a valid +newline sequence. + Version 10.20 30-June-2015 -------------------------- diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 9d71c93..a7452bf 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "10 November 2015" "PCRE2 10.21" +.TH PCRE2API 3 "13 November 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -2729,6 +2729,11 @@ simultaneous substitutions, as this \fBpcre2test\fP example shows: There is an additional option, PCRE2_SUBSTITUTE_GLOBAL, which causes the function to iterate over the subject string, replacing every matching substring. If this is not set, only the first matching substring is replaced. +If any matched substring has zero length, after the substitution has happened, +an attempt to find a non-empty match at the same position is performed. If this +is not successful, the current position is advanced by one character except +when CRLF is a valid newline sequence and the next two characters are CR, LF. +In this case, the current position is advanced by two characters. .P A second additional option, PCRE2_SUBSTITUTE_EXTENDED, causes extra processing to be applied to the replacement string. Without this option, only the dollar @@ -3087,6 +3092,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 10 November 2015 +Last updated: 13 November 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 88631e2..019a7a5 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "10 November 2015" "PCRE2 10.21" +.TH PCRE2PATTERN 3 "13 November 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -671,8 +671,8 @@ below. This particular group matches either the two-character sequence CR followed by LF, or one of the single characters LF (linefeed, U+000A), VT (vertical tab, U+000B), FF (form feed, U+000C), CR (carriage return, U+000D), or NEL (next -line, U+0085). The two-character sequence is treated as a single unit that -cannot be split. +line, U+0085). Because this is an atomic group, the two-character sequence is +treated as a single unit that cannot be split. .P In other modes, two additional characters whose codepoints are greater than 255 are added: LS (line separator, U+2028) and PS (paragraph separator, U+2029). @@ -1183,6 +1183,18 @@ patterns that are anchored in single line mode because all branches start with when the \fIstartoffset\fP argument of \fBpcre2_match()\fP is non-zero. The PCRE2_DOLLAR_ENDONLY option is ignored if PCRE2_MULTILINE is set. .P +When the newline convention (see +.\" HTML +.\" +"Newline conventions" +.\" +below) recognizes the two-character sequence CRLF as a newline, this is +preferred, even if the single characters CR and LF are also recognized as +newlines. For example, if the newline convention is "any", a multiline mode +circumflex matches before "xyz" in the string "abc\er\enxyz" rather than after +CR, even though CR on its own is a valid newline. (It also matches at the very +start of the string, of course.) +.P Note that the sequences \eA, \eZ, and \ez can be used to match the start and end of the subject in both modes, and if all branches of a pattern start with \eA it is always anchored, whether or not PCRE2_MULTILINE is set. @@ -3413,6 +3425,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 10 November 2015 +Last updated: 13 November 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c index 9ece6f6..94a329e 100644 --- a/src/pcre2_substitute.c +++ b/src/pcre2_substitute.c @@ -296,8 +296,22 @@ do if (rc != PCRE2_ERROR_NOMATCH) goto EXIT; if (goptions == 0 || start_offset >= length) break; + /* Advance by one code point. Then, if CRLF is a valid newline sequence and + we have advanced into the middle of it, advance one more code point. In + other words, do not start in the middle of CRLF, even if CR and LF on their + own are valid newlines. */ + save_start = start_offset++; - if ((code->overall_options & PCRE2_UTF) != 0) + if (subject[start_offset-1] == CHAR_CR && + code->newline_convention != PCRE2_NEWLINE_CR && + code->newline_convention != PCRE2_NEWLINE_LF && + start_offset < length && + subject[start_offset] == CHAR_LF) + start_offset++; + + /* Otherwise, in UTF mode, advance past any secondary code points. */ + + else if ((code->overall_options & PCRE2_UTF) != 0) { #if PCRE2_CODE_UNIT_WIDTH == 8 while (start_offset < length && (subject[start_offset] & 0xc0) == 0x80) diff --git a/testdata/testinput2 b/testdata/testinput2 index c6ea772..d8ded88 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -498,6 +498,9 @@ /^ab\n/Igm,aftertext ab\nab\ncd +/^/gm,newline=any + a\rb\nc\r\nxyz\=aftertext + /abc/I /abc|bac/I @@ -4659,4 +4662,18 @@ a)"xI /(?|(a)|())/BI +# Test CRLF handling in empty string substitutions + +/^$/gm,newline=anycrlf,replace=- + X\r\n\r\nY + +/^$/gm,newline=crlf,replace=- + X\r\n\r\nY + +/^$/gm,newline=any,replace=- + X\r\n\r\nY + +"(*ANYCRLF)(?m)^(.*[^0-9\r\n].*|)$"g,replace=NaN + 15\r\nfoo\r\n20\r\nbar\r\nbaz\r\n\r\n20 + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 2807628..7d560c2 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -1316,6 +1316,17 @@ Subject length lower bound = 3 0: ab\x0a 0+ cd +/^/gm,newline=any + a\rb\nc\r\nxyz\=aftertext + 0: + 0+ a\x0db\x0ac\x0d\x0axyz + 0: + 0+ b\x0ac\x0d\x0axyz + 0: + 0+ c\x0d\x0axyz + 0: + 0+ xyz + /abc/I Capturing subpattern count = 0 First code unit = 'a' @@ -14842,4 +14853,22 @@ Capturing subpattern count = 1 May match empty string Subject length lower bound = 0 +# Test CRLF handling in empty string substitutions + +/^$/gm,newline=anycrlf,replace=- + X\r\n\r\nY + 1: X\x0d\x0a-\x0d\x0aY + +/^$/gm,newline=crlf,replace=- + X\r\n\r\nY + 1: X\x0d\x0a-\x0d\x0aY + +/^$/gm,newline=any,replace=- + X\r\n\r\nY + 1: X\x0d\x0a-\x0d\x0aY + +"(*ANYCRLF)(?m)^(.*[^0-9\r\n].*|)$"g,replace=NaN + 15\r\nfoo\r\n20\r\nbar\r\nbaz\r\n\r\n20 + 4: 15\x0d\x0aNaN\x0d\x0a20\x0d\x0aNaN\x0d\x0aNaN\x0d\x0aNaN\x0d\x0a20 + # End of testinput2