From 7a6e8a445450e3007774a4f7d5dac23a0ef43587 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Mon, 1 Jan 2018 14:12:35 +0000 Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the first code unit of a newline sequence. --- ChangeLog | 12 +++++++++--- src/pcre2_dfa_match.c | 21 +++++++++++++++------ src/pcre2_match.c | 21 +++++++++++++++------ testdata/testinput2 | 10 ++++++++++ testdata/testinput6 | 10 ++++++++++ testdata/testoutput2 | 13 +++++++++++++ testdata/testoutput6 | 13 +++++++++++++ 7 files changed, 85 insertions(+), 15 deletions(-) diff --git a/ChangeLog b/ChangeLog index 250b107..86e2e6d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -100,11 +100,17 @@ code for -g that handles the case when \K in an assertion causes the match to end at the original start point. Also arranged for it to detect when \K causes the end of a match to be before its start. -24. Similar to 23 above, strange things (including loops) could happen in -pcre2grep when \K was used in an assertion when --colour was used or in -multiline mode. The "end at original start point" bug is fixed, and if the end +24. Similar to 23 above, strange things (including loops) could happen in +pcre2grep when \K was used in an assertion when --colour was used or in +multiline mode. The "end at original start point" bug is fixed, and if the end point is found to be before the start point, they are swapped. +25. When PCRE2_FIRSTLINE without PCRE2_NO_START_OPTIMIZE was used in non-JIT +matching (both pcre2_match() and pcre2_dfa_match()) and the matched string +started with the first code unit of a newline sequence, matching failed because +the search for the first code unit stopped before rather than after the first +code unit of a newline in the subject string. + Version 10.30 14-August-2017 ---------------------------- diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index b0cd866..9c1d805 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2017 University of Cambridge + New API code Copyright (c) 2016-2018 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -3367,9 +3367,11 @@ for (;;) /* If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the - first newline. Implement this by temporarily adjusting end_subject so that - we stop the optimization scans for a first code unit at a newline. If the - match fails at the newline, later code breaks this loop. */ + first newline following the start of matching. Temporarily adjust + end_subject so that we stop the optimization scans for a first code unit + immediately after the first character of a newline (the first code unit can + legitimately be a newline). If the match fails at the newline, later code + breaks this loop. */ if (firstline) { @@ -3377,7 +3379,7 @@ for (;;) #ifdef SUPPORT_UNICODE if (utf) { - while (t < mb->end_subject && !IS_NEWLINE(t)) + while (t < end_subject && !IS_NEWLINE(t)) { t++; ACROSSCHAR(t < end_subject, *t, t++); @@ -3385,7 +3387,14 @@ for (;;) } else #endif - while (t < mb->end_subject && !IS_NEWLINE(t)) t++; + while (t < end_subject && !IS_NEWLINE(t)) t++; + + /* Note that we only need to advance by one code unit if we found a + newline. If the newline is CRLF, a first code unit of LF should not + match, because it is not at or before the newline. Similarly, only the + first code unit of a Unicode newline might be relevant. */ + + if (t < end_subject) t++; end_subject = t; } diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 265fade..8872345 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -6367,9 +6367,11 @@ for(;;) /* If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the - first newline. Implement this by temporarily adjusting end_subject so that - we stop the optimization scans for a first code unit at a newline. If the - match fails at the newline, later code breaks this loop. */ + first newline following the start of matching. Temporarily adjust + end_subject so that we stop the optimization scans for a first code unit + immediately after the first character of a newline (the first code unit can + legitimately be a newline). If the match fails at the newline, later code + breaks this loop. */ if (firstline) { @@ -6377,7 +6379,7 @@ for(;;) #ifdef SUPPORT_UNICODE if (utf) { - while (t < mb->end_subject && !IS_NEWLINE(t)) + while (t < end_subject && !IS_NEWLINE(t)) { t++; ACROSSCHAR(t < end_subject, *t, t++); @@ -6385,7 +6387,14 @@ for(;;) } else #endif - while (t < mb->end_subject && !IS_NEWLINE(t)) t++; + while (t < end_subject && !IS_NEWLINE(t)) t++; + + /* Note that we only need to advance by one code unit if we found a + newline. If the newline is CRLF, a first code unit of LF should not + match, because it is not at or before the newline. Similarly, only the + first code unit of a Unicode newline might be relevant. */ + + if (t < end_subject) t++; end_subject = t; } @@ -6648,7 +6657,7 @@ for(;;) cb.start_match = (PCRE2_SIZE)(start_match - subject); cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; - + mb->start_used_ptr = start_match; mb->last_used_ptr = start_match; mb->match_call_count = 0; diff --git a/testdata/testinput2 b/testdata/testinput2 index 942ec45..fe8efbf 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5395,4 +5395,14 @@ a)"xI \= Expect no match aac\=callout_extra +/\n/firstline + xyz\nabc + +/\nabc/firstline + xyz\nabc + +/\x{0a}abc/firstline,newline=crlf +\= Expect no match + xyz\r\nabc + # End of testinput2 diff --git a/testdata/testinput6 b/testdata/testinput6 index ce2e082..614c3a0 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -4932,4 +4932,14 @@ /(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?);); +/\n/firstline + xyz\nabc + +/\nabc/firstline + xyz\nabc + +/\x{0a}abc/firstline,newline=crlf +\= Expect no match + xyz\r\nabc + # End of testinput6 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index b7177ce..62ec12f 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -16440,6 +16440,19 @@ Callout (15): 'XXX' ^^ b No match +/\n/firstline + xyz\nabc + 0: \x0a + +/\nabc/firstline + xyz\nabc + 0: \x0aabc + +/\x{0a}abc/firstline,newline=crlf +\= Expect no match + xyz\r\nabc +No match + # End of testinput2 Error -65: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 60e8349..998f20b 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7753,4 +7753,17 @@ No match .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?);); Failed: error -47: match limit exceeded +/\n/firstline + xyz\nabc + 0: \x0a + +/\nabc/firstline + xyz\nabc + 0: \x0aabc + +/\x{0a}abc/firstline,newline=crlf +\= Expect no match + xyz\r\nabc +No match + # End of testinput6