Previous FIRSTLINE patch was broken. Fix it.

This commit is contained in:
Philip.Hazel 2018-01-01 14:54:06 +00:00
parent 7a6e8a4454
commit 807f37095d
6 changed files with 43 additions and 37 deletions

View File

@ -3363,8 +3363,6 @@ for (;;)
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
(options & PCRE2_DFA_RESTART) == 0) (options & PCRE2_DFA_RESTART) == 0)
{ {
PCRE2_SPTR save_end_subject = end_subject;
/* If firstline is TRUE, the start of the match is constrained to the first /* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the line of a multiline string. That is, the match must be before or at the
first newline following the start of matching. Temporarily adjust first newline following the start of matching. Temporarily adjust
@ -3388,13 +3386,6 @@ for (;;)
else else
#endif #endif
while (t < end_subject && !IS_NEWLINE(t)) t++; while (t < end_subject && !IS_NEWLINE(t)) t++;
/* Note that we only need to advance by one code unit if we found a
newline. If the newline is CRLF, a first code unit of LF should not
match, because it is not at or before the newline. Similarly, only the
first code unit of a Unicode newline might be relevant. */
if (t < end_subject) t++;
end_subject = t; end_subject = t;
} }
@ -3466,14 +3457,18 @@ for (;;)
#endif #endif
} }
/* If we can't find the required code unit, break the bumpalong loop, /* If we can't find the required code unit, having reached the true end
to force a match failure, except when doing partial matching, when we of the subject, break the bumpalong loop, to force a match failure,
let the next cycle run at the end of the subject. To see why, consider except when doing partial matching, when we let the next cycle run at
the pattern /(?<=abc)def/, which partially matches "abc", even though the end of the subject. To see why, consider the pattern /(?<=abc)def/,
the string does not contain the starting character "d". */ which partially matches "abc", even though the string does not contain
the starting character "d". If we have not reached the true end of the
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
we also let the cycle run, because the matching string is legitimately
allowed to start with the first code unit of a newline. */
if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 && if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
start_match >= end_subject) start_match >= mb->end_subject)
break; break;
} }
@ -3532,7 +3527,7 @@ for (;;)
/* Restore fudged end_subject */ /* Restore fudged end_subject */
end_subject = save_end_subject; end_subject = mb->end_subject;
/* The following two optimizations are disabled for partial matching. */ /* The following two optimizations are disabled for partial matching. */

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2015-2017 University of Cambridge New API code Copyright (c) 2015-2018 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -6363,15 +6363,11 @@ for(;;)
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
{ {
PCRE2_SPTR save_end_subject = end_subject;
/* If firstline is TRUE, the start of the match is constrained to the first /* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the line of a multiline string. That is, the match must be before or at the
first newline following the start of matching. Temporarily adjust first newline following the start of matching. Temporarily adjust
end_subject so that we stop the optimization scans for a first code unit end_subject so that we stop the scans for a first code unit at a newline.
immediately after the first character of a newline (the first code unit can If the match fails at the newline, later code breaks the loop. */
legitimately be a newline). If the match fails at the newline, later code
breaks this loop. */
if (firstline) if (firstline)
{ {
@ -6388,13 +6384,6 @@ for(;;)
else else
#endif #endif
while (t < end_subject && !IS_NEWLINE(t)) t++; while (t < end_subject && !IS_NEWLINE(t)) t++;
/* Note that we only need to advance by one code unit if we found a
newline. If the newline is CRLF, a first code unit of LF should not
match, because it is not at or before the newline. Similarly, only the
first code unit of a Unicode newline might be relevant. */
if (t < end_subject) t++;
end_subject = t; end_subject = t;
} }
@ -6470,13 +6459,17 @@ for(;;)
#endif #endif
} }
/* If we can't find the required code unit, break the bumpalong loop, /* If we can't find the required code unit, having reached the true end
to force a match failure, except when doing partial matching, when we of the subject, break the bumpalong loop, to force a match failure,
let the next cycle run at the end of the subject. To see why, consider except when doing partial matching, when we let the next cycle run at
the pattern /(?<=abc)def/, which partially matches "abc", even though the end of the subject. To see why, consider the pattern /(?<=abc)def/,
the string does not contain the starting character "d". */ which partially matches "abc", even though the string does not contain
the starting character "d". If we have not reached the true end of the
subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
we also let the cycle run, because the matching string is legitimately
allowed to start with the first code unit of a newline. */
if (!mb->partial && start_match >= end_subject) if (!mb->partial && start_match >= mb->end_subject)
{ {
rc = MATCH_NOMATCH; rc = MATCH_NOMATCH;
break; break;
@ -6538,7 +6531,7 @@ for(;;)
/* Restore fudged end_subject */ /* Restore fudged end_subject */
end_subject = save_end_subject; end_subject = mb->end_subject;
/* The following two optimizations must be disabled for partial matching. */ /* The following two optimizations must be disabled for partial matching. */

4
testdata/testinput2 vendored
View File

@ -5405,4 +5405,8 @@ a)"xI
\= Expect no match \= Expect no match
xyz\r\nabc xyz\r\nabc
/[abc]/firstline
\= Expect no match
\na
# End of testinput2 # End of testinput2

4
testdata/testinput6 vendored
View File

@ -4942,4 +4942,8 @@
\= Expect no match \= Expect no match
xyz\r\nabc xyz\r\nabc
/[abc]/firstline
\= Expect no match
\na
# End of testinput6 # End of testinput6

View File

@ -16453,6 +16453,11 @@ No match
xyz\r\nabc xyz\r\nabc
No match No match
/[abc]/firstline
\= Expect no match
\na
No match
# End of testinput2 # End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number) Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data Error -62: bad serialized data

View File

@ -7766,4 +7766,9 @@ Failed: error -47: match limit exceeded
xyz\r\nabc xyz\r\nabc
No match No match
/[abc]/firstline
\= Expect no match
\na
No match
# End of testinput6 # End of testinput6