Fix PCRE2_FIRSTLINE bug when a pattern match starts with the first code unit of

a newline sequence.
This commit is contained in:
Philip.Hazel 2018-01-01 14:12:35 +00:00
parent 800d884bce
commit 7a6e8a4454
7 changed files with 85 additions and 15 deletions

View File

@ -100,11 +100,17 @@ code for -g that handles the case when \K in an assertion causes the match to
end at the original start point. Also arranged for it to detect when \K causes end at the original start point. Also arranged for it to detect when \K causes
the end of a match to be before its start. the end of a match to be before its start.
24. Similar to 23 above, strange things (including loops) could happen in 24. Similar to 23 above, strange things (including loops) could happen in
pcre2grep when \K was used in an assertion when --colour was used or in pcre2grep when \K was used in an assertion when --colour was used or in
multiline mode. The "end at original start point" bug is fixed, and if the end multiline mode. The "end at original start point" bug is fixed, and if the end
point is found to be before the start point, they are swapped. point is found to be before the start point, they are swapped.
25. When PCRE2_FIRSTLINE without PCRE2_NO_START_OPTIMIZE was used in non-JIT
matching (both pcre2_match() and pcre2_dfa_match()) and the matched string
started with the first code unit of a newline sequence, matching failed because
the search for the first code unit stopped before rather than after the first
code unit of a newline in the subject string.
Version 10.30 14-August-2017 Version 10.30 14-August-2017
---------------------------- ----------------------------

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2017 University of Cambridge New API code Copyright (c) 2016-2018 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -3367,9 +3367,11 @@ for (;;)
/* If firstline is TRUE, the start of the match is constrained to the first /* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the line of a multiline string. That is, the match must be before or at the
first newline. Implement this by temporarily adjusting end_subject so that first newline following the start of matching. Temporarily adjust
we stop the optimization scans for a first code unit at a newline. If the end_subject so that we stop the optimization scans for a first code unit
match fails at the newline, later code breaks this loop. */ immediately after the first character of a newline (the first code unit can
legitimately be a newline). If the match fails at the newline, later code
breaks this loop. */
if (firstline) if (firstline)
{ {
@ -3377,7 +3379,7 @@ for (;;)
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf)
{ {
while (t < mb->end_subject && !IS_NEWLINE(t)) while (t < end_subject && !IS_NEWLINE(t))
{ {
t++; t++;
ACROSSCHAR(t < end_subject, *t, t++); ACROSSCHAR(t < end_subject, *t, t++);
@ -3385,7 +3387,14 @@ for (;;)
} }
else else
#endif #endif
while (t < mb->end_subject && !IS_NEWLINE(t)) t++; while (t < end_subject && !IS_NEWLINE(t)) t++;
/* Note that we only need to advance by one code unit if we found a
newline. If the newline is CRLF, a first code unit of LF should not
match, because it is not at or before the newline. Similarly, only the
first code unit of a Unicode newline might be relevant. */
if (t < end_subject) t++;
end_subject = t; end_subject = t;
} }

View File

@ -6367,9 +6367,11 @@ for(;;)
/* If firstline is TRUE, the start of the match is constrained to the first /* If firstline is TRUE, the start of the match is constrained to the first
line of a multiline string. That is, the match must be before or at the line of a multiline string. That is, the match must be before or at the
first newline. Implement this by temporarily adjusting end_subject so that first newline following the start of matching. Temporarily adjust
we stop the optimization scans for a first code unit at a newline. If the end_subject so that we stop the optimization scans for a first code unit
match fails at the newline, later code breaks this loop. */ immediately after the first character of a newline (the first code unit can
legitimately be a newline). If the match fails at the newline, later code
breaks this loop. */
if (firstline) if (firstline)
{ {
@ -6377,7 +6379,7 @@ for(;;)
#ifdef SUPPORT_UNICODE #ifdef SUPPORT_UNICODE
if (utf) if (utf)
{ {
while (t < mb->end_subject && !IS_NEWLINE(t)) while (t < end_subject && !IS_NEWLINE(t))
{ {
t++; t++;
ACROSSCHAR(t < end_subject, *t, t++); ACROSSCHAR(t < end_subject, *t, t++);
@ -6385,7 +6387,14 @@ for(;;)
} }
else else
#endif #endif
while (t < mb->end_subject && !IS_NEWLINE(t)) t++; while (t < end_subject && !IS_NEWLINE(t)) t++;
/* Note that we only need to advance by one code unit if we found a
newline. If the newline is CRLF, a first code unit of LF should not
match, because it is not at or before the newline. Similarly, only the
first code unit of a Unicode newline might be relevant. */
if (t < end_subject) t++;
end_subject = t; end_subject = t;
} }
@ -6648,7 +6657,7 @@ for(;;)
cb.start_match = (PCRE2_SIZE)(start_match - subject); cb.start_match = (PCRE2_SIZE)(start_match - subject);
cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH; cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
mb->start_used_ptr = start_match; mb->start_used_ptr = start_match;
mb->last_used_ptr = start_match; mb->last_used_ptr = start_match;
mb->match_call_count = 0; mb->match_call_count = 0;

10
testdata/testinput2 vendored
View File

@ -5395,4 +5395,14 @@ a)"xI
\= Expect no match \= Expect no match
aac\=callout_extra aac\=callout_extra
/\n/firstline
xyz\nabc
/\nabc/firstline
xyz\nabc
/\x{0a}abc/firstline,newline=crlf
\= Expect no match
xyz\r\nabc
# End of testinput2 # End of testinput2

10
testdata/testinput6 vendored
View File

@ -4932,4 +4932,14 @@
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor /(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?);); .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
/\n/firstline
xyz\nabc
/\nabc/firstline
xyz\nabc
/\x{0a}abc/firstline,newline=crlf
\= Expect no match
xyz\r\nabc
# End of testinput6 # End of testinput6

13
testdata/testoutput2 vendored
View File

@ -16440,6 +16440,19 @@ Callout (15): 'XXX'
^^ b ^^ b
No match No match
/\n/firstline
xyz\nabc
0: \x0a
/\nabc/firstline
xyz\nabc
0: \x0aabc
/\x{0a}abc/firstline,newline=crlf
\= Expect no match
xyz\r\nabc
No match
# End of testinput2 # End of testinput2
Error -65: PCRE2_ERROR_BADDATA (unknown error number) Error -65: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data Error -62: bad serialized data

13
testdata/testoutput6 vendored
View File

@ -7753,4 +7753,17 @@ No match
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?);); .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00 \x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
Failed: error -47: match limit exceeded Failed: error -47: match limit exceeded
/\n/firstline
xyz\nabc
0: \x0a
/\nabc/firstline
xyz\nabc
0: \x0aabc
/\x{0a}abc/firstline,newline=crlf
\= Expect no match
xyz\r\nabc
No match
# End of testinput6 # End of testinput6