Fix PCRE2_FIRSTLINE bug when a pattern match starts with the first code unit of
a newline sequence.
This commit is contained in:
parent
800d884bce
commit
7a6e8a4454
12
ChangeLog
12
ChangeLog
|
@ -100,11 +100,17 @@ code for -g that handles the case when \K in an assertion causes the match to
|
||||||
end at the original start point. Also arranged for it to detect when \K causes
|
end at the original start point. Also arranged for it to detect when \K causes
|
||||||
the end of a match to be before its start.
|
the end of a match to be before its start.
|
||||||
|
|
||||||
24. Similar to 23 above, strange things (including loops) could happen in
|
24. Similar to 23 above, strange things (including loops) could happen in
|
||||||
pcre2grep when \K was used in an assertion when --colour was used or in
|
pcre2grep when \K was used in an assertion when --colour was used or in
|
||||||
multiline mode. The "end at original start point" bug is fixed, and if the end
|
multiline mode. The "end at original start point" bug is fixed, and if the end
|
||||||
point is found to be before the start point, they are swapped.
|
point is found to be before the start point, they are swapped.
|
||||||
|
|
||||||
|
25. When PCRE2_FIRSTLINE without PCRE2_NO_START_OPTIMIZE was used in non-JIT
|
||||||
|
matching (both pcre2_match() and pcre2_dfa_match()) and the matched string
|
||||||
|
started with the first code unit of a newline sequence, matching failed because
|
||||||
|
the search for the first code unit stopped before rather than after the first
|
||||||
|
code unit of a newline in the subject string.
|
||||||
|
|
||||||
|
|
||||||
Version 10.30 14-August-2017
|
Version 10.30 14-August-2017
|
||||||
----------------------------
|
----------------------------
|
||||||
|
|
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||||
|
|
||||||
Written by Philip Hazel
|
Written by Philip Hazel
|
||||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||||
New API code Copyright (c) 2016-2017 University of Cambridge
|
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -3367,9 +3367,11 @@ for (;;)
|
||||||
|
|
||||||
/* If firstline is TRUE, the start of the match is constrained to the first
|
/* If firstline is TRUE, the start of the match is constrained to the first
|
||||||
line of a multiline string. That is, the match must be before or at the
|
line of a multiline string. That is, the match must be before or at the
|
||||||
first newline. Implement this by temporarily adjusting end_subject so that
|
first newline following the start of matching. Temporarily adjust
|
||||||
we stop the optimization scans for a first code unit at a newline. If the
|
end_subject so that we stop the optimization scans for a first code unit
|
||||||
match fails at the newline, later code breaks this loop. */
|
immediately after the first character of a newline (the first code unit can
|
||||||
|
legitimately be a newline). If the match fails at the newline, later code
|
||||||
|
breaks this loop. */
|
||||||
|
|
||||||
if (firstline)
|
if (firstline)
|
||||||
{
|
{
|
||||||
|
@ -3377,7 +3379,7 @@ for (;;)
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf)
|
if (utf)
|
||||||
{
|
{
|
||||||
while (t < mb->end_subject && !IS_NEWLINE(t))
|
while (t < end_subject && !IS_NEWLINE(t))
|
||||||
{
|
{
|
||||||
t++;
|
t++;
|
||||||
ACROSSCHAR(t < end_subject, *t, t++);
|
ACROSSCHAR(t < end_subject, *t, t++);
|
||||||
|
@ -3385,7 +3387,14 @@ for (;;)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
|
while (t < end_subject && !IS_NEWLINE(t)) t++;
|
||||||
|
|
||||||
|
/* Note that we only need to advance by one code unit if we found a
|
||||||
|
newline. If the newline is CRLF, a first code unit of LF should not
|
||||||
|
match, because it is not at or before the newline. Similarly, only the
|
||||||
|
first code unit of a Unicode newline might be relevant. */
|
||||||
|
|
||||||
|
if (t < end_subject) t++;
|
||||||
end_subject = t;
|
end_subject = t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6367,9 +6367,11 @@ for(;;)
|
||||||
|
|
||||||
/* If firstline is TRUE, the start of the match is constrained to the first
|
/* If firstline is TRUE, the start of the match is constrained to the first
|
||||||
line of a multiline string. That is, the match must be before or at the
|
line of a multiline string. That is, the match must be before or at the
|
||||||
first newline. Implement this by temporarily adjusting end_subject so that
|
first newline following the start of matching. Temporarily adjust
|
||||||
we stop the optimization scans for a first code unit at a newline. If the
|
end_subject so that we stop the optimization scans for a first code unit
|
||||||
match fails at the newline, later code breaks this loop. */
|
immediately after the first character of a newline (the first code unit can
|
||||||
|
legitimately be a newline). If the match fails at the newline, later code
|
||||||
|
breaks this loop. */
|
||||||
|
|
||||||
if (firstline)
|
if (firstline)
|
||||||
{
|
{
|
||||||
|
@ -6377,7 +6379,7 @@ for(;;)
|
||||||
#ifdef SUPPORT_UNICODE
|
#ifdef SUPPORT_UNICODE
|
||||||
if (utf)
|
if (utf)
|
||||||
{
|
{
|
||||||
while (t < mb->end_subject && !IS_NEWLINE(t))
|
while (t < end_subject && !IS_NEWLINE(t))
|
||||||
{
|
{
|
||||||
t++;
|
t++;
|
||||||
ACROSSCHAR(t < end_subject, *t, t++);
|
ACROSSCHAR(t < end_subject, *t, t++);
|
||||||
|
@ -6385,7 +6387,14 @@ for(;;)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
|
while (t < end_subject && !IS_NEWLINE(t)) t++;
|
||||||
|
|
||||||
|
/* Note that we only need to advance by one code unit if we found a
|
||||||
|
newline. If the newline is CRLF, a first code unit of LF should not
|
||||||
|
match, because it is not at or before the newline. Similarly, only the
|
||||||
|
first code unit of a Unicode newline might be relevant. */
|
||||||
|
|
||||||
|
if (t < end_subject) t++;
|
||||||
end_subject = t;
|
end_subject = t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6648,7 +6657,7 @@ for(;;)
|
||||||
|
|
||||||
cb.start_match = (PCRE2_SIZE)(start_match - subject);
|
cb.start_match = (PCRE2_SIZE)(start_match - subject);
|
||||||
cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
|
cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
|
||||||
|
|
||||||
mb->start_used_ptr = start_match;
|
mb->start_used_ptr = start_match;
|
||||||
mb->last_used_ptr = start_match;
|
mb->last_used_ptr = start_match;
|
||||||
mb->match_call_count = 0;
|
mb->match_call_count = 0;
|
||||||
|
|
|
@ -5395,4 +5395,14 @@ a)"xI
|
||||||
\= Expect no match
|
\= Expect no match
|
||||||
aac\=callout_extra
|
aac\=callout_extra
|
||||||
|
|
||||||
|
/\n/firstline
|
||||||
|
xyz\nabc
|
||||||
|
|
||||||
|
/\nabc/firstline
|
||||||
|
xyz\nabc
|
||||||
|
|
||||||
|
/\x{0a}abc/firstline,newline=crlf
|
||||||
|
\= Expect no match
|
||||||
|
xyz\r\nabc
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -4932,4 +4932,14 @@
|
||||||
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
|
/(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
|
||||||
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
||||||
|
|
||||||
|
/\n/firstline
|
||||||
|
xyz\nabc
|
||||||
|
|
||||||
|
/\nabc/firstline
|
||||||
|
xyz\nabc
|
||||||
|
|
||||||
|
/\x{0a}abc/firstline,newline=crlf
|
||||||
|
\= Expect no match
|
||||||
|
xyz\r\nabc
|
||||||
|
|
||||||
# End of testinput6
|
# End of testinput6
|
||||||
|
|
|
@ -16440,6 +16440,19 @@ Callout (15): 'XXX'
|
||||||
^^ b
|
^^ b
|
||||||
No match
|
No match
|
||||||
|
|
||||||
|
/\n/firstline
|
||||||
|
xyz\nabc
|
||||||
|
0: \x0a
|
||||||
|
|
||||||
|
/\nabc/firstline
|
||||||
|
xyz\nabc
|
||||||
|
0: \x0aabc
|
||||||
|
|
||||||
|
/\x{0a}abc/firstline,newline=crlf
|
||||||
|
\= Expect no match
|
||||||
|
xyz\r\nabc
|
||||||
|
No match
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
|
Error -65: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
Error -62: bad serialized data
|
Error -62: bad serialized data
|
||||||
|
|
|
@ -7753,4 +7753,17 @@ No match
|
||||||
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
.*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
|
||||||
Failed: error -47: match limit exceeded
|
Failed: error -47: match limit exceeded
|
||||||
|
|
||||||
|
/\n/firstline
|
||||||
|
xyz\nabc
|
||||||
|
0: \x0a
|
||||||
|
|
||||||
|
/\nabc/firstline
|
||||||
|
xyz\nabc
|
||||||
|
0: \x0aabc
|
||||||
|
|
||||||
|
/\x{0a}abc/firstline,newline=crlf
|
||||||
|
\= Expect no match
|
||||||
|
xyz\r\nabc
|
||||||
|
No match
|
||||||
|
|
||||||
# End of testinput6
|
# End of testinput6
|
||||||
|
|
Loading…
Reference in New Issue