From 7a6e8a445450e3007774a4f7d5dac23a0ef43587 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Mon, 1 Jan 2018 14:12:35 +0000
Subject: [PATCH] Fix PCRE2_FIRSTLINE bug when a pattern match starts with the
 first code unit of a newline sequence.

---
 ChangeLog             | 12 +++++++++---
 src/pcre2_dfa_match.c | 21 +++++++++++++++------
 src/pcre2_match.c     | 21 +++++++++++++++------
 testdata/testinput2   | 10 ++++++++++
 testdata/testinput6   | 10 ++++++++++
 testdata/testoutput2  | 13 +++++++++++++
 testdata/testoutput6  | 13 +++++++++++++
 7 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 250b107..86e2e6d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -100,11 +100,17 @@ code for -g that handles the case when \K in an assertion causes the match to
 end at the original start point. Also arranged for it to detect when \K causes
 the end of a match to be before its start.
 
-24. Similar to 23 above, strange things (including loops) could happen in 
-pcre2grep when \K was used in an assertion when --colour was used or in 
-multiline mode. The "end at original start point" bug is fixed, and if the end 
+24. Similar to 23 above, strange things (including loops) could happen in
+pcre2grep when \K was used in an assertion when --colour was used or in
+multiline mode. The "end at original start point" bug is fixed, and if the end
 point is found to be before the start point, they are swapped.
 
+25. When PCRE2_FIRSTLINE without PCRE2_NO_START_OPTIMIZE was used in non-JIT
+matching (both pcre2_match() and pcre2_dfa_match()) and the matched string
+started with the first code unit of a newline sequence, matching failed because
+the search for the first code unit stopped before rather than after the first
+code unit of a newline in the subject string.
+
 
 Version 10.30 14-August-2017
 ----------------------------
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index b0cd866..9c1d805 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2017 University of Cambridge
+          New API code Copyright (c) 2016-2018 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -3367,9 +3367,11 @@ for (;;)
 
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
-    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans for a first code unit at a newline. If the
-    match fails at the newline, later code breaks this loop. */
+    first newline following the start of matching. Temporarily adjust
+    end_subject so that we stop the optimization scans for a first code unit
+    immediately after the first character of a newline (the first code unit can
+    legitimately be a newline). If the match fails at the newline, later code
+    breaks this loop. */
 
     if (firstline)
       {
@@ -3377,7 +3379,7 @@ for (;;)
 #ifdef SUPPORT_UNICODE
       if (utf)
         {
-        while (t < mb->end_subject && !IS_NEWLINE(t))
+        while (t < end_subject && !IS_NEWLINE(t))
           {
           t++;
           ACROSSCHAR(t < end_subject, *t, t++);
@@ -3385,7 +3387,14 @@ for (;;)
         }
       else
 #endif
-      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+      while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+      /* Note that we only need to advance by one code unit if we found a
+      newline. If the newline is CRLF, a first code unit of LF should not
+      match, because it is not at or before the newline. Similarly, only the
+      first code unit of a Unicode newline might be relevant. */
+
+      if (t < end_subject) t++;
       end_subject = t;
       }
 
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 265fade..8872345 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -6367,9 +6367,11 @@ for(;;)
 
     /* If firstline is TRUE, the start of the match is constrained to the first
     line of a multiline string. That is, the match must be before or at the
-    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans for a first code unit at a newline. If the
-    match fails at the newline, later code breaks this loop. */
+    first newline following the start of matching. Temporarily adjust
+    end_subject so that we stop the optimization scans for a first code unit
+    immediately after the first character of a newline (the first code unit can
+    legitimately be a newline). If the match fails at the newline, later code
+    breaks this loop. */
 
     if (firstline)
       {
@@ -6377,7 +6379,7 @@ for(;;)
 #ifdef SUPPORT_UNICODE
       if (utf)
         {
-        while (t < mb->end_subject && !IS_NEWLINE(t))
+        while (t < end_subject && !IS_NEWLINE(t))
           {
           t++;
           ACROSSCHAR(t < end_subject, *t, t++);
@@ -6385,7 +6387,14 @@ for(;;)
         }
       else
 #endif
-      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+      while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+      /* Note that we only need to advance by one code unit if we found a
+      newline. If the newline is CRLF, a first code unit of LF should not
+      match, because it is not at or before the newline. Similarly, only the
+      first code unit of a Unicode newline might be relevant. */
+
+      if (t < end_subject) t++;
       end_subject = t;
       }
 
@@ -6648,7 +6657,7 @@ for(;;)
 
   cb.start_match = (PCRE2_SIZE)(start_match - subject);
   cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
-    
+
   mb->start_used_ptr = start_match;
   mb->last_used_ptr = start_match;
   mb->match_call_count = 0;
diff --git a/testdata/testinput2 b/testdata/testinput2
index 942ec45..fe8efbf 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5395,4 +5395,14 @@ a)"xI
 \= Expect no match
     aac\=callout_extra 
 
+/\n/firstline
+    xyz\nabc
+
+/\nabc/firstline
+    xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+
 # End of testinput2
diff --git a/testdata/testinput6 b/testdata/testinput6
index ce2e082..614c3a0 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -4932,4 +4932,14 @@
 /(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
 .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
 
+/\n/firstline
+    xyz\nabc
+
+/\nabc/firstline
+    xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+
 # End of testinput6
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index b7177ce..62ec12f 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -16440,6 +16440,19 @@ Callout (15): 'XXX'
      ^^     b
 No match
 
+/\n/firstline
+    xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+    xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+No match
+
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index 60e8349..998f20b 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -7753,4 +7753,17 @@ No match
 .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
 Failed: error -47: match limit exceeded
 
+/\n/firstline
+    xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+    xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+No match
+
 # End of testinput6