Fix PCRE2_FIRSTLINE bug when a pattern match starts with the first code unit of

a newline sequence.
2018-01-01 14:12:35 +00:00 · 2018-01-01 14:12:35 +00:00 · 7a6e8a4454
parent 800d884bce
commit 7a6e8a4454
7 changed files with 85 additions and 15 deletions
--- a/12
+++ b/12
@ -100,11 +100,17 @@ code for -g that handles the case when \K in an assertion causes the match to
 end at the original start point. Also arranged for it to detect when \K causes
 the end of a match to be before its start.

-24. Similar to 23 above, strange things (including loops) could happen in 
-pcre2grep when \K was used in an assertion when --colour was used or in 
-multiline mode. The "end at original start point" bug is fixed, and if the end 
+24. Similar to 23 above, strange things (including loops) could happen in
+pcre2grep when \K was used in an assertion when --colour was used or in
+multiline mode. The "end at original start point" bug is fixed, and if the end
 point is found to be before the start point, they are swapped.

+25. When PCRE2_FIRSTLINE without PCRE2_NO_START_OPTIMIZE was used in non-JIT
+matching (both pcre2_match() and pcre2_dfa_match()) and the matched string
+started with the first code unit of a newline sequence, matching failed because
+the search for the first code unit stopped before rather than after the first
+code unit of a newline in the subject string.
+

 Version 10.30 14-August-2017
 ----------------------------
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2017 University of Cambridge
+          New API code Copyright (c) 2016-2018 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@ -3367,9 +3367,11 @@ for (;;)

    /* If firstline is TRUE, the start of the match is constrained to the first
    line of a multiline string. That is, the match must be before or at the
-    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans for a first code unit at a newline. If the
-    match fails at the newline, later code breaks this loop. */
+    first newline following the start of matching. Temporarily adjust
+    end_subject so that we stop the optimization scans for a first code unit
+    immediately after the first character of a newline (the first code unit can
+    legitimately be a newline). If the match fails at the newline, later code
+    breaks this loop. */

    if (firstline)
      {
@ -3377,7 +3379,7 @@ for (;;)
 #ifdef SUPPORT_UNICODE
      if (utf)
        {
-        while (t < mb->end_subject && !IS_NEWLINE(t))
+        while (t < end_subject && !IS_NEWLINE(t))
          {
          t++;
          ACROSSCHAR(t < end_subject, *t, t++);
@ -3385,7 +3387,14 @@ for (;;)
        }
      else
 #endif
-      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+      while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+      /* Note that we only need to advance by one code unit if we found a
+      newline. If the newline is CRLF, a first code unit of LF should not
+      match, because it is not at or before the newline. Similarly, only the
+      first code unit of a Unicode newline might be relevant. */
+
+      if (t < end_subject) t++;
      end_subject = t;
      }

--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@ -6367,9 +6367,11 @@ for(;;)

    /* If firstline is TRUE, the start of the match is constrained to the first
    line of a multiline string. That is, the match must be before or at the
-    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans for a first code unit at a newline. If the
-    match fails at the newline, later code breaks this loop. */
+    first newline following the start of matching. Temporarily adjust
+    end_subject so that we stop the optimization scans for a first code unit
+    immediately after the first character of a newline (the first code unit can
+    legitimately be a newline). If the match fails at the newline, later code
+    breaks this loop. */

    if (firstline)
      {
@ -6377,7 +6379,7 @@ for(;;)
 #ifdef SUPPORT_UNICODE
      if (utf)
        {
-        while (t < mb->end_subject && !IS_NEWLINE(t))
+        while (t < end_subject && !IS_NEWLINE(t))
          {
          t++;
          ACROSSCHAR(t < end_subject, *t, t++);
@ -6385,7 +6387,14 @@ for(;;)
        }
      else
 #endif
-      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+      while (t < end_subject && !IS_NEWLINE(t)) t++;
+
+      /* Note that we only need to advance by one code unit if we found a
+      newline. If the newline is CRLF, a first code unit of LF should not
+      match, because it is not at or before the newline. Similarly, only the
+      first code unit of a Unicode newline might be relevant. */
+
+      if (t < end_subject) t++;
      end_subject = t;
      }

@ -6648,7 +6657,7 @@ for(;;)

  cb.start_match = (PCRE2_SIZE)(start_match - subject);
  cb.callout_flags |= PCRE2_CALLOUT_STARTMATCH;
-    
+
  mb->start_used_ptr = start_match;
  mb->last_used_ptr = start_match;
  mb->match_call_count = 0;
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -5395,4 +5395,14 @@ a)"xI
 \= Expect no match
    aac\=callout_extra 

+/\n/firstline
+    xyz\nabc
+
+/\nabc/firstline
+    xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+
 # End of testinput2
--- a/testdata/testinput6
+++ b/testdata/testinput6
@ -4932,4 +4932,14 @@
 /(*LIMIT_MATCH=100).*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););/no_dotstar_anchor
 .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););

+/\n/firstline
+    xyz\nabc
+
+/\nabc/firstline
+    xyz\nabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+
 # End of testinput6
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -16440,6 +16440,19 @@ Callout (15): 'XXX'
     ^^     b
 No match

+/\n/firstline
+    xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+    xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+No match
+
 # End of testinput2
 Error -65: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@ -7753,4 +7753,17 @@ No match
 .*(?![|H]?.*(?![|H]?););.*(?![|H]?.*(?![|H]?););\x00\x00\x00\x00\x00\x00\x00(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?!(?![|);)?.*(![|H]?);)?.*(?![|H]?);)?.*(?![|H]?);)?.*(?![|H]););![|H]?););[|H]?);|H]?);)\x00\x00\x00\x00\x00\x00H]?););?![|H]?);)?.*(?![|H]?););[||H]?);)?.*(?![|H]?););[|H]?);(?![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););;[\x00\x00\x00\x00\x00\x00\x00![|H]?););![|H]?););[|H]?);|H]?);)?.*(?![|H]?););
 Failed: error -47: match limit exceeded

+/\n/firstline
+    xyz\nabc
+ 0: \x0a
+
+/\nabc/firstline
+    xyz\nabc
+ 0: \x0aabc
+
+/\x{0a}abc/firstline,newline=crlf
+\= Expect no match
+    xyz\r\nabc
+No match
+
 # End of testinput6