From e3e4131379eb19b261a04edd60b05bff17fa3e81 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Wed, 5 Nov 2014 16:05:19 +0000
Subject: [PATCH] Fix bug for (*ACCEPT) inside a capturing group.

---
 ChangeLog            |  7 +++++++
 src/pcre2_match.c    | 17 ++++++++++++++---
 src/pcre2test.c      | 26 ++++++++++++++++++++++----
 testdata/testinput1  |  3 +++
 testdata/testoutput1 |  9 +++++++++
 5 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index cdca10a..eecc01c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -44,4 +44,11 @@ strings matched by the repetition are not all the same length.
 information. This applied to any pattern with a group that matched no
 characters, for example: /(?:(?=.)|(?<!x))a/.
 
+7. When an (*ACCEPT) is triggered inside capturing parentheses, it arranges for
+those parentheses to be closed with whatever has been captured so far. However,
+it was failing to mark any other groups between the hightest capture so far and
+the currrent group as "unset". Thus, the ovector for those groups contained
+whatever was previously there. An example is the pattern /(x)|((*ACCEPT))/ when
+matched against "abcd".
+
 ****
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index 94309ee..b66dd35 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -1465,7 +1465,18 @@ for (;;)
       mb->ovector[offset] =
         mb->ovector[mb->offset_end - number];
       mb->ovector[offset+1] = eptr - mb->start_subject;
-      if (offset_top <= offset) offset_top = offset + 2;
+
+      /* If this group is at or above the current highwater mark, ensure that
+      any groups between the current high water mark and this group are marked
+      unset and then update the high water mark. */
+
+      if (offset >= offset_top)
+        {
+        register PCRE2_SIZE *iptr = mb->ovector + offset_top;
+        register PCRE2_SIZE *iend = mb->ovector + offset;
+        while (iptr < iend) *iptr++ = PCRE2_UNSET;
+        offset_top = offset + 2;
+        }
       }
     ecode += 1 + IMM2_SIZE;
     break;
@@ -6321,18 +6332,18 @@ while (nextframe != NULL)
 *           Match a Regular Expression           *
 *************************************************/
 
-/* This function applies a compiled re to a subject string and picks out
+/* This function applies a compiled pattern to a subject string and picks out
 portions of the string if it matches. Two elements in the vector are set for
 each substring: the offsets to the start and end of the substring.
 
 Arguments:
-  context         points a PCRE2 context
   code            points to the compiled expression
   subject         points to the subject string
   length          length of subject string (may contain binary zeros)
   start_offset    where to start in the subject string
   options         option bits
   match_data      points to a match_data block
+  mcontext        points a PCRE2 context
 
 Returns:          > 0 => success; value is the number of ovector pairs filled
                   = 0 => success, but ovector is not big enough
diff --git a/src/pcre2test.c b/src/pcre2test.c
index 66b2e19..7146733 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -163,6 +163,7 @@ void vms_setsymbol( char *, char *, int );
 #define CFAIL_UNSET UINT32_MAX  /* Unset value for cfail fields */
 #define DFA_WS_DIMENSION 1000   /* Size of DFA workspace */
 #define DEFAULT_OVECCOUNT 15    /* Default ovector count */
+#define JUNK_OFFSET 0xdeadbeef  /* For initializing ovector */
 #define LOOPREPEAT 500000       /* Default loop count for timing */
 #define VERSION_SIZE 64         /* Size of buffer for the version strings */
 
@@ -4685,12 +4686,18 @@ else
 
 for (gmatched = 0;; gmatched++)
   {
+  PCRE2_SIZE j;
   int capcount;
   PCRE2_SIZE *ovector;
   PCRE2_SIZE ovecsave[2];
 
   ovector = FLD(match_data, ovector);
 
+  /* Fill the ovector with junk to detect elements that do not get set
+  when they should be. */
+    
+  for (j = 0; j < 2*dat_datctl.oveccount; j++) ovector[j] = JUNK_OFFSET;
+
   /* When matching is via pcre2_match(), we will detect the use of JIT via the
   stack callback function. */
 
@@ -4786,7 +4793,7 @@ for (gmatched = 0;; gmatched++)
       {
       PCRE2_SET_CALLOUT(dat_context, NULL, NULL);  /* No callout */
       }
-
+      
     /* Run a single DFA or NFA match. */
 
     if ((dat_datctl.control & CTL_DFA) != 0)
@@ -4887,14 +4894,27 @@ for (gmatched = 0;; gmatched++)
         fprintf(outfile, "Start of matched string is beyond its end - "
           "displaying from end to start.\n");
         }
-
+        
       fprintf(outfile, "%2d: ", i/2);
+
+      /* Check for an unset group */
+
       if (start == PCRE2_UNSET)
         {
         fprintf(outfile, "<unset>\n");
         continue;
         }
 
+      /* Check for silly offsets, in particular, values that have not been
+      set when they should have been. */ 
+        
+      if (start > ulen || end > ulen)
+        {
+        fprintf(outfile, "ERROR: bad value(s) for offset(s): 0x%lx 0x%lx\n",
+          start, end);
+        continue;    
+        }  
+ 
       /* When JIT is not being used, ALLUSEDTEXT may be set. (It if is set with
       JIT, it is disabled above, with a comment.) When the match is done by the
       interpreter, leftchar and rightchar are available, and if ALLUSEDTEXT is
@@ -4918,7 +4938,6 @@ for (gmatched = 0;; gmatched++)
 
         if (showallused)
           {
-          PCRE2_SIZE j;
           PCHARS(lleft, pp, leftchar, start - leftchar, utf, outfile);
           PCHARS(lmiddle, pp, start, end - start, utf, outfile);
           PCHARS(lright, pp, end, rightchar - end, utf, outfile);
@@ -4944,7 +4963,6 @@ for (gmatched = 0;; gmatched++)
             fprintf(outfile, " (JIT)");
           if (startchar != start)
             {
-            PCRE2_SIZE j;
             fprintf(outfile, "\n    ");
             for (j = 0; j < lleft; j++) fprintf(outfile, "^");
             }
diff --git a/testdata/testinput1 b/testdata/testinput1
index 2aea25d..fa6c203 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5702,4 +5702,7 @@ name)/mark
    abd
    xyd 
 
+/(?:((abcd))|(((?:(?:(?:(?:abc|(?:abcdef))))b)abcdefghi)abc)|((*ACCEPT)))/
+    1234abcd
+
 # End of testinput1 
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index 8240536..6bd2e9e 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9403,4 +9403,13 @@ No match
    xyd 
  0: d
 
+/(?:((abcd))|(((?:(?:(?:(?:abc|(?:abcdef))))b)abcdefghi)abc)|((*ACCEPT)))/
+    1234abcd
+ 0: 
+ 1: <unset>
+ 2: <unset>
+ 3: <unset>
+ 4: <unset>
+ 5: 
+
 # End of testinput1