From 911b8009849ef4cfea5853d17f53f5ec84d8a0c7 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Tue, 27 Jan 2015 17:21:32 +0000
Subject: [PATCH] Fix incorrect size calculation when a reference to a
 duplicate name occurs in a part of the pattern where PCRE2_DUPNAMES is not
 set.

---
 ChangeLog              |  9 +++++++
 src/pcre2_compile.c    | 54 ++++++++++++++++++------------------------
 src/pcre2_intmodedep.h |  1 -
 testdata/testinput2    |  4 ++++
 testdata/testoutput2   |  4 ++++
 5 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 4ba5dc7..9f704de 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,6 +31,15 @@ in order to avoid accessing uninitialized data when serializing.
 
 5. The (*NO_JIT) feature is implemented.
 
+6. If a bug that caused pcre2_compile() to use more memory than allocated was
+triggered when using valgrind, the code in (3) above passed a stupidly large 
+value to valgrind. This caused a crash instead of an "internal error" return.
+
+7. A reference to a duplicated named group (either a back reference or a test 
+for being set in a conditional) that occurred in a part of the pattern where 
+PCRE2_DUPNAMES was not set caused the amount of memory needed for the pattern 
+to be incorrectly calculated, leading to overwriting.
+
 
 Version 10.00 05-January-2015
 -----------------------------
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index bf9795e..ffd2a2b 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -5296,9 +5296,10 @@ for (;; ptr++)
           }
 
         /* Otherwise we expect to read a name; anything else is an error. When
-        a name is one of a number of duplicates, a different opcode is used and
-        it needs more memory. Unfortunately we cannot tell whether a name is a
-        duplicate in the first pass, so we have to allow for more memory. */
+        the referenced name is one of a number of duplicates, a different
+        opcode is used and it needs more memory. Unfortunately we cannot tell
+        whether this is the case in the first pass, so we have to allow for
+        more memory always. */
 
         else
           {
@@ -5318,8 +5319,7 @@ for (;; ptr++)
             ptr++;
             }
           namelen = (int)(ptr - name);
-          if (lengthptr != NULL && (options & PCRE2_DUPNAMES) != 0)
-            *lengthptr += IMM2_SIZE;
+          if (lengthptr != NULL) *lengthptr += IMM2_SIZE;
           }
 
         /* Check the terminator */
@@ -5736,15 +5736,15 @@ for (;; ptr++)
             }
           recno = (i < cb->names_found)? ng->number : 0;
 
-          /* Count named back references. */
-
-          if (!is_recurse) cb->namedrefcount++;
-
           /* If duplicate names are permitted, we have to allow for a named
           reference to a duplicated name (this cannot be determined until the
-          second pass). This needs an extra 16-bit data item. */
+          second pass). This needs an extra data item. Counting named back 
+          references and incrementing the count at the end does not work 
+          because it does not account for duplication of groups containing such 
+          references. Nor does checking for PCRE2_DUPNAMES because that need 
+          not be set at the point of reference. */
 
-          if ((options & PCRE2_DUPNAMES) != 0) *lengthptr += IMM2_SIZE;
+          *lengthptr += IMM2_SIZE;
           }
 
         /* In the real compile, search the name table. We check the name
@@ -6874,7 +6874,7 @@ for (;;)
       }
 
     /* Fill in the ket */
-
+    
     *code = OP_KET;
     PUT(code, 1, (int)(code - start_bracket));
     code += 1 + LINK_SIZE;
@@ -7479,7 +7479,6 @@ cb.name_entry_size = 0;
 cb.name_table = NULL;
 cb.named_groups = named_groups;
 cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
-cb.namedrefcount = 0;
 cb.names_found = 0;
 cb.open_caps = NULL;
 cb.parens_depth = 0;
@@ -7668,13 +7667,6 @@ if (length > MAX_PATTERN_SIZE)
   goto HAD_ERROR;
   }
 
-/* If there are groups with duplicate names and there are also references by
-name, we must allow for the possibility of named references to duplicated
-groups. These require an extra data item each. */
-
-if (cb.dupnames && cb.namedrefcount > 0)
-  length += cb.namedrefcount * IMM2_SIZE * sizeof(PCRE2_UCHAR);
-
 /* Compute the size of, and then get and initialize, the data block for storing
 the compiled pattern and names table. Integer overflow should no longer be
 possible because nowadays we limit the maximum value of cb.names_found and
@@ -7689,7 +7681,7 @@ if (re == NULL)
   errorcode = ERR21;
   goto HAD_ERROR;
   }
-
+  
 re->memctl = ccontext->memctl;
 re->tables = tables;
 re->executable_jit = NULL;
@@ -7775,22 +7767,22 @@ if (cb.had_accept)
   }
 
 /* If we have not reached end of pattern after a successful compile, there's an
-excess bracket. Otherwise, fill in the final opcode and check for disastrous
-overflow. */
+excess bracket. Fill in the final opcode and check for disastrous overflow.
+If no overflow, but the estimated length exceeds the really used length, adjust
+the value of re->blocksize, and if valgrind support is configured, mark the
+extra allocated memory as unaddressable, so that any out-of-bound reads can be
+detected. */
 
 if (errorcode == 0 && ptr < cb.end_pattern) errorcode = ERR22;
 *code++ = OP_END;
 usedlength = code - codestart;
-if (usedlength > length) errorcode = ERR23;
-
-/* If the estimated length exceeds the really used length, adjust the value of
-re->blocksize, and if valgrind support is configured, mark the extra allocated
-memory as unaddressable, so that any out-of-bound reads can be detected. */
-
-re->blocksize -= CU2BYTES(length - usedlength);
+if (usedlength > length) errorcode = ERR23; else
+  {
+  re->blocksize -= CU2BYTES(length - usedlength);
 #ifdef SUPPORT_VALGRIND
-VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
+  VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
 #endif
+  }
 
 /* Fill in any forward references that are required. There may be repeated
 references; optimize for them, as searching a large regex takes time. */
diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
index 44acb89..4e57ed4 100644
--- a/src/pcre2_intmodedep.h
+++ b/src/pcre2_intmodedep.h
@@ -677,7 +677,6 @@ typedef struct compile_block {
   uint32_t final_bracount;         /* Saved value after first pass */
   uint32_t top_backref;            /* Maximum back reference */
   uint32_t backref_map;            /* Bitmap of low back refs */
-  uint32_t namedrefcount;          /* Number of backreferences by name */
   uint32_t nltype;                 /* Newline type */
   uint32_t nllen;                  /* Newline string length */
   PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
diff --git a/testdata/testinput2 b/testdata/testinput2
index afd04cc..ad6f0a3 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4134,4 +4134,8 @@ a random value. /Ix
     aa123\=ovector=2
     aa123\=ovector=3
 
+/(?<N111>(?J)(?<N111>1(111111)11|)1|1|)(?(<N111>)1)/
+
+/(?<N>(?J)(?<N>))(?-J)\k<N>/
+
 # End of testinput2 
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 5588fdb..da01b4e 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -13893,4 +13893,8 @@ Matched, but too many substrings
  1: <unset>
  2: a
 
+/(?<N111>(?J)(?<N111>1(111111)11|)1|1|)(?(<N111>)1)/
+
+/(?<N>(?J)(?<N>))(?-J)\k<N>/
+
 # End of testinput2