From 4d35b44b438a49873737cc263f568ffdc12bb2d9 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Thu, 23 Apr 2015 17:28:39 +0000
Subject: [PATCH] Fix compatibility issues for \8 and \9.

---
 ChangeLog            |  3 +++
 doc/pcre2pattern.3   | 31 +++++++++++++++---------------
 doc/pcre2syntax.3    | 45 +++++++++++++++++++++++++++-----------------
 src/pcre2_compile.c  | 24 ++++++++++++++---------
 testdata/testinput1  |  6 ++++++
 testdata/testinput2  | 11 +++++++++++
 testdata/testoutput1 | 20 ++++++++++++++++++++
 testdata/testoutput2 | 30 +++++++++++++++++++++++++++++
 8 files changed, 129 insertions(+), 41 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index cfcb9b6..5125c90 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -94,6 +94,9 @@ fuzzer: see http://lcamtuf.coredump.cx/afl/.
 
 23. Added the PCRE2_ALT_CIRCUMFLEX option.
 
+24. Adjust the treatment of \8 and \9 to be the same as the current Perl 
+behaviour.
+
 
 Version 10.10 06-March-2015
 ---------------------------
diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
index 9a18ca5..b0f27d9 100644
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "22 April 2015" "PCRE2 10.20"
+.TH PCRE2PATTERN 3 "23 April 2015" "PCRE2 10.20"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -387,11 +387,13 @@ numbers, and \eg{} to specify back references. The following paragraphs
 describe the old, ambiguous syntax.
 .P
 The handling of a backslash followed by a digit other than 0 is complicated,
-and Perl has changed in recent releases, causing PCRE2 also to change. Outside
-a character class, PCRE2 reads the digit and any following digits as a decimal
-number. If the number is less than 8, or if there have been at least that many
-previous capturing left parentheses in the expression, the entire sequence is
-taken as a \fIback reference\fP. A description of how this works is given
+and Perl has changed over time, causing PCRE2 also to change.
+.P
+Outside a character class, PCRE2 reads the digit and any following digits as a
+decimal number. If the number is less than 10, begins with the digit 8 or 9, or
+if there are at least that many previous capturing left parentheses in the
+expression, the entire sequence is taken as a \fIback reference\fP. A
+description of how this works is given
 .\" HTML <a href="#backreferences">
 .\" </a>
 later,
@@ -399,14 +401,14 @@ later,
 following the discussion of
 .\" HTML <a href="#subpattern">
 .\" </a>
-parenthesized subpatterns.
+parenthesized subpatterns. 
 .\"
+Otherwise, up to three octal digits are read to form a character code.
 .P
-Inside a character class, or if the decimal number following \e is greater than
-7 and there have not been that many capturing subpatterns, PCRE2 handles \e8
-and \e9 as the literal characters "8" and "9", and otherwise re-reads up to
-three octal digits following the backslash, using them to generate a data
-character. Any subsequent digits stand for themselves. For example:
+Inside a character class, PCRE2 handles \e8 and \e9 as the literal characters
+"8" and "9", and otherwise reads up to three octal digits following the
+backslash, using them to generate a data character. Any subsequent digits stand
+for themselves. For example, outside a character class:
 .sp
   \e040   is another way of writing an ASCII space
 .\" JOIN
@@ -425,8 +427,7 @@ character. Any subsequent digits stand for themselves. For example:
   \e377   might be a back reference, otherwise
             the value 255 (decimal)
 .\" JOIN
-  \e81    is either a back reference, or the two
-            characters "8" and "1"
+  \e81    is always a back reference
 .sp
 Note that octal values of 100 or greater that are specified using this syntax
 must not be introduced by a leading zero, because no more than three octal
@@ -3337,6 +3338,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 22 April 2015
+Last updated: 23 April 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3
index bd5eabc..398be1e 100644
--- a/doc/pcre2syntax.3
+++ b/doc/pcre2syntax.3
@@ -1,4 +1,4 @@
-.TH PCRE2SYNTAX 3 "22 April 2015" "PCRE2 10.20"
+.TH PCRE2SYNTAX 3 "23 April 2015" "PCRE2 10.20"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
@@ -19,7 +19,7 @@ documentation. This document contains a quick-reference summary of the syntax.
   \eQ...\eE    treat enclosed characters as literal
 .
 .
-.SH "CHARACTERS"
+.SH "ESCAPED CHARACTERS"
 .rs
 .sp
   \ea         alarm, that is, the BEL character (hex 07)
@@ -32,17 +32,28 @@ documentation. This document contains a quick-reference summary of the syntax.
   \e0dd       character with octal code 0dd
   \eddd       character with octal code ddd, or backreference
   \eo{ddd..}  character with octal code ddd..
-  \eU         "U" if PCRE2_ALT_BSUX is set (otherwise is an error) 
+  \eU         "U" if PCRE2_ALT_BSUX is set (otherwise is an error)
   \euhhhh     character with hex code hhhh (if PCRE2_ALT_BSUX is set)
-  \exhh       character with hex code hh 
+  \exhh       character with hex code hh
   \ex{hhh..}  character with hex code hhh..
 .sp
-Note that \e0dd is always an octal code, and that \e8 and \e9 are the literal
-characters "8" and "9". When \ex is not followed by {, from zero to two 
-hexadecimal digits are read, but if PCRE2_ALT_BSUX is set, \ex must be followed 
-by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise 
-it matches a literal "x". Likewise, if \eu (in ALT_BSUX mode) is not followed 
-by four hexadecimal digits, it matches a literal "u".
+Note that \e0dd is always an octal code. The treatment of backslash followed by
+a non-zero digit is complicated; for details see the section
+.\" HTML <a href="pcre2pattern.html#digitsafterbackslash">
+.\" </a>
+"Non-printing characters"
+.\"
+in the
+.\" HREF
+\fBpcre2pattern\fP
+.\"
+documentation.
+.P
+When \ex is not followed by {, from zero to two hexadecimal digits are read,
+but if PCRE2_ALT_BSUX is set, \ex must be followed by two hexadecimal digits to
+be recognized as a hexadecimal escape; otherwise it matches a literal "x".
+Likewise, if \eu (in ALT_BSUX mode) is not followed by four hexadecimal digits,
+it matches a literal "u".
 .
 .
 .SH "CHARACTER TYPES"
@@ -329,7 +340,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
   \eB          not a word boundary
   ^           start of subject
                 also after an internal newline in multiline mode
-                (after any newline if PCRE2_ALT_CIRCUMFLEX is set) 
+                (after any newline if PCRE2_ALT_CIRCUMFLEX is set)
   \eA          start of subject
   $           end of subject
                 also before newline at end of subject
@@ -407,8 +418,8 @@ appear.
   (*UCP)          set PCRE2_UCP (use Unicode properties for \ed etc)
 .sp
 Note that LIMIT_MATCH and LIMIT_RECURSION can only reduce the value of the
-limits set by the caller of pcre2_match(), not increase them. The application 
-can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or 
+limits set by the caller of pcre2_match(), not increase them. The application
+can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or
 PCRE2_NEVER_UCP options, respectively, at compile time.
 .
 .
@@ -530,9 +541,9 @@ pattern is not anchored.
   (?Cn)           callout with numerical data n
   (?C"text")      callout with string data
 .sp
-The allowed string delimiters are ` ' " ^ % # $ (which are the same for the 
-start and the end), and the starting delimiter { matched with the ending 
-delimiter }. To encode the ending delimiter within the string, double it.   
+The allowed string delimiters are ` ' " ^ % # $ (which are the same for the
+start and the end), and the starting delimiter { matched with the ending
+delimiter }. To encode the ending delimiter within the string, double it.
 .
 .
 .SH "SEE ALSO"
@@ -556,6 +567,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 22 April 2015
+Last updated: 23 April 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index c2405eb..582ca7f 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -1868,9 +1868,9 @@ else
     Outside a character class, the digits are read as a decimal number. If the
     number is less than 10, or if there are that many previous extracting left
     brackets, it is a back reference. Otherwise, up to three octal digits are
-    read to form an escaped byte. Thus \123 is likely to be octal 123 (cf
-    \0123, which is octal 012 followed by the literal 3). If the octal value is
-    greater than 377, the least significant 8 bits are taken.
+    read to form an escaped character code. Thus \123 is likely to be octal 123
+    (cf \0123, which is octal 012 followed by the literal 3). If the octal
+    value is greater than 377, the least significant 8 bits are taken.
 
     Inside a character class, \ followed by a digit is always either a literal
     8 or 9 or an octal number. */
@@ -1899,18 +1899,24 @@ else
         *errorcodeptr = ERR61;
         break;
         }
-      if (s < 10 || s <= cb->bracount)  /* Check for back reference */
+        
+      /* \1 to \9 are always back references. \8x and \9x are too, unless there 
+      are an awful lot of previous captures; \1x to \7x are octal escapes if 
+      there are not that many previous captures. */ 
+ 
+      if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount)
         {
-        escape = -s;
+        escape = -s;     /* Indicates a back reference */
         break;
         }
       ptr = oldptr;      /* Put the pointer back and fall through */
       }
 
-    /* Handle a digit following \ when the number is not a back reference. If
-    the first digit is 8 or 9, Perl used to generate a binary zero byte and
-    then treat the digit as a following literal. At least by Perl 5.18 this
-    changed so as not to insert the binary zero. */
+    /* Handle a digit following \ when the number is not a back reference, or 
+    we are within a character class. If the first digit is 8 or 9, Perl used to
+    generate a binary zero byte and then treat the digit as a following
+    literal. At least by Perl 5.18 this changed so as not to insert the binary
+    zero. */
 
     if ((c = *ptr) >= CHAR_8) break;
 
diff --git a/testdata/testinput1 b/testdata/testinput1
index e5a0d0f..345df9c 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5715,4 +5715,10 @@ name)/mark
 "(?1)(?#?'){8}(a)"
     baaaaaaaaac
 
+/((((((((((((x))))))))))))\12/
+    xx
+
+/A[\8]B[\9]C/
+    A8B9C
+
 # End of testinput1 
diff --git a/testdata/testinput2 b/testdata/testinput2
index 9116d37..f7e9191 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4279,4 +4279,15 @@ a random value. /Ix
 /^/gm,alt_circumflex
     \n\n\n
 
+/((((((((x))))))))\81/
+    xx1
+
+/((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))\80/
+    xx
+
+/\80/
+
+/A\8B\9C/
+    A8B9C
+
 # End of testinput2 
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index bd5c222..f85a151 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9427,4 +9427,24 @@ No match
  0: aaaaaaaaa
  1: a
 
+/((((((((((((x))))))))))))\12/
+    xx
+ 0: xx
+ 1: x
+ 2: x
+ 3: x
+ 4: x
+ 5: x
+ 6: x
+ 7: x
+ 8: x
+ 9: x
+10: x
+11: x
+12: x
+
+/A[\8]B[\9]C/
+    A8B9C
+ 0: A8B9C
+
 # End of testinput1 
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 3ce9b71..47cf1cb 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -14318,4 +14318,34 @@ No match
  0: 
  0: 
 
+/((((((((x))))))))\81/
+Failed: error 115 at offset 20: reference to non-existent subpattern
+    xx1
+
+/((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))\80/
+    xx
+Matched, but too many substrings
+ 0: xx
+ 1: x
+ 2: x
+ 3: x
+ 4: x
+ 5: x
+ 6: x
+ 7: x
+ 8: x
+ 9: x
+10: x
+11: x
+12: x
+13: x
+14: x
+
+/\80/
+Failed: error 115 at offset 3: reference to non-existent subpattern
+
+/A\8B\9C/
+Failed: error 115 at offset 7: reference to non-existent subpattern
+    A8B9C
+
 # End of testinput2