From 7de013bac3e1f6b2a199a5fe795991a6144d3865 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Fri, 4 Jan 2019 16:41:32 +0000
Subject: [PATCH] Fix issues with BAD_ESCAPE_IS_LITERAL in character classes.

---
 ChangeLog              | 10 ++++++++++
 doc/html/pcre2api.html | 13 ++++++++-----
 doc/pcre2.txt          | 12 ++++++++----
 doc/pcre2api.3         | 15 +++++++++------
 src/pcre2_compile.c    | 40 +++++++++++++++++++++-------------------
 src/pcre2_error.c      |  4 ++--
 testdata/testinput2    | 14 +++++++++++++-
 testdata/testoutput2   | 30 ++++++++++++++++++++++++------
 8 files changed, 95 insertions(+), 43 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index cf04444..7689f25 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -102,6 +102,16 @@ for the stack as it needs for -bigstack.
 
 26. Insert a cast in pcre2_dfa_match.c to suppress a compiler warning.
 
+26. With PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL set, escape sequences such as \s 
+which are valid in character classes, but not as the end of ranges, were being 
+treated as literals. An example is [_-\s] (but not [\s-_] because that gave an 
+error at the *start* of a range). Now an "invalid range" error is given 
+independently of PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL.
+
+27. Related to 26 above, PCRE2_BAD_ESCAPE_IS_LITERAL was affecting known escape 
+sequences such as \eX when they appeared invalidly in a character class. Now
+the option applies only to unrecognized or malformed escape sequences.
+
 
 Version 10.32 10-September-2018
 -------------------------------
diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html
index 9e02fda..018a077 100644
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
@@ -1870,11 +1870,14 @@ always causes an error in Perl.
 </P>
 <P>
 If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to
-<b>pcre2_compile()</b>, all unrecognized or erroneous escape sequences are
+<b>pcre2_compile()</b>, all unrecognized or malformed escape sequences are
 treated as single-character escapes. For example, \j is a literal "j" and
 \x{2z} is treated as the literal string "x{2z}". Setting this option means
-that typos in patterns may go undetected and have unexpected results. This is a
-dangerous option. Use with care.
+that typos in patterns may go undetected and have unexpected results. Also note 
+that a sequence such as [\N{] is interpreted as a malformed attempt at
+[\N{...}] and so is treated as [N{] whereas [\N] gives an error because an
+unqualified \N is a valid escape sequence but is not supported in a character
+class. To reiterate: this is a dangerous option. Use with great care.
 <pre>
   PCRE2_EXTRA_ESCAPED_CR_IS_LF
 </pre>
@@ -3782,9 +3785,9 @@ Cambridge, England.
 </P>
 <br><a name="SEC42" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 27 November 2018
+Last updated: 04 January 2019
 <br>
-Copyright &copy; 1997-2018 University of Cambridge.
+Copyright &copy; 1997-2019 University of Cambridge.
 <br>
 <p>
 Return to the <a href="index.html">PCRE2 index page</a>.
diff --git a/doc/pcre2.txt b/doc/pcre2.txt
index fa5ef88..0a54e89 100644
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
@@ -1846,11 +1846,15 @@ COMPILING A PATTERN
        Perl.
 
        If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL  extra  option  is  passed  to
-       pcre2_compile(),  all  unrecognized  or  erroneous escape sequences are
+       pcre2_compile(),  all  unrecognized  or  malformed escape sequences are
        treated as single-character escapes. For example, \j is a  literal  "j"
        and  \x{2z}  is  treated  as  the  literal string "x{2z}". Setting this
        option means that typos in patterns may go undetected  and  have  unex-
-       pected results. This is a dangerous option. Use with care.
+       pected  results. Also note that a sequence such as [\N{] is interpreted
+       as a malformed attempt at [\N{...}] and so is treated as  [N{]  whereas
+       [\N]  gives  an  error  because  an  unqualified  \N  is a valid escape
+       sequence but is not supported in a character class. To reiterate:  this
+       is a dangerous option. Use with great care.
 
          PCRE2_EXTRA_ESCAPED_CR_IS_LF
 
@@ -3654,8 +3658,8 @@ AUTHOR
 
 REVISION
 
-       Last updated: 27 November 2018
-       Copyright (c) 1997-2018 University of Cambridge.
+       Last updated: 04 January 2019
+       Copyright (c) 1997-2019 University of Cambridge.
 ------------------------------------------------------------------------------
  
  
diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
index 5d64bc9..f20de25 100644
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@@ -1,4 +1,4 @@
-.TH PCRE2API 3 "27 November 2018" "PCRE2 10.33"
+.TH PCRE2API 3 "04 January 2019" "PCRE2 10.33"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@@ -1825,11 +1825,14 @@ Perl's warning switch is enabled. However, a malformed octal number after \eo{
 always causes an error in Perl.
 .P
 If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to
-\fBpcre2_compile()\fP, all unrecognized or erroneous escape sequences are
+\fBpcre2_compile()\fP, all unrecognized or malformed escape sequences are
 treated as single-character escapes. For example, \ej is a literal "j" and
 \ex{2z} is treated as the literal string "x{2z}". Setting this option means
-that typos in patterns may go undetected and have unexpected results. This is a
-dangerous option. Use with care.
+that typos in patterns may go undetected and have unexpected results. Also note 
+that a sequence such as [\eN{] is interpreted as a malformed attempt at
+[\eN{...}] and so is treated as [N{] whereas [\eN] gives an error because an
+unqualified \eN is a valid escape sequence but is not supported in a character
+class. To reiterate: this is a dangerous option. Use with great care.
 .sp
   PCRE2_EXTRA_ESCAPED_CR_IS_LF
 .sp
@@ -3790,6 +3793,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 27 November 2018
-Copyright (c) 1997-2018 University of Cambridge.
+Last updated: 04 January 2019
+Copyright (c) 1997-2019 University of Cambridge.
 .fi
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 95ffeba..eb45210 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2019 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -3346,9 +3346,9 @@ while (ptr < ptrend)
         tempptr = ptr;
         escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode,
           options, TRUE, cb);
+
         if (errorcode != 0)
           {
-          CLASS_ESCAPE_FAILED:
           if ((cb->cx->extra_options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
             goto FAILED;
           ptr = tempptr;
@@ -3359,30 +3359,32 @@ while (ptr < ptrend)
           escape = 0;                 /* Treat as literal character */
           }
 
-        if (escape == 0)  /* Escaped character code point is in c */
+        switch(escape)
           {
+          case 0:  /* Escaped character code point is in c */
           char_is_literal = FALSE;
           goto CLASS_LITERAL;
-          }
 
-        /* These three escapes do not alter the class range state. */
-
-        if (escape == ESC_b)
-          {
-          c = CHAR_BS;   /* \b is backspace in a class */
+          case ESC_b:
+          c = CHAR_BS;    /* \b is backspace in a class */
           char_is_literal = FALSE;
           goto CLASS_LITERAL;
-          }
 
-        else if (escape == ESC_Q)
-          {
+          case ESC_Q:
           inescq = TRUE;  /* Enter literal mode */
           goto CLASS_CONTINUE;
-          }
 
-        else if (escape == ESC_E)  /* Ignore orphan \E */
+          case ESC_E:     /* Ignore orphan \E */
           goto CLASS_CONTINUE;
 
+          case ESC_B:     /* Always an error in a class */
+          case ESC_R:
+          case ESC_X:
+          errorcode = ERR7;
+          ptr--;
+          goto FAILED;
+          }
+
         /* The second part of a range can be a single-character escape
         sequence (detected above), but not any of the other escapes. Perl
         treats a hyphen as a literal in such circumstances. However, in Perl's
@@ -3392,7 +3394,7 @@ while (ptr < ptrend)
         if (class_range_state == RANGE_STARTED)
           {
           errorcode = ERR50;
-          goto CLASS_ESCAPE_FAILED;
+          goto FAILED;  /* Not CLASS_ESCAPE_FAILED; always an error */
           }
 
         /* Of the remaining escapes, only those that define characters are
@@ -3402,8 +3404,8 @@ while (ptr < ptrend)
         switch(escape)
           {
           case ESC_N:
-          errorcode = ERR71;  /* Not supported in a class */
-          goto CLASS_ESCAPE_FAILED;
+          errorcode = ERR71;
+          goto FAILED;
 
           case ESC_H:
           case ESC_h:
@@ -3466,14 +3468,14 @@ while (ptr < ptrend)
             }
 #else
           errorcode = ERR45;
-          goto CLASS_ESCAPE_FAILED;
+          goto FAILED;
 #endif
           break;  /* End \P and \p */
 
           default:    /* All others are not allowed in a class */
           errorcode = ERR7;
           ptr--;
-          goto CLASS_ESCAPE_FAILED;
+          goto FAILED;
           }
 
         /* Perl gives a warning unless a following hyphen is the last character
diff --git a/src/pcre2_error.c b/src/pcre2_error.c
index 1aefe7b..4c8127c 100644
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
      Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2019 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -71,7 +71,7 @@ static const unsigned char compile_error_texts[] =
   /* 5 */
   "number too big in {} quantifier\0"
   "missing terminating ] for character class\0"
-  "invalid escape sequence in character class\0"
+  "escape sequence is invalid in character class\0"
   "range out of order in character class\0"
   "quantifier does not follow a repeatable item\0"
   /* 10 */
diff --git a/testdata/testinput2 b/testdata/testinput2
index d471284..a6e44dd 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5304,10 +5304,22 @@ a)"xI
 
 /\N{\c/IB,bad_escape_is_literal
 
-/[\j\x{z}\o\gA-\Nb-\g]/B,bad_escape_is_literal
+/[\j\x{z}\o\gAb\g]/B,bad_escape_is_literal
 
 /[Q-\N]/B,bad_escape_is_literal
 
+/[\s-_]/bad_escape_is_literal
+
+/[_-\s]/bad_escape_is_literal
+
+/[\B\R\X]/B
+
+/[\B\R\X]/B,bad_escape_is_literal
+
+/[A-\BP-\RV-\X]/B
+
+/[A-\BP-\RV-\X]/B,bad_escape_is_literal
+
 # ----------------------------------------------------------------------
 
 /a\b(c/literal
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 47eabf3..ecc5d21 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -135,13 +135,13 @@ Failed: error 105 at offset 7: number too big in {} quantifier
 Failed: error 106 at offset 5: missing terminating ] for character class
 
 /[\B]/B
-Failed: error 107 at offset 2: invalid escape sequence in character class
+Failed: error 107 at offset 2: escape sequence is invalid in character class
 
 /[\R]/B
-Failed: error 107 at offset 2: invalid escape sequence in character class
+Failed: error 107 at offset 2: escape sequence is invalid in character class
 
 /[\X]/B
-Failed: error 107 at offset 2: invalid escape sequence in character class
+Failed: error 107 at offset 2: escape sequence is invalid in character class
 
 /[z-a]/
 Failed: error 108 at offset 3: range out of order in character class
@@ -16224,16 +16224,34 @@ First code unit = 'N'
 Last code unit = 'c'
 Subject length lower bound = 3
 
-/[\j\x{z}\o\gA-\Nb-\g]/B,bad_escape_is_literal
+/[\j\x{z}\o\gAb\g]/B,bad_escape_is_literal
 ------------------------------------------------------------------
         Bra
-        [A-Nb-gjoxz{}]
+        [Abgjoxz{}]
         Ket
         End
 ------------------------------------------------------------------
 
 /[Q-\N]/B,bad_escape_is_literal
-Failed: error 108 at offset 4: range out of order in character class
+Failed: error 150 at offset 5: invalid range in character class
+
+/[\s-_]/bad_escape_is_literal
+Failed: error 150 at offset 3: invalid range in character class
+
+/[_-\s]/bad_escape_is_literal
+Failed: error 150 at offset 5: invalid range in character class
+
+/[\B\R\X]/B
+Failed: error 107 at offset 2: escape sequence is invalid in character class
+
+/[\B\R\X]/B,bad_escape_is_literal
+Failed: error 107 at offset 2: escape sequence is invalid in character class
+
+/[A-\BP-\RV-\X]/B
+Failed: error 107 at offset 4: escape sequence is invalid in character class
+
+/[A-\BP-\RV-\X]/B,bad_escape_is_literal
+Failed: error 107 at offset 4: escape sequence is invalid in character class
 
 # ----------------------------------------------------------------------