From fe0a16fe8f94d2674b9d201e488dfd31c4893f74 Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Fri, 24 Jul 2015 18:18:05 +0000
Subject: [PATCH] Make EBCDIC [a-z] type ranges Perl compatible.

---
 ChangeLog           |  4 +++
 doc/pcre2pattern.3  | 19 +++++++---
 src/pcre2_compile.c | 84 ++++++++++++++++++++++++++++++++++++---------
 3 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index c6d3480..6ea8aae 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -81,6 +81,10 @@ This bug was discovered by Karl Skomski with the LLVM fuzzer.
 very pedantic coding infelicities and a buffer overflow while checking a UTF-8 
 string if the final multi-byte UTF-8 character was truncated.
 
+22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
+class, where both values are literal letters in the same case, omit the 
+non-letter EBCDIC code points within the range.
+
 
 Version 10.20 30-June-2015
 --------------------------
diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
index 04325c7..9f0ff4f 100644
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -1324,9 +1324,18 @@ sequence other than one that defines a single character appears at a point
 where a range ending character is expected. For example, [z-\exff] is valid,
 but [A-\ed] and [A-[:digit:]] are not.
 .P
-Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\e000-\e037]. Ranges
-can include any characters that are valid for the current mode.
+Ranges normally include all code points between the start and end characters,
+inclusive. They can also be used for code points specified numerically, for
+example [\e000-\e037]. Ranges can include any characters that are valid for the
+current mode.
+.P
+There is a special case in EBCDIC environments for ranges whose end points are 
+both specified as literal letters in the same case. For compatibility with 
+Perl, EBCDIC code points within the range that are not letters are omitted. For 
+example, [h-k] matches only four characters, even though the codes for h and k 
+are 0x88 and 0x92, a range of 11 code points. However, if the range is 
+specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
+are included.
 .P
 If a range that includes letters is used when caseless matching is set, it
 matches the letters in either case. For example, [W-c] is equivalent to
@@ -3367,6 +3376,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 17 July 2015
+Last updated: 24 July 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 08ea585..c25970f 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -3323,38 +3323,38 @@ for (; ptr < cb->end_pattern; ptr++)
         goto FAILED;
         }
       break;
-      
+
       /* Conditional group */
 
       case CHAR_LEFT_PARENTHESIS:
       if (ptr[3] != CHAR_QUESTION_MARK)   /* Not assertion or callout */
-        {  
+        {
         nest_depth++;
         ptr += 2;
-        break; 
+        break;
         }
-        
+
       /* Must be an assertion or a callout */
- 
+
       switch(ptr[4])
        {
        case CHAR_LESS_THAN_SIGN:
-       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) 
+       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
          goto MISSING_ASSERTION;
-       /* Fall through */       
+       /* Fall through */
 
        case CHAR_C:
        case CHAR_EXCLAMATION_MARK:
        case CHAR_EQUALS_SIGN:
        ptr++;
        break;
-       
+
        default:
-       MISSING_ASSERTION: 
-       ptr += 3;            /* To improve error message */         
+       MISSING_ASSERTION:
+       ptr += 3;            /* To improve error message */
        errorcode = ERR28;
-       goto FAILED; 
-       }      
+       goto FAILED;
+       }
       break;
 
       case CHAR_COLON:
@@ -3939,7 +3939,7 @@ for (;; ptr++)
       {
       nestptr = ptr + 7;
       ptr = sub_start_of_word;  /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
       continue;
       }
 
@@ -3947,7 +3947,7 @@ for (;; ptr++)
       {
       nestptr = ptr + 7;
       ptr = sub_end_of_word;    /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
       continue;
       }
 
@@ -4046,6 +4046,9 @@ for (;; ptr++)
     for(;;)
       {
       PCRE2_SPTR oldptr;
+#ifdef EBCDIC
+      BOOL range_is_literal = TRUE;
+#endif
 
       if (c == CHAR_NULL && ptr >= cb->end_pattern)
         {
@@ -4226,7 +4229,13 @@ for (;; ptr++)
         {
         escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
         if (*errorcodeptr != 0) goto FAILED;
-        if (escape == 0) c = ec;               /* Escaped single char */
+        if (escape == 0)    /* Escaped single char */
+          {
+          c = ec;
+#ifdef EBCDIC
+          range_is_literal = FALSE;
+#endif
+          }
         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
         else if (escape == ESC_N)          /* \N is not supported in a class */
           {
@@ -4430,7 +4439,9 @@ for (;; ptr++)
             int descape;
             descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
             if (*errorcodeptr != 0) goto FAILED;
-
+#ifdef EBCDIC
+            range_is_literal = FALSE;
+#endif
             /* 0 means a character was put into d; \b is backspace; any other
             special causes an error. */
 
@@ -4476,9 +4487,48 @@ for (;; ptr++)
 
         if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
 
+        /* In an EBCDIC environment, Perl treats alphabetic ranges specially
+        because there are holes in the encoding, and simply using the range A-Z
+        (for example) would include the characters in the holes. This applies
+        only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
+
+#ifdef EBCDIC
+        if (range_is_literal &&
+             (cb->ctypes[c] & ctype_letter) != 0 &&
+             (cb->ctypes[d] & ctype_letter) != 0 &&
+             (c <= CHAR_z) == (d <= CHAR_z))
+          {
+          uint32_t uc = (c <= CHAR_z)? 0 : 64;
+          uint32_t C = c - uc;
+          uint32_t D = d - uc;
+
+          if (C <= CHAR_i)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                ((D < CHAR_i)? D : CHAR_i) + uc);
+            C = CHAR_j;
+            }
+
+          if (C <= D && C <= CHAR_r)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                ((D < CHAR_r)? D : CHAR_r) + uc);
+            C = CHAR_s;
+            }
+
+          if (C <= D)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                D + uc);
+            }
+          }
+        else
+#endif
         class_has_8bitchar +=
           add_to_class(classbits, &class_uchardata, options, cb, c, d);
-
         goto CONTINUE_CLASS;   /* Go get the next char in the class */
         }