Make EBCDIC [a-z] type ranges Perl compatible.

2015-07-24 18:18:05 +00:00 · 2015-07-24 18:18:05 +00:00 · fe0a16fe8f
parent 1bcfb856f5
commit fe0a16fe8f
3 changed files with 85 additions and 22 deletions
--- a/4
+++ b/4
@ -81,6 +81,10 @@ This bug was discovered by Karl Skomski with the LLVM fuzzer.
 very pedantic coding infelicities and a buffer overflow while checking a UTF-8 
 string if the final multi-byte UTF-8 character was truncated.

+22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
+class, where both values are literal letters in the same case, omit the 
+non-letter EBCDIC code points within the range.
+

 Version 10.20 30-June-2015
 --------------------------
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -1324,9 +1324,18 @@ sequence other than one that defines a single character appears at a point
 where a range ending character is expected. For example, [z-\exff] is valid,
 but [A-\ed] and [A-[:digit:]] are not.
 .P
-Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\e000-\e037]. Ranges
-can include any characters that are valid for the current mode.
+Ranges normally include all code points between the start and end characters,
+inclusive. They can also be used for code points specified numerically, for
+example [\e000-\e037]. Ranges can include any characters that are valid for the
+current mode.
+.P
+There is a special case in EBCDIC environments for ranges whose end points are 
+both specified as literal letters in the same case. For compatibility with 
+Perl, EBCDIC code points within the range that are not letters are omitted. For 
+example, [h-k] matches only four characters, even though the codes for h and k 
+are 0x88 and 0x92, a range of 11 code points. However, if the range is 
+specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
+are included.
 .P
 If a range that includes letters is used when caseless matching is set, it
 matches the letters in either case. For example, [W-c] is equivalent to
@ -3367,6 +3376,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 17 July 2015
+Last updated: 24 July 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -3323,38 +3323,38 @@ for (; ptr < cb->end_pattern; ptr++)
        goto FAILED;
        }
      break;
-      
+
      /* Conditional group */

      case CHAR_LEFT_PARENTHESIS:
      if (ptr[3] != CHAR_QUESTION_MARK)   /* Not assertion or callout */
-        {  
+        {
        nest_depth++;
        ptr += 2;
-        break; 
+        break;
        }
-        
+
      /* Must be an assertion or a callout */
- 
+
      switch(ptr[4])
       {
       case CHAR_LESS_THAN_SIGN:
-       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) 
+       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
         goto MISSING_ASSERTION;
-       /* Fall through */       
+       /* Fall through */

       case CHAR_C:
       case CHAR_EXCLAMATION_MARK:
       case CHAR_EQUALS_SIGN:
       ptr++;
       break;
-       
+
       default:
-       MISSING_ASSERTION: 
-       ptr += 3;            /* To improve error message */         
+       MISSING_ASSERTION:
+       ptr += 3;            /* To improve error message */
       errorcode = ERR28;
-       goto FAILED; 
-       }      
+       goto FAILED;
+       }
      break;

      case CHAR_COLON:
@ -3939,7 +3939,7 @@ for (;; ptr++)
      {
      nestptr = ptr + 7;
      ptr = sub_start_of_word;  /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
      continue;
      }

@ -3947,7 +3947,7 @@ for (;; ptr++)
      {
      nestptr = ptr + 7;
      ptr = sub_end_of_word;    /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
      continue;
      }

@ -4046,6 +4046,9 @@ for (;; ptr++)
    for(;;)
      {
      PCRE2_SPTR oldptr;
+#ifdef EBCDIC
+      BOOL range_is_literal = TRUE;
+#endif

      if (c == CHAR_NULL && ptr >= cb->end_pattern)
        {
@ -4226,7 +4229,13 @@ for (;; ptr++)
        {
        escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
        if (*errorcodeptr != 0) goto FAILED;
-        if (escape == 0) c = ec;               /* Escaped single char */
+        if (escape == 0)    /* Escaped single char */
+          {
+          c = ec;
+#ifdef EBCDIC
+          range_is_literal = FALSE;
+#endif
+          }
        else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
        else if (escape == ESC_N)          /* \N is not supported in a class */
          {
@ -4430,7 +4439,9 @@ for (;; ptr++)
            int descape;
            descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
            if (*errorcodeptr != 0) goto FAILED;
-
+#ifdef EBCDIC
+            range_is_literal = FALSE;
+#endif
            /* 0 means a character was put into d; \b is backspace; any other
            special causes an error. */

@ -4476,9 +4487,48 @@ for (;; ptr++)

        if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;

+        /* In an EBCDIC environment, Perl treats alphabetic ranges specially
+        because there are holes in the encoding, and simply using the range A-Z
+        (for example) would include the characters in the holes. This applies
+        only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
+
+#ifdef EBCDIC
+        if (range_is_literal &&
+             (cb->ctypes[c] & ctype_letter) != 0 &&
+             (cb->ctypes[d] & ctype_letter) != 0 &&
+             (c <= CHAR_z) == (d <= CHAR_z))
+          {
+          uint32_t uc = (c <= CHAR_z)? 0 : 64;
+          uint32_t C = c - uc;
+          uint32_t D = d - uc;
+
+          if (C <= CHAR_i)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                ((D < CHAR_i)? D : CHAR_i) + uc);
+            C = CHAR_j;
+            }
+
+          if (C <= D && C <= CHAR_r)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                ((D < CHAR_r)? D : CHAR_r) + uc);
+            C = CHAR_s;
+            }
+
+          if (C <= D)
+            {
+            class_has_8bitchar +=
+              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
+                D + uc);
+            }
+          }
+        else
+#endif
        class_has_8bitchar +=
          add_to_class(classbits, &class_uchardata, options, cb, c, d);
-
        goto CONTINUE_CLASS;   /* Go get the next char in the class */
        }