Make EBCDIC [a-z] type ranges Perl compatible.

2015-07-24 18:18:05 +00:00 · 2015-07-24 18:18:05 +00:00 · fe0a16fe8f
parent 1bcfb856f5
commit fe0a16fe8f
3 changed files with 85 additions and 22 deletions
--- a/4
+++ b/4
@ -81,6 +81,10 @@ This bug was discovered by Karl Skomski with the LLVM fuzzer.
 very pedantic coding infelicities and a buffer overflow while checking a UTF-8 
 string if the final multi-byte UTF-8 character was truncated.
 22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
 class, where both values are literal letters in the same case, omit the 
 non-letter EBCDIC code points within the range.
 Version 10.20 30-June-2015
 --------------------------
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -1324,9 +1324,18 @@ sequence other than one that defines a single character appears at a point
 where a range ending character is expected. For example, [z-\exff] is valid,
 but [A-\ed] and [A-[:digit:]] are not.
 .P
-Ranges operate in the collating sequence of character values. They can also be
+Ranges normally include all code points between the start and end characters,
-used for characters specified numerically, for example [\e000-\e037]. Ranges
+inclusive. They can also be used for code points specified numerically, for
-can include any characters that are valid for the current mode.
+example [\e000-\e037]. Ranges can include any characters that are valid for the
 current mode.
 .P
 There is a special case in EBCDIC environments for ranges whose end points are 
 both specified as literal letters in the same case. For compatibility with 
 Perl, EBCDIC code points within the range that are not letters are omitted. For 
 example, [h-k] matches only four characters, even though the codes for h and k 
 are 0x88 and 0x92, a range of 11 code points. However, if the range is 
 specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
 are included.
 .P
 If a range that includes letters is used when caseless matching is set, it
 matches the letters in either case. For example, [W-c] is equivalent to
@ -3367,6 +3376,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 17 July 2015
+Last updated: 24 July 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -3323,38 +3323,38 @@ for (; ptr < cb->end_pattern; ptr++)
        goto FAILED;
        }
      break;
-      
+
      /* Conditional group */
      case CHAR_LEFT_PARENTHESIS:
      if (ptr[3] != CHAR_QUESTION_MARK)   /* Not assertion or callout */
-        {  
+        {
        nest_depth++;
        ptr += 2;
-        break; 
+        break;
        }
-        
+
      /* Must be an assertion or a callout */
- 
+
      switch(ptr[4])
       {
       case CHAR_LESS_THAN_SIGN:
-       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) 
+       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
         goto MISSING_ASSERTION;
-       /* Fall through */       
+       /* Fall through */
       case CHAR_C:
       case CHAR_EXCLAMATION_MARK:
       case CHAR_EQUALS_SIGN:
       ptr++;
       break;
-       
+
       default:
-       MISSING_ASSERTION: 
+       MISSING_ASSERTION:
-       ptr += 3;            /* To improve error message */         
+       ptr += 3;            /* To improve error message */
       errorcode = ERR28;
-       goto FAILED; 
+       goto FAILED;
-       }      
+       }
      break;
      case CHAR_COLON:
@ -3939,7 +3939,7 @@ for (;; ptr++)
      {
      nestptr = ptr + 7;
      ptr = sub_start_of_word;  /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
      continue;
      }
@ -3947,7 +3947,7 @@ for (;; ptr++)
      {
      nestptr = ptr + 7;
      ptr = sub_end_of_word;    /* Do not combine these statements; clang's */
-      ptr--;                    /* sanitizer moans about a negative index. */ 
+      ptr--;                    /* sanitizer moans about a negative index. */
      continue;
      }
@ -4046,6 +4046,9 @@ for (;; ptr++)
    for(;;)
      {
      PCRE2_SPTR oldptr;
 #ifdef EBCDIC
      BOOL range_is_literal = TRUE;
 #endif
      if (c == CHAR_NULL && ptr >= cb->end_pattern)
        {
@ -4226,7 +4229,13 @@ for (;; ptr++)
        {
        escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
        if (*errorcodeptr != 0) goto FAILED;
-        if (escape == 0) c = ec;               /* Escaped single char */
+        if (escape == 0)    /* Escaped single char */
          {
          c = ec;
 #ifdef EBCDIC
          range_is_literal = FALSE;
 #endif
          }
        else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
        else if (escape == ESC_N)          /* \N is not supported in a class */
          {
@ -4430,7 +4439,9 @@ for (;; ptr++)
            int descape;
            descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
            if (*errorcodeptr != 0) goto FAILED;
-
+#ifdef EBCDIC
            range_is_literal = FALSE;
 #endif
            /* 0 means a character was put into d; \b is backspace; any other
            special causes an error. */
@ -4476,9 +4487,48 @@ for (;; ptr++)
        if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
        /* In an EBCDIC environment, Perl treats alphabetic ranges specially
        because there are holes in the encoding, and simply using the range A-Z
        (for example) would include the characters in the holes. This applies
        only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
 #ifdef EBCDIC
        if (range_is_literal &&
             (cb->ctypes[c] & ctype_letter) != 0 &&
             (cb->ctypes[d] & ctype_letter) != 0 &&
             (c <= CHAR_z) == (d <= CHAR_z))
          {
          uint32_t uc = (c <= CHAR_z)? 0 : 64;
          uint32_t C = c - uc;
          uint32_t D = d - uc;
          if (C <= CHAR_i)
            {
            class_has_8bitchar +=
              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
                ((D < CHAR_i)? D : CHAR_i) + uc);
            C = CHAR_j;
            }
          if (C <= D && C <= CHAR_r)
            {
            class_has_8bitchar +=
              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
                ((D < CHAR_r)? D : CHAR_r) + uc);
            C = CHAR_s;
            }
          if (C <= D)
            {
            class_has_8bitchar +=
              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
                D + uc);
            }
          }
        else
 #endif
        class_has_8bitchar +=
          add_to_class(classbits, &class_uchardata, options, cb, c, d);
        goto CONTINUE_CLASS;   /* Go get the next char in the class */
        }