Make EBCDIC [a-z] type ranges Perl compatible.

This commit is contained in:
Philip.Hazel 2015-07-24 18:18:05 +00:00
parent 1bcfb856f5
commit fe0a16fe8f
3 changed files with 85 additions and 22 deletions

View File

@ -81,6 +81,10 @@ This bug was discovered by Karl Skomski with the LLVM fuzzer.
very pedantic coding infelicities and a buffer overflow while checking a UTF-8 very pedantic coding infelicities and a buffer overflow while checking a UTF-8
string if the final multi-byte UTF-8 character was truncated. string if the final multi-byte UTF-8 character was truncated.
22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
class, where both values are literal letters in the same case, omit the
non-letter EBCDIC code points within the range.
Version 10.20 30-June-2015 Version 10.20 30-June-2015
-------------------------- --------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21" .TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS" .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -1324,9 +1324,18 @@ sequence other than one that defines a single character appears at a point
where a range ending character is expected. For example, [z-\exff] is valid, where a range ending character is expected. For example, [z-\exff] is valid,
but [A-\ed] and [A-[:digit:]] are not. but [A-\ed] and [A-[:digit:]] are not.
.P .P
Ranges operate in the collating sequence of character values. They can also be Ranges normally include all code points between the start and end characters,
used for characters specified numerically, for example [\e000-\e037]. Ranges inclusive. They can also be used for code points specified numerically, for
can include any characters that are valid for the current mode. example [\e000-\e037]. Ranges can include any characters that are valid for the
current mode.
.P
There is a special case in EBCDIC environments for ranges whose end points are
both specified as literal letters in the same case. For compatibility with
Perl, EBCDIC code points within the range that are not letters are omitted. For
example, [h-k] matches only four characters, even though the codes for h and k
are 0x88 and 0x92, a range of 11 code points. However, if the range is
specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
are included.
.P .P
If a range that includes letters is used when caseless matching is set, it If a range that includes letters is used when caseless matching is set, it
matches the letters in either case. For example, [W-c] is equivalent to matches the letters in either case. For example, [W-c] is equivalent to
@ -3367,6 +3376,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 17 July 2015 Last updated: 24 July 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -3323,38 +3323,38 @@ for (; ptr < cb->end_pattern; ptr++)
goto FAILED; goto FAILED;
} }
break; break;
/* Conditional group */ /* Conditional group */
case CHAR_LEFT_PARENTHESIS: case CHAR_LEFT_PARENTHESIS:
if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */ if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */
{ {
nest_depth++; nest_depth++;
ptr += 2; ptr += 2;
break; break;
} }
/* Must be an assertion or a callout */ /* Must be an assertion or a callout */
switch(ptr[4]) switch(ptr[4])
{ {
case CHAR_LESS_THAN_SIGN: case CHAR_LESS_THAN_SIGN:
if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN) if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
goto MISSING_ASSERTION; goto MISSING_ASSERTION;
/* Fall through */ /* Fall through */
case CHAR_C: case CHAR_C:
case CHAR_EXCLAMATION_MARK: case CHAR_EXCLAMATION_MARK:
case CHAR_EQUALS_SIGN: case CHAR_EQUALS_SIGN:
ptr++; ptr++;
break; break;
default: default:
MISSING_ASSERTION: MISSING_ASSERTION:
ptr += 3; /* To improve error message */ ptr += 3; /* To improve error message */
errorcode = ERR28; errorcode = ERR28;
goto FAILED; goto FAILED;
} }
break; break;
case CHAR_COLON: case CHAR_COLON:
@ -3939,7 +3939,7 @@ for (;; ptr++)
{ {
nestptr = ptr + 7; nestptr = ptr + 7;
ptr = sub_start_of_word; /* Do not combine these statements; clang's */ ptr = sub_start_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */ ptr--; /* sanitizer moans about a negative index. */
continue; continue;
} }
@ -3947,7 +3947,7 @@ for (;; ptr++)
{ {
nestptr = ptr + 7; nestptr = ptr + 7;
ptr = sub_end_of_word; /* Do not combine these statements; clang's */ ptr = sub_end_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */ ptr--; /* sanitizer moans about a negative index. */
continue; continue;
} }
@ -4046,6 +4046,9 @@ for (;; ptr++)
for(;;) for(;;)
{ {
PCRE2_SPTR oldptr; PCRE2_SPTR oldptr;
#ifdef EBCDIC
BOOL range_is_literal = TRUE;
#endif
if (c == CHAR_NULL && ptr >= cb->end_pattern) if (c == CHAR_NULL && ptr >= cb->end_pattern)
{ {
@ -4226,7 +4229,13 @@ for (;; ptr++)
{ {
escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb); escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
if (*errorcodeptr != 0) goto FAILED; if (*errorcodeptr != 0) goto FAILED;
if (escape == 0) c = ec; /* Escaped single char */ if (escape == 0) /* Escaped single char */
{
c = ec;
#ifdef EBCDIC
range_is_literal = FALSE;
#endif
}
else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
else if (escape == ESC_N) /* \N is not supported in a class */ else if (escape == ESC_N) /* \N is not supported in a class */
{ {
@ -4430,7 +4439,9 @@ for (;; ptr++)
int descape; int descape;
descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb); descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
if (*errorcodeptr != 0) goto FAILED; if (*errorcodeptr != 0) goto FAILED;
#ifdef EBCDIC
range_is_literal = FALSE;
#endif
/* 0 means a character was put into d; \b is backspace; any other /* 0 means a character was put into d; \b is backspace; any other
special causes an error. */ special causes an error. */
@ -4476,9 +4487,48 @@ for (;; ptr++)
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
because there are holes in the encoding, and simply using the range A-Z
(for example) would include the characters in the holes. This applies
only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
#ifdef EBCDIC
if (range_is_literal &&
(cb->ctypes[c] & ctype_letter) != 0 &&
(cb->ctypes[d] & ctype_letter) != 0 &&
(c <= CHAR_z) == (d <= CHAR_z))
{
uint32_t uc = (c <= CHAR_z)? 0 : 64;
uint32_t C = c - uc;
uint32_t D = d - uc;
if (C <= CHAR_i)
{
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
((D < CHAR_i)? D : CHAR_i) + uc);
C = CHAR_j;
}
if (C <= D && C <= CHAR_r)
{
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
((D < CHAR_r)? D : CHAR_r) + uc);
C = CHAR_s;
}
if (C <= D)
{
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
D + uc);
}
}
else
#endif
class_has_8bitchar += class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, c, d); add_to_class(classbits, &class_uchardata, options, cb, c, d);
goto CONTINUE_CLASS; /* Go get the next char in the class */ goto CONTINUE_CLASS; /* Go get the next char in the class */
} }