Make EBCDIC [a-z] type ranges Perl compatible.

This commit is contained in:
Philip.Hazel 2015-07-24 18:18:05 +00:00
parent 1bcfb856f5
commit fe0a16fe8f
3 changed files with 85 additions and 22 deletions

View File

@ -81,6 +81,10 @@ This bug was discovered by Karl Skomski with the LLVM fuzzer.
very pedantic coding infelicities and a buffer overflow while checking a UTF-8
string if the final multi-byte UTF-8 character was truncated.
22. For Perl compatibility in EBCDIC environments, ranges such as a-z in a
class, where both values are literal letters in the same case, omit the
non-letter EBCDIC code points within the range.
Version 10.20 30-June-2015
--------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2PATTERN 3 "17 July 2015" "PCRE2 10.21"
.TH PCRE2PATTERN 3 "24 July 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -1324,9 +1324,18 @@ sequence other than one that defines a single character appears at a point
where a range ending character is expected. For example, [z-\exff] is valid,
but [A-\ed] and [A-[:digit:]] are not.
.P
Ranges operate in the collating sequence of character values. They can also be
used for characters specified numerically, for example [\e000-\e037]. Ranges
can include any characters that are valid for the current mode.
Ranges normally include all code points between the start and end characters,
inclusive. They can also be used for code points specified numerically, for
example [\e000-\e037]. Ranges can include any characters that are valid for the
current mode.
.P
There is a special case in EBCDIC environments for ranges whose end points are
both specified as literal letters in the same case. For compatibility with
Perl, EBCDIC code points within the range that are not letters are omitted. For
example, [h-k] matches only four characters, even though the codes for h and k
are 0x88 and 0x92, a range of 11 code points. However, if the range is
specified numerically, for example, [\ex88-\ex92] or [h-\x92], all code points
are included.
.P
If a range that includes letters is used when caseless matching is set, it
matches the letters in either case. For example, [W-c] is equivalent to
@ -3367,6 +3376,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 17 July 2015
Last updated: 24 July 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -3323,38 +3323,38 @@ for (; ptr < cb->end_pattern; ptr++)
goto FAILED;
}
break;
/* Conditional group */
case CHAR_LEFT_PARENTHESIS:
if (ptr[3] != CHAR_QUESTION_MARK) /* Not assertion or callout */
{
{
nest_depth++;
ptr += 2;
break;
break;
}
/* Must be an assertion or a callout */
switch(ptr[4])
{
case CHAR_LESS_THAN_SIGN:
if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
goto MISSING_ASSERTION;
/* Fall through */
/* Fall through */
case CHAR_C:
case CHAR_EXCLAMATION_MARK:
case CHAR_EQUALS_SIGN:
ptr++;
break;
default:
MISSING_ASSERTION:
ptr += 3; /* To improve error message */
MISSING_ASSERTION:
ptr += 3; /* To improve error message */
errorcode = ERR28;
goto FAILED;
}
goto FAILED;
}
break;
case CHAR_COLON:
@ -3939,7 +3939,7 @@ for (;; ptr++)
{
nestptr = ptr + 7;
ptr = sub_start_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */
ptr--; /* sanitizer moans about a negative index. */
continue;
}
@ -3947,7 +3947,7 @@ for (;; ptr++)
{
nestptr = ptr + 7;
ptr = sub_end_of_word; /* Do not combine these statements; clang's */
ptr--; /* sanitizer moans about a negative index. */
ptr--; /* sanitizer moans about a negative index. */
continue;
}
@ -4046,6 +4046,9 @@ for (;; ptr++)
for(;;)
{
PCRE2_SPTR oldptr;
#ifdef EBCDIC
BOOL range_is_literal = TRUE;
#endif
if (c == CHAR_NULL && ptr >= cb->end_pattern)
{
@ -4226,7 +4229,13 @@ for (;; ptr++)
{
escape = check_escape(&ptr, &ec, errorcodeptr, options, TRUE, cb);
if (*errorcodeptr != 0) goto FAILED;
if (escape == 0) c = ec; /* Escaped single char */
if (escape == 0) /* Escaped single char */
{
c = ec;
#ifdef EBCDIC
range_is_literal = FALSE;
#endif
}
else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
else if (escape == ESC_N) /* \N is not supported in a class */
{
@ -4430,7 +4439,9 @@ for (;; ptr++)
int descape;
descape = check_escape(&ptr, &d, errorcodeptr, options, TRUE, cb);
if (*errorcodeptr != 0) goto FAILED;
#ifdef EBCDIC
range_is_literal = FALSE;
#endif
/* 0 means a character was put into d; \b is backspace; any other
special causes an error. */
@ -4476,9 +4487,48 @@ for (;; ptr++)
if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
/* In an EBCDIC environment, Perl treats alphabetic ranges specially
because there are holes in the encoding, and simply using the range A-Z
(for example) would include the characters in the holes. This applies
only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
#ifdef EBCDIC
if (range_is_literal &&
(cb->ctypes[c] & ctype_letter) != 0 &&
(cb->ctypes[d] & ctype_letter) != 0 &&
(c <= CHAR_z) == (d <= CHAR_z))
{
uint32_t uc = (c <= CHAR_z)? 0 : 64;
uint32_t C = c - uc;
uint32_t D = d - uc;
if (C <= CHAR_i)
{
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
((D < CHAR_i)? D : CHAR_i) + uc);
C = CHAR_j;
}
if (C <= D && C <= CHAR_r)
{
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
((D < CHAR_r)? D : CHAR_r) + uc);
C = CHAR_s;
}
if (C <= D)
{
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, C + uc,
D + uc);
}
}
else
#endif
class_has_8bitchar +=
add_to_class(classbits, &class_uchardata, options, cb, c, d);
goto CONTINUE_CLASS; /* Go get the next char in the class */
}