From 149aa29209ad9cec64314484e46a37d92f210305 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 12 Jun 2015 16:25:23 +0000 Subject: [PATCH] Fix \a and \e in pcre2test, and \a in pcre2_compile, on EBCDIC platforms. --- ChangeLog | 6 ++++ src/pcre2_compile.c | 4 +-- src/pcre2_internal.h | 68 +++++++++++++++++++++++++------------------- src/pcre2test.c | 4 +-- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/ChangeLog b/ChangeLog index 37b565d..766f6b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -155,6 +155,12 @@ being treated as a literal 'l' instead of causing an error. an empty string was repeated, it was not identified as matching an empty string itself. For example: /^(?:(?(1)x|)+)+$()/. +40. In an EBCDIC environment, pcretest was mishandling the escape sequences +\a and \e in test subject lines. + +41. In an EBCDIC environment, \a in a pattern was converted to the ASCII +instead of the EBCDIC value. + Version 10.10 06-March-2015 --------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 80c2d08..9ad36d0 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -296,7 +296,7 @@ static const short int escapes[] = { -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, - CHAR_GRAVE_ACCENT, 7, + CHAR_GRAVE_ACCENT, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, @@ -328,7 +328,7 @@ because it is defined as 'a', which of course picks up the ASCII value. */ #endif static const short int escapes[] = { -/* 80 */ 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, +/* 80 */ ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, /* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p, /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index e2a9252..c6d1427 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1192,31 +1192,6 @@ only. */ /* -------------------- Definitions for compiled patterns -------------------*/ -/* Escape items that are just an encoding of a particular data value. */ - -#ifndef ESC_e -#define ESC_e CHAR_ESC -#endif - -#ifndef ESC_f -#define ESC_f CHAR_FF -#endif - -#ifndef ESC_n -#define ESC_n CHAR_LF -#endif - -#ifndef ESC_r -#define ESC_r CHAR_CR -#endif - -/* We can't officially use ESC_t because it is a POSIX reserved identifier -(presumably because of all the others like size_t). */ - -#ifndef ESC_tee -#define ESC_tee CHAR_HT -#endif - /* Codes for different types of Unicode property */ #define PT_ANY 0 /* Any property - matches all chars */ @@ -1255,13 +1230,46 @@ contain characters with values greater than 255. */ #define XCL_PROP 3 /* Unicode property (2-byte property code follows) */ #define XCL_NOTPROP 4 /* Unicode inverted property (ditto) */ +/* Escape items that are just an encoding of a particular data value. These +appear in the escapes[] table in pcre2_compile.c as positive numbers. */ + +#ifndef ESC_a +#define ESC_a CHAR_BEL +#endif + +#ifndef ESC_e +#define ESC_e CHAR_ESC +#endif + +#ifndef ESC_f +#define ESC_f CHAR_FF +#endif + +#ifndef ESC_n +#define ESC_n CHAR_LF +#endif + +#ifndef ESC_r +#define ESC_r CHAR_CR +#endif + +/* We can't officially use ESC_t because it is a POSIX reserved identifier +(presumably because of all the others like size_t). */ + +#ifndef ESC_tee +#define ESC_tee CHAR_HT +#endif + /* These are escaped items that aren't just an encoding of a particular data value such as \n. They must have non-zero values, as check_escape() returns 0 -for a data character. Also, they must appear in the same order as in the -opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it -corresponds to "." in DOTALL mode rather than an escape sequence. It is also -used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In -non-DOTALL mode, "." behaves like \N. +for a data character. In the escapes[] table in pcre2_compile.c their values +are negated in order to distinguish them from data values. + +They must appear here in the same order as in the opcode definitions below, up +to ESC_z. There's a dummy for OP_ALLANY because it corresponds to "." in DOTALL +mode rather than an escape sequence. It is also used for [^] in JavaScript +compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves +like \N. The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc. when PCRE_UCP is set and replacement of \d etc by \p sequences is required. diff --git a/src/pcre2test.c b/src/pcre2test.c index 97ba5bb..1759a22 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -5181,9 +5181,9 @@ while ((c = *p++) != 0) else switch ((c = *p++)) { case '\\': break; - case 'a': c = 7; break; + case 'a': c = CHAR_BEL; break; case 'b': c = '\b'; break; - case 'e': c = 27; break; + case 'e': c = CHAR_ESC; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break;