From 7927ac0ee31b60954c9c0ad42236b25498c4f264 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 13 May 2017 17:46:27 +0000 Subject: [PATCH] Updates to experimental conversion code. --- src/pcre2_convert.c | 49 ++++++++++++++++++++++++++++++++++--------- testdata/testinput24 | 17 +++++++++++++++ testdata/testoutput24 | 29 +++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/pcre2_convert.c b/src/pcre2_convert.c index c391521..d4e44b4 100644 --- a/src/pcre2_convert.c +++ b/src/pcre2_convert.c @@ -118,7 +118,9 @@ PCRE2_UCHAR *pp = p; PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */ PCRE2_SIZE convlength = 0; +uint32_t bracount = 0; uint32_t posix_class_state = POSIX_CLASS_NOT_STARTED; +uint32_t lastspecial = 0; BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0; BOOL inclass = FALSE; BOOL nextisliteral = FALSE; @@ -130,7 +132,13 @@ BOOL nextisliteral = FALSE; *bufflenptr = plength; -/* Now scan the input */ +/* Now scan the input. In non-extended patterns, an initial asterisk is treated +as literal. Still figuring out what happens in extended patterns... */ + +if (plength > 0 && *posix == CHAR_ASTERISK) + { + if (!extended) nextisliteral = TRUE; + } while (plength > 0) { @@ -262,35 +270,56 @@ while (plength > 0) { if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH); if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; - *p++ = *posix++; + lastspecial = *p++ = *posix++; plength--; } else nextisliteral = TRUE; break; + case CHAR_RIGHT_PARENTHESIS: + if (!extended || bracount == 0) goto ESCAPE_LITERAL; + bracount--; + goto COPY_SPECIAL; + + case CHAR_LEFT_PARENTHESIS: + bracount++; + /* Fall through */ + case CHAR_QUESTION_MARK: case CHAR_PLUS: case CHAR_LEFT_CURLY_BRACKET: case CHAR_RIGHT_CURLY_BRACKET: case CHAR_VERTICAL_LINE: - case CHAR_LEFT_PARENTHESIS: - case CHAR_RIGHT_PARENTHESIS: - if (!extended) PUTCHARS(STR_BACKSLASH); + if (!extended) goto ESCAPE_LITERAL; /* Fall through */ - case CHAR_ASTERISK: case CHAR_DOT: - case CHAR_CIRCUMFLEX_ACCENT: case CHAR_DOLLAR_SIGN: + COPY_SPECIAL: + lastspecial = c; if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; - *p++ = sc; - break; - + *p++ = c; + break; + + case CHAR_ASTERISK: + if (lastspecial != CHAR_ASTERISK) goto COPY_SPECIAL; + break; /* Ignore second and subsequent asterisks */ + + case CHAR_CIRCUMFLEX_ACCENT: + if (extended || + lastspecial == 0 || + lastspecial == CHAR_LEFT_PARENTHESIS || + lastspecial == CHAR_VERTICAL_LINE) + goto COPY_SPECIAL; + /* Fall through */ + default: if (c < 256 && strchr("\\{}?*+[]()|.^$", c) != NULL) { + ESCAPE_LITERAL: PUTCHARS(STR_BACKSLASH); } + lastspecial = 0xff; /* Indicates nothing special */ if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; memcpy(p, posix - clength, CU2BYTES(clength)); p += clength; diff --git a/testdata/testinput24 b/testdata/testinput24 index c98fd25..f18277b 100644 --- a/testdata/testinput24 +++ b/testdata/testinput24 @@ -247,6 +247,11 @@ \= Expect no match aab +/(ab)c)d]/ + Xabc)d]Y + +/a***b/ + #pattern convert=unset #pattern convert=posix_basic @@ -261,6 +266,18 @@ /^how to \^how to/ +/*abc/ + X*abcY + +/**abc/ + XabcY + X*abcY + X**abcY + +/^b\(c^d\)\(^e^f\)/ + +/a***b/ + #pattern convert=unset /abc/ diff --git a/testdata/testoutput24 b/testdata/testoutput24 index bd5a8d2..86ed7a5 100644 --- a/testdata/testoutput24 +++ b/testdata/testoutput24 @@ -396,6 +396,15 @@ No match aab No match +/(ab)c)d]/ +(ab)c\)d\] + Xabc)d]Y + 0: abc)d] + 1: ab + +/a***b/ +a*b + #pattern convert=unset #pattern convert=posix_basic @@ -417,6 +426,26 @@ how.to how\.to /^how to \^how to/ ^how to \^how to +/*abc/ +\*abc + X*abcY + 0: *abc + +/**abc/ +\**abc + XabcY + 0: abc + X*abcY + 0: *abc + X**abcY + 0: **abc + +/^b\(c^d\)\(^e^f\)/ +^b(c\^d)(^e\^f) + +/a***b/ +a*b + #pattern convert=unset /abc/