From d9c33d0708aa5ecc8cd09daa21c462e77e439b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Herczeg?= Date: Wed, 17 May 2017 13:44:24 +0000 Subject: [PATCH] Reworking bash glob conversion. --- src/pcre2_convert.c | 355 +++++++++--------------------------------- testdata/testinput24 | 10 +- testdata/testoutput24 | 24 +-- 3 files changed, 77 insertions(+), 312 deletions(-) diff --git a/src/pcre2_convert.c b/src/pcre2_convert.c index 6cd988b..254e572 100644 --- a/src/pcre2_convert.c +++ b/src/pcre2_convert.c @@ -693,109 +693,6 @@ context->output = output; context->output_size = output_size; } -/* Bash glob reading modes. */ - -#define PCRE2_BASH_GLOB_NORMAL 0 -#define PCRE2_BASH_GLOB_QUOTED 1 -#define PCRE2_BASH_GLOB_DOUBLE_QUOTED 2 -#define PCRE2_BASH_GLOB_BACKSLASH 3 - -/* Maximum nesting level of enclosed groups. */ - -#define PCRE2_BASH_GLOB_MAX_NESTING 16 - -typedef struct pcre2_bash_glob_context { - PCRE2_SPTR pattern; - PCRE2_SPTR pattern_end; - pcre2_output_context out; - int read_mode; - BOOL is_control_char; -} pcre2_bash_glob_context; - -/* Read the next character from the glob. If the character - is a control character context->is_control_char is set - to TRUE. Otherwise this field is FALSE. - -Arguments: - context the bash glob context - utf TRUE if UTF -*/ - -static BOOL -convert_glob_bash_read(pcre2_bash_glob_context *context, BOOL utf) -{ -while (TRUE) - { - if (context->pattern >= context->pattern_end) - return FALSE; - - context->pattern++; - -#ifdef SUPPORT_UNICODE - /* Intermediate unicode octets are always normal characters. */ - if (utf && NOT_FIRSTCU(context->pattern[-1])) - { - context->is_control_char = FALSE; - return TRUE; - } -#endif - - if (context->read_mode == PCRE2_BASH_GLOB_QUOTED) - { - if (context->pattern[-1] != CHAR_APOSTROPHE) - return TRUE; - - context->read_mode = PCRE2_BASH_GLOB_NORMAL; - continue; - } - else if (context->read_mode == PCRE2_BASH_GLOB_DOUBLE_QUOTED) - { - if (context->pattern[-1] == CHAR_BACKSLASH && - context->pattern < context->pattern_end && - (context->pattern[0] == CHAR_QUOTATION_MARK || - context->pattern[0] == CHAR_BACKSLASH)) - { - context->pattern++; - return TRUE; - } - else if (context->pattern[-1] != CHAR_QUOTATION_MARK) - return TRUE; - - context->read_mode = PCRE2_BASH_GLOB_NORMAL; - continue; - } - - context->is_control_char = FALSE; - - if (context->pattern[-1] == CHAR_APOSTROPHE) - { - context->read_mode = PCRE2_BASH_GLOB_QUOTED; - continue; - } - - if (context->pattern[-1] == CHAR_QUOTATION_MARK) - { - context->read_mode = PCRE2_BASH_GLOB_DOUBLE_QUOTED; - continue; - } - - if (context->pattern[-1] == CHAR_BACKSLASH) - { - if (context->pattern < context->pattern_end) - { - context->pattern++; - return TRUE; - } - - context->read_mode = PCRE2_BASH_GLOB_BACKSLASH; - return FALSE; - } - - context->is_control_char = TRUE; - return TRUE; - } -} - /* Prints a wildcard into the output. @@ -806,30 +703,24 @@ Arguments: */ static void -convert_glob_bash_wildcard(pcre2_bash_glob_context *context, - PCRE2_UCHAR separator, BOOL after_sep) +convert_glob_bash_wildcard(pcre2_output_context *out, + PCRE2_UCHAR separator) { int len = 2; -context->out.out_str[0] = CHAR_LEFT_SQUARE_BRACKET; -context->out.out_str[1] = CHAR_CIRCUMFLEX_ACCENT; +out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; +out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; if (separator == CHAR_BACKSLASH) { - context->out.out_str[2] = CHAR_BACKSLASH; + out->out_str[2] = CHAR_BACKSLASH; len = 3; } -if (after_sep) - { - context->out.out_str[len] = CHAR_DOT; - len++; - } +convert_glob_bash_write_str(out, len); -convert_glob_bash_write_str(&context->out, len); - -convert_glob_bash_write(&context->out, separator); -convert_glob_bash_write(&context->out, CHAR_RIGHT_SQUARE_BRACKET); +convert_glob_bash_write(out, separator); +convert_glob_bash_write(out, CHAR_RIGHT_SQUARE_BRACKET); } @@ -851,204 +742,98 @@ Returns: 0 => success */ static int -convert_glob_bash(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength, +convert_glob_bash(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength, BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) { -pcre2_bash_glob_context context; -uint8_t group_types[PCRE2_BASH_GLOB_MAX_NESTING]; -int nesting_level, result; -BOOL after_sep = TRUE; +pcre2_output_context out; +PCRE2_SPTR pattern_start = pattern; +PCRE2_SPTR pattern_end = pattern + plength; +int result; PCRE2_UCHAR c; /* Initialize default for error offset as end of input. */ -context.pattern = pattern; -context.pattern_end = pattern + plength; -context.read_mode = PCRE2_BASH_GLOB_NORMAL; -context.out.output = use_buffer; -context.out.output_end = use_buffer + use_length; -context.out.output_size = 0; +out.output = use_buffer; +out.output_end = use_buffer + use_length; +out.output_size = 0; -context.out.out_str[0] = CHAR_BACKSLASH; -context.out.out_str[1] = CHAR_A; -convert_glob_bash_write_str(&context.out, 2); +out.out_str[0] = CHAR_LEFT_PARENTHESIS; +out.out_str[1] = CHAR_QUESTION_MARK; +out.out_str[2] = CHAR_s; +out.out_str[3] = CHAR_RIGHT_PARENTHESIS; +out.out_str[4] = CHAR_BACKSLASH; +out.out_str[5] = CHAR_A; +convert_glob_bash_write_str(&out, 6); -nesting_level = 0; result = 0; -while (convert_glob_bash_read(&context, utf)) +while (pattern < pattern_end) { - c = context.pattern[-1]; + c = *pattern++; - if (context.is_control_char) + if (c == CHAR_ASTERISK) { - if (c == CHAR_LEFT_PARENTHESIS) + if (pattern != pattern_start + 1) { - /* ! Unexpected open parenthesis ! */ - result = ERROR_END_BACKSLASH; - break; + out.out_str[0] = CHAR_LEFT_PARENTHESIS; + out.out_str[1] = CHAR_ASTERISK; + out.out_str[2] = CHAR_C; + out.out_str[3] = CHAR_O; + out.out_str[4] = CHAR_M; + out.out_str[5] = CHAR_M; + out.out_str[6] = CHAR_I; + out.out_str[7] = CHAR_T; + convert_glob_bash_write_str(&out, 8); + convert_glob_bash_write(&out, CHAR_RIGHT_PARENTHESIS); } - if (c == CHAR_RIGHT_PARENTHESIS) - { - if (nesting_level == 0) - { - /* ! Unexpected open parenthesis ! */ - result = ERROR_END_BACKSLASH; - break; - } - - c = group_types[--nesting_level]; - - convert_glob_bash_write(&context.out, CHAR_RIGHT_PARENTHESIS); - if (c != CHAR_COMMERCIAL_AT) - { - convert_glob_bash_write(&context.out, c); - convert_glob_bash_write(&context.out, CHAR_QUESTION_MARK); - } - - after_sep = FALSE; - continue; - } - - if (c == CHAR_VERTICAL_LINE && nesting_level > 0) - { - convert_glob_bash_write(&context.out, CHAR_VERTICAL_LINE); - - after_sep = FALSE; - continue; - } - - if ((c == CHAR_QUESTION_MARK || c == CHAR_ASTERISK || - c == CHAR_PLUS || c == CHAR_COMMERCIAL_AT) && - context.pattern < context.pattern_end && - context.pattern[0] == CHAR_LEFT_PARENTHESIS) - { - if (nesting_level >= PCRE2_BASH_GLOB_MAX_NESTING) - { - result = ERROR_TOO_DEEP_NESTING; - break; - } - - if (after_sep) - { - context.out.out_str[0] = CHAR_LEFT_PARENTHESIS; - context.out.out_str[1] = CHAR_QUESTION_MARK; - context.out.out_str[2] = CHAR_EXCLAMATION_MARK; - context.out.out_str[3] = CHAR_BACKSLASH; - context.out.out_str[4] = CHAR_DOT; - context.out.out_str[5] = CHAR_RIGHT_PARENTHESIS; - convert_glob_bash_write_str(&context.out, 6); - } - - context.pattern++; - group_types[nesting_level++] = (uint8_t) c; - - context.out.out_str[0] = CHAR_LEFT_PARENTHESIS; - context.out.out_str[1] = CHAR_QUESTION_MARK; - context.out.out_str[2] = CHAR_COLON; - convert_glob_bash_write_str(&context.out, 3); - - after_sep = FALSE; - continue; - } - - if (c == CHAR_ASTERISK) - { - if (nesting_level == 0 && context.pattern != pattern + 1) - { - context.out.out_str[0] = CHAR_LEFT_PARENTHESIS; - context.out.out_str[1] = CHAR_ASTERISK; - context.out.out_str[2] = CHAR_C; - context.out.out_str[3] = CHAR_O; - context.out.out_str[4] = CHAR_M; - context.out.out_str[5] = CHAR_M; - context.out.out_str[6] = CHAR_I; - context.out.out_str[7] = CHAR_T; - convert_glob_bash_write_str(&context.out, 8); - convert_glob_bash_write(&context.out, CHAR_RIGHT_PARENTHESIS); - } - - if (after_sep) - { - context.out.out_str[0] = CHAR_LEFT_PARENTHESIS; - context.out.out_str[1] = CHAR_QUESTION_MARK; - context.out.out_str[2] = CHAR_COLON; - convert_glob_bash_write_str(&context.out, 3); - - convert_glob_bash_wildcard(&context, ccontext->glob_separator, TRUE); - convert_glob_bash_wildcard(&context, ccontext->glob_separator, FALSE); - - context.out.out_str[0] = CHAR_ASTERISK; - context.out.out_str[1] = CHAR_QUESTION_MARK; - context.out.out_str[2] = CHAR_RIGHT_PARENTHESIS; - context.out.out_str[3] = CHAR_QUESTION_MARK; - context.out.out_str[4] = CHAR_QUESTION_MARK; - convert_glob_bash_write_str(&context.out, 5); - } - else - { - convert_glob_bash_wildcard(&context, ccontext->glob_separator, FALSE); - context.out.out_str[0] = CHAR_ASTERISK; - context.out.out_str[1] = CHAR_QUESTION_MARK; - convert_glob_bash_write_str(&context.out, 2); - } - - after_sep = FALSE; - continue; - } - - if (c == CHAR_QUESTION_MARK) - { - convert_glob_bash_wildcard(&context, - ccontext->glob_separator, after_sep); - - after_sep = FALSE; - continue; - } - } - - after_sep = (c == ccontext->glob_separator); - - if (after_sep && nesting_level > 0) - { - context.out.out_str[0] = CHAR_LEFT_PARENTHESIS; - context.out.out_str[1] = CHAR_ASTERISK; - context.out.out_str[2] = CHAR_F; - context.out.out_str[3] = CHAR_RIGHT_PARENTHESIS; - convert_glob_bash_write_str(&context.out, 4); - - after_sep = FALSE; + convert_glob_bash_wildcard(&out, ccontext->glob_separator); + out.out_str[0] = CHAR_ASTERISK; + out.out_str[1] = CHAR_QUESTION_MARK; + convert_glob_bash_write_str(&out, 2); continue; } - if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) - convert_glob_bash_write(&context.out, CHAR_BACKSLASH); + if (c == CHAR_QUESTION_MARK) + { + convert_glob_bash_wildcard(&out, ccontext->glob_separator); + continue; + } - convert_glob_bash_write(&context.out, c); + if (c == CHAR_BACKSLASH) + { + if (pattern >= pattern_end) + { + result = ERROR_END_BACKSLASH; + break; + } + c = *pattern++; + } + + if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) + convert_glob_bash_write(&out, CHAR_BACKSLASH); + + convert_glob_bash_write(&out, c); } if (result == 0) { - /* ! Unexpected end of input ! */ - if (nesting_level > 0 || context.read_mode != PCRE2_BASH_GLOB_NORMAL) - result = ERROR_MISSING_CLOSING_PARENTHESIS; - else - { - context.out.out_str[0] = CHAR_BACKSLASH; - context.out.out_str[1] = CHAR_z; - context.out.out_str[2] = CHAR_NULL; - convert_glob_bash_write_str(&context.out, 3); - } + out.out_str[0] = CHAR_BACKSLASH; + out.out_str[1] = CHAR_z; + out.out_str[2] = CHAR_NULL; + convert_glob_bash_write_str(&out, 3); + + if (!dummyrun && out.output_size != (out.output - use_buffer)) + result = PCRE2_ERROR_NOMEMORY; } if (result != 0) { - *bufflenptr = context.out.output - use_buffer; + *bufflenptr = pattern - pattern_start; return result; } -*bufflenptr = context.out.output_size - 1; +*bufflenptr = out.output_size - 1; return 0; } @@ -1139,7 +924,7 @@ for (i = 0; i < 2; i++) break; case PCRE2_CONVERT_GLOB_BASH: - rc = convert_glob_bash(pattype, pattern, plength, utf, use_buffer, use_length, + rc = convert_glob_bash(options, pattern, plength, utf, use_buffer, use_length, bufflenptr, dummyrun, ccontext); break; diff --git a/testdata/testinput24 b/testdata/testinput24 index 73ff1a3..1a72534 100644 --- a/testdata/testinput24 +++ b/testdata/testinput24 @@ -222,20 +222,12 @@ /A\B\\C\D/ -/A'B'C'''\'''D''/ - -/A""B"\\\"\C"''""/ - -/'\{}?*+[]()|.^$'/ +/\\{}\?\*+\[\]()|.^$/ /*a*\/*b*/ /?a?\/?b?/ -/a|b@(a|b)*\/@(a|b)/ - -/\/@(a\/|b\/)\// - #pattern convert=unset #pattern convert=posix_extended diff --git a/testdata/testoutput24 b/testdata/testoutput24 index fc5d4bc..d3f195a 100644 --- a/testdata/testoutput24 +++ b/testdata/testoutput24 @@ -345,31 +345,19 @@ No match # Non control character checking // -\A\z +(?s)\A\z /A\B\\C\D/ -\AAB\\CD\z +(?s)\AAB\\CD\z -/A'B'C'''\'''D''/ -\AABC\\D\z - -/A""B"\\\"\C"''""/ -\AAB\\"\\C\z - -/'\{}?*+[]()|.^$'/ -\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z +/\\{}\?\*+\[\]()|.^$/ +(?s)\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z /*a*\/*b*/ -\A(?:[^./][^/]*?)??a(*COMMIT)[^/]*?/(*COMMIT)(?:[^./][^/]*?)??b(*COMMIT)[^/]*?\z +(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*?\z /?a?\/?b?/ -\A[^./]a[^/]/[^./]b[^/]\z - -/a|b@(a|b)*\/@(a|b)/ -\Aa\|b(?:a|b)(*COMMIT)[^/]*?/(?!\.)(?:a|b)\z - -/\/@(a\/|b\/)\// -\A/(?!\.)(?:a(*F)|b(*F))/\z +(?s)\A[^/]a[^/]/[^/]b[^/]\z #pattern convert=unset