Support character classes in glob conversion.

This commit is contained in:
Zoltán Herczeg 2017-05-18 06:46:22 +00:00
parent dfc9712bcd
commit ac12e979b3
3 changed files with 270 additions and 20 deletions

View File

@ -58,7 +58,7 @@ POSSIBILITY OF SUCH DAMAGE.
#define ERROR_END_BACKSLASH 101
#define ERROR_MISSING_SQUARE_BRACKET 106
#define ERROR_MISSING_CLOSING_PARENTHESIS 114
#define ERROR_TOO_DEEP_NESTING 119
#define ERROR_UNKNOWN_POSIX_CLASS 130
#define ERROR_NO_UNICODE 132
/* Generated pattern fragments */
@ -651,34 +651,34 @@ typedef struct pcre2_output_context {
/* Write a character into the output.
Arguments:
context the bash glob context
out output context
chr the next character
*/
static void
convert_glob_bash_write(pcre2_output_context *context, PCRE2_UCHAR chr)
convert_glob_bash_write(pcre2_output_context *out, PCRE2_UCHAR chr)
{
context->output_size++;
out->output_size++;
if (context->output < context->output_end)
*context->output++ = chr;
if (out->output < out->output_end)
*out->output++ = chr;
}
/* Write a string into the output.
Arguments:
context the bash glob context
length length of context->out_str
out output context
length length of out->out_str
*/
static void
convert_glob_bash_write_str(pcre2_output_context *context, PCRE2_SIZE length)
convert_glob_bash_write_str(pcre2_output_context *out, PCRE2_SIZE length)
{
uint8_t *out_str = context->out_str;
PCRE2_UCHAR *output = context->output;
PCRE2_SPTR output_end = context->output_end;
PCRE2_SIZE output_size = context->output_size;
uint8_t *out_str = out->out_str;
PCRE2_UCHAR *output = out->output;
PCRE2_SPTR output_end = out->output_end;
PCRE2_SIZE output_size = out->output_size;
do
{
@ -689,17 +689,219 @@ do
}
while (--length != 0);
context->output = output;
context->output_size = output_size;
out->output = output;
out->output_size = output_size;
}
/* Parse a posix class.
Arguments:
from starting point of scanning the range
pattern_end end of pattern
out output context
Returns: TRUE => success
FALSE => malformed class
*/
static int
convert_glob_bash_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
pcre2_output_context *out)
{
static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:"
"graph:lower:print:punct:space:upper:word:xdigit:";
PCRE2_SPTR pattern = *from;
PCRE2_SPTR start;
const char *class_ptr;
PCRE2_UCHAR c;
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_COLON;
convert_glob_bash_write_str(out, 2);
while (TRUE)
{
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
c = *pattern++;
if (c == CHAR_COLON && pattern < pattern_end &&
*pattern == CHAR_RIGHT_SQUARE_BRACKET)
{
break;
}
if (c < CHAR_a || c > CHAR_z)
{
/* All POSIX class is composed of lowercase characters */
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
convert_glob_bash_write(out, c);
}
start = *from;
*from = pattern + 1;
class_ptr = posix_classes;
while (TRUE)
{
if (*class_ptr == CHAR_NULL) return ERROR_UNKNOWN_POSIX_CLASS;
pattern = start;
while (*pattern == (PCRE2_UCHAR) *class_ptr)
{
if (*pattern == CHAR_COLON)
{
out->out_str[0] = CHAR_COLON;
out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET;
convert_glob_bash_write_str(out, 2);
return 0;
}
pattern++;
class_ptr++;
}
while (*class_ptr != CHAR_COLON) class_ptr++;
class_ptr++;
}
}
/* Parse a range of characters.
Arguments:
from starting point of scanning the range
pattern_end end of pattern
out output context
separator glob separator
Returns: 0 => success
!0 => error code
*/
static int
convert_glob_bash_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
pcre2_output_context *out, PCRE2_UCHAR separator)
{
PCRE2_SPTR pattern = *from;
PCRE2_UCHAR c;
int result, len;
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
c = *pattern;
if (c == CHAR_EXCLAMATION_MARK
|| c == CHAR_CIRCUMFLEX_ACCENT)
{
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
len = 2;
}
else
{
out->out_str[0] = CHAR_LEFT_PARENTHESIS;
out->out_str[1] = CHAR_QUESTION_MARK;
out->out_str[2] = CHAR_EXCLAMATION_MARK;
len = 3;
}
if (separator < 128 && strchr(pcre2_escaped_literals, separator) != NULL)
{
out->out_str[len] = CHAR_BACKSLASH;
len++;
}
convert_glob_bash_write_str(out, len);
convert_glob_bash_write(out, separator);
if (c == CHAR_EXCLAMATION_MARK
|| c == CHAR_CIRCUMFLEX_ACCENT)
{
pattern++;
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
c = *pattern;
}
else
{
out->out_str[0] = CHAR_RIGHT_PARENTHESIS;
out->out_str[1] = CHAR_LEFT_SQUARE_BRACKET;
convert_glob_bash_write_str(out, 2);
}
if (c == CHAR_MINUS || c == CHAR_RIGHT_SQUARE_BRACKET)
{
convert_glob_bash_write(out, CHAR_BACKSLASH);
convert_glob_bash_write(out, c);
pattern++;
}
while (pattern < pattern_end)
{
c = *pattern++;
if (c == CHAR_RIGHT_SQUARE_BRACKET)
{
convert_glob_bash_write(out, c);
*from = pattern;
return 0;
}
if (c == CHAR_LEFT_SQUARE_BRACKET && pattern < pattern_end &&
*pattern == CHAR_COLON)
{
*from = pattern + 1;
result = convert_glob_bash_parse_class(from, pattern_end, out);
if (result != 0) return result;
pattern = *from;
continue;
}
if (c == CHAR_BACKSLASH)
{
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_END_BACKSLASH;
}
c = *pattern++;
}
if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
c == CHAR_BACKSLASH || c == CHAR_MINUS)
convert_glob_bash_write(out, CHAR_BACKSLASH);
convert_glob_bash_write(out, c);
}
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
/* Prints a wildcard into the output.
Arguments:
context the bash glob context
out output context
separator glob separator
after_sep whether the wildcard is right after a separator
*/
static void
@ -711,7 +913,7 @@ int len = 2;
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
if (separator == CHAR_BACKSLASH)
if (separator < 128 && strchr(pcre2_escaped_literals, separator) != NULL)
{
out->out_str[2] = CHAR_BACKSLASH;
len = 3;
@ -749,8 +951,8 @@ convert_glob_bash(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength,
pcre2_output_context out;
PCRE2_SPTR pattern_start = pattern;
PCRE2_SPTR pattern_end = pattern + plength;
int result;
PCRE2_UCHAR c;
int result;
/* Initialize default for error offset as end of input. */
out.output = use_buffer;
@ -800,6 +1002,14 @@ while (pattern < pattern_end)
continue;
}
if (c == CHAR_LEFT_SQUARE_BRACKET)
{
result = convert_glob_bash_parse_range(&pattern, pattern_end,
&out, ccontext->glob_separator);
if (result != 0) break;
continue;
}
if (c == CHAR_BACKSLASH)
{
if (pattern >= pattern_end)

16
testdata/testinput24 vendored
View File

@ -228,6 +228,22 @@
/?a?\/?b?/
/[a\\b\c][]][-][\]\-]/
/[^a\\b\c][!]][!-][^\]\-]/
/[[:alpha:][:xdigit:][:word:]]/
/[[:alpha:/
/[[:alpha:]/
/[[:alphaa:]]/
/[[:xdigi:]]/
/[[:xdigit::]]/
#pattern convert=unset
#pattern convert=posix_extended

24
testdata/testoutput24 vendored
View File

@ -359,6 +359,30 @@ No match
/?a?\/?b?/
(?s)\A[^/]a[^/]/[^/]b[^/]\z
/[a\\b\c][]][-][\]\-]/
(?s)\A(?!/)[a\\bc](?!/)[\]](?!/)[\-](?!/)[\]\-]\z
/[^a\\b\c][!]][!-][^\]\-]/
(?s)\A[^/a\\bc][^/\]][^/\-][^/\]\-]\z
/[[:alpha:][:xdigit:][:word:]]/
(?s)\A(?!/)[[:alpha:][:xdigit:][:word:]]\z
/[[:alpha:/
** Pattern conversion error at offset 9: missing terminating ] for character class
/[[:alpha:]/
** Pattern conversion error at offset 10: missing terminating ] for character class
/[[:alphaa:]]/
** Pattern conversion error at offset 11: unknown POSIX class name
/[[:xdigi:]]/
** Pattern conversion error at offset 10: unknown POSIX class name
/[[:xdigit::]]/
** Pattern conversion error at offset 10: missing terminating ] for character class
#pattern convert=unset
#pattern convert=posix_extended