Rework character range parsing in glob conversion.

This commit is contained in:
Zoltán Herczeg 2017-05-24 10:14:43 +00:00
parent 638c74f12b
commit 9826db624e
3 changed files with 215 additions and 134 deletions

View File

@ -423,6 +423,47 @@ out->output_size = output_size;
}
/* Prints the separator into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_separator(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
if (with_escape)
convert_glob_write(out, CHAR_BACKSLASH);
convert_glob_write(out, separator);
}
/* Prints a wildcard into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_wildcard(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
convert_glob_write_str(out, 2);
convert_glob_print_separator(out, separator, with_escape);
convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
}
/* Parse a posix class.
Arguments:
@ -519,77 +560,89 @@ Returns: 0 => success
static int
convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
pcre2_output_context *out, PCRE2_UCHAR separator, BOOL with_escape)
pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
{
BOOL is_negative = FALSE;
BOOL separator_seen = FALSE;
BOOL has_prev_c;
PCRE2_SPTR pattern = *from;
PCRE2_UCHAR c;
PCRE2_SPTR char_start = NULL;
uint32_t c, prev_c;
int result, len;
(void)utf; /* Avoid compiler warning. */
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
c = *pattern;
if (c == CHAR_EXCLAMATION_MARK
|| c == CHAR_CIRCUMFLEX_ACCENT)
if (*pattern == CHAR_EXCLAMATION_MARK
|| *pattern == CHAR_CIRCUMFLEX_ACCENT)
{
pattern++;
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
is_negative = TRUE;
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
len = 2;
}
else
{
out->out_str[0] = CHAR_LEFT_PARENTHESIS;
out->out_str[1] = CHAR_QUESTION_MARK;
out->out_str[2] = CHAR_EXCLAMATION_MARK;
len = 3;
}
if (!no_wildsep)
{
if (with_escape)
{
out->out_str[len] = CHAR_BACKSLASH;
len++;
}
out->out_str[len] = (uint8_t) separator;
}
convert_glob_write_str(out, len + 1);
if (c == CHAR_EXCLAMATION_MARK
|| c == CHAR_CIRCUMFLEX_ACCENT)
{
pattern++;
if (pattern >= pattern_end)
{
*from = pattern;
return ERROR_MISSING_SQUARE_BRACKET;
}
c = *pattern;
}
else
{
out->out_str[0] = CHAR_RIGHT_PARENTHESIS;
out->out_str[1] = CHAR_LEFT_SQUARE_BRACKET;
convert_glob_write_str(out, 2);
}
convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);
if (c == CHAR_MINUS || c == CHAR_RIGHT_SQUARE_BRACKET)
has_prev_c = FALSE;
prev_c = 0;
if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
{
convert_glob_write(out, CHAR_BACKSLASH);
convert_glob_write(out, c);
convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
has_prev_c = TRUE;
prev_c = CHAR_RIGHT_SQUARE_BRACKET;
pattern++;
}
while (pattern < pattern_end)
{
c = *pattern++;
char_start = pattern;
GETCHARINCTEST(c, pattern);
if (c == CHAR_RIGHT_SQUARE_BRACKET)
{
convert_glob_write(out, c);
if (!is_negative && !no_wildsep && separator_seen)
{
out->out_str[0] = CHAR_LEFT_PARENTHESIS;
out->out_str[1] = CHAR_QUESTION_MARK;
out->out_str[2] = CHAR_LESS_THAN_SIGN;
out->out_str[3] = CHAR_EXCLAMATION_MARK;
convert_glob_write_str(out, 4);
convert_glob_print_separator(out, separator, with_escape);
convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
}
*from = pattern;
return 0;
}
@ -605,32 +658,64 @@ while (pattern < pattern_end)
pattern = *from;
/* A dash after a character class is a normal character. */
if (pattern >= pattern_end || *pattern != CHAR_MINUS)
has_prev_c = FALSE;
prev_c = 0;
separator_seen = TRUE;
continue;
c = CHAR_MINUS;
pattern++;
}
else if (c == CHAR_MINUS)
else if (c == CHAR_MINUS && has_prev_c &&
*pattern != CHAR_RIGHT_SQUARE_BRACKET)
{
convert_glob_write(out, CHAR_MINUS);
c = *pattern++;
if (c == CHAR_BACKSLASH)
{
char_start = pattern;
GETCHARINCTEST(c, pattern);
if (pattern >= pattern_end) break;
c = *pattern++;
if (escape != 0 && c == escape)
{
char_start = pattern;
GETCHARINCTEST(c, pattern);
}
else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
{
*from = pattern;
return PCRE2_ERROR_CONVERT_SYNTAX;
}
if (prev_c > c)
{
*from = pattern;
return PCRE2_ERROR_CONVERT_SYNTAX;
}
if (prev_c < separator && separator < c) separator_seen = TRUE;
has_prev_c = FALSE;
prev_c = 0;
}
else
{
if (escape != 0 && c == escape)
{
char_start = pattern;
GETCHARINCTEST(c, pattern);
if (pattern >= pattern_end) break;
}
has_prev_c = TRUE;
prev_c = c;
}
else if (c == CHAR_BACKSLASH)
c = *pattern++;
if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
c == CHAR_BACKSLASH || c == CHAR_MINUS)
convert_glob_write(out, CHAR_BACKSLASH);
convert_glob_write(out, c);
if (c == separator) separator_seen = TRUE;
do convert_glob_write(out, *char_start++); while (char_start < pattern);
}
*from = pattern;
@ -638,47 +723,6 @@ return ERROR_MISSING_SQUARE_BRACKET;
}
/* Prints the separator into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_separator(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
if (with_escape)
convert_glob_write(out, CHAR_BACKSLASH);
convert_glob_write(out, separator);
}
/* Prints a wildcard into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_wildcard(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
convert_glob_write_str(out, 2);
convert_glob_print_separator(out, separator, with_escape);
convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
}
/* Prints a (*COMMIT) into the output.
Arguments:
@ -727,8 +771,8 @@ pcre2_output_context out;
PCRE2_SPTR pattern_start = pattern;
PCRE2_SPTR pattern_end = pattern + plength;
PCRE2_UCHAR separator = ccontext->glob_separator;
PCRE2_UCHAR escape = ccontext->glob_escape;
PCRE2_UCHAR c;
BOOL no_escape = ccontext->glob_escape == 0;
BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
BOOL in_atomic = FALSE;
@ -736,14 +780,16 @@ BOOL after_starstar = FALSE;
BOOL with_escape, is_start;
int result, len;
(void)utf; /* Avoid compiler warning */
(void)utf; /* Avoid compiler warning. */
if (separator >= 128)
#ifdef SUPPORT_UNICODE
if (utf && (separator >= 128 || escape >= 128))
{
/* Currently only ASCII separators are supported. */
/* Currently only ASCII characters are supported. */
*bufflenptr = 0;
return PCRE2_ERROR_CONVERT_SYNTAX;
}
#endif
with_escape = strchr(pcre2_escaped_literals, separator) != NULL;
@ -809,7 +855,7 @@ while (pattern < pattern_end)
break;
}
if (!no_escape && *pattern == ccontext->glob_escape)
if (escape != 0 && *pattern == escape)
{
pattern++;
if (pattern >= pattern_end)
@ -908,6 +954,8 @@ while (pattern < pattern_end)
out.out_str[0] = CHAR_ASTERISK;
out.out_str[1] = CHAR_QUESTION_MARK;
if (pattern >= pattern_end)
out.out_str[1] = CHAR_PLUS;
convert_glob_write_str(&out, 2);
continue;
}
@ -924,12 +972,12 @@ while (pattern < pattern_end)
if (c == CHAR_LEFT_SQUARE_BRACKET)
{
result = convert_glob_parse_range(&pattern, pattern_end,
&out, separator, with_escape);
&out, utf, separator, with_escape, escape, no_wildsep);
if (result != 0) break;
continue;
}
if (!no_escape && c == ccontext->glob_escape)
if (escape != 0 && c == escape)
{
if (pattern >= pattern_end)
{

12
testdata/testinput24 vendored
View File

@ -227,6 +227,18 @@
/[[:alpha:][:xdigit:][:word:]]/
"[/-/]"
/[-----]/
/[------]/
/[!------]/
/[[:alpha:]-a]/
/[a-[:alpha:]]/
/[[:alpha:/
/[[:alpha:]/

83
testdata/testoutput24 vendored
View File

@ -22,10 +22,10 @@
# Can't have separator in a class
"[ab/cd]"
(?s)\A(?!/)[ab/cd]\z
(?s)\A[ab/cd](?<!/)\z
"[,-/]"
(?s)\A(?!/)[,-/]\z
(?s)\A[,-/](?<!/)\z
/[ab/
** Pattern conversion error at offset 3: missing terminating ] for character class
@ -41,7 +41,7 @@
# Now some actual tests
/a?b[]xy]*c/
(?s)\Aa[^/]b(?!/)[\]xy](*COMMIT)[^/]*?c\z
(?s)\Aa[^/]b[]xy](*COMMIT)[^/]*?c\z
azb]1234c
0: azb]1234c
@ -70,14 +70,14 @@ No match
No match
/*/
(?s)\A[^/]*?\z
(?s)\A[^/]*+\z
foo
0: foo
\
0:
/f*/
(?s)\Af(*COMMIT)[^/]*?\z
(?s)\Af(*COMMIT)[^/]*+\z
foo
0: foo
f
@ -92,7 +92,7 @@ No match
No match
/*foo*/
(?s)\A[^/]*?foo(*COMMIT)[^/]*?\z
(?s)\A[^/]*?foo(*COMMIT)[^/]*+\z
foo
0: foo
food
@ -101,7 +101,7 @@ No match
0: aprilfool
/*ob*a*r*/
(?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*?\z
(?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*+\z
foobar
0: foobar
@ -127,38 +127,41 @@ No match
0: f\oo
/*[al]?/
(?s)\A[^/]*?(?!/)[al][^/]\z
(?s)\A[^/]*?[al][^/]\z
ball
0: ball
/[ten]/
(?s)\A(?!/)[ten]\z
(?s)\A[ten]\z
\= Expect no match
ten
No match
/t[a-g]n/
(?s)\At(?!/)[a-g]n\z
(?s)\At[a-g]n\z
ten
0: ten
/a[]]b/
(?s)\Aa(?!/)[\]]b\z
(?s)\Aa[]]b\z
a]b
0: a]b
/a[]a-]b/
** Pattern conversion error at offset 7: missing terminating ] for character class
(?s)\Aa[]a\-]b\z
/a[]-]b/
** Pattern conversion error at offset 6: missing terminating ] for character class
(?s)\Aa[]\-]b\z
a-b
0: a-b
a]b
0: a]b
\= Expect no match
aab
No match
/a[]a-z]b/
(?s)\Aa(?!/)[\]a-z]b\z
(?s)\Aa[]a-z]b\z
aab
0: aab
@ -176,12 +179,12 @@ No match
No match
'[[:alpha:]][[:digit:]][[:upper:]]'
(?s)\A(?!/)[[:alpha:]](?!/)[[:digit:]](?!/)[[:upper:]]\z
(?s)\A[[:alpha:]](?<!/)[[:digit:]](?<!/)[[:upper:]](?<!/)\z
a1B
0: a1B
'[[:digit:][:upper:][:space:]]'
(?s)\A(?!/)[[:digit:][:upper:][:space:]]\z
(?s)\A[[:digit:][:upper:][:space:]](?<!/)\z
A
0: A
1
@ -195,7 +198,7 @@ No match
No match
'[a-c[:digit:]x-z]'
(?s)\A(?!/)[a-c[:digit:]x-z]\z
(?s)\A[a-c[:digit:]x-z](?<!/)\z
5
0: 5
b
@ -221,7 +224,7 @@ No match
No match
/A[+-0]B/
(?s)\AA(?!/)[+-0]B\z
(?s)\AA[+-0](?<!/)B\z
A+B
0: A+B
A.B
@ -249,7 +252,7 @@ No match
0: .xyz
"[,-0]x?z"
(?s)\A(?!/)[,-0]x[^/]z\z
(?s)\A[,-0](?<!/)x[^/]z\z
,xyz
0: ,xyz
\= Expect no match
@ -259,12 +262,12 @@ No match
0: .xyz
".x*"
(?s)\A\.x(*COMMIT)[^/]*?\z
(?s)\A\.x(*COMMIT)[^/]*+\z
.xabc
0: .xabc
/a[--0]z/
(?s)\Aa(?!/)[\--0]z\z
(?s)\Aa[\--0](?<!/)z\z
a-z
0: a-z
a.z
@ -278,7 +281,7 @@ No match
No match
/<[a-c-d]>/
(?s)\A<(?!/)[a-c-d]>\z
(?s)\A<[a-c\-d]>\z
<a>
0: <a>
<b>
@ -291,7 +294,7 @@ No match
0: <->
/a[[:digit:].]z/
(?s)\Aa(?!/)[[:digit:].]z\z
(?s)\Aa[[:digit:].](?<!/)z\z
a1z
0: a1z
a.z
@ -334,19 +337,37 @@ No match
(?s)\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z
/*a*\/*b*/
(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*?\z
(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*+\z
/?a?\/?b?/
(?s)\A[^/]a[^/]/[^/]b[^/]\z
/[a\\b\c][]][-][\]\-]/
(?s)\A(?!/)[a\\bc](?!/)[\]](?!/)[\-](?!/)[\]\-]\z
(?s)\A[a\\bc][]][\-][\]\-]\z
/[^a\\b\c][!]][!-][^\]\-]/
(?s)\A[^/a\\bc][^/\]][^/\-][^/\]\-]\z
(?s)\A[^/a\\bc][^/]][^/\-][^/\]\-]\z
/[[:alpha:][:xdigit:][:word:]]/
(?s)\A(?!/)[[:alpha:][:xdigit:][:word:]]\z
(?s)\A[[:alpha:][:xdigit:][:word:]](?<!/)\z
"[/-/]"
(?s)\A[/-/](?<!/)\z
/[-----]/
(?s)\A[\--\-\-\-]\z
/[------]/
(?s)\A[\--\-\--\-]\z
/[!------]/
(?s)\A[^/\--\-\--\-]\z
/[[:alpha:]-a]/
(?s)\A[[:alpha:]\-a](?<!/)\z
/[a-[:alpha:]]/
** Pattern conversion error at offset 4: invalid syntax
/[[:alpha:/
** Pattern conversion error at offset 9: missing terminating ] for character class
@ -386,14 +407,14 @@ No match
0: /xax/
/**\/*a*/
(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?\z)
(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*+\z)
xx/xx/xx/xax
0: /xax
xx/xx/xx/xax/xx
No match
/**\/*a*\/**\/*b*/
(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*?\z)
(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*+\z)
xx/xx/xx/xax/xx/xb
0: /xax/xx/xb
xx/xx/xx/xax/xx/x
@ -402,10 +423,10 @@ No match
#pattern convert=glob:glob_no_starstar
/***/
(?s)\A[^/]*?\z
(?s)\A[^/]*+\z
/**a**/
(?s)\A[^/]*?a(*COMMIT)[^/]*?\z
(?s)\A[^/]*?a(*COMMIT)[^/]*+\z
#pattern convert=unset
#pattern convert=glob:glob_no_wild_separator