Rework character range parsing in glob conversion.

This commit is contained in:
Zoltán Herczeg 2017-05-24 10:14:43 +00:00
parent 638c74f12b
commit 9826db624e
3 changed files with 215 additions and 134 deletions

View File

@ -423,6 +423,47 @@ out->output_size = output_size;
} }
/* Prints the separator into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_separator(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
if (with_escape)
convert_glob_write(out, CHAR_BACKSLASH);
convert_glob_write(out, separator);
}
/* Prints a wildcard into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_wildcard(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
convert_glob_write_str(out, 2);
convert_glob_print_separator(out, separator, with_escape);
convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
}
/* Parse a posix class. /* Parse a posix class.
Arguments: Arguments:
@ -519,77 +560,89 @@ Returns: 0 => success
static int static int
convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end,
pcre2_output_context *out, PCRE2_UCHAR separator, BOOL with_escape) pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator,
BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep)
{ {
BOOL is_negative = FALSE;
BOOL separator_seen = FALSE;
BOOL has_prev_c;
PCRE2_SPTR pattern = *from; PCRE2_SPTR pattern = *from;
PCRE2_UCHAR c; PCRE2_SPTR char_start = NULL;
uint32_t c, prev_c;
int result, len; int result, len;
(void)utf; /* Avoid compiler warning. */
if (pattern >= pattern_end) if (pattern >= pattern_end)
{ {
*from = pattern; *from = pattern;
return ERROR_MISSING_SQUARE_BRACKET; return ERROR_MISSING_SQUARE_BRACKET;
} }
c = *pattern; if (*pattern == CHAR_EXCLAMATION_MARK
|| *pattern == CHAR_CIRCUMFLEX_ACCENT)
if (c == CHAR_EXCLAMATION_MARK
|| c == CHAR_CIRCUMFLEX_ACCENT)
{
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
len = 2;
}
else
{
out->out_str[0] = CHAR_LEFT_PARENTHESIS;
out->out_str[1] = CHAR_QUESTION_MARK;
out->out_str[2] = CHAR_EXCLAMATION_MARK;
len = 3;
}
if (with_escape)
{
out->out_str[len] = CHAR_BACKSLASH;
len++;
}
out->out_str[len] = (uint8_t) separator;
convert_glob_write_str(out, len + 1);
if (c == CHAR_EXCLAMATION_MARK
|| c == CHAR_CIRCUMFLEX_ACCENT)
{ {
pattern++; pattern++;
if (pattern >= pattern_end) if (pattern >= pattern_end)
{ {
*from = pattern; *from = pattern;
return ERROR_MISSING_SQUARE_BRACKET; return ERROR_MISSING_SQUARE_BRACKET;
} }
c = *pattern;
} is_negative = TRUE;
else
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
len = 2;
if (!no_wildsep)
{ {
out->out_str[0] = CHAR_RIGHT_PARENTHESIS; if (with_escape)
out->out_str[1] = CHAR_LEFT_SQUARE_BRACKET; {
convert_glob_write_str(out, 2); out->out_str[len] = CHAR_BACKSLASH;
len++;
}
out->out_str[len] = (uint8_t) separator;
} }
if (c == CHAR_MINUS || c == CHAR_RIGHT_SQUARE_BRACKET) convert_glob_write_str(out, len + 1);
}
else
convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET);
has_prev_c = FALSE;
prev_c = 0;
if (*pattern == CHAR_RIGHT_SQUARE_BRACKET)
{ {
convert_glob_write(out, CHAR_BACKSLASH); convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
convert_glob_write(out, c); has_prev_c = TRUE;
prev_c = CHAR_RIGHT_SQUARE_BRACKET;
pattern++; pattern++;
} }
while (pattern < pattern_end) while (pattern < pattern_end)
{ {
c = *pattern++; char_start = pattern;
GETCHARINCTEST(c, pattern);
if (c == CHAR_RIGHT_SQUARE_BRACKET) if (c == CHAR_RIGHT_SQUARE_BRACKET)
{ {
convert_glob_write(out, c); convert_glob_write(out, c);
if (!is_negative && !no_wildsep && separator_seen)
{
out->out_str[0] = CHAR_LEFT_PARENTHESIS;
out->out_str[1] = CHAR_QUESTION_MARK;
out->out_str[2] = CHAR_LESS_THAN_SIGN;
out->out_str[3] = CHAR_EXCLAMATION_MARK;
convert_glob_write_str(out, 4);
convert_glob_print_separator(out, separator, with_escape);
convert_glob_write(out, CHAR_RIGHT_PARENTHESIS);
}
*from = pattern; *from = pattern;
return 0; return 0;
} }
@ -605,32 +658,64 @@ while (pattern < pattern_end)
pattern = *from; pattern = *from;
/* A dash after a character class is a normal character. */ has_prev_c = FALSE;
if (pattern >= pattern_end || *pattern != CHAR_MINUS) prev_c = 0;
separator_seen = TRUE;
continue; continue;
c = CHAR_MINUS;
pattern++;
} }
else if (c == CHAR_MINUS) else if (c == CHAR_MINUS && has_prev_c &&
*pattern != CHAR_RIGHT_SQUARE_BRACKET)
{ {
convert_glob_write(out, CHAR_MINUS); convert_glob_write(out, CHAR_MINUS);
c = *pattern++;
if (c == CHAR_BACKSLASH) char_start = pattern;
{ GETCHARINCTEST(c, pattern);
if (pattern >= pattern_end) break; if (pattern >= pattern_end) break;
c = *pattern++;
if (escape != 0 && c == escape)
{
char_start = pattern;
GETCHARINCTEST(c, pattern);
} }
else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON)
{
*from = pattern;
return PCRE2_ERROR_CONVERT_SYNTAX;
}
if (prev_c > c)
{
*from = pattern;
return PCRE2_ERROR_CONVERT_SYNTAX;
}
if (prev_c < separator && separator < c) separator_seen = TRUE;
has_prev_c = FALSE;
prev_c = 0;
}
else
{
if (escape != 0 && c == escape)
{
char_start = pattern;
GETCHARINCTEST(c, pattern);
if (pattern >= pattern_end) break;
}
has_prev_c = TRUE;
prev_c = c;
} }
else if (c == CHAR_BACKSLASH)
c = *pattern++;
if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET || if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET ||
c == CHAR_BACKSLASH || c == CHAR_MINUS) c == CHAR_BACKSLASH || c == CHAR_MINUS)
convert_glob_write(out, CHAR_BACKSLASH); convert_glob_write(out, CHAR_BACKSLASH);
convert_glob_write(out, c); if (c == separator) separator_seen = TRUE;
do convert_glob_write(out, *char_start++); while (char_start < pattern);
} }
*from = pattern; *from = pattern;
@ -638,47 +723,6 @@ return ERROR_MISSING_SQUARE_BRACKET;
} }
/* Prints the separator into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_separator(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
if (with_escape)
convert_glob_write(out, CHAR_BACKSLASH);
convert_glob_write(out, separator);
}
/* Prints a wildcard into the output.
Arguments:
out output context
separator glob separator
with_escape backslash is needed before separator
*/
static void
convert_glob_print_wildcard(pcre2_output_context *out,
PCRE2_UCHAR separator, BOOL with_escape)
{
out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET;
out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT;
convert_glob_write_str(out, 2);
convert_glob_print_separator(out, separator, with_escape);
convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET);
}
/* Prints a (*COMMIT) into the output. /* Prints a (*COMMIT) into the output.
Arguments: Arguments:
@ -727,8 +771,8 @@ pcre2_output_context out;
PCRE2_SPTR pattern_start = pattern; PCRE2_SPTR pattern_start = pattern;
PCRE2_SPTR pattern_end = pattern + plength; PCRE2_SPTR pattern_end = pattern + plength;
PCRE2_UCHAR separator = ccontext->glob_separator; PCRE2_UCHAR separator = ccontext->glob_separator;
PCRE2_UCHAR escape = ccontext->glob_escape;
PCRE2_UCHAR c; PCRE2_UCHAR c;
BOOL no_escape = ccontext->glob_escape == 0;
BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0; BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0;
BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0; BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0;
BOOL in_atomic = FALSE; BOOL in_atomic = FALSE;
@ -736,14 +780,16 @@ BOOL after_starstar = FALSE;
BOOL with_escape, is_start; BOOL with_escape, is_start;
int result, len; int result, len;
(void)utf; /* Avoid compiler warning */ (void)utf; /* Avoid compiler warning. */
if (separator >= 128) #ifdef SUPPORT_UNICODE
if (utf && (separator >= 128 || escape >= 128))
{ {
/* Currently only ASCII separators are supported. */ /* Currently only ASCII characters are supported. */
*bufflenptr = 0; *bufflenptr = 0;
return PCRE2_ERROR_CONVERT_SYNTAX; return PCRE2_ERROR_CONVERT_SYNTAX;
} }
#endif
with_escape = strchr(pcre2_escaped_literals, separator) != NULL; with_escape = strchr(pcre2_escaped_literals, separator) != NULL;
@ -809,7 +855,7 @@ while (pattern < pattern_end)
break; break;
} }
if (!no_escape && *pattern == ccontext->glob_escape) if (escape != 0 && *pattern == escape)
{ {
pattern++; pattern++;
if (pattern >= pattern_end) if (pattern >= pattern_end)
@ -908,6 +954,8 @@ while (pattern < pattern_end)
out.out_str[0] = CHAR_ASTERISK; out.out_str[0] = CHAR_ASTERISK;
out.out_str[1] = CHAR_QUESTION_MARK; out.out_str[1] = CHAR_QUESTION_MARK;
if (pattern >= pattern_end)
out.out_str[1] = CHAR_PLUS;
convert_glob_write_str(&out, 2); convert_glob_write_str(&out, 2);
continue; continue;
} }
@ -924,12 +972,12 @@ while (pattern < pattern_end)
if (c == CHAR_LEFT_SQUARE_BRACKET) if (c == CHAR_LEFT_SQUARE_BRACKET)
{ {
result = convert_glob_parse_range(&pattern, pattern_end, result = convert_glob_parse_range(&pattern, pattern_end,
&out, separator, with_escape); &out, utf, separator, with_escape, escape, no_wildsep);
if (result != 0) break; if (result != 0) break;
continue; continue;
} }
if (!no_escape && c == ccontext->glob_escape) if (escape != 0 && c == escape)
{ {
if (pattern >= pattern_end) if (pattern >= pattern_end)
{ {

12
testdata/testinput24 vendored
View File

@ -227,6 +227,18 @@
/[[:alpha:][:xdigit:][:word:]]/ /[[:alpha:][:xdigit:][:word:]]/
"[/-/]"
/[-----]/
/[------]/
/[!------]/
/[[:alpha:]-a]/
/[a-[:alpha:]]/
/[[:alpha:/ /[[:alpha:/
/[[:alpha:]/ /[[:alpha:]/

83
testdata/testoutput24 vendored
View File

@ -22,10 +22,10 @@
# Can't have separator in a class # Can't have separator in a class
"[ab/cd]" "[ab/cd]"
(?s)\A(?!/)[ab/cd]\z (?s)\A[ab/cd](?<!/)\z
"[,-/]" "[,-/]"
(?s)\A(?!/)[,-/]\z (?s)\A[,-/](?<!/)\z
/[ab/ /[ab/
** Pattern conversion error at offset 3: missing terminating ] for character class ** Pattern conversion error at offset 3: missing terminating ] for character class
@ -41,7 +41,7 @@
# Now some actual tests # Now some actual tests
/a?b[]xy]*c/ /a?b[]xy]*c/
(?s)\Aa[^/]b(?!/)[\]xy](*COMMIT)[^/]*?c\z (?s)\Aa[^/]b[]xy](*COMMIT)[^/]*?c\z
azb]1234c azb]1234c
0: azb]1234c 0: azb]1234c
@ -70,14 +70,14 @@ No match
No match No match
/*/ /*/
(?s)\A[^/]*?\z (?s)\A[^/]*+\z
foo foo
0: foo 0: foo
\ \
0: 0:
/f*/ /f*/
(?s)\Af(*COMMIT)[^/]*?\z (?s)\Af(*COMMIT)[^/]*+\z
foo foo
0: foo 0: foo
f f
@ -92,7 +92,7 @@ No match
No match No match
/*foo*/ /*foo*/
(?s)\A[^/]*?foo(*COMMIT)[^/]*?\z (?s)\A[^/]*?foo(*COMMIT)[^/]*+\z
foo foo
0: foo 0: foo
food food
@ -101,7 +101,7 @@ No match
0: aprilfool 0: aprilfool
/*ob*a*r*/ /*ob*a*r*/
(?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*?\z (?s)\A[^/]*?ob(*COMMIT)[^/]*?a(*COMMIT)[^/]*?r(*COMMIT)[^/]*+\z
foobar foobar
0: foobar 0: foobar
@ -127,38 +127,41 @@ No match
0: f\oo 0: f\oo
/*[al]?/ /*[al]?/
(?s)\A[^/]*?(?!/)[al][^/]\z (?s)\A[^/]*?[al][^/]\z
ball ball
0: ball 0: ball
/[ten]/ /[ten]/
(?s)\A(?!/)[ten]\z (?s)\A[ten]\z
\= Expect no match \= Expect no match
ten ten
No match No match
/t[a-g]n/ /t[a-g]n/
(?s)\At(?!/)[a-g]n\z (?s)\At[a-g]n\z
ten ten
0: ten 0: ten
/a[]]b/ /a[]]b/
(?s)\Aa(?!/)[\]]b\z (?s)\Aa[]]b\z
a]b a]b
0: a]b 0: a]b
/a[]a-]b/ /a[]a-]b/
** Pattern conversion error at offset 7: missing terminating ] for character class (?s)\Aa[]a\-]b\z
/a[]-]b/ /a[]-]b/
** Pattern conversion error at offset 6: missing terminating ] for character class (?s)\Aa[]\-]b\z
a-b a-b
0: a-b
a]b a]b
0: a]b
\= Expect no match \= Expect no match
aab aab
No match
/a[]a-z]b/ /a[]a-z]b/
(?s)\Aa(?!/)[\]a-z]b\z (?s)\Aa[]a-z]b\z
aab aab
0: aab 0: aab
@ -176,12 +179,12 @@ No match
No match No match
'[[:alpha:]][[:digit:]][[:upper:]]' '[[:alpha:]][[:digit:]][[:upper:]]'
(?s)\A(?!/)[[:alpha:]](?!/)[[:digit:]](?!/)[[:upper:]]\z (?s)\A[[:alpha:]](?<!/)[[:digit:]](?<!/)[[:upper:]](?<!/)\z
a1B a1B
0: a1B 0: a1B
'[[:digit:][:upper:][:space:]]' '[[:digit:][:upper:][:space:]]'
(?s)\A(?!/)[[:digit:][:upper:][:space:]]\z (?s)\A[[:digit:][:upper:][:space:]](?<!/)\z
A A
0: A 0: A
1 1
@ -195,7 +198,7 @@ No match
No match No match
'[a-c[:digit:]x-z]' '[a-c[:digit:]x-z]'
(?s)\A(?!/)[a-c[:digit:]x-z]\z (?s)\A[a-c[:digit:]x-z](?<!/)\z
5 5
0: 5 0: 5
b b
@ -221,7 +224,7 @@ No match
No match No match
/A[+-0]B/ /A[+-0]B/
(?s)\AA(?!/)[+-0]B\z (?s)\AA[+-0](?<!/)B\z
A+B A+B
0: A+B 0: A+B
A.B A.B
@ -249,7 +252,7 @@ No match
0: .xyz 0: .xyz
"[,-0]x?z" "[,-0]x?z"
(?s)\A(?!/)[,-0]x[^/]z\z (?s)\A[,-0](?<!/)x[^/]z\z
,xyz ,xyz
0: ,xyz 0: ,xyz
\= Expect no match \= Expect no match
@ -259,12 +262,12 @@ No match
0: .xyz 0: .xyz
".x*" ".x*"
(?s)\A\.x(*COMMIT)[^/]*?\z (?s)\A\.x(*COMMIT)[^/]*+\z
.xabc .xabc
0: .xabc 0: .xabc
/a[--0]z/ /a[--0]z/
(?s)\Aa(?!/)[\--0]z\z (?s)\Aa[\--0](?<!/)z\z
a-z a-z
0: a-z 0: a-z
a.z a.z
@ -278,7 +281,7 @@ No match
No match No match
/<[a-c-d]>/ /<[a-c-d]>/
(?s)\A<(?!/)[a-c-d]>\z (?s)\A<[a-c\-d]>\z
<a> <a>
0: <a> 0: <a>
<b> <b>
@ -291,7 +294,7 @@ No match
0: <-> 0: <->
/a[[:digit:].]z/ /a[[:digit:].]z/
(?s)\Aa(?!/)[[:digit:].]z\z (?s)\Aa[[:digit:].](?<!/)z\z
a1z a1z
0: a1z 0: a1z
a.z a.z
@ -334,19 +337,37 @@ No match
(?s)\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z (?s)\A\\\{\}\?\*\+\[\]\(\)\|\.\^\$\z
/*a*\/*b*/ /*a*\/*b*/
(?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*?\z (?s)\A[^/]*?a(*COMMIT)[^/]*?/(*COMMIT)[^/]*?b(*COMMIT)[^/]*+\z
/?a?\/?b?/ /?a?\/?b?/
(?s)\A[^/]a[^/]/[^/]b[^/]\z (?s)\A[^/]a[^/]/[^/]b[^/]\z
/[a\\b\c][]][-][\]\-]/ /[a\\b\c][]][-][\]\-]/
(?s)\A(?!/)[a\\bc](?!/)[\]](?!/)[\-](?!/)[\]\-]\z (?s)\A[a\\bc][]][\-][\]\-]\z
/[^a\\b\c][!]][!-][^\]\-]/ /[^a\\b\c][!]][!-][^\]\-]/
(?s)\A[^/a\\bc][^/\]][^/\-][^/\]\-]\z (?s)\A[^/a\\bc][^/]][^/\-][^/\]\-]\z
/[[:alpha:][:xdigit:][:word:]]/ /[[:alpha:][:xdigit:][:word:]]/
(?s)\A(?!/)[[:alpha:][:xdigit:][:word:]]\z (?s)\A[[:alpha:][:xdigit:][:word:]](?<!/)\z
"[/-/]"
(?s)\A[/-/](?<!/)\z
/[-----]/
(?s)\A[\--\-\-\-]\z
/[------]/
(?s)\A[\--\-\--\-]\z
/[!------]/
(?s)\A[^/\--\-\--\-]\z
/[[:alpha:]-a]/
(?s)\A[[:alpha:]\-a](?<!/)\z
/[a-[:alpha:]]/
** Pattern conversion error at offset 4: invalid syntax
/[[:alpha:/ /[[:alpha:/
** Pattern conversion error at offset 9: missing terminating ] for character class ** Pattern conversion error at offset 9: missing terminating ] for character class
@ -386,14 +407,14 @@ No match
0: /xax/ 0: /xax/
/**\/*a*/ /**\/*a*/
(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?\z) (?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*+\z)
xx/xx/xx/xax xx/xx/xx/xax
0: /xax 0: /xax
xx/xx/xx/xax/xx xx/xx/xx/xax/xx
No match No match
/**\/*a*\/**\/*b*/ /**\/*a*\/**\/*b*/
(?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*?\z) (?s)(?:\A|/)(?>[^/]*?a)(?>[^/]*?/)(*COMMIT)(?:.*?/)??(?>[^/]*?b)(?>[^/]*+\z)
xx/xx/xx/xax/xx/xb xx/xx/xx/xax/xx/xb
0: /xax/xx/xb 0: /xax/xx/xb
xx/xx/xx/xax/xx/x xx/xx/xx/xax/xx/x
@ -402,10 +423,10 @@ No match
#pattern convert=glob:glob_no_starstar #pattern convert=glob:glob_no_starstar
/***/ /***/
(?s)\A[^/]*?\z (?s)\A[^/]*+\z
/**a**/ /**a**/
(?s)\A[^/]*?a(*COMMIT)[^/]*?\z (?s)\A[^/]*?a(*COMMIT)[^/]*+\z
#pattern convert=unset #pattern convert=unset
#pattern convert=glob:glob_no_wild_separator #pattern convert=glob:glob_no_wild_separator