Fix mis-parsing of a conditional group with callout but a question mark where
the assertion should start.
This commit is contained in:
parent
482b6a1f0a
commit
6c48775955
|
@ -110,6 +110,10 @@ are noted here for the record.
|
||||||
|
|
||||||
(p) A buffer overflow could occur while sorting the names in the group name
|
(p) A buffer overflow could occur while sorting the names in the group name
|
||||||
list (depending on the order in which the names were seen).
|
list (depending on the order in which the names were seen).
|
||||||
|
|
||||||
|
(q) A conditional group that started with a callout was not doing the right
|
||||||
|
check for a following assertion, leading to compiling bad code. Example:
|
||||||
|
/(?(C'XX))?!XX/
|
||||||
|
|
||||||
4. Back references are now permitted in lookbehind assertions when there are
|
4. Back references are now permitted in lookbehind assertions when there are
|
||||||
no duplicated group numbers (that is, (?| has not been used), and, if the
|
no duplicated group numbers (that is, (?| has not been used), and, if the
|
||||||
|
|
|
@ -2325,7 +2325,7 @@ while (ptr < ptrend)
|
||||||
parsed_pattern = manage_callouts(thisptr, &previous_callout, options,
|
parsed_pattern = manage_callouts(thisptr, &previous_callout, options,
|
||||||
parsed_pattern, cb);
|
parsed_pattern, cb);
|
||||||
PARSED_LITERAL(c, parsed_pattern);
|
PARSED_LITERAL(c, parsed_pattern);
|
||||||
meta_quantifier = 0;
|
meta_quantifier = 0;
|
||||||
}
|
}
|
||||||
continue; /* Next character */
|
continue; /* Next character */
|
||||||
}
|
}
|
||||||
|
@ -2475,13 +2475,16 @@ while (ptr < ptrend)
|
||||||
/* If expect_cond_assert is 2, we have just passed (?( and are expecting an
|
/* If expect_cond_assert is 2, we have just passed (?( and are expecting an
|
||||||
assertion, possibly preceded by a callout. If the value is 1, we have just
|
assertion, possibly preceded by a callout. If the value is 1, we have just
|
||||||
had the callout and expect an assertion. There must be at least 3 more
|
had the callout and expect an assertion. There must be at least 3 more
|
||||||
characters in all cases. We know that the current character is an opening
|
characters in all cases. When expect_cond_assert is 2, we know that the
|
||||||
parenthesis, as otherwise we wouldn't be here. Note that expect_cond_assert
|
current character is an opening parenthesis, as otherwise we wouldn't be
|
||||||
may be negative, since all callouts just decrement it. */
|
here. However, when it is 1, we need to check, and it's easiest just to check
|
||||||
|
always. Note that expect_cond_assert may be negative, since all callouts just
|
||||||
|
decrement it. */
|
||||||
|
|
||||||
if (expect_cond_assert > 0)
|
if (expect_cond_assert > 0)
|
||||||
{
|
{
|
||||||
BOOL ok = ptrend - ptr >= 3 && ptr[0] == CHAR_QUESTION_MARK;
|
BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
|
||||||
|
ptr[0] == CHAR_QUESTION_MARK;
|
||||||
if (ok) switch(ptr[1])
|
if (ok) switch(ptr[1])
|
||||||
{
|
{
|
||||||
case CHAR_C:
|
case CHAR_C:
|
||||||
|
@ -4527,7 +4530,7 @@ Arguments:
|
||||||
|
|
||||||
Returns: 0 There's been an error, *errorcodeptr is non-zero
|
Returns: 0 There's been an error, *errorcodeptr is non-zero
|
||||||
+1 Success, this branch must match at least one character
|
+1 Success, this branch must match at least one character
|
||||||
-1 Success, this branch may match an empty string
|
-1 Success, this branch may match an empty string
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -4538,7 +4541,7 @@ compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
|
||||||
{
|
{
|
||||||
int bravalue = 0;
|
int bravalue = 0;
|
||||||
int okreturn = -1;
|
int okreturn = -1;
|
||||||
int group_return = 0;
|
int group_return = 0;
|
||||||
uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
|
uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
|
||||||
uint32_t greedy_default, greedy_non_default;
|
uint32_t greedy_default, greedy_non_default;
|
||||||
uint32_t repeat_type, op_type;
|
uint32_t repeat_type, op_type;
|
||||||
|
@ -4622,7 +4625,7 @@ for (;; pptr++)
|
||||||
BOOL should_flip_negation;
|
BOOL should_flip_negation;
|
||||||
BOOL match_all_or_no_wide_chars;
|
BOOL match_all_or_no_wide_chars;
|
||||||
BOOL possessive_quantifier;
|
BOOL possessive_quantifier;
|
||||||
BOOL note_group_empty;
|
BOOL note_group_empty;
|
||||||
int class_has_8bitchar;
|
int class_has_8bitchar;
|
||||||
int i;
|
int i;
|
||||||
uint32_t mclength;
|
uint32_t mclength;
|
||||||
|
@ -4687,15 +4690,15 @@ for (;; pptr++)
|
||||||
Checking for the legality of quantifiers happens in parse_regex(), except for
|
Checking for the legality of quantifiers happens in parse_regex(), except for
|
||||||
a quantifier after an assertion that is a condition. */
|
a quantifier after an assertion that is a condition. */
|
||||||
|
|
||||||
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
|
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
|
||||||
{
|
{
|
||||||
previous = code;
|
previous = code;
|
||||||
if (matched_char) okreturn = 1;
|
if (matched_char) okreturn = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
previous_matched_char = matched_char;
|
previous_matched_char = matched_char;
|
||||||
matched_char = FALSE;
|
matched_char = FALSE;
|
||||||
note_group_empty = FALSE;
|
note_group_empty = FALSE;
|
||||||
skipunits = 0; /* Default value for most subgroups */
|
skipunits = 0; /* Default value for most subgroups */
|
||||||
|
|
||||||
switch(meta)
|
switch(meta)
|
||||||
|
@ -4737,7 +4740,7 @@ for (;; pptr++)
|
||||||
repeats. The value of reqcu doesn't change either. */
|
repeats. The value of reqcu doesn't change either. */
|
||||||
|
|
||||||
case META_DOT:
|
case META_DOT:
|
||||||
matched_char = TRUE;
|
matched_char = TRUE;
|
||||||
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
|
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
|
||||||
zerofirstcu = firstcu;
|
zerofirstcu = firstcu;
|
||||||
zerofirstcuflags = firstcuflags;
|
zerofirstcuflags = firstcuflags;
|
||||||
|
@ -4755,7 +4758,7 @@ for (;; pptr++)
|
||||||
|
|
||||||
case META_CLASS_EMPTY:
|
case META_CLASS_EMPTY:
|
||||||
case META_CLASS_EMPTY_NOT:
|
case META_CLASS_EMPTY_NOT:
|
||||||
matched_char = TRUE;
|
matched_char = TRUE;
|
||||||
*code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
|
*code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
|
||||||
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
|
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
|
||||||
zerofirstcu = firstcu;
|
zerofirstcu = firstcu;
|
||||||
|
@ -4778,7 +4781,7 @@ for (;; pptr++)
|
||||||
|
|
||||||
case META_CLASS_NOT:
|
case META_CLASS_NOT:
|
||||||
case META_CLASS:
|
case META_CLASS:
|
||||||
matched_char = TRUE;
|
matched_char = TRUE;
|
||||||
negate_class = meta == META_CLASS_NOT;
|
negate_class = meta == META_CLASS_NOT;
|
||||||
|
|
||||||
/* We can optimize the case of a single character in a class by generating
|
/* We can optimize the case of a single character in a class by generating
|
||||||
|
@ -5502,7 +5505,7 @@ for (;; pptr++)
|
||||||
goto GROUP_PROCESS_NOTE_EMPTY;
|
goto GROUP_PROCESS_NOTE_EMPTY;
|
||||||
|
|
||||||
/* The DEFINE condition is always false. It's internal groups may never
|
/* The DEFINE condition is always false. It's internal groups may never
|
||||||
be called, so matched_char must remain false, hence the jump to
|
be called, so matched_char must remain false, hence the jump to
|
||||||
GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
|
GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
|
||||||
|
|
||||||
case META_COND_DEFINE:
|
case META_COND_DEFINE:
|
||||||
|
@ -5603,12 +5606,12 @@ for (;; pptr++)
|
||||||
|
|
||||||
/* Process nested bracketed regex. The nesting depth is maintained for the
|
/* Process nested bracketed regex. The nesting depth is maintained for the
|
||||||
benefit of the stackguard function. The test for too deep nesting is now
|
benefit of the stackguard function. The test for too deep nesting is now
|
||||||
done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
|
done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
|
||||||
others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
|
others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
|
||||||
note of whether or not they may match an empty string. */
|
note of whether or not they may match an empty string. */
|
||||||
|
|
||||||
GROUP_PROCESS_NOTE_EMPTY:
|
GROUP_PROCESS_NOTE_EMPTY:
|
||||||
note_group_empty = TRUE;
|
note_group_empty = TRUE;
|
||||||
|
|
||||||
GROUP_PROCESS:
|
GROUP_PROCESS:
|
||||||
cb->parens_depth += 1;
|
cb->parens_depth += 1;
|
||||||
|
@ -5619,7 +5622,7 @@ for (;; pptr++)
|
||||||
templastcapture = cb->lastcapture; /* Save value before group */
|
templastcapture = cb->lastcapture; /* Save value before group */
|
||||||
length_prevgroup = 0; /* Initialize for pre-compile phase */
|
length_prevgroup = 0; /* Initialize for pre-compile phase */
|
||||||
|
|
||||||
if ((group_return =
|
if ((group_return =
|
||||||
compile_regex(
|
compile_regex(
|
||||||
options, /* The option state */
|
options, /* The option state */
|
||||||
&tempcode, /* Where to put code (updated) */
|
&tempcode, /* Where to put code (updated) */
|
||||||
|
@ -5636,14 +5639,14 @@ for (;; pptr++)
|
||||||
&length_prevgroup /* Pre-compile phase */
|
&length_prevgroup /* Pre-compile phase */
|
||||||
)) == 0)
|
)) == 0)
|
||||||
return 0; /* Error */
|
return 0; /* Error */
|
||||||
|
|
||||||
cb->parens_depth -= 1;
|
cb->parens_depth -= 1;
|
||||||
|
|
||||||
/* If that was a non-conditional significant group (not an assertion, not a
|
/* If that was a non-conditional significant group (not an assertion, not a
|
||||||
DEFINE) that matches at least one character, then the current item matches
|
DEFINE) that matches at least one character, then the current item matches
|
||||||
a character. Conditionals are handled below. */
|
a character. Conditionals are handled below. */
|
||||||
|
|
||||||
if (note_group_empty && bravalue != OP_COND && group_return > 0)
|
if (note_group_empty && bravalue != OP_COND && group_return > 0)
|
||||||
matched_char = TRUE;
|
matched_char = TRUE;
|
||||||
|
|
||||||
/* If that was an atomic group and there are no capturing groups within it,
|
/* If that was an atomic group and there are no capturing groups within it,
|
||||||
|
@ -5676,7 +5679,7 @@ for (;; pptr++)
|
||||||
tc += GET(tc,1);
|
tc += GET(tc,1);
|
||||||
}
|
}
|
||||||
while (*tc != OP_KET);
|
while (*tc != OP_KET);
|
||||||
|
|
||||||
/* A DEFINE group is never obeyed inline (the "condition" is always
|
/* A DEFINE group is never obeyed inline (the "condition" is always
|
||||||
false). It must have only one branch. Having checked this, change the
|
false). It must have only one branch. Having checked this, change the
|
||||||
opcode to OP_FALSE. */
|
opcode to OP_FALSE. */
|
||||||
|
@ -5695,7 +5698,7 @@ for (;; pptr++)
|
||||||
|
|
||||||
/* A "normal" conditional group. If there is just one branch, we must not
|
/* A "normal" conditional group. If there is just one branch, we must not
|
||||||
make use of its firstcu or reqcu, because this is equivalent to an
|
make use of its firstcu or reqcu, because this is equivalent to an
|
||||||
empty second branch. Also, it may match an empty string. If there are two
|
empty second branch. Also, it may match an empty string. If there are two
|
||||||
branches, this item must match a character if the group must. */
|
branches, this item must match a character if the group must. */
|
||||||
|
|
||||||
else
|
else
|
||||||
|
@ -6018,7 +6021,7 @@ for (;; pptr++)
|
||||||
repeat_max = 1;
|
repeat_max = 1;
|
||||||
|
|
||||||
REPEAT:
|
REPEAT:
|
||||||
if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
|
if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
|
||||||
|
|
||||||
/* Remember whether this is a variable length repeat, and default to
|
/* Remember whether this is a variable length repeat, and default to
|
||||||
single-char opcodes. */
|
single-char opcodes. */
|
||||||
|
@ -6087,7 +6090,7 @@ for (;; pptr++)
|
||||||
PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
|
PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
|
||||||
code += 2 + 2 * LINK_SIZE;
|
code += 2 + 2 * LINK_SIZE;
|
||||||
length_prevgroup = 3 + 3*LINK_SIZE;
|
length_prevgroup = 3 + 3*LINK_SIZE;
|
||||||
group_return = -1; /* Set "may match empty string" */
|
group_return = -1; /* Set "may match empty string" */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Now handle repetition for the different types of item. */
|
/* Now handle repetition for the different types of item. */
|
||||||
|
@ -6444,9 +6447,9 @@ for (;; pptr++)
|
||||||
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* In the compile phase, adjust the opcode if the group can match
|
/* In the compile phase, adjust the opcode if the group can match
|
||||||
an empty string. For a conditional group with only one branch, the
|
an empty string. For a conditional group with only one branch, the
|
||||||
value of group_return will not show "could be empty", so we must
|
value of group_return will not show "could be empty", so we must
|
||||||
check that separately. */
|
check that separately. */
|
||||||
|
|
||||||
if (lengthptr == NULL)
|
if (lengthptr == NULL)
|
||||||
|
@ -6877,10 +6880,10 @@ for (;; pptr++)
|
||||||
character if it hasn't already been set. */
|
character if it hasn't already been set. */
|
||||||
|
|
||||||
if (meta_arg > ESC_b && meta_arg < ESC_Z)
|
if (meta_arg > ESC_b && meta_arg < ESC_Z)
|
||||||
{
|
{
|
||||||
matched_char = TRUE;
|
matched_char = TRUE;
|
||||||
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
|
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set values to reset to if this is followed by a zero repeat. */
|
/* Set values to reset to if this is followed by a zero repeat. */
|
||||||
|
|
||||||
|
@ -6946,7 +6949,7 @@ for (;; pptr++)
|
||||||
NORMAL_CHAR:
|
NORMAL_CHAR:
|
||||||
meta = *pptr; /* Get the full 32 bits */
|
meta = *pptr; /* Get the full 32 bits */
|
||||||
NORMAL_CHAR_SET: /* Character is already in meta */
|
NORMAL_CHAR_SET: /* Character is already in meta */
|
||||||
matched_char = TRUE;
|
matched_char = TRUE;
|
||||||
|
|
||||||
/* For caseless UTF mode, check whether this character has more than one
|
/* For caseless UTF mode, check whether this character has more than one
|
||||||
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
|
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
|
||||||
|
@ -7071,7 +7074,7 @@ Arguments:
|
||||||
Returns: 0 There has been an error
|
Returns: 0 There has been an error
|
||||||
+1 Success, this group must match at least one character
|
+1 Success, this group must match at least one character
|
||||||
-1 Success, this group may match an empty string
|
-1 Success, this group may match an empty string
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int
|
static int
|
||||||
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
|
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
|
||||||
|
@ -7156,7 +7159,7 @@ code += 1 + LINK_SIZE + skipunits;
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
int branch_return;
|
int branch_return;
|
||||||
|
|
||||||
/* Insert OP_REVERSE if this is as lookbehind assertion. */
|
/* Insert OP_REVERSE if this is as lookbehind assertion. */
|
||||||
|
|
||||||
if (lookbehind && lookbehindlength > 0)
|
if (lookbehind && lookbehindlength > 0)
|
||||||
|
@ -7169,14 +7172,14 @@ for (;;)
|
||||||
/* Now compile the branch; in the pre-compile phase its length gets added
|
/* Now compile the branch; in the pre-compile phase its length gets added
|
||||||
into the length. */
|
into the length. */
|
||||||
|
|
||||||
if ((branch_return =
|
if ((branch_return =
|
||||||
compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
|
compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
|
||||||
&branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
|
&branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
|
||||||
cb, (lengthptr == NULL)? NULL : &length)) == 0)
|
cb, (lengthptr == NULL)? NULL : &length)) == 0)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/* If a branch can match an empty string, so can the whole group. */
|
/* If a branch can match an empty string, so can the whole group. */
|
||||||
|
|
||||||
if (branch_return < 0) okreturn = -1;
|
if (branch_return < 0) okreturn = -1;
|
||||||
|
|
||||||
/* In the real compile phase, there is some post-processing to be done. */
|
/* In the real compile phase, there is some post-processing to be done. */
|
||||||
|
@ -7308,7 +7311,7 @@ for (;;)
|
||||||
}
|
}
|
||||||
*lengthptr += length;
|
*lengthptr += length;
|
||||||
}
|
}
|
||||||
// if (lengthptr == NULL) fprintf(stderr, "~~group returns %d\n", okreturn);
|
// if (lengthptr == NULL) fprintf(stderr, "~~group returns %d\n", okreturn);
|
||||||
return okreturn;
|
return okreturn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9127,7 +9130,7 @@ of the function here. */
|
||||||
pptr = cb.parsed_pattern;
|
pptr = cb.parsed_pattern;
|
||||||
code = (PCRE2_UCHAR *)codestart;
|
code = (PCRE2_UCHAR *)codestart;
|
||||||
*code = OP_BRA;
|
*code = OP_BRA;
|
||||||
regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
|
regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
|
||||||
&firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
|
&firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
|
||||||
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
|
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
|
||||||
re->top_bracket = cb.bracount;
|
re->top_bracket = cb.bracount;
|
||||||
|
|
|
@ -4944,4 +4944,10 @@ a)"xI
|
||||||
|
|
||||||
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
|
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
|
||||||
|
|
||||||
|
# This one used to compile rubbish instead of a compile error, and then
|
||||||
|
# behave unpredictably at match time.
|
||||||
|
|
||||||
|
/.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X/
|
||||||
|
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -15424,6 +15424,13 @@ Subject length lower bound = 0
|
||||||
|
|
||||||
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
|
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
|
||||||
|
|
||||||
|
# This one used to compile rubbish instead of a compile error, and then
|
||||||
|
# behave unpredictably at match time.
|
||||||
|
|
||||||
|
/.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X/
|
||||||
|
Failed: error 128 at offset 63: assertion expected after (?( or (?(?C)
|
||||||
|
.+(?(?C'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'))?!XXXX.=X
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
|
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
|
||||||
Error -62: bad serialized data
|
Error -62: bad serialized data
|
||||||
|
|
Loading…
Reference in New Issue