Fix empty \Q\E between an item and a qualifier in auto-callout mode.

This commit is contained in:
Philip.Hazel 2015-11-30 17:31:16 +00:00
parent 1b38451847
commit 12fc152074
4 changed files with 72 additions and 24 deletions

View File

@ -349,6 +349,10 @@ was set when the pmatch argument was NULL. It now returns REG_INVARG.
104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep. 104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep.
105. An empty \Q\E sequence between an item and its qualifier caused
pcre2_compile() to misbehave when auto callouts were enabled. This bug
was found by the LLVM fuzzer.
Version 10.20 30-June-2015 Version 10.20 30-June-2015
-------------------------- --------------------------

View File

@ -3947,8 +3947,16 @@ for (;; ptr++)
last_code = code; last_code = code;
} }
/* If in \Q...\E, check for the end; if not, we have a literal. If not in /* Before doing anything else we must handle all the special items that do
\Q...\E, an isolated \E is ignored. */ nothing, and which may come between an item and its quantifier. Otherwise,
when auto-callouts are enabled, a callout gets incorrectly inserted before
the quantifier is recognized. After recognizing a "do nothing" item, restart
the loop in case another one follows. */
/* If c is not NULL we are not at the end of the pattern. If it is NULL, we
may still be in the pattern with a NULL data item. In these cases, if we are
in \Q...\E, check for the \E that ends the literal string; if not, we have a
literal character. If not in \Q...\E, an isolated \E is ignored. */
if (c != CHAR_NULL || ptr < cb->end_pattern) if (c != CHAR_NULL || ptr < cb->end_pattern)
{ {
@ -3958,7 +3966,7 @@ for (;; ptr++)
ptr++; ptr++;
continue; continue;
} }
else if (inescq) else if (inescq) /* Literal character */
{ {
if (previous_callout != NULL) if (previous_callout != NULL)
{ {
@ -3973,17 +3981,27 @@ for (;; ptr++)
} }
goto NORMAL_CHAR; goto NORMAL_CHAR;
} }
/* Check for the start of a \Q...\E sequence. We must do this here rather
than later in case it is immediately followed by \E, which turns it into a
"do nothing" sequence. */
if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
{
inescq = TRUE;
ptr++;
continue;
}
} }
/* In extended mode, skip white space and comments. We need a loop in order /* In extended mode, skip white space and #-comments that end at newline. */
to check for more white space and more comments after a comment. */
if ((options & PCRE2_EXTENDED) != 0) if ((options & PCRE2_EXTENDED) != 0)
{ {
for (;;) PCRE2_SPTR wscptr = ptr;
while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
if (c == CHAR_NUMBER_SIGN)
{ {
while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
if (c != CHAR_NUMBER_SIGN) break;
ptr++; ptr++;
while (*ptr != CHAR_NULL) while (*ptr != CHAR_NULL)
{ {
@ -3997,13 +4015,19 @@ for (;; ptr++)
if (utf) FORWARDCHAR(ptr); if (utf) FORWARDCHAR(ptr);
#endif #endif
} }
c = *ptr; /* Either NULL or the char after a newline */ }
/* If we skipped any characters, restart the loop. Otherwise, we didn't see
a comment. */
if (ptr > wscptr)
{
ptr--;
continue;
} }
} }
/* Skip over (?# comments. We need to do this here because we want to know if /* Skip over (?# comments. */
the next thing is a quantifier, and these comments may come between an item
and its quantifier. */
if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK && if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
ptr[2] == CHAR_NUMBER_SIGN) ptr[2] == CHAR_NUMBER_SIGN)
@ -4018,7 +4042,8 @@ for (;; ptr++)
continue; continue;
} }
/* See if the next thing is a quantifier. */ /* End of processing "do nothing" items. See if the next thing is a
quantifier. */
is_quantifier = is_quantifier =
c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
@ -7133,7 +7158,10 @@ for (;; ptr++)
are negative the reference number. Only back references and those types are negative the reference number. Only back references and those types
that consume a character may be repeated. We can test for values between that consume a character may be repeated. We can test for values between
ESC_b and ESC_Z for the latter; this may have to change if any new ones are ESC_b and ESC_Z for the latter; this may have to change if any new ones are
ever created. */ ever created.
Note: \Q and \E are handled at the start of the character-processing loop,
not here. */
case CHAR_BACKSLASH: case CHAR_BACKSLASH:
tempptr = ptr; tempptr = ptr;
@ -7145,16 +7173,6 @@ for (;; ptr++)
c = ec; c = ec;
else else
{ {
if (escape == ESC_Q) /* Handle start of quoted string */
{
if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
ptr += 2; /* avoid empty string */
else inescq = TRUE;
continue;
}
if (escape == ESC_E) continue; /* Perl ignores an orphan \E */
/* For metasequences that actually match a character, we disable the /* For metasequences that actually match a character, we disable the
setting of a first character if it hasn't already been set. */ setting of a first character if it hasn't already been set. */

3
testdata/testinput2 vendored
View File

@ -4699,4 +4699,7 @@ a)"xI
/(A*)\E+/B,auto_callout /(A*)\E+/B,auto_callout
/()\Q\E*]/B,auto_callout
a[bc]d
# End of testinput2 # End of testinput2

23
testdata/testoutput2 vendored
View File

@ -14956,4 +14956,27 @@ Subject length lower bound = 0
End End
------------------------------------------------------------------ ------------------------------------------------------------------
/()\Q\E*]/B,auto_callout
------------------------------------------------------------------
Bra
Callout 255 0 7
Brazero
SCBra 1
Callout 255 1 0
KetRmax
Callout 255 7 1
]
Callout 255 8 0
Ket
End
------------------------------------------------------------------
a[bc]d
--->a[bc]d
+0 ^ ()\Q\E*
+1 ^ )
+7 ^ ]
+8 ^^
0: ]
1:
# End of testinput2 # End of testinput2