diff --git a/ChangeLog b/ChangeLog index b19ffc3..7c5b9c4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -349,6 +349,10 @@ was set when the pmatch argument was NULL. It now returns REG_INVARG. 104. Allow for up to 32-bit numbers in the ordin() function in pcre2grep. +105. An empty \Q\E sequence between an item and its qualifier caused +pcre2_compile() to misbehave when auto callouts were enabled. This bug +was found by the LLVM fuzzer. + Version 10.20 30-June-2015 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index db674d4..1699835 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3947,8 +3947,16 @@ for (;; ptr++) last_code = code; } - /* If in \Q...\E, check for the end; if not, we have a literal. If not in - \Q...\E, an isolated \E is ignored. */ + /* Before doing anything else we must handle all the special items that do + nothing, and which may come between an item and its quantifier. Otherwise, + when auto-callouts are enabled, a callout gets incorrectly inserted before + the quantifier is recognized. After recognizing a "do nothing" item, restart + the loop in case another one follows. */ + + /* If c is not NULL we are not at the end of the pattern. If it is NULL, we + may still be in the pattern with a NULL data item. In these cases, if we are + in \Q...\E, check for the \E that ends the literal string; if not, we have a + literal character. If not in \Q...\E, an isolated \E is ignored. */ if (c != CHAR_NULL || ptr < cb->end_pattern) { @@ -3958,7 +3966,7 @@ for (;; ptr++) ptr++; continue; } - else if (inescq) + else if (inescq) /* Literal character */ { if (previous_callout != NULL) { @@ -3973,17 +3981,27 @@ for (;; ptr++) } goto NORMAL_CHAR; } + + /* Check for the start of a \Q...\E sequence. We must do this here rather + than later in case it is immediately followed by \E, which turns it into a + "do nothing" sequence. */ + + if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q) + { + inescq = TRUE; + ptr++; + continue; + } } - /* In extended mode, skip white space and comments. We need a loop in order - to check for more white space and more comments after a comment. */ + /* In extended mode, skip white space and #-comments that end at newline. */ if ((options & PCRE2_EXTENDED) != 0) { - for (;;) + PCRE2_SPTR wscptr = ptr; + while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr); + if (c == CHAR_NUMBER_SIGN) { - while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr); - if (c != CHAR_NUMBER_SIGN) break; ptr++; while (*ptr != CHAR_NULL) { @@ -3997,13 +4015,19 @@ for (;; ptr++) if (utf) FORWARDCHAR(ptr); #endif } - c = *ptr; /* Either NULL or the char after a newline */ + } + + /* If we skipped any characters, restart the loop. Otherwise, we didn't see + a comment. */ + + if (ptr > wscptr) + { + ptr--; + continue; } } - /* Skip over (?# comments. We need to do this here because we want to know if - the next thing is a quantifier, and these comments may come between an item - and its quantifier. */ + /* Skip over (?# comments. */ if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_NUMBER_SIGN) @@ -4018,7 +4042,8 @@ for (;; ptr++) continue; } - /* See if the next thing is a quantifier. */ + /* End of processing "do nothing" items. See if the next thing is a + quantifier. */ is_quantifier = c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || @@ -7133,7 +7158,10 @@ for (;; ptr++) are negative the reference number. Only back references and those types that consume a character may be repeated. We can test for values between ESC_b and ESC_Z for the latter; this may have to change if any new ones are - ever created. */ + ever created. + + Note: \Q and \E are handled at the start of the character-processing loop, + not here. */ case CHAR_BACKSLASH: tempptr = ptr; @@ -7145,16 +7173,6 @@ for (;; ptr++) c = ec; else { - if (escape == ESC_Q) /* Handle start of quoted string */ - { - if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) - ptr += 2; /* avoid empty string */ - else inescq = TRUE; - continue; - } - - if (escape == ESC_E) continue; /* Perl ignores an orphan \E */ - /* For metasequences that actually match a character, we disable the setting of a first character if it hasn't already been set. */ diff --git a/testdata/testinput2 b/testdata/testinput2 index e7e0598..19240d2 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4699,4 +4699,7 @@ a)"xI /(A*)\E+/B,auto_callout +/()\Q\E*]/B,auto_callout + a[bc]d + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 0cc40b7..a3aced7 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14956,4 +14956,27 @@ Subject length lower bound = 0 End ------------------------------------------------------------------ +/()\Q\E*]/B,auto_callout +------------------------------------------------------------------ + Bra + Callout 255 0 7 + Brazero + SCBra 1 + Callout 255 1 0 + KetRmax + Callout 255 7 1 + ] + Callout 255 8 0 + Ket + End +------------------------------------------------------------------ + a[bc]d +--->a[bc]d + +0 ^ ()\Q\E* + +1 ^ ) + +7 ^ ] + +8 ^^ + 0: ] + 1: + # End of testinput2