diff --git a/ChangeLog b/ChangeLog index fd56823..3bad63e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -46,6 +46,10 @@ for example, /(?(R))*+/, was incorrectly compiled. 12. The Unicode tables have been updated to Unicode 8.0.0 (thanks to Christian Persch). +13. An empty comment (?#) in a pattern was incorrectly processed and could +provoke a buffer overflow. This bug was discovered by Karl Skomski with the +LLVM fuzzer. + Version 10.20 30-June-2015 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 6faf649..3cd501c 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2997,6 +2997,7 @@ int namelen; int i; BOOL inescq = FALSE; BOOL isdupname; +BOOL skiptoket = FALSE; BOOL utf = (options & PCRE2_UTF) != 0; BOOL negate_class; PCRE2_SPTR name; @@ -3009,6 +3010,16 @@ nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size); for (; ptr < cb->end_pattern; ptr++) { c = *ptr; + + /* Parenthesized groups set skiptoket when all following characters up to the + next closing parenthesis must be ignored. The parenthesis itself must be + processed (to end the nested parenthesized item). */ + + if (skiptoket) + { + if (c != CHAR_RIGHT_PARENTHESIS) continue; + skiptoket = FALSE; + } /* Skip over literals */ @@ -3177,9 +3188,14 @@ for (; ptr < cb->end_pattern; ptr++) { default: ptr += 2; - if (ptr[0] == CHAR_R || /* (?R) */ - IS_DIGIT(ptr[0]) || /* (?n) */ - (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1]))) break; /* (?-n) */ + if (ptr[0] == CHAR_R || /* (?R) */ + ptr[0] == CHAR_NUMBER_SIGN || /* (?#) */ + IS_DIGIT(ptr[0]) || /* (?n) */ + (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1]))) /* (?-n) */ + { + skiptoket = TRUE; + break; + } /* Handle (?| and (?imsxJU: which are the only other valid forms. Both need a new block on the nest stack. */ @@ -3304,16 +3320,6 @@ for (; ptr < cb->end_pattern; ptr++) while (ptr[0] != delimiter); break; - case CHAR_NUMBER_SIGN: - ptr += 3; - while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; - if (*ptr != CHAR_RIGHT_PARENTHESIS) - { - errorcode = ERR18; - goto FAILED; - } - break; - case CHAR_LEFT_PARENTHESIS: nest_depth++; /* Fall through */ diff --git a/testdata/testinput2 b/testdata/testinput2 index bcccaa2..09958a9 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4342,4 +4342,8 @@ a random value. /Ix /(?(R))*+/B abcd +/((?x)(?#))#(?'/ + +/((?x)(?#))#(?'abc')/I + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index f6762c9..ddb9ff0 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14514,4 +14514,14 @@ Failed: error 124 at offset 10: unrecognized character after (?< abcd 0: +/((?x)(?#))#(?'/ +Failed: error 124 at offset 14: unrecognized character after (?< + +/((?x)(?#))#(?'abc')/I +Capturing subpattern count = 2 +Named capturing subpatterns: + abc 2 +First code unit = '#' +Subject length lower bound = 1 + # End of testinput2