From ea03932668961437d60b38914ad5ae3cb950ef88 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 28 Feb 2015 11:31:51 +0000 Subject: [PATCH] Fix "internal error" bug caused by patterns like "((?2){0,1999}())?". --- ChangeLog | 8 +++++ src/pcre2_compile.c | 72 ++++++++++++++++++++++++-------------------- testdata/testinput2 | 4 +++ testdata/testoutput2 | 4 +++ 4 files changed, 55 insertions(+), 33 deletions(-) diff --git a/ChangeLog b/ChangeLog index 89fc7a2..dac9710 100644 --- a/ChangeLog +++ b/ChangeLog @@ -97,6 +97,14 @@ only on Windows. 21. "make distclean" was not removing config.h, a file that is created for use with CMake. +22. A pattern such as "((?2){0,1999}())?", which has a group containing a +forward reference repeated a large (but limited) number of times within a +repeated outer group that has a zero minimum quantifier, caused incorrect code +to be compiled, leading to the error "internal error: previously-checked +referenced subpattern not found" when an incorrect memory address was read. +This bug was reported as "heap overflow", discovered by Kai Lu of Fortinet's +FortiGuard Labs. + Version 10.00 05-January-2015 ----------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index a79b2b8..1ebd415 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2586,18 +2586,18 @@ the current group is on this list, it adjusts the offset in the list, not the value in the reference (which is a group number). Arguments: - group points to the start of the group - adjust the amount by which the group is to be moved - utf TRUE in UTF mode - cb compile data - save_hwm the hwm forward reference pointer at the start of the group + group points to the start of the group + adjust the amount by which the group is to be moved + utf TRUE in UTF mode + cb compile data + save_hwm_offset the hwm forward reference offset at the start of the group Returns: nothing */ static void adjust_recurse(PCRE2_UCHAR *group, int adjust, BOOL utf, compile_block *cb, - PCRE2_UCHAR *save_hwm) + size_t save_hwm_offset) { PCRE2_UCHAR *ptr = group; @@ -2609,7 +2609,8 @@ while ((ptr = (PCRE2_UCHAR *)find_recurse(ptr, utf)) != NULL) /* See if this recursion is on the forward reference list. If so, adjust the reference. */ - for (hc = save_hwm; hc < cb->hwm; hc += LINK_SIZE) + for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; hc < cb->hwm; + hc += LINK_SIZE) { offset = (int)GET(hc, 0); if (cb->start_code + offset == ptr + 1) @@ -3093,7 +3094,7 @@ PCRE2_SPTR tempptr; PCRE2_SPTR nestptr = NULL; PCRE2_UCHAR *previous = NULL; PCRE2_UCHAR *previous_callout = NULL; -PCRE2_UCHAR *save_hwm = NULL; +size_t save_hwm_offset = 0; uint8_t classbits[32]; /* We can fish out the UTF setting once and for all into a BOOL, but we must @@ -4565,7 +4566,7 @@ for (;; ptr++) if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; - adjust_recurse(previous, 1, utf, cb, save_hwm); + adjust_recurse(previous, 1, utf, cb, save_hwm_offset); memmove(previous + 1, previous, CU2BYTES(len)); code++; if (repeat_max == 0) @@ -4589,7 +4590,7 @@ for (;; ptr++) { int offset; *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf, cb, save_hwm); + adjust_recurse(previous, 2 + LINK_SIZE, utf, cb, save_hwm_offset); memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; @@ -4652,26 +4653,25 @@ for (;; ptr++) for (i = 1; i < repeat_min; i++) { PCRE2_UCHAR *hc; - PCRE2_UCHAR *this_hwm = cb->hwm; + size_t this_hwm_offset = cb->hwm - cb->start_workspace; memcpy(code, previous, CU2BYTES(len)); while (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm)) + WORK_SIZE_SAFETY_MARGIN - + (this_hwm_offset - save_hwm_offset)) { - size_t save_offset = save_hwm - cb->start_workspace; - size_t this_offset = this_hwm - cb->start_workspace; *errorcodeptr = expand_workspace(cb); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (PCRE2_UCHAR *)cb->start_workspace + save_offset; - this_hwm = (PCRE2_UCHAR *)cb->start_workspace + this_offset; } - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; + hc < (PCRE2_UCHAR *)cb->start_workspace + this_hwm_offset; + hc += LINK_SIZE) { PUT(cb->hwm, 0, GET(hc, 0) + len); cb->hwm += LINK_SIZE; } - save_hwm = this_hwm; + save_hwm_offset = this_hwm_offset; code += len; } } @@ -4716,7 +4716,7 @@ for (;; ptr++) else for (i = repeat_max - 1; i >= 0; i--) { PCRE2_UCHAR *hc; - PCRE2_UCHAR *this_hwm = cb->hwm; + size_t this_hwm_offset = cb->hwm - cb->start_workspace; *code++ = OP_BRAZERO + repeat_type; @@ -4738,22 +4738,21 @@ for (;; ptr++) copying them. */ while (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm)) + WORK_SIZE_SAFETY_MARGIN - + (this_hwm_offset - save_hwm_offset)) { - size_t save_offset = save_hwm - cb->start_workspace; - size_t this_offset = this_hwm - cb->start_workspace; *errorcodeptr = expand_workspace(cb); if (*errorcodeptr != 0) goto FAILED; - save_hwm = (PCRE2_UCHAR *)cb->start_workspace + save_offset; - this_hwm = (PCRE2_UCHAR *)cb->start_workspace + this_offset; } - for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) + for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; + hc < (PCRE2_UCHAR *)cb->start_workspace + this_hwm_offset; + hc += LINK_SIZE) { PUT(cb->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); cb->hwm += LINK_SIZE; } - save_hwm = this_hwm; + save_hwm_offset = this_hwm_offset; code += len; } @@ -4849,7 +4848,7 @@ for (;; ptr++) { int nlen = (int)(code - bracode); *code = OP_END; - adjust_recurse(bracode, 1 + LINK_SIZE, utf, cb, save_hwm); + adjust_recurse(bracode, 1 + LINK_SIZE, utf, cb, save_hwm_offset); memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); code += 1 + LINK_SIZE; nlen += 1 + LINK_SIZE; @@ -4984,7 +4983,7 @@ for (;; ptr++) else { *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cb, save_hwm); + adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cb, save_hwm_offset); memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -5009,13 +5008,17 @@ for (;; ptr++) /* ===================================================================*/ /* Start of nested parenthesized sub-expression, or comment or lookahead or lookbehind or option setting or condition or all the other extended - parenthesis forms. */ + parenthesis forms. We must save the current high-water-mark for the + forward reference list so that we know where they start for this group. + However, because the list may be extended when there are very many forward + references (usually the result of a replicated inner group), we must use + an offset rather than an absolute address. */ case CHAR_LEFT_PARENTHESIS: newoptions = options; skipbytes = 0; bravalue = OP_CBRA; - save_hwm = cb->hwm; + save_hwm_offset = cb->hwm - cb->start_workspace; reset_bracount = FALSE; /* First deal with various "verbs" that can be introduced by '*'. */ @@ -5972,7 +5975,8 @@ for (;; ptr++) /* Fudge the value of "called" so that when it is inserted as an offset below, what it actually inserted is the reference number - of the group. Then remember the forward reference. */ + of the group. Then remember the forward reference, expanding the + working space where the list is kept if necessary. */ called = cb->start_code + recno; if (cb->hwm >= cb->start_workspace + cb->workspace_size - @@ -6395,7 +6399,9 @@ for (;; ptr++) PCRE2_SPTR p; uint32_t cf; - save_hwm = cb->hwm; /* Normally this is set when '(' is read */ + /* Normally save_hwm_offset is set when '(' is read */ + + save_hwm_offset = cb->hwm - cb->start_workspace; terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; @@ -6933,7 +6939,7 @@ for (;;) { *code = OP_END; adjust_recurse(start_bracket, 1 + LINK_SIZE, - (options & PCRE2_UTF) != 0, cb, cb->hwm); + (options & PCRE2_UTF) != 0, cb, cb->hwm - cb->start_workspace); memmove(start_bracket + 1 + LINK_SIZE, start_bracket, CU2BYTES(code - start_bracket)); *start_bracket = OP_ONCE; diff --git a/testdata/testinput2 b/testdata/testinput2 index 497e03b..58b47d0 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4171,5 +4171,9 @@ a random value. /Ix '^(?:a)*+(\w)' g g\=ovector=1 + +# This pattern showed up a compile-time bug + +"((?2){0,1999}())?" # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 071caf4..e695c1a 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -13949,5 +13949,9 @@ Matched, but too many substrings g\=ovector=1 Matched, but too many substrings 0: g + +# This pattern showed up a compile-time bug + +"((?2){0,1999}())?" # End of testinput2