diff --git a/ChangeLog b/ChangeLog index 0169af1..f908155 100644 --- a/ChangeLog +++ b/ChangeLog @@ -24,11 +24,17 @@ released code, but are noted here for the record. a match, because the external block was being set from non-existent internal ovector fields. Fixes oss-fuzz issue 781. + (b) A pattern with very many capturing parentheses (when the internal frame + size was greater than the initial frame vector on the stack) caused a + crash. A vector on the heap is now set up at the start of matching if the + vector on the stack is not big enough to handle at least 10 frames. + Fixes oss-fuzz issue 783. + 2. Hardened pcre2test so as to reduce the number of bugs reported by fuzzers: (a) Check for malloc failures when getting memory for the ovector (POSIX) or the match data block (non-POSIX). - + 3. In the 32-bit library in non-UTF mode, an attempt to find a Unicode property for a character with a code point greater than 0x10ffff (the Unicode maximum) caused a crash. diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 734ee80..1f194d0 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -816,9 +816,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode); ovector[0] = Fstart_match - mb->start_subject; ovector[1] = Feptr - mb->start_subject; - + /* Set i to the smaller of the sizes of the external and frame ovectors. */ - + i = 2 * ((top_bracket + 1 > oveccount)? oveccount : top_bracket + 1); memcpy(ovector + 2, Fovector, (i - 2) * sizeof(PCRE2_SIZE)); while (--i >= Foffset_top + 2) ovector[i] = PCRE2_UNSET; @@ -5231,7 +5231,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* The variable Flength will be added to Fecode when the condition is false, to get to the second branch. Setting it to the offset to the ALT or KET, then incrementing Fecode achieves this effect. However, if the second - branch is non-existent, we must point to the KET so that the end of the + branch is non-existent, we must point to the KET so that the end of the group is correctly processed. We now have Fecode pointing to the condition or callout. */ @@ -5478,8 +5478,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* If we are at the end of an assertion that is a condition, return a match, discarding any intermediate backtracking points. Copy back the - captures into the frame before N so that they are set on return. Doing - this for all assertions, both positive and negative, seems to match what + captures into the frame before N so that they are set on return. Doing + this for all assertions, both positive and negative, seems to match what Perl does. */ if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) @@ -5545,7 +5545,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); case OP_SCBRA: case OP_SCBRAPOS: number = GET2(bracode, 1+LINK_SIZE); - + /* Handle a recursively called group. We reinstate the previous set of captures and then carry on. */ @@ -6197,45 +6197,6 @@ mb->name_count = re->name_count; mb->name_entry_size = re->name_entry_size; mb->start_code = mb->name_table + re->name_count * re->name_entry_size; -/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE -vector at the end, whose size depends on the number of capturing parentheses in -the pattern. It is not used at all if there are no capturing parentheses. - - frame_size is the total size of each frame - mb->frame_vector_size is the total usable size of the vector (rounded down - to a whole number of frames) - -The last of these may be changed if the frame vector has to be expanded. We -therefore put it into the match block so that it is correct when calling -match() more than once for non-anchored patterns. */ - -frame_size = sizeof(heapframe) + ((re->top_bracket - 1) * 2 * sizeof(PCRE2_SIZE)); -mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size); - -/* Set up the initial frame set. Write to the ovector within the first frame to -mark every capture unset and to avoid uninitialized memory read errors when it -is copied to a new frame. */ - -memset((char *)(mb->stack_frames) + offsetof(heapframe,ovector), 0xff, - re->top_bracket * 2 * sizeof(PCRE2_SIZE)); -mb->match_frames = mb->stack_frames; -mb->match_frames_top = - (heapframe *)((char *)mb->match_frames + mb->frame_vector_size); - -/* Limits set in the pattern override the match context only if they are -smaller. */ - -mb->match_limit = (mcontext->match_limit < re->limit_match)? - mcontext->match_limit : re->limit_match; -mb->match_limit_recursion = (mcontext->recursion_limit < re->limit_recursion)? - mcontext->recursion_limit : re->limit_recursion; - -/* Pointers to the individual character tables */ - -mb->lcc = re->tables + lcc_offset; -mb->fcc = re->tables + fcc_offset; -mb->ctypes = re->tables + ctypes_offset; - /* Process the \R and newline settings. */ mb->bsr_convention = re->bsr_convention; @@ -6269,6 +6230,60 @@ switch(re->newline_convention) default: return PCRE2_ERROR_INTERNAL; } +/* The backtracking frames have fixed data at the front, and a PCRE2_SIZE +vector at the end, whose size depends on the number of capturing parentheses in +the pattern. It is not used at all if there are no capturing parentheses. + + frame_size is the total size of each frame + mb->frame_vector_size is the total usable size of the vector (rounded down + to a whole number of frames) + +The last of these is changed within the match() function if the frame vector +has to be expanded. We therefore put it into the match block so that it is +correct when calling match() more than once for non-anchored patterns. */ + +frame_size = sizeof(heapframe) + ((re->top_bracket - 1) * 2 * sizeof(PCRE2_SIZE)); + +/* If a pattern has very many capturing parentheses, the frame size may be very +large. Ensure that there are at least 10 available frames by getting an initial +vector on the heap if necessary. */ + +if (frame_size <= START_FRAMES_SIZE/10) + { + mb->match_frames = mb->stack_frames; /* Initial frame vector on the stack */ + mb->frame_vector_size = ((START_FRAMES_SIZE/frame_size) * frame_size); + } +else + { + mb->frame_vector_size = frame_size * 10; + mb->match_frames = mb->memctl.malloc(mb->frame_vector_size, + mb->memctl.memory_data); + if (mb->match_frames == NULL) return PCRE2_ERROR_NOMEMORY; + } + +mb->match_frames_top = + (heapframe *)((char *)mb->match_frames + mb->frame_vector_size); + +/* Write to the ovector within the first frame to mark every capture unset and +to avoid uninitialized memory read errors when it is copied to a new frame. */ + +memset((char *)(mb->match_frames) + offsetof(heapframe, ovector), 0xff, + re->top_bracket * 2 * sizeof(PCRE2_SIZE)); + +/* Limits set in the pattern override the match context only if they are +smaller. */ + +mb->match_limit = (mcontext->match_limit < re->limit_match)? + mcontext->match_limit : re->limit_match; +mb->match_limit_recursion = (mcontext->recursion_limit < re->limit_recursion)? + mcontext->recursion_limit : re->limit_recursion; + +/* Pointers to the individual character tables */ + +mb->lcc = re->tables + lcc_offset; +mb->fcc = re->tables + fcc_offset; +mb->ctypes = re->tables + ctypes_offset; + /* Set up the first code unit to match, if available. The first_codeunit value is never set for an anchored regular expression, but the anchoring may be forced at run time, so we have to test for anchoring. The first code unit may diff --git a/testdata/testinput2 b/testdata/testinput2 index 014a504..3d4f3c7 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5009,4 +5009,10 @@ a)"xI '(?:a(*:aa))b|ac' mark ac +/(R?){65}/ + (R?){65} + +/\[(a)]{60}/expand + aaaa + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index a23dbb6..7292b1d 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -15559,6 +15559,15 @@ Subject length lower bound = 11 ac 0: ac +/(R?){65}/ + (R?){65} + 0: + 1: + +/\[(a)]{60}/expand + aaaa +No match + # End of testinput2 Error -63: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data