From e3d62b0ff8c5c8ed6ddf8fcca634a0e37710ba62 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sun, 9 Aug 2015 16:29:35 +0000 Subject: [PATCH] Re-write recursion handling to fix another compiler bug and make it all less error-prone. --- ChangeLog | 8 + src/pcre2_compile.c | 533 ++++++++++------------------------------ src/pcre2_error.c | 2 +- src/pcre2_intmodedep.h | 9 +- testdata/testinput14 | 43 ++++ testdata/testinput16 | 43 ++++ testdata/testinput2 | 22 +- testdata/testinput5 | 4 +- testdata/testinput8 | 11 +- testdata/testoutput14 | 92 +++++++ testdata/testoutput16 | 98 ++++++++ testdata/testoutput2 | 39 +-- testdata/testoutput5 | 5 +- testdata/testoutput8-16 | 158 +++++++++++- testdata/testoutput8-32 | 158 +++++++++++- testdata/testoutput8-8 | 158 +++++++++++- 16 files changed, 912 insertions(+), 471 deletions(-) diff --git a/ChangeLog b/ChangeLog index 06f8259..b6dbe9c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -111,6 +111,14 @@ binary file, it incorrectly wrote output to stdout. 29. The JIT compiler did not restore the control verb head in case of *THEN control verbs. This issue was found by Karl Skomski with a custom LLVM fuzzer. +30. The way recursive references such as (?3) are compiled has been re-written +because the old way was the cause of many issues. Now, conversion of the group +number into a pattern offset does not happen until the pattern has been +completely compiled. This does mean that detection of all infinitely looping +recursions is postponed till match time. In the past, some easy ones were +detected at compile time. This re-writing was done in response to yet another +bug found by the LLVM fuzzer. + Version 10.20 30-June-2015 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 7076633..abd33d7 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -101,21 +101,9 @@ pre-compile phase when determining how much memory is required. The regex is partly compiled into this space, but the compiled parts are discarded as soon as they can be, so that hopefully there will never be an overrun. The code does, however, check for an overrun. The largest amount I've seen used is 218, -so this number is very generous. - -The same workspace is used during the second, actual compile phase for -remembering forward references to groups so that they can be filled in at the -end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE -is 4 there is plenty of room for most patterns. However, the memory can get -filled up by repetitions of forward references, for example patterns like -/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so -that the workspace is expanded in this situation. The value below is therefore -a minimum, and we put a maximum on it for safety. The minimum is now also -defined in terms of LINK_SIZE so that the size increase kicks in at the same -number of forward references in all cases. */ +so this number is very generous. */ #define COMPILE_WORK_SIZE (2048*LINK_SIZE) -#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE) /* The overrun tests check for a slightly smaller size so that they detect the overrun before it actually does run off the end of the data block. */ @@ -1215,18 +1203,23 @@ for (;;) *************************************************/ /* This function scans through a branch of a compiled pattern to see whether it -can match the empty string. It is called from could_be_empty() below and from -compile_branch() when checking for an unlimited repeat of a group that can -match nothing. Note that first_significant_code() skips over backward and -negative forward assertions when its final argument is TRUE. If we hit an -unclosed bracket, we return "empty" - this means we've struck an inner bracket -whose current branch will already have been scanned. +can match the empty string. It is called at the end of compiling to check the +entire pattern, and from compile_branch() when checking for an unlimited repeat +of a group that can match nothing. In the latter case it is called only when +doing the real compile, not during the pre-compile that measures the size of +the compiled pattern. + +Note that first_significant_code() skips over backward and negative forward +assertions when its final argument is TRUE. If we hit an unclosed bracket, we +return "empty" - this means we've struck an inner bracket whose current branch +will already have been scanned. Arguments: code points to start of search endcode points to where to stop utf TRUE if in UTF mode cb compile data + atend TRUE if being called to check an entire pattern recurses chain of recurse_check to catch mutual recursion Returns: TRUE if what is matched could be empty @@ -1234,7 +1227,7 @@ Returns: TRUE if what is matched could be empty static BOOL could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf, - compile_block *cb, recurse_check *recurses) + compile_block *cb, BOOL atend, recurse_check *recurses) { register PCRE2_UCHAR c; recurse_check this_recurse; @@ -1257,35 +1250,27 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); continue; } - /* For a recursion/subroutine call, if its end has been reached, which - implies a backward reference subroutine call, we can scan it. If it's a - forward reference subroutine call, we can't. To detect forward reference - we have to scan up the list that is kept in the workspace. This function is - called only when doing the real compile, not during the pre-compile that - measures the size of the compiled pattern. */ + /* For a recursion/subroutine call we can scan the recursion when this + function is called at the end, to check a complete pattern. Before then, + recursions just have the group number as their argument and in any case may + be forward references. In that situation, we return TRUE, just in case. It + means that unlimited repeats of groups that contain recursions are always + treated as "could be empty" - which just adds a bit more processing time + because of the runtime check. */ if (c == OP_RECURSE) { - PCRE2_SPTR scode = cb->start_code + GET(code, 1); - PCRE2_SPTR endgroup = scode; + PCRE2_SPTR scode, endgroup; BOOL empty_branch; - /* Test for forward reference or uncompleted reference. This is disabled - when called to scan a completed pattern by setting cb->start_workspace to - NULL. */ + if (!atend) return TRUE; + scode = cb->start_code + GET(code, 1); + endgroup = scode; - if (cb->start_workspace != NULL) - { - PCRE2_SPTR tcode; - for (tcode = cb->start_workspace; tcode < cb->hwm; tcode += LINK_SIZE) - if ((int)GET(tcode, 0) == (int)(code + 1 - cb->start_code)) return TRUE; - if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ - } - - /* If the reference is to a completed group, we need to detect whether this - is a recursive call, as otherwise there will be an infinite loop. If it is - a recursion, just skip over it. Simple recursions are easily detected. For - mutual recursions we keep a chain on the stack. */ + /* We need to detect whether this is a recursive call, as otherwise there + will be an infinite loop. If it is a recursion, just skip over it. Simple + recursions are easily detected. For mutual recursions we keep a chain on + the stack. */ do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); if (code >= scode && code <= endgroup) continue; /* Simple recursion */ @@ -1297,8 +1282,8 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); if (r != NULL) continue; /* Mutual recursion */ } - /* Completed reference; scan the referenced group, remembering it on the - stack chain to detect mutual recursions. */ + /* Scan the referenced group, remembering it on the stack chain to detect + mutual recursions. */ empty_branch = FALSE; this_recurse.prev = recurses; @@ -1306,7 +1291,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); do { - if (could_be_empty_branch(scode, endcode, utf, cb, &this_recurse)) + if (could_be_empty_branch(scode, endcode, utf, cb, atend, &this_recurse)) { empty_branch = TRUE; break; @@ -1363,7 +1348,7 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); do { if (!empty_branch && could_be_empty_branch(code, endcode, utf, cb, - recurses)) empty_branch = TRUE; + atend, recurses)) empty_branch = TRUE; code += GET(code, 1); } while (*code == OP_ALT); @@ -1584,77 +1569,6 @@ return TRUE; -/************************************************* -* Scan compiled regex for non-emptiness * -*************************************************/ - -/* This function is called to check for left recursive calls. We want to check -the current branch of the current pattern to see if it could match the empty -string. If it could, we must look outwards for branches at other levels, -stopping when we pass beyond the bracket which is the subject of the recursion. -This function is called only during the real compile, not during the -pre-compile. - -Arguments: - code points to start of the recursion - endcode points to where to stop (current RECURSE item) - bcptr points to the chain of current (unclosed) branch starts - utf TRUE if in UTF mode - cb compile data - -Returns: TRUE if what is matched could be empty -*/ - -static BOOL -could_be_empty(PCRE2_SPTR code, PCRE2_SPTR endcode, branch_chain *bcptr, - BOOL utf, compile_block *cb) -{ -while (bcptr != NULL && bcptr->current_branch >= code) - { - if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cb, NULL)) - return FALSE; - bcptr = bcptr->outer; - } -return TRUE; -} - - - -/************************************************* -* Expand the workspace * -*************************************************/ - -/* This function is called during the second compiling phase, if the number of -forward references fills the existing workspace, which is originally a block on -the stack. A larger block is obtained from the heap unless the ultimate limit -has been reached or the increase will be rather small. - -Argument: pointer to the compile data block -Returns: 0 if all went well, else an error number -*/ - -static int -expand_workspace(compile_block *cb) -{ -PCRE2_UCHAR *newspace; -int newsize = cb->workspace_size * 2; -if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX; -if (cb->workspace_size >= COMPILE_WORK_SIZE_MAX || - newsize - cb->workspace_size < WORK_SIZE_SAFETY_MARGIN) - return ERR72; -newspace = cb->cx->memctl.malloc(CU2BYTES(newsize), cb->cx->memctl.memory_data); -if (newspace == NULL) return ERR21; -memcpy(newspace, cb->start_workspace, cb->workspace_size * sizeof(PCRE2_UCHAR)); -cb->hwm = (PCRE2_UCHAR *)newspace + (cb->hwm - cb->start_workspace); -if (cb->workspace_size > COMPILE_WORK_SIZE) - cb->cx->memctl.free((void *)cb->start_workspace, cb->cx->memctl.memory_data); -cb->start_workspace = newspace; -cb->workspace_size = newsize; -return 0; -} - - - /************************************************* * Check for counted repeat * *************************************************/ @@ -2480,82 +2394,6 @@ for (;;) -/************************************************* -* Adjust OP_RECURSE items in repeated group * -*************************************************/ - -/* OP_RECURSE items contain an offset from the start of the regex to the group -that is referenced. This means that groups can be replicated for fixed -repetition simply by copying (because the recursion is allowed to refer to -earlier groups that are outside the current group). However, when a group is -optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is -inserted before it, after it has been compiled. This means that any OP_RECURSE -items within it that refer to the group itself or any contained groups have to -have their offsets adjusted. That is one of the jobs of this function. Before -it is called, the partially compiled regex must be temporarily terminated with -OP_END. - -This function has been extended to cope with forward references for recursions -and subroutine calls. It must check the list of such references for the -group we are dealing with. If it finds that one of the recursions in the -current group is on this list, it does not adjust the value in the reference -(which is a group number). After the group has been scanned, all the offsets in -the forward reference list for the group are adjusted. - -Arguments: - group points to the start of the group - adjust the amount by which the group is to be moved - utf TRUE in UTF mode - cb compile data - save_hwm_offset the hwm forward reference offset at the start of the group - -Returns: nothing -*/ - -static void -adjust_recurse(PCRE2_UCHAR *group, int adjust, BOOL utf, compile_block *cb, - size_t save_hwm_offset) -{ -uint32_t offset; -PCRE2_UCHAR *hc; -PCRE2_UCHAR *ptr = group; - -/* Scan the group for recursions. For each one found, check the forward -reference list. */ - -while ((ptr = (PCRE2_UCHAR *)find_recurse(ptr, utf)) != NULL) - { - for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; hc < cb->hwm; - hc += LINK_SIZE) - { - offset = (int)GET(hc, 0); - if (cb->start_code + offset == ptr + 1) break; - } - - /* If we have not found this recursion on the forward reference list, adjust - the recursion's offset if it's after the start of this group. */ - - if (hc >= cb->hwm) - { - offset = (int)GET(ptr, 1); - if (cb->start_code + offset >= group) PUT(ptr, 1, offset + adjust); - } - - ptr += 1 + LINK_SIZE; - } - -/* Now adjust all forward reference offsets for the group. */ - -for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; hc < cb->hwm; - hc += LINK_SIZE) - { - offset = (int)GET(hc, 0); - PUT(hc, 0, offset + adjust); - } -} - - - /************************************************* * Check for POSIX class syntax * *************************************************/ @@ -3215,7 +3053,7 @@ for (; ptr < cb->end_pattern; ptr++) top_nest->reset_group = cb->bracount; top_nest->max_group = cb->bracount; top_nest->flags |= NSF_RESET; - cb->external_flags |= PCRE2_DUPCAPUSED; + cb->external_flags |= PCRE2_DUPCAPUSED; break; } @@ -3598,7 +3436,6 @@ int32_t req_caseopt, reqvary, tempreqvary; int after_manual_callout = 0; int escape; size_t length_prevgroup = 0; -size_t item_hwm_offset = 0; register uint32_t c; register PCRE2_UCHAR *code = *codeptr; PCRE2_UCHAR *last_code = code; @@ -3755,16 +3592,6 @@ for (;; ptr++) last_code = code; } - /* In the real compile phase, just check the workspace used by the forward - reference list. */ - - else if (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN) - { - *errorcodeptr = ERR52; - goto FAILED; - } - /* If in \Q...\E, check for the end; if not, we have a literal */ if (inescq && (c != CHAR_NULL || ptr < cb->end_pattern)) @@ -3906,7 +3733,6 @@ for (;; ptr++) zeroreqcu = reqcu; zeroreqcuflags = reqcuflags; previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; @@ -3955,7 +3781,6 @@ for (;; ptr++) /* Handle a real character class. */ previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if they are encountered at the top level, so we'll do that too. */ @@ -4830,16 +4655,6 @@ for (;; ptr++) PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); code += 2 + 2 * LINK_SIZE; length_prevgroup = 3 + 3*LINK_SIZE; - - /* When actually compiling, we need to check whether this was a forward - reference, and if so, adjust the offset. */ - - if (lengthptr == NULL && cb->hwm >= cb->start_workspace + LINK_SIZE) - { - int offset = GET(cb->hwm, -LINK_SIZE); - if (offset == previous + 1 - cb->start_code) - PUT(cb->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE); - } } /* Now handle repetition for the different types of item. */ @@ -5082,7 +4897,6 @@ for (;; ptr++) { register int i; int len = (int)(code - previous); - size_t base_hwm_offset = item_hwm_offset; PCRE2_UCHAR *bralink = NULL; PCRE2_UCHAR *brazeroptr = NULL; @@ -5130,16 +4944,10 @@ for (;; ptr++) selectively. If the maximum is 1 or unlimited, we just have to stick in the BRAZERO - and do no more at this point. However, we do need to adjust any - OP_RECURSE calls inside the group that refer to the group itself or any - internal or forward referenced group, because the offset is from the - start of the whole regex. Temporarily terminate the pattern while doing - this. */ + and do no more at this point. */ if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { - *code = OP_END; - adjust_recurse(previous, 1, utf, cb, item_hwm_offset); memmove(previous + 1, previous, CU2BYTES(len)); code++; if (repeat_max == 0) @@ -5156,14 +4964,11 @@ for (;; ptr++) The first one has to be handled carefully because it's the original copy, which has to be moved up. The remainder can be handled by code that is common with the non-zero minimum case below. We have to - adjust the value or repeat_max, since one less copy is required. Once - again, we may have to adjust any OP_RECURSE calls inside the group. */ + adjust the value or repeat_max, since one less copy is required. */ else { int offset; - *code = OP_END; - adjust_recurse(previous, 2 + LINK_SIZE, utf, cb, item_hwm_offset); memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len)); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; @@ -5211,9 +5016,7 @@ for (;; ptr++) } /* This is compiling for real. If there is a set first byte for - the group, and we have not yet set a "required byte", set it. Make - sure there is enough workspace for copying forward references before - doing the copy. */ + the group, and we have not yet set a "required byte", set it. */ else { @@ -5222,29 +5025,9 @@ for (;; ptr++) reqcu = firstcu; reqcuflags = firstcuflags; } - for (i = 1; i < repeat_min; i++) { - PCRE2_UCHAR *hc; - size_t this_hwm_offset = cb->hwm - cb->start_workspace; memcpy(code, previous, CU2BYTES(len)); - - while (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN - - (this_hwm_offset - base_hwm_offset)) - { - *errorcodeptr = expand_workspace(cb); - if (*errorcodeptr != 0) goto FAILED; - } - - for (hc = (PCRE2_UCHAR *)cb->start_workspace + base_hwm_offset; - hc < (PCRE2_UCHAR *)cb->start_workspace + this_hwm_offset; - hc += LINK_SIZE) - { - PUT(cb->hwm, 0, GET(hc, 0) + len); - cb->hwm += LINK_SIZE; - } - base_hwm_offset = this_hwm_offset; code += len; } } @@ -5288,9 +5071,6 @@ for (;; ptr++) else for (i = repeat_max - 1; i >= 0; i--) { - PCRE2_UCHAR *hc; - size_t this_hwm_offset = cb->hwm - cb->start_workspace; - *code++ = OP_BRAZERO + repeat_type; /* All but the final copy start a new nesting, maintaining the @@ -5306,26 +5086,6 @@ for (;; ptr++) } memcpy(code, previous, CU2BYTES(len)); - - /* Ensure there is enough workspace for forward references before - copying them. */ - - while (cb->hwm > cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN - - (this_hwm_offset - base_hwm_offset)) - { - *errorcodeptr = expand_workspace(cb); - if (*errorcodeptr != 0) goto FAILED; - } - - for (hc = (PCRE2_UCHAR *)cb->start_workspace + base_hwm_offset; - hc < (PCRE2_UCHAR *)cb->start_workspace + this_hwm_offset; - hc += LINK_SIZE) - { - PUT(cb->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); - cb->hwm += LINK_SIZE; - } - base_hwm_offset = this_hwm_offset; code += len; } @@ -5391,14 +5151,15 @@ for (;; ptr++) else { - /* In the compile phase, check for empty string matching. */ + /* In the compile phase, check whether the group could match an empty + string. */ if (lengthptr == NULL) { PCRE2_UCHAR *scode = bracode; do { - if (could_be_empty_branch(scode, ketcode, utf, cb, NULL)) + if (could_be_empty_branch(scode, ketcode, utf, cb, FALSE, NULL)) { *bracode += OP_SBRA - OP_BRA; break; @@ -5420,14 +5181,11 @@ for (;; ptr++) { /* For COND brackets, we wrap the whole thing in a possessively repeated non-capturing bracket, because we have not invented POS - versions of the COND opcodes. Because we are moving code along, we - must ensure that any pending recursive references are updated. */ + versions of the COND opcodes. */ if (*bracode == OP_COND || *bracode == OP_SCOND) { int nlen = (int)(code - bracode); - *code = OP_END; - adjust_recurse(bracode, 1 + LINK_SIZE, utf, cb, item_hwm_offset); memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen)); code += 1 + LINK_SIZE; nlen += 1 + LINK_SIZE; @@ -5556,13 +5314,10 @@ for (;; ptr++) *tempcode = opcode_possessify[repcode]; /* For opcode without a special possessified version, wrap the item in - ONCE brackets. Because we are moving code along, we must ensure that - any pending recursive references are updated. */ + ONCE brackets. */ else { - *code = OP_END; - adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cb, item_hwm_offset); memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len)); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; @@ -5958,8 +5713,8 @@ for (;; ptr++) { while (IS_DIGIT(*ptr)) ptr++; *errorcodeptr = ERR61; - goto FAILED; - } + goto FAILED; + } recno = recno * 10 + (int)(*ptr - CHAR_0); ptr++; } @@ -6101,8 +5856,8 @@ for (;; ptr++) if (recno > INT_MAX / 10 - 1) /* Integer overflow */ { *errorcodeptr = ERR61; - goto FAILED; - } + goto FAILED; + } recno = recno * 10 + name[i] - CHAR_0; } if (recno == 0) recno = RREF_ANY; @@ -6479,7 +6234,6 @@ for (;; ptr++) if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF; PUT2INC(code, 0, index); PUT2INC(code, 0, count); @@ -6502,7 +6256,6 @@ for (;; ptr++) case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: { - PCRE2_SPTR called; terminator = CHAR_RIGHT_PARENTHESIS; /* Come here from the \g<...> and \g'...' code (Oniguruma @@ -6570,77 +6323,30 @@ for (;; ptr++) } recno += cb->bracount; } + + if ((uint32_t)recno > cb->final_bracount) + { + *errorcodeptr = ERR15; + goto FAILED; + } - /* Come here from code above that handles a named recursion */ + /* Come here from code above that handles a named recursion. + We insert the number of the called group after OP_RECURSE. At the + end of compiling the pattern is scanned and these numbers are + replaced by offsets within the pattern. It is done like this to avoid + problems with forward references and adjusting offsets when groups + are duplicated and moved (as discovered in previous implementations). + Note that a recursion does not have a set first character (relevant + if it is repeated, because it will then be wrapped with ONCE + brackets). */ HANDLE_RECURSION: previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; - called = cb->start_code; - - /* When we are actually compiling, find the bracket that is being - referenced. Temporarily end the regex in case it doesn't exist before - this point. If we end up with a forward reference, first check that - the bracket does occur later so we can give the error (and position) - now. Then remember this forward reference in the workspace so it can - be filled in at the end. */ - - if (lengthptr == NULL) - { - *code = OP_END; - if (recno != 0) - called = PRIV(find_bracket)(cb->start_code, utf, recno); - - /* Forward reference */ - - if (called == NULL) - { - if ((uint32_t)recno > cb->final_bracount) - { - *errorcodeptr = ERR15; - goto FAILED; - } - - /* Fudge the value of "called" so that when it is inserted as an - offset below, what it actually inserted is the reference number - of the group. Then remember the forward reference, expanding the - working space where the list is kept if necessary. */ - - called = cb->start_code + recno; - if (cb->hwm >= cb->start_workspace + cb->workspace_size - - WORK_SIZE_SAFETY_MARGIN) - { - *errorcodeptr = expand_workspace(cb); - if (*errorcodeptr != 0) goto FAILED; - } - PUTINC(cb->hwm, 0, (int)(code + 1 - cb->start_code)); - } - - /* If not a forward reference, and the subpattern is still open, - this is a recursive call. We check to see if this is a left - recursion that could loop for ever, and diagnose that case. We - must not, however, do this check if we are in a conditional - subpattern because the condition might be testing for recursion in - a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid. - Forever loops are also detected at runtime, so those that occur in - conditional subpatterns will be picked up then. */ - - else if (GET(called, 1) == 0 && cond_depth <= 0 && - could_be_empty(called, code, bcptr, utf, cb)) - { - *errorcodeptr = ERR40; - goto FAILED; - } - } - - /* Insert the recursion/subroutine item. It does not have a set first - character (relevant if it is repeated, because it will then be - wrapped with ONCE brackets). */ - *code = OP_RECURSE; - PUT(code, 1, (int)(called - cb->start_code)); + PUT(code, 1, recno); code += 1 + LINK_SIZE; groupsetfirstcu = FALSE; + cb->had_recurse = TRUE; } /* Can't determine a first byte now */ @@ -6780,7 +6486,6 @@ for (;; ptr++) else { previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; } *code = bravalue; @@ -7098,7 +6803,6 @@ for (;; ptr++) HANDLE_REFERENCE: if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE; previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF; PUT2INC(code, 0, recno); cb->backref_map |= (recno < 32)? (1u << recno) : 1; @@ -7128,7 +6832,6 @@ for (;; ptr++) if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb)) goto FAILED; previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP; *code++ = ptype; *code++ = pdata; @@ -7177,7 +6880,6 @@ for (;; ptr++) { previous = (escape > ESC_b && escape < ESC_Z)? code : NULL; - item_hwm_offset = cb->hwm - cb->start_workspace; *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape; } } @@ -7212,7 +6914,6 @@ for (;; ptr++) ONE_CHAR: previous = code; - item_hwm_offset = cb->hwm - cb->start_workspace; /* For caseless UTF mode, check whether this character has more than one other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ @@ -7354,7 +7055,6 @@ int32_t firstcuflags, reqcuflags; uint32_t branchfirstcu, branchreqcu; int32_t branchfirstcuflags, branchreqcuflags; size_t length; -size_t save_hwm_offset; unsigned int orig_bracount; unsigned int max_bracount; branch_chain bc; @@ -7376,8 +7076,6 @@ bc.current_branch = code; firstcu = reqcu = 0; firstcuflags = reqcuflags = REQ_UNSET; -save_hwm_offset = cb->hwm - cb->start_workspace; /* hwm at start of group */ - /* Accumulate the length for use in the pre-compile phase. Start with the length of the BRA and KET and any extra code units that are required at the beginning. We accumulate in a local variable to save frequent testing of @@ -7573,18 +7271,13 @@ for (;;) code += 1 + LINK_SIZE; /* If it was a capturing subpattern, check to see if it contained any - recursive back references. If so, we must wrap it in atomic brackets. - Because we are moving code along, we must ensure that any pending recursive - or forward subroutine references are updated. In any event, remove the - block from the chain. */ + recursive back references. If so, we must wrap it in atomic brackets. In + any event, remove the block from the chain. */ if (capnumber > 0) { if (cb->open_caps->flag) { - *code = OP_END; - adjust_recurse(start_bracket, 1 + LINK_SIZE, - (options & PCRE2_UTF) != 0, cb, save_hwm_offset); memmove(start_bracket + 1 + LINK_SIZE, start_bracket, CU2BYTES(code - start_bracket)); *start_bracket = OP_ONCE; @@ -8176,7 +7869,7 @@ cb.dupnames = FALSE; cb.end_pattern = pattern + patlen; cb.external_flags = 0; cb.external_options = options; -cb.hwm = cworkspace; +cb.had_recurse = FALSE; cb.iscondassert = FALSE; cb.max_lookbehind = 0; cb.name_entry_size = 0; @@ -8445,8 +8138,7 @@ codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) + the name/number translation table and of the code are passed around in the compile data block. The start/end pattern and initial options are already set from the pre-compile phase, as is the name_entry_size field. Reset the bracket -count and the names_found field. Also reset the hwm field; this time it's used -for remembering forward references to subpatterns. */ +count and the names_found field. */ cb.parens_depth = 0; cb.assert_depth = 0; @@ -8454,7 +8146,6 @@ cb.bracount = 0; cb.max_lookbehind = 0; cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); cb.start_code = codestart; -cb.hwm = (PCRE2_UCHAR *)(cb.start_workspace); cb.iscondassert = FALSE; cb.req_varyopt = 0; cb.had_accept = FALSE; @@ -8512,6 +8203,65 @@ if (usedlength > length) errorcode = ERR23; else #endif } +/* Scan the pattern for recursion/subroutine calls and convert the group +numbers into offsets. Maintain a small cache so that repeated groups containing +recursions are efficiently handled. */ + +#define RSCAN_CACHE_SIZE 8 + +if (errorcode == 0 && cb.had_recurse) + { + PCRE2_UCHAR *rcode; + PCRE2_SPTR rgroup; + int ccount = 0; + int start = RSCAN_CACHE_SIZE; + recurse_cache rc[RSCAN_CACHE_SIZE]; + + for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf); + rcode != NULL; + rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf)) + { + int i, p, recno; + + recno = (int)GET(rcode, 1); + if (recno == 0) rgroup = codestart; else + { + PCRE2_SPTR search_from = codestart; + rgroup = NULL; + for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) + { + if (recno == rc[p].recno) + { + rgroup = rc[p].group; + break; + } + + /* Group n+1 must always start to the right of group n, so we can save + search time below when the new group number is greater than any of the + previously found groups. */ + + if (recno > rc[p].recno) search_from = rc[p].group; + } + + if (rgroup == NULL) + { + rgroup = PRIV(find_bracket)(search_from, utf, recno); + if (rgroup == NULL) + { + errorcode = ERR53; + break; + } + if (--start < 0) start = RSCAN_CACHE_SIZE - 1; + rc[start].recno = recno; + rc[start].group = rgroup; + if (ccount < RSCAN_CACHE_SIZE) ccount++; + } + } + + PUT(rcode, 1, rgroup - codestart); + } + } + /* In rare debugging situations we sometimes need to look at the compiled code at this stage. */ @@ -8520,37 +8270,6 @@ pcre2_printint(re, stderr, TRUE); fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength); #endif -/* Fill in any forward references that are required. There may be repeated -references; optimize for them, as searching a large regex takes time. The -test of errorcode inside the loop means that nothing is done if it is already -non-zero. */ - -if (cb.hwm > cb.start_workspace) - { - int prev_recno = -1; - PCRE2_SPTR groupptr = NULL; - while (errorcode == 0 && cb.hwm > cb.start_workspace) - { - int offset, recno; - cb.hwm -= LINK_SIZE; - offset = GET(cb.hwm, 0); - recno = GET(codestart, offset); - if (recno != prev_recno) - { - groupptr = PRIV(find_bracket)(codestart, utf, recno); - prev_recno = recno; - } - if (groupptr == NULL) errorcode = ERR53; - else PUT(((PCRE2_UCHAR *)codestart), offset, (int)(groupptr - codestart)); - } - } - -/* If the workspace had to be expanded, free the new memory. */ - -if (cb.workspace_size > COMPILE_WORK_SIZE) - ccontext->memctl.free((void *)cb.start_workspace, - ccontext->memctl.memory_data); - /* After a successful compile, give an error if there's back reference to a non-existent capturing subpattern. Then, unless disabled, check whether any single character iterators can be auto-possessified. The function overwrites @@ -8716,7 +8435,7 @@ can be provided to applications. */ do { - if (could_be_empty_branch(codestart, code, utf, &cb, NULL)) + if (could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL)) { re->flags |= PCRE2_MATCH_EMPTY; break; diff --git a/src/pcre2_error.c b/src/pcre2_error.c index fff3166..5d1c040 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -112,7 +112,7 @@ static const char compile_error_texts[] = "number after (?C is greater than 255\0" "closing parenthesis for (?C expected\0" /* 40 */ - "recursion could loop indefinitely\0" + "SPARE ERROR\0" "unrecognized character after (?P\0" "syntax error in subpattern name (missing terminator)\0" "two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0" diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index f20f71e..704c375 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -647,6 +647,13 @@ typedef struct recurse_check { PCRE2_SPTR group; } recurse_check; +/* Structure for building a cache when filling in recursion offsets. */ + +typedef struct recurse_cache { + PCRE2_SPTR group; + int recno; +} recurse_cache; + /* Structure for maintaining a chain of pointers to the currently incomplete branches, for testing for left recursion while compiling. */ @@ -678,7 +685,6 @@ typedef struct compile_block { PCRE2_SPTR start_code; /* The start of the compiled code */ PCRE2_SPTR start_pattern; /* The start of the pattern */ PCRE2_SPTR end_pattern; /* The end of the pattern */ - PCRE2_UCHAR *hwm; /* High watermark of workspace */ PCRE2_UCHAR *name_table; /* The name/number table */ size_t workspace_size; /* Size of workspace */ uint16_t names_found; /* Number of entries so far */ @@ -701,6 +707,7 @@ typedef struct compile_block { int req_varyopt; /* "After variable item" flag for reqbyte */ BOOL had_accept; /* (*ACCEPT) encountered */ BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ + BOOL had_recurse; /* Had a recursion or subroutine call */ BOOL check_lookbehind; /* Lookbehinds need later checking */ BOOL dupnames; /* Duplicate names exist */ BOOL iscondassert; /* Next assert is a condition */ diff --git a/testdata/testinput14 b/testdata/testinput14 index 69ca2b6..b4fc203 100644 --- a/testdata/testinput14 +++ b/testdata/testinput14 @@ -108,5 +108,48 @@ /abc(?=abcde)(?=ab)/allusedtext abcabcdefg + +# These tests provoke recursion loops, which give a different error message +# when JIT is used. + +/(?R)/I + abcd + +/(a|(?R))/I + abcd + defg + +/(ab|(bc|(de|(?R))))/I + abcd + fghi + +/(ab|(bc|(de|(?1))))/I + abcd + fghi + +/x(ab|(bc|(de|(?1)x)x)x)/I + xab123 + xfghi + +/(?!\w)(?R)/ + abcd + =abc + +/(?=\w)(?R)/ + =abc + abcd + +/(?)/ -/(?!\w)(?R)/ - -/(?=\w)(?R)/ - -/(?(?P0|)|(?P>L2)(?P>L1))/ + abcd + 0abc /abc(*MARK:)pqr/ @@ -3306,8 +3290,6 @@ a random value. /Ix /[:a[:abc]b:]/B -/(a+|(?R)b)/ - /^(a(*:A)(d|e(*:B))z|aeq)/auto_callout adz aez diff --git a/testdata/testinput5 b/testdata/testinput5 index 4ddc47d..413ab1a 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1634,9 +1634,7 @@ 123ábc123 /(?<=abc)(|def)/g,utf,replace=<$0> - 123abcáyzabcdef789abcሴqr - -/[^\xff]((?1))/utf,debug + 123abcáyzabcdef789abcሴqr /[A-`]/iB,utf abcdefghijklmno diff --git a/testdata/testinput8 b/testdata/testinput8 index 545e751..9962356 100644 --- a/testdata/testinput8 +++ b/testdata/testinput8 @@ -2,7 +2,8 @@ # shown when the link size is 2. This is just a doublecheck test to ensure the # sizes don't go horribly wrong when something is changed. The pattern contents # are all themselves checked in other tests. Unicode, including property -# support, is required for these tests. +# support, is required for these tests. There are separate output files for +# each code unit size. #pattern fullbincode,memory @@ -156,4 +157,12 @@ ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))) /parens_nest_limit=1000,-fullbincode +/(?(1)(?1)){8,}+()/debug + abcd + +/(?(1)|a(?1)b){2,}+()/debug + abcde + +/((?1)(?2)(?3)(?4)(?5)(?6)(?7)(?8)(?9)(?9)(?8)(?7)(?6)(?5)(?4)(?3)(?2)(?1)(?0)){2,}()()()()()()()()()/debug + # End of testinput8 diff --git a/testdata/testoutput14 b/testdata/testoutput14 index cdfd6f7..5f8d423 100644 --- a/testdata/testoutput14 +++ b/testdata/testoutput14 @@ -238,5 +238,97 @@ Failed: error -52: nested recursion at the same subject position abcabcdefg 0: abcabcde >>>>> + +# These tests provoke recursion loops, which give a different error message +# when JIT is used. + +/(?R)/I +Capturing subpattern count = 0 +May match empty string +Subject length lower bound = 0 + abcd +Failed: error -52: nested recursion at the same subject position + +/(a|(?R))/I +Capturing subpattern count = 1 +May match empty string +Subject length lower bound = 1 + abcd + 0: a + 1: a + defg +Failed: error -52: nested recursion at the same subject position + +/(ab|(bc|(de|(?R))))/I +Capturing subpattern count = 3 +May match empty string +Subject length lower bound = 2 + abcd + 0: ab + 1: ab + fghi +Failed: error -52: nested recursion at the same subject position + +/(ab|(bc|(de|(?1))))/I +Capturing subpattern count = 3 +May match empty string +Subject length lower bound = 2 + abcd + 0: ab + 1: ab + fghi +Failed: error -52: nested recursion at the same subject position + +/x(ab|(bc|(de|(?1)x)x)x)/I +Capturing subpattern count = 3 +First code unit = 'x' +Subject length lower bound = 3 + xab123 + 0: xab + 1: ab + xfghi +Failed: error -52: nested recursion at the same subject position + +/(?!\w)(?R)/ + abcd +Failed: error -52: nested recursion at the same subject position + =abc +Failed: error -52: nested recursion at the same subject position + +/(?=\w)(?R)/ + =abc +Failed: error -52: nested recursion at the same subject position + abcd +Failed: error -52: nested recursion at the same subject position + +/(?)/ Failed: error 162 at offset 4: subpattern name expected -/(?!\w)(?R)/ -Failed: error 140 at offset 9: recursion could loop indefinitely - -/(?=\w)(?R)/ -Failed: error 140 at offset 9: recursion could loop indefinitely - -/(?(?P0|)|(?P>L2)(?P>L1))/ -Failed: error 140 at offset 31: recursion could loop indefinitely + abcd + 0: + 1: + 2: + 0abc + 0: 0 + 1: 0 + 2: 0 /abc(*MARK:)pqr/ Failed: error 166 at offset 10: (*MARK) must have an argument @@ -11104,9 +11084,6 @@ Matched, but too many substrings End ------------------------------------------------------------------ -/(a+|(?R)b)/ -Failed: error 140 at offset 7: recursion could loop indefinitely - /^(a(*:A)(d|e(*:B))z|aeq)/auto_callout adz --->adz diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 3ee04a6..7422221 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4003,12 +4003,9 @@ Subject length lower bound = 1 1: 123X\x{1234}Z123 /(?<=abc)(|def)/g,utf,replace=<$0> - 123abcáyzabcdef789abcሴqr + 123abcáyzabcdef789abcሴqr 4: 123abc<>\x{e1}yzabc<>789abc<>\x{1234}qr -/[^\xff]((?1))/utf,debug -Failed: error 140 at offset 11: recursion could loop indefinitely - /[A-`]/iB,utf ------------------------------------------------------------------ Bra diff --git a/testdata/testoutput8-16 b/testdata/testoutput8-16 index 0f9fa6a..40343d8 100644 --- a/testdata/testoutput8-16 +++ b/testdata/testoutput8-16 @@ -2,7 +2,8 @@ # shown when the link size is 2. This is just a doublecheck test to ensure the # sizes don't go horribly wrong when something is changed. The pattern contents # are all themselves checked in other tests. Unicode, including property -# support, is required for these tests. +# support, is required for these tests. There are separate output files for +# each code unit size. #pattern fullbincode,memory @@ -849,4 +850,159 @@ Memory allocation (code space): 14 /parens_nest_limit=1000,-fullbincode Failed: error 184 at offset 1540: (?| and/or (?J: or (?x: parentheses are too deeply nested +/(?(1)(?1)){8,}+()/debug +------------------------------------------------------------------ + 0 79 Bra + 2 70 Once + 4 6 Cond + 6 1 Cond ref + 8 74 Recurse + 10 6 Ket + 12 6 Cond + 14 1 Cond ref + 16 74 Recurse + 18 6 Ket + 20 6 Cond + 22 1 Cond ref + 24 74 Recurse + 26 6 Ket + 28 6 Cond + 30 1 Cond ref + 32 74 Recurse + 34 6 Ket + 36 6 Cond + 38 1 Cond ref + 40 74 Recurse + 42 6 Ket + 44 6 Cond + 46 1 Cond ref + 48 74 Recurse + 50 6 Ket + 52 6 Cond + 54 1 Cond ref + 56 74 Recurse + 58 6 Ket + 60 10 SBraPos + 62 6 SCond + 64 1 Cond ref + 66 74 Recurse + 68 6 Ket + 70 10 KetRpos + 72 70 Ket + 74 3 CBra 1 + 77 3 Ket + 79 79 Ket + 81 End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Max back reference = 1 +May match empty string +Subject length lower bound = 0 + abcd + 0: + 1: + +/(?(1)|a(?1)b){2,}+()/debug +------------------------------------------------------------------ + 0 43 Bra + 2 34 Once + 4 4 Cond + 6 1 Cond ref + 8 8 Alt + 10 a + 12 38 Recurse + 14 b + 16 12 Ket + 18 16 SBraPos + 20 4 SCond + 22 1 Cond ref + 24 8 Alt + 26 a + 28 38 Recurse + 30 b + 32 12 Ket + 34 16 KetRpos + 36 34 Ket + 38 3 CBra 1 + 41 3 Ket + 43 43 Ket + 45 End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Max back reference = 1 +May match empty string +Subject length lower bound = 0 + abcde +No match + +/((?1)(?2)(?3)(?4)(?5)(?6)(?7)(?8)(?9)(?9)(?8)(?7)(?6)(?5)(?4)(?3)(?2)(?1)(?0)){2,}()()()()()()()()()/debug +------------------------------------------------------------------ + 0 133 Bra + 2 41 CBra 1 + 5 2 Recurse + 7 88 Recurse + 9 93 Recurse + 11 98 Recurse + 13 103 Recurse + 15 108 Recurse + 17 113 Recurse + 19 118 Recurse + 21 123 Recurse + 23 123 Recurse + 25 118 Recurse + 27 113 Recurse + 29 108 Recurse + 31 103 Recurse + 33 98 Recurse + 35 93 Recurse + 37 88 Recurse + 39 2 Recurse + 41 0 Recurse + 43 41 Ket + 45 41 SCBra 1 + 48 2 Recurse + 50 88 Recurse + 52 93 Recurse + 54 98 Recurse + 56 103 Recurse + 58 108 Recurse + 60 113 Recurse + 62 118 Recurse + 64 123 Recurse + 66 123 Recurse + 68 118 Recurse + 70 113 Recurse + 72 108 Recurse + 74 103 Recurse + 76 98 Recurse + 78 93 Recurse + 80 88 Recurse + 82 2 Recurse + 84 0 Recurse + 86 41 KetRmax + 88 3 CBra 2 + 91 3 Ket + 93 3 CBra 3 + 96 3 Ket + 98 3 CBra 4 +101 3 Ket +103 3 CBra 5 +106 3 Ket +108 3 CBra 6 +111 3 Ket +113 3 CBra 7 +116 3 Ket +118 3 CBra 8 +121 3 Ket +123 3 CBra 9 +126 3 Ket +128 3 CBra 10 +131 3 Ket +133 133 Ket +135 End +------------------------------------------------------------------ +Capturing subpattern count = 10 +May match empty string +Subject length lower bound = 0 + # End of testinput8 diff --git a/testdata/testoutput8-32 b/testdata/testoutput8-32 index 1b843a5..7b61dc0 100644 --- a/testdata/testoutput8-32 +++ b/testdata/testoutput8-32 @@ -2,7 +2,8 @@ # shown when the link size is 2. This is just a doublecheck test to ensure the # sizes don't go horribly wrong when something is changed. The pattern contents # are all themselves checked in other tests. Unicode, including property -# support, is required for these tests. +# support, is required for these tests. There are separate output files for +# each code unit size. #pattern fullbincode,memory @@ -848,4 +849,159 @@ Memory allocation (code space): 28 ))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))) /parens_nest_limit=1000,-fullbincode +/(?(1)(?1)){8,}+()/debug +------------------------------------------------------------------ + 0 79 Bra + 2 70 Once + 4 6 Cond + 6 1 Cond ref + 8 74 Recurse + 10 6 Ket + 12 6 Cond + 14 1 Cond ref + 16 74 Recurse + 18 6 Ket + 20 6 Cond + 22 1 Cond ref + 24 74 Recurse + 26 6 Ket + 28 6 Cond + 30 1 Cond ref + 32 74 Recurse + 34 6 Ket + 36 6 Cond + 38 1 Cond ref + 40 74 Recurse + 42 6 Ket + 44 6 Cond + 46 1 Cond ref + 48 74 Recurse + 50 6 Ket + 52 6 Cond + 54 1 Cond ref + 56 74 Recurse + 58 6 Ket + 60 10 SBraPos + 62 6 SCond + 64 1 Cond ref + 66 74 Recurse + 68 6 Ket + 70 10 KetRpos + 72 70 Ket + 74 3 CBra 1 + 77 3 Ket + 79 79 Ket + 81 End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Max back reference = 1 +May match empty string +Subject length lower bound = 0 + abcd + 0: + 1: + +/(?(1)|a(?1)b){2,}+()/debug +------------------------------------------------------------------ + 0 43 Bra + 2 34 Once + 4 4 Cond + 6 1 Cond ref + 8 8 Alt + 10 a + 12 38 Recurse + 14 b + 16 12 Ket + 18 16 SBraPos + 20 4 SCond + 22 1 Cond ref + 24 8 Alt + 26 a + 28 38 Recurse + 30 b + 32 12 Ket + 34 16 KetRpos + 36 34 Ket + 38 3 CBra 1 + 41 3 Ket + 43 43 Ket + 45 End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Max back reference = 1 +May match empty string +Subject length lower bound = 0 + abcde +No match + +/((?1)(?2)(?3)(?4)(?5)(?6)(?7)(?8)(?9)(?9)(?8)(?7)(?6)(?5)(?4)(?3)(?2)(?1)(?0)){2,}()()()()()()()()()/debug +------------------------------------------------------------------ + 0 133 Bra + 2 41 CBra 1 + 5 2 Recurse + 7 88 Recurse + 9 93 Recurse + 11 98 Recurse + 13 103 Recurse + 15 108 Recurse + 17 113 Recurse + 19 118 Recurse + 21 123 Recurse + 23 123 Recurse + 25 118 Recurse + 27 113 Recurse + 29 108 Recurse + 31 103 Recurse + 33 98 Recurse + 35 93 Recurse + 37 88 Recurse + 39 2 Recurse + 41 0 Recurse + 43 41 Ket + 45 41 SCBra 1 + 48 2 Recurse + 50 88 Recurse + 52 93 Recurse + 54 98 Recurse + 56 103 Recurse + 58 108 Recurse + 60 113 Recurse + 62 118 Recurse + 64 123 Recurse + 66 123 Recurse + 68 118 Recurse + 70 113 Recurse + 72 108 Recurse + 74 103 Recurse + 76 98 Recurse + 78 93 Recurse + 80 88 Recurse + 82 2 Recurse + 84 0 Recurse + 86 41 KetRmax + 88 3 CBra 2 + 91 3 Ket + 93 3 CBra 3 + 96 3 Ket + 98 3 CBra 4 +101 3 Ket +103 3 CBra 5 +106 3 Ket +108 3 CBra 6 +111 3 Ket +113 3 CBra 7 +116 3 Ket +118 3 CBra 8 +121 3 Ket +123 3 CBra 9 +126 3 Ket +128 3 CBra 10 +131 3 Ket +133 133 Ket +135 End +------------------------------------------------------------------ +Capturing subpattern count = 10 +May match empty string +Subject length lower bound = 0 + # End of testinput8 diff --git a/testdata/testoutput8-8 b/testdata/testoutput8-8 index a12cc71..cae6889 100644 --- a/testdata/testoutput8-8 +++ b/testdata/testoutput8-8 @@ -2,7 +2,8 @@ # shown when the link size is 2. This is just a doublecheck test to ensure the # sizes don't go horribly wrong when something is changed. The pattern contents # are all themselves checked in other tests. Unicode, including property -# support, is required for these tests. +# support, is required for these tests. There are separate output files for +# each code unit size. #pattern fullbincode,memory @@ -849,4 +850,159 @@ Memory allocation (code space): 10 /parens_nest_limit=1000,-fullbincode Failed: error 184 at offset 1540: (?| and/or (?J: or (?x: parentheses are too deeply nested +/(?(1)(?1)){8,}+()/debug +------------------------------------------------------------------ + 0 119 Bra + 3 105 Once + 6 9 Cond + 9 1 Cond ref + 12 111 Recurse + 15 9 Ket + 18 9 Cond + 21 1 Cond ref + 24 111 Recurse + 27 9 Ket + 30 9 Cond + 33 1 Cond ref + 36 111 Recurse + 39 9 Ket + 42 9 Cond + 45 1 Cond ref + 48 111 Recurse + 51 9 Ket + 54 9 Cond + 57 1 Cond ref + 60 111 Recurse + 63 9 Ket + 66 9 Cond + 69 1 Cond ref + 72 111 Recurse + 75 9 Ket + 78 9 Cond + 81 1 Cond ref + 84 111 Recurse + 87 9 Ket + 90 15 SBraPos + 93 9 SCond + 96 1 Cond ref + 99 111 Recurse +102 9 Ket +105 15 KetRpos +108 105 Ket +111 5 CBra 1 +116 5 Ket +119 119 Ket +122 End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Max back reference = 1 +May match empty string +Subject length lower bound = 0 + abcd + 0: + 1: + +/(?(1)|a(?1)b){2,}+()/debug +------------------------------------------------------------------ + 0 61 Bra + 3 47 Once + 6 6 Cond + 9 1 Cond ref + 12 10 Alt + 15 a + 17 53 Recurse + 20 b + 22 16 Ket + 25 22 SBraPos + 28 6 SCond + 31 1 Cond ref + 34 10 Alt + 37 a + 39 53 Recurse + 42 b + 44 16 Ket + 47 22 KetRpos + 50 47 Ket + 53 5 CBra 1 + 58 5 Ket + 61 61 Ket + 64 End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Max back reference = 1 +May match empty string +Subject length lower bound = 0 + abcde +No match + +/((?1)(?2)(?3)(?4)(?5)(?6)(?7)(?8)(?9)(?9)(?8)(?7)(?6)(?5)(?4)(?3)(?2)(?1)(?0)){2,}()()()()()()()()()/debug +------------------------------------------------------------------ + 0 205 Bra + 3 62 CBra 1 + 8 3 Recurse + 11 133 Recurse + 14 141 Recurse + 17 149 Recurse + 20 157 Recurse + 23 165 Recurse + 26 173 Recurse + 29 181 Recurse + 32 189 Recurse + 35 189 Recurse + 38 181 Recurse + 41 173 Recurse + 44 165 Recurse + 47 157 Recurse + 50 149 Recurse + 53 141 Recurse + 56 133 Recurse + 59 3 Recurse + 62 0 Recurse + 65 62 Ket + 68 62 SCBra 1 + 73 3 Recurse + 76 133 Recurse + 79 141 Recurse + 82 149 Recurse + 85 157 Recurse + 88 165 Recurse + 91 173 Recurse + 94 181 Recurse + 97 189 Recurse +100 189 Recurse +103 181 Recurse +106 173 Recurse +109 165 Recurse +112 157 Recurse +115 149 Recurse +118 141 Recurse +121 133 Recurse +124 3 Recurse +127 0 Recurse +130 62 KetRmax +133 5 CBra 2 +138 5 Ket +141 5 CBra 3 +146 5 Ket +149 5 CBra 4 +154 5 Ket +157 5 CBra 5 +162 5 Ket +165 5 CBra 6 +170 5 Ket +173 5 CBra 7 +178 5 Ket +181 5 CBra 8 +186 5 Ket +189 5 CBra 9 +194 5 Ket +197 5 CBra 10 +202 5 Ket +205 205 Ket +208 End +------------------------------------------------------------------ +Capturing subpattern count = 10 +May match empty string +Subject length lower bound = 0 + # End of testinput8