More refactoring: keep track of empty branches during compiling, replacing a

post-compile scan.
This commit is contained in:
Philip.Hazel 2016-12-23 17:09:37 +00:00
parent 1f87b60f01
commit d15aab4d17
4 changed files with 145 additions and 540 deletions

View File

@ -237,6 +237,13 @@ be the result.
the internal recursive calls that are used for lookrounds and recursions within
the pattern.
37. More refactoring has got rid of the internal could_be_empty_branch()
function (around 400 lines of code, including comments) by keeping track of
could-be-emptiness as the pattern is compiled instead of scanning compiled
groups. (This would have been much harder before the refactoring of #3 above.)
This lifts a restriction on the number of branches in a group (more than about
1100 would give "pattern is too complicated").
Version 10.22 29-July-2016
--------------------------

View File

@ -120,7 +120,7 @@ static unsigned int
add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
const uint32_t *, unsigned int);
static BOOL
static int
compile_regex(uint32_t, PCRE2_UCHAR **, uint32_t **, int *, uint32_t,
uint32_t *, int32_t *, uint32_t *, int32_t *, branch_chain *,
compile_block *, PCRE2_SIZE *);
@ -372,10 +372,8 @@ compiler is clever with identical subexpressions. */
/* These flags are used in the groupinfo vector. */
#define GI_SET_COULD_BE_EMPTY 0x80000000u
#define GI_COULD_BE_EMPTY 0x40000000u
#define GI_NOT_FIXED_LENGTH 0x20000000u
#define GI_SET_FIXED_LENGTH 0x10000000u
#define GI_SET_FIXED_LENGTH 0x80000000u
#define GI_NOT_FIXED_LENGTH 0x40000000u
#define GI_FIXED_LENGTH_MASK 0x0000ffffu
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
@ -4134,427 +4132,6 @@ for (;;)
/*************************************************
* Scan compiled branch for non-emptiness *
*************************************************/
/* This function scans through a branch of a compiled pattern to see whether it
can match the empty string. It is called at the end of compiling to check the
entire pattern, and from compile_branch() when checking for an unlimited repeat
of a group that can match nothing. In the latter case it is called only when
doing the real compile, not during the pre-compile that measures the size of
the compiled pattern.
Note that first_significant_code() skips over backward and negative forward
assertions when its final argument is TRUE. If we hit an unclosed bracket, we
return "empty" - this means we've struck an inner bracket whose current branch
will already have been scanned.
Arguments:
code points to start of search
endcode points to where to stop
utf TRUE if in UTF mode
cb compile data
atend TRUE if being called to check an entire pattern
recurses chain of recurse_check to catch mutual recursion
countptr pointer to count to catch over-complicated pattern
Returns: 0 if what is matched cannot be empty
1 if what is matched could be empty
-1 if the pattern is too complicated
*/
#define CBE_NOTEMPTY 0
#define CBE_EMPTY 1
#define CBE_TOOCOMPLICATED (-1)
static int
could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
{
uint32_t group = 0;
uint32_t groupinfo = 0;
PCRE2_UCHAR c;
recurse_check this_recurse;
/* If what we are checking has already been set as "could be empty", we know
the answer. */
if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
/* If this is a capturing group, we may have the answer cached, but we can only
use this information if there are no (?| groups in the pattern, because
otherwise group numbers are not unique. */
if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
(*code == OP_CBRA || *code == OP_CBRAPOS))
{
group = GET2(code, 1 + LINK_SIZE);
groupinfo = cb->groupinfo[group];
if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
/* A large and/or complex regex can take too long to process. We have to assume
it can match an empty string. This can happen more often when (?| groups are
present in the pattern and the caching is disabled. Setting the cap at 1100
allows the test for more than 1023 capturing patterns to work. */
if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
/* Scan the opcodes for this branch. */
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
code < endcode;
code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
{
PCRE2_SPTR ccode;
c = *code;
/* Skip over forward assertions; the other assertions are skipped by
first_significant_code() with a TRUE final argument. */
if (c == OP_ASSERT)
{
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* For a recursion/subroutine call we can scan the recursion when this
function is called at the end, to check a complete pattern. Before then,
recursions just have the group number as their argument and in any case may
be forward references. In that situation, we return CBE_EMPTY, just in case.
It means that unlimited repeats of groups that contain recursions are always
treated as "could be empty" - which just adds a bit more processing time
because of the runtime check. */
if (c == OP_RECURSE)
{
PCRE2_SPTR scode, endgroup;
BOOL empty_branch;
if (!atend) goto ISTRUE;
scode = cb->start_code + GET(code, 1);
endgroup = scode;
/* We need to detect whether this is a recursive call, as otherwise there
will be an infinite loop. If it is a recursion, just skip over it. Simple
recursions are easily detected. For mutual recursions we keep a chain on
the stack. */
do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
if (code >= scode && code <= endgroup) continue; /* Simple recursion */
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev)
if (r->group == scode) break;
if (r != NULL) continue; /* Mutual recursion */
}
/* Scan the referenced group, remembering it on the stack chain to detect
mutual recursions. */
empty_branch = FALSE;
this_recurse.prev = recurses;
this_recurse.group = scode;
do
{
int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
&this_recurse, countptr);
if (rc < 0) return rc;
if (rc > 0)
{
empty_branch = TRUE;
break;
}
scode += GET(scode, 1);
}
while (*scode == OP_ALT);
if (!empty_branch) goto ISFALSE; /* All branches are non-empty */
continue;
}
/* Groups with zero repeats can of course be empty; skip them. */
if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
c == OP_BRAPOSZERO)
{
code += PRIV(OP_lengths)[c];
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* A nested group that is already marked as "could be empty" can just be
skipped. */
if (c == OP_SBRA || c == OP_SBRAPOS ||
c == OP_SCBRA || c == OP_SCBRAPOS)
{
do code += GET(code, 1); while (*code == OP_ALT);
c = *code;
continue;
}
/* For other groups, scan the branches. */
if (c == OP_BRA || c == OP_BRAPOS ||
c == OP_CBRA || c == OP_CBRAPOS ||
c == OP_ONCE || c == OP_ONCE_NC ||
c == OP_COND || c == OP_SCOND)
{
BOOL empty_branch;
if (GET(code, 1) == 0) goto ISTRUE; /* Hit unclosed bracket */
/* If a conditional group has only one branch, there is a second, implied,
empty branch, so just skip over the conditional, because it could be empty.
Otherwise, scan the individual branches of the group. */
if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
code += GET(code, 1);
else
{
empty_branch = FALSE;
do
{
if (!empty_branch)
{
int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
recurses, countptr);
if (rc < 0) return rc;
if (rc > 0) empty_branch = TRUE;
}
code += GET(code, 1);
}
while (*code == OP_ALT);
if (!empty_branch) goto ISFALSE; /* All branches are non-empty */
}
c = *code;
continue;
}
/* Handle the other opcodes */
switch (c)
{
/* Check for quantifiers after a class. XCLASS is used for classes that
cannot be represented just by a bit map. This includes negated single
high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
actual length is stored in the compiled code, so we must update "code"
here. */
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
case OP_XCLASS:
ccode = code += GET(code, 1);
goto CHECK_CLASS_REPEAT;
#endif
case OP_CLASS:
case OP_NCLASS:
ccode = code + PRIV(OP_lengths)[OP_CLASS];
#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
CHECK_CLASS_REPEAT:
#endif
switch (*ccode)
{
case OP_CRSTAR: /* These could be empty; continue */
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
case OP_CRPOSSTAR:
case OP_CRPOSQUERY:
break;
default: /* Non-repeat => class must match */
case OP_CRPLUS: /* These repeats aren't empty */
case OP_CRMINPLUS:
case OP_CRPOSPLUS:
goto ISFALSE;
case OP_CRRANGE:
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
if (GET2(ccode, 1) > 0) goto ISFALSE; /* Minimum > 0 */
break;
}
break;
/* Opcodes that must match a character */
case OP_ANY:
case OP_ALLANY:
case OP_ANYBYTE:
case OP_PROP:
case OP_NOTPROP:
case OP_ANYNL:
case OP_NOT_HSPACE:
case OP_HSPACE:
case OP_NOT_VSPACE:
case OP_VSPACE:
case OP_EXTUNI:
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_CHAR:
case OP_CHARI:
case OP_NOT:
case OP_NOTI:
case OP_PLUS:
case OP_PLUSI:
case OP_MINPLUS:
case OP_MINPLUSI:
case OP_NOTPLUS:
case OP_NOTPLUSI:
case OP_NOTMINPLUS:
case OP_NOTMINPLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
case OP_NOTPOSPLUS:
case OP_NOTPOSPLUSI:
case OP_EXACT:
case OP_EXACTI:
case OP_NOTEXACT:
case OP_NOTEXACTI:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
case OP_TYPEEXACT:
goto ISFALSE;
/* These are going to continue, as they may be empty, but we have to
fudge the length for the \p and \P cases. */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPOSSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSQUERY:
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
break;
/* Same for these */
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
code += 2;
break;
/* End of branch */
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_KETRPOS:
case OP_ALT:
goto ISTRUE;
/* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY,
POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative
versions may be followed by a multi-code-unit character. */
#ifdef MAYBE_UTF_MULTI
case OP_STAR:
case OP_STARI:
case OP_NOTSTAR:
case OP_NOTSTARI:
case OP_MINSTAR:
case OP_MINSTARI:
case OP_NOTMINSTAR:
case OP_NOTMINSTARI:
case OP_POSSTAR:
case OP_POSSTARI:
case OP_NOTPOSSTAR:
case OP_NOTPOSSTARI:
case OP_QUERY:
case OP_QUERYI:
case OP_NOTQUERY:
case OP_NOTQUERYI:
case OP_MINQUERY:
case OP_MINQUERYI:
case OP_NOTMINQUERY:
case OP_NOTMINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
case OP_NOTPOSQUERY:
case OP_NOTPOSQUERYI:
if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
break;
case OP_UPTO:
case OP_UPTOI:
case OP_NOTUPTO:
case OP_NOTUPTOI:
case OP_MINUPTO:
case OP_MINUPTOI:
case OP_NOTMINUPTO:
case OP_NOTMINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
case OP_NOTPOSUPTO:
case OP_NOTPOSUPTOI:
if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
break;
#endif /* MAYBE_UTF_MULTI */
/* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
string. */
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
case OP_THEN_ARG:
code += code[1];
break;
/* None of the remaining opcodes are required to match a character. */
default:
break;
}
}
ISTRUE:
groupinfo |= GI_COULD_BE_EMPTY;
ISFALSE:
if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
#ifdef SUPPORT_UNICODE
/*************************************************
* Get othercase range *
@ -4948,17 +4525,20 @@ Arguments:
lengthptr NULL during the real compile phase
points to length accumulator during pre-compile phase
Returns: TRUE on success
FALSE, with *errorcodeptr set non-zero on error
Returns: 0 There's been an error, *errorcodeptr is non-zero
+1 Success, this branch must match at least one character
-1 Success, this branch may match an empty string
*/
static BOOL
static int
compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
int *errorcodeptr, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
compile_block *cb, PCRE2_SIZE *lengthptr)
{
int bravalue = 0;
int okreturn = -1;
int group_return = 0;
uint32_t repeat_min = 0, repeat_max = 0; /* To please picky compilers */
uint32_t greedy_default, greedy_non_default;
uint32_t repeat_type, op_type;
@ -4980,6 +4560,8 @@ PCRE2_UCHAR *tempcode;
PCRE2_UCHAR *previous = NULL;
PCRE2_UCHAR op_previous;
BOOL groupsetfirstcu = FALSE;
BOOL matched_char = FALSE;
BOOL previous_matched_char = FALSE;
const uint8_t *cbits = cb->cbits;
uint8_t classbits[32];
@ -5040,6 +4622,7 @@ for (;; pptr++)
BOOL should_flip_negation;
BOOL match_all_or_no_wide_chars;
BOOL possessive_quantifier;
BOOL note_group_empty;
int class_has_8bitchar;
int i;
uint32_t mclength;
@ -5067,7 +4650,7 @@ for (;; pptr++)
{
*errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
ERR52 : ERR86;
return FALSE;
return 0;
}
/* There is at least one situation where code goes backwards: this is the
@ -5087,7 +4670,7 @@ for (;; pptr++)
if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
{
*errorcodeptr = ERR20; /* Integer overflow */
return FALSE;
return 0;
}
*lengthptr += (PCRE2_SIZE)(code - orig_code);
code = orig_code;
@ -5104,8 +4687,15 @@ for (;; pptr++)
Checking for the legality of quantifiers happens in parse_regex(), except for
a quantifier after an assertion that is a condition. */
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY) previous = code;
if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
{
previous = code;
if (matched_char) okreturn = 1;
}
previous_matched_char = matched_char;
matched_char = FALSE;
note_group_empty = FALSE;
skipunits = 0; /* Default value for most subgroups */
switch(meta)
@ -5122,7 +4712,7 @@ for (;; pptr++)
*reqcuflagsptr = reqcuflags;
*codeptr = code;
*pptrptr = pptr;
return TRUE;
return okreturn;
/* ===================================================================*/
@ -5147,6 +4737,7 @@ for (;; pptr++)
repeats. The value of reqcu doesn't change either. */
case META_DOT:
matched_char = TRUE;
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
zerofirstcu = firstcu;
zerofirstcuflags = firstcuflags;
@ -5164,6 +4755,7 @@ for (;; pptr++)
case META_CLASS_EMPTY:
case META_CLASS_EMPTY_NOT:
matched_char = TRUE;
*code++ = (meta == META_CLASS_EMPTY_NOT)? OP_ALLANY : OP_FAIL;
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
zerofirstcu = firstcu;
@ -5186,6 +4778,7 @@ for (;; pptr++)
case META_CLASS_NOT:
case META_CLASS:
matched_char = TRUE;
negate_class = meta == META_CLASS_NOT;
/* We can optimize the case of a single character in a class by generating
@ -5406,7 +4999,7 @@ for (;; pptr++)
"in character class", meta);
#endif
*errorcodeptr = ERR89; /* Internal error - unrecognized. */
return FALSE;
return 0;
}
escape = META_DATA(meta);
@ -5840,7 +5433,7 @@ for (;; pptr++)
PUT2(code, 2+LINK_SIZE, ng->number);
if (ng->number > cb->top_backref) cb->top_backref = ng->number;
skipunits = 1+IMM2_SIZE;
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
}
break; /* Found a duplicated name */
}
@ -5862,7 +5455,7 @@ for (;; pptr++)
{
*errorcodeptr = ERR61;
cb->erroroffset = offset + i;
return FALSE;
return 0;
}
}
}
@ -5871,7 +5464,7 @@ for (;; pptr++)
{
*errorcodeptr = ERR15;
cb->erroroffset = offset;
return FALSE;
return 0;
}
/* (?Rdigits) treated as a recursion reference by number. A value of
@ -5882,7 +5475,7 @@ for (;; pptr++)
code[1+LINK_SIZE] = OP_RREF;
PUT2(code, 2+LINK_SIZE, groupnumber);
skipunits = 1+IMM2_SIZE;
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
}
/* A duplicated name was found. Note that if an R<digits> name is found
@ -5896,7 +5489,7 @@ for (;; pptr++)
count = 0; /* Values for first pass (avoids compiler warning) */
index = 0;
if (lengthptr == NULL && !find_dupname_details(name, length, &index,
&count, errorcodeptr, cb)) return FALSE;
&count, errorcodeptr, cb)) return 0;
/* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
insert appropriate data values. */
@ -5906,9 +5499,11 @@ for (;; pptr++)
PUT2(code, 2+LINK_SIZE, index);
PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
}
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
/* The DEFINE condition is always false. */
/* The DEFINE condition is always false. It's internal groups may never
be called, so matched_char must remain false, hence the jump to
GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
case META_COND_DEFINE:
bravalue = OP_COND;
@ -5927,14 +5522,14 @@ for (;; pptr++)
{
*errorcodeptr = ERR15;
cb->erroroffset = offset;
return FALSE;
return 0;
}
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
offset -= 2; /* Point at initial ( for too many branches error */
code[1+LINK_SIZE] = OP_CREF;
skipunits = 1+IMM2_SIZE;
PUT2(code, 2+LINK_SIZE, groupnumber);
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
/* Test for the PCRE2 version. */
@ -5949,13 +5544,13 @@ for (;; pptr++)
OP_TRUE : OP_FALSE;
skipunits = 1;
pptr += 3;
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
/* The condition is an assertion, possibly preceded by a callout. */
case META_COND_ASSERT:
bravalue = OP_COND;
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
/* ===================================================================*/
@ -6000,7 +5595,7 @@ for (;; pptr++)
case META_ATOMIC:
bravalue = OP_ONCE;
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
case META_NOCAPTURE:
bravalue = OP_BRA;
@ -6008,7 +5603,12 @@ for (;; pptr++)
/* Process nested bracketed regex. The nesting depth is maintained for the
benefit of the stackguard function. The test for too deep nesting is now
done in parse_regex(). */
done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
note of whether or not they may match an empty string. */
GROUP_PROCESS_NOTE_EMPTY:
note_group_empty = TRUE;
GROUP_PROCESS:
cb->parens_depth += 1;
@ -6019,7 +5619,8 @@ for (;; pptr++)
templastcapture = cb->lastcapture; /* Save value before group */
length_prevgroup = 0; /* Initialize for pre-compile phase */
if (!compile_regex(
if ((group_return =
compile_regex(
options, /* The option state */
&tempcode, /* Where to put code (updated) */
&pptr, /* Input pointer (updated) */
@ -6033,11 +5634,19 @@ for (;; pptr++)
cb, /* Compile data block */
(lengthptr == NULL)? NULL : /* Actual compile phase */
&length_prevgroup /* Pre-compile phase */
))
return FALSE;
)) == 0)
return 0; /* Error */
cb->parens_depth -= 1;
/* If this was an atomic group and there are no capturing groups within it,
/* If that was a non-conditional significant group (not an assertion, not a
DEFINE) that matches at least one character, then the current item matches
a character. Conditionals are handled below. */
if (note_group_empty && bravalue != OP_COND && group_return > 0)
matched_char = TRUE;
/* If that was an atomic group and there are no capturing groups within it,
generate OP_ONCE_NC instead of OP_ONCE. */
if (bravalue == OP_ONCE && cb->lastcapture <= templastcapture)
@ -6078,7 +5687,7 @@ for (;; pptr++)
{
cb->erroroffset = offset;
*errorcodeptr = ERR54;
return FALSE;
return 0;
}
code[LINK_SIZE+1] = OP_FALSE;
bravalue = OP_DEFINE; /* A flag to suppress char handling below */
@ -6086,7 +5695,8 @@ for (;; pptr++)
/* A "normal" conditional group. If there is just one branch, we must not
make use of its firstcu or reqcu, because this is equivalent to an
empty second branch. */
empty second branch. Also, it may match an empty string. If there are two
branches, this item must match a character if the group must. */
else
{
@ -6094,9 +5704,10 @@ for (;; pptr++)
{
cb->erroroffset = offset;
*errorcodeptr = ERR27;
return FALSE;
return 0;
}
if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
else if (group_return > 0) matched_char = TRUE;
}
}
@ -6110,7 +5721,7 @@ for (;; pptr++)
if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
{
*errorcodeptr = ERR20;
return FALSE;
return 0;
}
*lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
code++; /* This already contains bravalue */
@ -6269,7 +5880,7 @@ for (;; pptr++)
{
*errorcodeptr = ERR15;
cb->erroroffset = offset;
return FALSE;
return 0;
}
/* If a back reference name is not duplicated, we can handle it as
@ -6288,7 +5899,7 @@ for (;; pptr++)
count = 0; /* Values for first pass (avoids compiler warning) */
index = 0;
if (lengthptr == NULL && !find_dupname_details(name, length, &index,
&count, errorcodeptr, cb)) return FALSE;
&count, errorcodeptr, cb)) return 0;
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
@ -6407,6 +6018,7 @@ for (;; pptr++)
repeat_max = 1;
REPEAT:
if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
/* Remember whether this is a variable length repeat, and default to
single-char opcodes. */
@ -6475,6 +6087,7 @@ for (;; pptr++)
PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
code += 2 + 2 * LINK_SIZE;
length_prevgroup = 3 + 3*LINK_SIZE;
group_return = -1; /* Set "may match empty string" */
}
/* Now handle repetition for the different types of item. */
@ -6689,7 +6302,7 @@ for (;; pptr++)
OFLOW_MAX - *lengthptr < delta)
{
*errorcodeptr = ERR20;
return FALSE;
return 0;
}
*lengthptr += delta;
}
@ -6742,7 +6355,7 @@ for (;; pptr++)
OFLOW_MAX - *lengthptr < delta)
{
*errorcodeptr = ERR20;
return FALSE;
return 0;
}
*lengthptr += delta;
}
@ -6831,34 +6444,14 @@ for (;; pptr++)
else
{
/* In the compile phase, check whether the group could match an
empty string. */
/* In the compile phase, adjust the opcode if the group can match
an empty string. For a conditional group with only one branch, the
value of group_return will not show "could be empty", so we must
check that separately. */
if (lengthptr == NULL)
{
PCRE2_UCHAR *scode = bracode;
do
{
int count = 0;
int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE,
NULL, &count);
if (rc < 0)
{
*errorcodeptr = ERR86;
return FALSE;
}
if (rc > 0)
{
*bracode += OP_SBRA - OP_BRA;
break;
}
scode += GET(scode, 1);
}
while (*scode == OP_ALT);
/* A conditional group with only one branch has an implicit empty
alternative branch. */
if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
*bracode = OP_SCOND;
}
@ -6917,7 +6510,7 @@ for (;; pptr++)
if (op_previous >= OP_EODN) /* Not a character type - internal error */
{
*errorcodeptr = ERR10;
return FALSE;
return 0;
}
else
{
@ -7188,7 +6781,7 @@ for (;; pptr++)
{
cb->erroroffset = offset;
*errorcodeptr = ERR15; /* Non-existent subpattern */
return FALSE;
return 0;
}
/* Come here from named backref handling when the reference is to a
@ -7241,7 +6834,7 @@ for (;; pptr++)
{
cb->erroroffset = offset;
*errorcodeptr = ERR15; /* Non-existent subpattern */
return FALSE;
return 0;
}
HANDLE_NUMERICAL_RECURSION:
*code = OP_RECURSE;
@ -7261,7 +6854,7 @@ for (;; pptr++)
skipunits = IMM2_SIZE;
PUT2(code, 1+LINK_SIZE, meta_arg);
cb->lastcapture = meta_arg;
goto GROUP_PROCESS;
goto GROUP_PROCESS_NOTE_EMPTY;
/* ===============================================================*/
@ -7279,12 +6872,15 @@ for (;; pptr++)
case META_ESCAPE:
/* We can test for escape sequences that consume a character because their
values lie between ESC_b and ESC_Z for the latter; this may have to change
if any new ones are ever created. For these sequences, we disable the
setting of a first character if it hasn't already been set. */
values lie between ESC_b and ESC_Z; this may have to change if any new ones
are ever created. For these sequences, we disable the setting of a first
character if it hasn't already been set. */
if (firstcuflags == REQ_UNSET && meta_arg > ESC_b && meta_arg < ESC_Z)
firstcuflags = REQ_NONE;
if (meta_arg > ESC_b && meta_arg < ESC_Z)
{
matched_char = TRUE;
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
}
/* Set values to reset to if this is followed by a zero repeat. */
@ -7341,7 +6937,7 @@ for (;; pptr++)
fprintf(stderr, "** Unrecognized parsed pattern item 0x%.8x\n", *pptr);
#endif
*errorcodeptr = ERR89; /* Internal error - unrecognized. */
return FALSE;
return 0;
}
/* Handle a literal character. We come here by goto in the case of a
@ -7350,6 +6946,7 @@ for (;; pptr++)
NORMAL_CHAR:
meta = *pptr; /* Get the full 32 bits */
NORMAL_CHAR_SET: /* Character is already in meta */
matched_char = TRUE;
/* For caseless UTF mode, check whether this character has more than one
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
@ -7471,10 +7068,12 @@ Arguments:
lengthptr NULL during the real compile phase
points to length accumulator during pre-compile phase
Returns: TRUE on success
Returns: 0 There has been an error
+1 Success, this group must match at least one character
-1 Success, this group may match an empty string
*/
static BOOL
static int
compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, uint32_t **pptrptr,
int *errorcodeptr, uint32_t skipunits, uint32_t *firstcuptr,
int32_t *firstcuflagsptr, uint32_t *reqcuptr,int32_t *reqcuflagsptr,
@ -7486,6 +7085,7 @@ PCRE2_UCHAR *start_bracket = code;
BOOL lookbehind;
open_capitem capitem;
int capnumber = 0;
int okreturn = 1;
uint32_t *pptr = *pptrptr;
uint32_t firstcu, reqcu;
uint32_t lookbehindlength;
@ -7501,7 +7101,7 @@ if (cb->cx->stack_guard != NULL &&
cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
{
*errorcodeptr= ERR33;
return FALSE;
return 0;
}
/* Miscellaneous initialization */
@ -7555,6 +7155,8 @@ code += 1 + LINK_SIZE + skipunits;
for (;;)
{
int branch_return;
/* Insert OP_REVERSE if this is as lookbehind assertion. */
if (lookbehind && lookbehindlength > 0)
@ -7567,10 +7169,15 @@ for (;;)
/* Now compile the branch; in the pre-compile phase its length gets added
into the length. */
if (!compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
if ((branch_return =
compile_branch(&options, &code, &pptr, errorcodeptr, &branchfirstcu,
&branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
cb, (lengthptr == NULL)? NULL : &length))
return FALSE;
cb, (lengthptr == NULL)? NULL : &length)) == 0)
return 0;
/* If a branch can match an empty string, so can the whole group. */
if (branch_return < 0) okreturn = -1;
/* In the real compile phase, there is some post-processing to be done. */
@ -7697,11 +7304,12 @@ for (;;)
if (OFLOW_MAX - *lengthptr < length)
{
*errorcodeptr = ERR20;
return FALSE;
return 0;
}
*lengthptr += length;
}
return TRUE;
// if (lengthptr == NULL) fprintf(stderr, "~~group returns %d\n", okreturn);
return okreturn;
}
/* Another branch follows. In the pre-compile phase, we can move the code
@ -9041,6 +8649,7 @@ uint32_t limit_recursion = UINT32_MAX;
int newline = 0; /* Unset; can be set by the pattern */
int bsr = 0; /* Unset; can be set by the pattern */
int errorcode = 0; /* Initialize to avoid compiler warn */
int regexrc; /* Return from compile */
uint32_t i; /* Local loop counter */
@ -9518,9 +9127,9 @@ of the function here. */
pptr = cb.parsed_pattern;
code = (PCRE2_UCHAR *)codestart;
*code = OP_BRA;
(void)compile_regex(re->overall_options, &code, &pptr, &errorcode, 0, &firstcu,
&firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
regexrc = compile_regex(re->overall_options, &code, &pptr, &errorcode, 0,
&firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
re->top_bracket = cb.bracount;
re->top_backref = cb.top_backref;
re->max_lookbehind = cb.max_lookbehind;
@ -9716,27 +9325,6 @@ if (reqcuflags >= 0 &&
}
}
/* Check for a pattern than can match an empty string, so that this information
can be provided to applications. */
do
{
int count = 0;
int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count);
if (rc < 0)
{
errorcode = ERR86;
goto HAD_CB_ERROR;
}
if (rc > 0)
{
re->flags |= PCRE2_MATCH_EMPTY;
break;
}
codestart += GET(codestart, 1);
}
while (*codestart == OP_ALT);
/* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
to set up information such as a bitmap of starting code units and a minimum
matching length. */

11
testdata/testinput2 vendored
View File

@ -4651,7 +4651,7 @@ B)x/alt_verbnames,mark
/abcdef/hex,max_pattern_length=3
# These two patterns used to take a long time to compile
# These patterns used to take a long time to compile
"(.*)
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
@ -4664,9 +4664,6 @@ B)x/alt_verbnames,mark
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
a)"xI
# When (?| is used and groups of the same number may be different,
# we have to rely on a count to catch overly complicated patterns.
"(?|()|())(.*)
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
@ -4941,4 +4938,10 @@ a)"xI
"()X|((((((((()))))))((((())))))\2())((((((\2\2)))\2)(\22((((\2\2)2))\2)))(2\ZZZ)+:)Z^|91ZiZZnter(ZZ |91Z(ZZ ZZ(\r2Z( or#(\Z2(Z\Z(\2\2)2))\2Z)Z(\22Z((\Z2(Z\Z(\2\2)2))\2Z+:)Z|91Z(ZZ ZZ(\r2Z( or#(\Z2(Z\Z((Z*(\2(Z\':))\0)i|||||||||||||||loZ\2\2)2))\2Z)Z(\22Z((\Z2(Z\Z(\2\2)2))\2Z)))int \)\0nte!rnal errpr\2\\21r(2\ZZZ)+:)Z!|91Z(ZZ ZZ(\r2Z( or#(\Z2(Z\Z(\2\2)2))\2Z)Z(\22Z((\Z2(Z\Z(\2\2)2))\2Z)))int \)\0(2\ZZZ)+:)Z^|91ZiZZnter(ZZ |91Z(ZZ ZZ(\r2Z( or#(\Z2(Z\Z(\2\2)2))\2Z)Z(\22Z((\Z2(Z\Z(\2\2)2))\2Z)))int \)\0(2\ZZZ)+:)Z^)))int \)\0(2\ZZZ)+:)Z^|91ZiZZnter(ZZernZal ZZ(\r2Z( or#(\Z2(Z\Z(\2\2)2))\2Z)Z(\22Z((\Z2(Z\Z(\2\2)2))\2Z)))int \))\ZZ(\r2Z( or#(\Z2(Z\Z(\2\2)2))\2Z)Z(\22Z((\Z2(Z\Z(\2\2)))\2))))((((((\2\2))))))"I
# This checks that new code for handling groups that may match an empty string
# works on a very large number of alternatives. This pattern used to provoke a
# complaint that it was too complicated.
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
# End of testinput2

19
testdata/testoutput2 vendored
View File

@ -10741,7 +10741,8 @@ Matched, but too many substrings
/(?(DEFINE)(a(?2)|b)(b(?1)|a))(?:(?1)|(?2))/I
Capturing subpattern count = 2
Subject length lower bound = 1
May match empty string
Subject length lower bound = 0
/(a(?2)|b)(b(?1)|a)(?:(?1)|(?2))/I
Capturing subpattern count = 2
@ -14759,7 +14760,7 @@ Failed: error 188 at offset 0: pattern string is longer than the limit set by th
/abcdef/hex,max_pattern_length=3
# These two patterns used to take a long time to compile
# These patterns used to take a long time to compile
"(.*)
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
@ -14782,14 +14783,14 @@ May match empty string
Options: extended
Subject length lower bound = 0
# When (?| is used and groups of the same number may be different,
# we have to rely on a count to catch overly complicated patterns.
"(?|()|())(.*)
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))"xI
Failed: error 186 at offset 148: regular expression is too complicated
Capturing subpattern count = 13
May match empty string
Options: extended
Subject length lower bound = 0
"(?|()|())(?<=a()
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
@ -15417,6 +15418,12 @@ Max back reference = 22
Contains explicit CR or LF match
Subject length lower bound = 0
# This checks that new code for handling groups that may match an empty string
# works on a very large number of alternatives. This pattern used to provoke a
# complaint that it was too complicated.
/(?:\[A|B|C|D|E|F|G|H|I|J|]{200}Z)/expand
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data