Don't use group caching when (?| is involved; instead use a counter to cap too

much computation.
This commit is contained in:
Philip.Hazel 2015-11-10 14:33:28 +00:00
parent a710d3b37b
commit e7727c36be
4 changed files with 210 additions and 34 deletions

View File

@ -270,7 +270,11 @@ size of patterns that they are prepared to handle.
79. Adding group information caching improves the speed of compiling when
checking whether a group has a fixed length and/or could match an empty string,
especially when recursion or subroutine calls are involved.
especially when recursion or subroutine calls are involved. However, this
cannot be used when (?| is present in the pattern because the same number may
be used for groups of different sizes. To catch runaway patterns in this
situation, counts have been introduced to the functions that scan for empty
branches or compute fixed lengths.
80. Allow for the possibility of the size of the nest_save structure not being
a factor of the size of the compiling workspace (it currently is).

View File

@ -616,6 +616,7 @@ static int fixed_length_errors[] =
ERR25, /* Lookbehind is not fixed length */
ERR36, /* \C in lookbehind is not allowed */
ERR87, /* Lookbehind is too long */
ERR86, /* Pattern too complicated */
ERR70 /* Internal error: unknown opcode encountered */
};
@ -828,11 +829,12 @@ Lookbehind lengths are held in 16-bit fields and the maximum value is defined
as LOOKBEHIND_MAX.
Arguments:
code points to the start of the pattern (the bracket)
utf TRUE in UTF mode
atend TRUE if called when the pattern is complete
cb the "compile data" structure
code points to the start of the pattern (the bracket)
utf TRUE in UTF mode
atend TRUE if called when the pattern is complete
cb the "compile data" structure
recurses chain of recurse_check to catch mutual recursion
countptr pointer to counter, to catch over-complexity
Returns: if non-negative, the fixed length,
or -1 if an OP_RECURSE item was encountered and atend is FALSE
@ -842,15 +844,16 @@ Returns: if non-negative, the fixed length,
or -5 if an unknown opcode was encountered (internal error)
*/
#define FFL_LATER (-1)
#define FFL_NOTFIXED (-2)
#define FFL_BACKSLASHC (-3)
#define FFL_TOOLONG (-4)
#define FFL_UNKNOWNOP (-5)
#define FFL_LATER (-1)
#define FFL_NOTFIXED (-2)
#define FFL_BACKSLASHC (-3)
#define FFL_TOOLONG (-4)
#define FFL_TOOCOMPLICATED (-5)
#define FFL_UNKNOWNOP (-6)
static int
find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
recurse_check *recurses)
recurse_check *recurses, int *countptr)
{
int length = -1;
uint32_t group = 0;
@ -859,7 +862,9 @@ recurse_check this_recurse;
register int branchlength = 0;
register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
/* If this is a capturing group, we may have the answer cached. */
/* If this is a capturing group, we may have the answer cached, but we can only
use this information if there are no (?| groups in the pattern, because
otherwise group numbers are not unique. */
if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
*code == OP_SCBRAPOS)
@ -867,11 +872,19 @@ if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
group = GET2(cc, 0);
cc += IMM2_SIZE;
groupinfo = cb->groupinfo[group];
if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
return groupinfo & GI_FIXED_LENGTH_MASK;
if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0)
{
if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
return groupinfo & GI_FIXED_LENGTH_MASK;
}
}
/* A large and/or complex regex can take too long to process. This can happen
more often when (?| groups are present in the pattern. */
if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
@ -895,7 +908,7 @@ for (;;)
case OP_ONCE:
case OP_ONCE_NC:
case OP_COND:
d = find_fixedlength(cc, utf, atend, cb, recurses);
d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@ -945,7 +958,7 @@ for (;;)
}
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_fixedlength(cs, utf, atend, cb, &this_recurse);
d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
if (d < 0) return d;
branchlength += d;
cc += 1 + LINK_SIZE;
@ -1298,13 +1311,21 @@ Arguments:
cb compile data
atend TRUE if being called to check an entire pattern
recurses chain of recurse_check to catch mutual recursion
countptr pointer to count to catch over-complicated pattern
Returns: TRUE if what is matched could be empty
Returns: 0 if what is matched cannot be empty
1 if what is matched could be empty
-1 if the pattern is too complicated
*/
static BOOL
#define CBE_NOTEMPTY 0
#define CBE_EMPTY 1
#define CBE_TOOCOMPLICATED (-1)
static int
could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
compile_block *cb, BOOL atend, recurse_check *recurses)
compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
{
uint32_t group = 0;
uint32_t groupinfo = 0;
@ -1314,18 +1335,30 @@ recurse_check this_recurse;
/* If what we are checking has already been set as "could be empty", we know
the answer. */
if (*code >= OP_SBRA && *code <= OP_SCOND) return TRUE;
if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
/* If this is a capturing group, we may have the answer cached. */
/* If this is a capturing group, we may have the answer cached, but we can only
use this information if there are no (?| groups in the pattern, because
otherwise group numbers are not unique. */
if (*code == OP_CBRA || *code == OP_CBRAPOS)
if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
(*code == OP_CBRA || *code == OP_CBRAPOS))
{
group = GET2(code, 1 + LINK_SIZE);
groupinfo = cb->groupinfo[group];
if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
return (groupinfo & GI_COULD_BE_EMPTY) != 0;
return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
/* A large and/or complex regex can take too long to process. We have to assume
it can match an empty string. This can happen more often when (?| groups are
present in the pattern and the caching is disabled. Setting the cap at 1100
allows the test for more than 1023 capturing patterns to work. */
if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
/* Scan the opcodes for this branch. */
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
code < endcode;
code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
@ -1347,8 +1380,8 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
/* For a recursion/subroutine call we can scan the recursion when this
function is called at the end, to check a complete pattern. Before then,
recursions just have the group number as their argument and in any case may
be forward references. In that situation, we return TRUE, just in case. It
means that unlimited repeats of groups that contain recursions are always
be forward references. In that situation, we return CBE_EMPTY, just in case.
It means that unlimited repeats of groups that contain recursions are always
treated as "could be empty" - which just adds a bit more processing time
because of the runtime check. */
@ -1385,7 +1418,10 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
do
{
if (could_be_empty_branch(scode, endcode, utf, cb, atend, &this_recurse))
int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
&this_recurse, countptr);
if (rc < 0) return rc;
if (rc > 0)
{
empty_branch = TRUE;
break;
@ -1441,8 +1477,13 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
empty_branch = FALSE;
do
{
if (!empty_branch && could_be_empty_branch(code, endcode, utf, cb,
atend, recurses)) empty_branch = TRUE;
if (!empty_branch)
{
int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
recurses, countptr);
if (rc < 0) return rc;
if (rc > 0) empty_branch = TRUE;
}
code += GET(code, 1);
}
while (*code == OP_ALT);
@ -1663,7 +1704,7 @@ groupinfo |= GI_COULD_BE_EMPTY;
ISFALSE:
if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
return (groupinfo & GI_COULD_BE_EMPTY) != 0;
return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
}
@ -5480,7 +5521,15 @@ for (;; ptr++)
PCRE2_UCHAR *scode = bracode;
do
{
if (could_be_empty_branch(scode, ketcode, utf, cb, FALSE, NULL))
int count = 0;
int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE,
NULL, &count);
if (rc < 0)
{
*errorcodeptr = ERR86;
goto FAILED;
}
if (rc > 0)
{
*bracode += OP_SBRA - OP_BRA;
break;
@ -7594,9 +7643,10 @@ for (;;)
if (lookbehind)
{
int fixed_length;
int count = 0;
*code = OP_END;
fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0,
FALSE, cb, NULL);
FALSE, cb, NULL, &count);
if (fixed_length == FFL_LATER)
{
cb->check_lookbehind = TRUE;
@ -8733,10 +8783,11 @@ if (errorcode == 0 && cb.check_lookbehind)
if (GET(cc, 1) == 0)
{
int fixed_length;
int count = 0;
PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
int end_op = *be;
*be = OP_END;
fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL);
fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL, &count);
*be = end_op;
if (fixed_length < 0)
{
@ -8860,7 +8911,14 @@ can be provided to applications. */
do
{
if (could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL))
int count = 0;
int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count);
if (rc < 0)
{
errorcode = ERR86;
goto HAD_ERROR;
}
if (rc > 0)
{
re->flags |= PCRE2_MATCH_EMPTY;
break;

24
testdata/testinput2 vendored
View File

@ -4626,6 +4626,20 @@ B)x/alt_verbnames,mark
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
a)"xI
# When (?| is used and groups of the same number may be different,
# we have to rely on a count to catch overly complicated patterns.
"(?|()|())(.*)
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))"xI
"(?|()|())(?<=a()
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
a)"xI
# Test the use of malloc for caching group information when there are more
# groups than fit into the on-stack workspace.
@ -4635,4 +4649,14 @@ a)"xI
/(A{65000})\1{65000}/I
# Test group scans when numbers are not unique
/(?|()+|(a)+)/BI
/(?|(a)+|()+)/BI
/(?|()|(a))/BI
/(?|(a)|())/BI
# End of testinput2

90
testdata/testoutput2 vendored
View File

@ -14734,6 +14734,22 @@ May match empty string
Options: extended
Subject length lower bound = 0
# When (?| is used and groups of the same number may be different,
# we have to rely on a count to catch overly complicated patterns.
"(?|()|())(.*)
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))"xI
Failed: error 186 at offset 148: regular expression is too complicated
"(?|()|())(?<=a()
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
((?-2)(?-2))((?-2)(?-2))((?-2)(?-2))
a)"xI
Failed: error 186 at offset 154: regular expression is too complicated
# Test the use of malloc for caching group information when there are more
# groups than fit into the on-stack workspace.
@ -14752,4 +14768,78 @@ First code unit = 'A'
Last code unit = 'A'
Subject length lower bound = 65535
# Test group scans when numbers are not unique
/(?|()+|(a)+)/BI
------------------------------------------------------------------
Bra
Bra
SCBra 1
KetRmax
Alt
CBra 1
a
KetRmax
Ket
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
/(?|(a)+|()+)/BI
------------------------------------------------------------------
Bra
Bra
CBra 1
a
KetRmax
Alt
SCBra 1
KetRmax
Ket
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
/(?|()|(a))/BI
------------------------------------------------------------------
Bra
Bra
CBra 1
Ket
Alt
CBra 1
a
Ket
Ket
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
/(?|(a)|())/BI
------------------------------------------------------------------
Bra
Bra
CBra 1
a
Ket
Alt
CBra 1
Ket
Ket
Ket
End
------------------------------------------------------------------
Capturing subpattern count = 1
May match empty string
Subject length lower bound = 0
# End of testinput2