Cache group minima to speed up studying of pathological patterns. Fixes

oss-fuzz #557.
This commit is contained in:
Philip.Hazel 2017-02-10 16:33:15 +00:00
parent 7ab91fdfb2
commit 3fb194afa4
4 changed files with 138 additions and 49 deletions

View File

@ -343,6 +343,15 @@ modules.
stack usage for the benefit of clang with -fsanitize-address, which uses huge
stack frames. Example pattern: /X?(R||){3335}/. Fixes oss-fuzz issue 553.
54. A pattern with very many explicit back references to a group that is a long
way from the start of the pattern could take a long time to compile because
searching for the referenced group in order to find the minimum length was
being done repeatedly. Now up to 128 group minimum lengths are cached and the
attempt to find a minimum length is abandoned if there is a back reference to a
group whose number is greater than 128. (In that case, the pattern is so
complicated that this optimization probably isn't worth it.) This fixes
oss-fuzz issue 557.
Version 10.22 29-July-2016
--------------------------

View File

@ -50,6 +50,10 @@ collecting data (e.g. minimum matching length). */
#include "pcre2_internal.h"
/* The maximum remembered capturing brackets minimum. */
#define MAX_CACHE_BACKREF 128
/* Set a bit in the starting code unit bit map. */
#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
@ -71,6 +75,12 @@ length is 16-bits long (on the grounds that anything longer than that is
pathological), so we give up when we reach that amount. This also means that
integer overflow for really crazy patterns cannot happen.
Backreference minimum lengths are cached to speed up multiple references. This
function is called only when the highest back reference in the pattern is less
than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
caching vector. The zeroth element contains the number of the highest set
value.
Arguments:
re compiled pattern block
code pointer to start of group (the bracket)
@ -78,6 +88,7 @@ Arguments:
utf UTF flag
recurses chain of recurse_check to catch mutual recursion
countptr pointer to call count (to catch over complexity)
backref_cache vector for caching back references.
Returns: the minimum length
-1 \C in UTF-8 mode
@ -90,7 +101,8 @@ Returns: the minimum length
static int
find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
int *backref_cache)
{
int length = -1;
int prev_cap_recno = -1;
@ -166,7 +178,8 @@ for (;;)
case OP_BRAPOS:
case OP_SBRAPOS:
PROCESS_NON_CAPTURE:
d = find_minlength(re, cc, startcode, utf, recurses, countptr);
d = find_minlength(re, cc, startcode, utf, recurses, countptr,
backref_cache);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@ -186,7 +199,8 @@ for (;;)
if (recno != prev_cap_recno)
{
prev_cap_recno = recno;
prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
backref_cache);
if (prev_cap_d < 0) return prev_cap_d;
}
branchlength += prev_cap_d;
@ -456,39 +470,52 @@ for (;;)
d = INT_MAX;
/* Scan all groups with the same name */
/* Scan all groups with the same name; find the shortest. */
while (count-- > 0)
{
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
had_recurse = TRUE;
break;
}
int dd, i;
recno = GET2(slot, 0);
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
dd = backref_cache[recno];
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
dd = 0;
had_recurse = TRUE;
break;
}
else
{
int dd;
this_recurse.prev = recurses;
this_recurse.group = cs;
dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
if (dd < 0) return dd;
if (dd < d) d = dd;
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev)
if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{
dd = 0;
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses;
this_recurse.group = cs;
dd = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr, backref_cache);
if (dd < 0) return dd;
}
}
backref_cache[recno] = dd;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
backref_cache[0] = recno;
}
if (dd < d) d = dd;
if (d <= 0) break; /* No point looking at any more */
slot += re->name_entry_size;
}
}
@ -502,35 +529,48 @@ for (;;)
case OP_REF:
case OP_REFI:
if (dupcapused) return -1;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
recno = GET2(cc, 1);
if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
d = backref_cache[recno];
else
{
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
int i;
if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
{
d = 0;
had_recurse = TRUE;
}
else
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
if (d < 0) return d;
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{
d = 0;
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
backref_cache);
if (d < 0) return d;
}
}
}
else d = 0;
backref_cache[recno] = d;
for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
backref_cache[0] = recno;
}
else d = 0;
cc += 1 + IMM2_SIZE;
/* Handle repeated back references */
@ -603,7 +643,7 @@ for (;;)
this_recurse.prev = recurses;
this_recurse.group = cs;
prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
countptr);
countptr, backref_cache);
if (prev_recurse_d < 0) return prev_recurse_d;
prev_recurse_recno = recno;
branchlength += prev_recurse_d;
@ -1548,12 +1588,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
}
/* Find the minimum length of subject string. If it can match an empty string,
the minimum length is already known. */
/* Find the minimum length of subject string. If the pattern can match an empty
string, the minimum length is already known. If there are more back references
than the size of the vector we are going to cache them in, do nothing. A
pattern that complicated will probably take a long time to analyze and may in
any case turn out to be too complicated. Note that back reference minima are
held as 16-bit numbers. */
if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
re->top_backref <= MAX_CACHE_BACKREF)
{
min = find_minlength(re, code, code, utf, NULL, &count);
int backref_cache[MAX_CACHE_BACKREF+1];
backref_cache[0] = 0; /* Highest one that is set */
min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
switch(min)
{
case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */

6
testdata/testinput2 vendored
View File

@ -4963,4 +4963,10 @@ a)"xI
/()(\g+65533)/
/Á\x00\x00\x00š(\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\x00k\d+\x00‎\x00\x00\x00\x00\x00\2*\x00\x00\1*.){36}int^\x00\x00˙˙\x00š(\1{50779}?)J\w2/I
/(a)(b)\2\1\1\1\1/I
/(?<a>a)(?<b>b)\g{b}\g{a}\g{a}\g{a}\g{a}(?<a>xx)(?<b>zz)/I,dupnames
# End of testinput2

31
testdata/testoutput2 vendored
View File

@ -2122,7 +2122,7 @@ Failed: error 115 at offset 5: reference to non-existent subpattern
Capturing subpattern count = 271
Max back reference = 270
Starting code units: 0 1 2 3 4 5 6 7 8 9
Subject length lower bound = 272
Subject length lower bound = 0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC\=ovector=300
0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC
1: 1
@ -15416,7 +15416,7 @@ Failed: error -33: bad offset value
Capturing subpattern count = 108
Max back reference = 22
Contains explicit CR or LF match
Subject length lower bound = 0
Subject length lower bound = 1
# This checks that new code for handling groups that may match an empty string
# works on a very large number of alternatives. This pattern used to provoke a
@ -15452,6 +15452,33 @@ Failed: error 161 at offset 11: group number is too big
/()(\g+65533)/
Failed: error 115 at offset 10: reference to non-existent subpattern
/Á\x00\x00\x00š(\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\x00k\d+\x00‎\x00\x00\x00\x00\x00\2*\x00\x00\1*.){36}int^\x00\x00ÿÿ\x00š(\1{50779}?)J\w2/I
Capturing subpattern count = 2
Max back reference = 2
First code unit = \xc1
Last code unit = '2'
Subject length lower bound = 65535
/(a)(b)\2\1\1\1\1/I
Capturing subpattern count = 2
Max back reference = 2
First code unit = 'a'
Last code unit = 'b'
Subject length lower bound = 7
/(?<a>a)(?<b>b)\g{b}\g{a}\g{a}\g{a}\g{a}(?<a>xx)(?<b>zz)/I,dupnames
Capturing subpattern count = 4
Max back reference = 4
Named capturing subpatterns:
a 1
a 3
b 2
b 4
Options: dupnames
First code unit = 'a'
Last code unit = 'z'
Subject length lower bound = 11
# End of testinput2
Error -63: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data