diff --git a/ChangeLog b/ChangeLog index c90011b..9f81a33 100644 --- a/ChangeLog +++ b/ChangeLog @@ -343,6 +343,15 @@ modules. stack usage for the benefit of clang with -fsanitize-address, which uses huge stack frames. Example pattern: /X?(R||){3335}/. Fixes oss-fuzz issue 553. +54. A pattern with very many explicit back references to a group that is a long +way from the start of the pattern could take a long time to compile because +searching for the referenced group in order to find the minimum length was +being done repeatedly. Now up to 128 group minimum lengths are cached and the +attempt to find a minimum length is abandoned if there is a back reference to a +group whose number is greater than 128. (In that case, the pattern is so +complicated that this optimization probably isn't worth it.) This fixes +oss-fuzz issue 557. + Version 10.22 29-July-2016 -------------------------- diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 4c08bc5..89715ab 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -50,6 +50,10 @@ collecting data (e.g. minimum matching length). */ #include "pcre2_internal.h" +/* The maximum remembered capturing brackets minimum. */ + +#define MAX_CACHE_BACKREF 128 + /* Set a bit in the starting code unit bit map. */ #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7)) @@ -71,6 +75,12 @@ length is 16-bits long (on the grounds that anything longer than that is pathological), so we give up when we reach that amount. This also means that integer overflow for really crazy patterns cannot happen. +Backreference minimum lengths are cached to speed up multiple references. This +function is called only when the highest back reference in the pattern is less +than or equal to MAX_CACHE_BACKREF, which is one less than the size of the +caching vector. The zeroth element contains the number of the highest set +value. + Arguments: re compiled pattern block code pointer to start of group (the bracket) @@ -78,6 +88,7 @@ Arguments: utf UTF flag recurses chain of recurse_check to catch mutual recursion countptr pointer to call count (to catch over complexity) + backref_cache vector for caching back references. Returns: the minimum length -1 \C in UTF-8 mode @@ -90,7 +101,8 @@ Returns: the minimum length static int find_minlength(const pcre2_real_code *re, PCRE2_SPTR code, - PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr) + PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr, + int *backref_cache) { int length = -1; int prev_cap_recno = -1; @@ -166,7 +178,8 @@ for (;;) case OP_BRAPOS: case OP_SBRAPOS: PROCESS_NON_CAPTURE: - d = find_minlength(re, cc, startcode, utf, recurses, countptr); + d = find_minlength(re, cc, startcode, utf, recurses, countptr, + backref_cache); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); @@ -186,7 +199,8 @@ for (;;) if (recno != prev_cap_recno) { prev_cap_recno = recno; - prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr); + prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr, + backref_cache); if (prev_cap_d < 0) return prev_cap_d; } branchlength += prev_cap_d; @@ -456,39 +470,52 @@ for (;;) d = INT_MAX; - /* Scan all groups with the same name */ + /* Scan all groups with the same name; find the shortest. */ while (count-- > 0) { - ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0)); - if (cs == NULL) return -2; - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ - { - d = 0; - had_recurse = TRUE; - break; - } + int dd, i; + recno = GET2(slot, 0); + + if (recno <= backref_cache[0] && backref_cache[recno] >= 0) + dd = backref_cache[recno]; else { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ + ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) /* Simple recursion */ { - d = 0; + dd = 0; had_recurse = TRUE; - break; } else { - int dd; - this_recurse.prev = recurses; - this_recurse.group = cs; - dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr); - if (dd < 0) return dd; - if (dd < d) d = dd; + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) + if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + { + dd = 0; + had_recurse = TRUE; + } + else + { + this_recurse.prev = recurses; + this_recurse.group = cs; + dd = find_minlength(re, cs, startcode, utf, &this_recurse, + countptr, backref_cache); + if (dd < 0) return dd; + } } + + backref_cache[recno] = dd; + for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; + backref_cache[0] = recno; } + + if (dd < d) d = dd; + if (d <= 0) break; /* No point looking at any more */ slot += re->name_entry_size; } } @@ -502,35 +529,48 @@ for (;;) case OP_REF: case OP_REFI: if (dupcapused) return -1; - if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) + recno = GET2(cc, 1); + if (recno <= backref_cache[0] && backref_cache[recno] >= 0) + d = backref_cache[recno]; + else { - ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); - if (cs == NULL) return -2; - do ce += GET(ce, 1); while (*ce == OP_ALT); - if (cc > cs && cc < ce) /* Simple recursion */ + int i; + if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) { - d = 0; - had_recurse = TRUE; - } - else - { - recurse_check *r = recurses; - for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; - if (r != NULL) /* Mutual recursion */ + ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); + if (cs == NULL) return -2; + do ce += GET(ce, 1); while (*ce == OP_ALT); + if (cc > cs && cc < ce) /* Simple recursion */ { d = 0; had_recurse = TRUE; } else { - this_recurse.prev = recurses; - this_recurse.group = cs; - d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr); - if (d < 0) return d; + recurse_check *r = recurses; + for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; + if (r != NULL) /* Mutual recursion */ + { + d = 0; + had_recurse = TRUE; + } + else + { + this_recurse.prev = recurses; + this_recurse.group = cs; + d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr, + backref_cache); + if (d < 0) return d; + } } } + else d = 0; + + backref_cache[recno] = d; + for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1; + backref_cache[0] = recno; } - else d = 0; + cc += 1 + IMM2_SIZE; /* Handle repeated back references */ @@ -603,7 +643,7 @@ for (;;) this_recurse.prev = recurses; this_recurse.group = cs; prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse, - countptr); + countptr, backref_cache); if (prev_recurse_d < 0) return prev_recurse_d; prev_recurse_recno = recno; branchlength += prev_recurse_d; @@ -1548,12 +1588,19 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0 && if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET; } -/* Find the minimum length of subject string. If it can match an empty string, -the minimum length is already known. */ +/* Find the minimum length of subject string. If the pattern can match an empty +string, the minimum length is already known. If there are more back references +than the size of the vector we are going to cache them in, do nothing. A +pattern that complicated will probably take a long time to analyze and may in +any case turn out to be too complicated. Note that back reference minima are +held as 16-bit numbers. */ -if ((re->flags & PCRE2_MATCH_EMPTY) == 0) +if ((re->flags & PCRE2_MATCH_EMPTY) == 0 && + re->top_backref <= MAX_CACHE_BACKREF) { - min = find_minlength(re, code, code, utf, NULL, &count); + int backref_cache[MAX_CACHE_BACKREF+1]; + backref_cache[0] = 0; /* Highest one that is set */ + min = find_minlength(re, code, code, utf, NULL, &count, backref_cache); switch(min) { case -1: /* \C in UTF mode or (*ACCEPT) or over-complex regex */ diff --git a/testdata/testinput2 b/testdata/testinput2 index 5f7dc6e..a700d5a 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4963,4 +4963,10 @@ a)"xI /()(\g+65533)/ +/Á\x00\x00\x00š(\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\x00k\d+\x00‎\x00\x00\x00\x00\x00\2*\x00\x00\1*.){36}int^\x00\x00ÿÿ\x00š(\1{50779}?)J\w2/I + +/(a)(b)\2\1\1\1\1/I + +/(?a)(?b)\g{b}\g{a}\g{a}\g{a}\g{a}(?xx)(?zz)/I,dupnames + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 9ad31d4..ce8c667 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -2122,7 +2122,7 @@ Failed: error 115 at offset 5: reference to non-existent subpattern Capturing subpattern count = 271 Max back reference = 270 Starting code units: 0 1 2 3 4 5 6 7 8 9 -Subject length lower bound = 272 +Subject length lower bound = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC\=ovector=300 0: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 ABC ABC 1: 1 @@ -15416,7 +15416,7 @@ Failed: error -33: bad offset value Capturing subpattern count = 108 Max back reference = 22 Contains explicit CR or LF match -Subject length lower bound = 0 +Subject length lower bound = 1 # This checks that new code for handling groups that may match an empty string # works on a very large number of alternatives. This pattern used to provoke a @@ -15452,6 +15452,33 @@ Failed: error 161 at offset 11: group number is too big /()(\g+65533)/ Failed: error 115 at offset 10: reference to non-existent subpattern +/Á\x00\x00\x00š(\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\2*\x00k\d+\x00‎\x00\x00\x00\x00\x00\2*\x00\x00\1*.){36}int^\x00\x00ÿÿ\x00š(\1{50779}?)J\w2/I +Capturing subpattern count = 2 +Max back reference = 2 +First code unit = \xc1 +Last code unit = '2' +Subject length lower bound = 65535 + +/(a)(b)\2\1\1\1\1/I +Capturing subpattern count = 2 +Max back reference = 2 +First code unit = 'a' +Last code unit = 'b' +Subject length lower bound = 7 + +/(?a)(?b)\g{b}\g{a}\g{a}\g{a}\g{a}(?xx)(?zz)/I,dupnames +Capturing subpattern count = 4 +Max back reference = 4 +Named capturing subpatterns: + a 1 + a 3 + b 2 + b 4 +Options: dupnames +First code unit = 'a' +Last code unit = 'z' +Subject length lower bound = 11 + # End of testinput2 Error -63: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data