Re-do previous patch and fix new forward-reference-with-quantification bugs.

This commit is contained in:
Philip.Hazel 2015-05-06 16:51:25 +00:00
parent c420d11041
commit e653c5f142
11 changed files with 178 additions and 40 deletions

View File

@ -106,6 +106,11 @@ subroutine call, for example /.((?2)(?R)\1)()/, pcre2_compile() failed to
compile correct code, leading to undefined behaviour or an internally detected
error. This bug was discovered by the LLVM fuzzer.
27. Quantification of certain items (e.g. atomic back references) could cause
incorrect code to be compiled when recursive forward references were involved.
For example, in this pattern: /(?1)()((((((\1++))\x85)+)|))/. This bug was
discovered by the LLVM fuzzer.
Version 10.10 06-March-2015
---------------------------

View File

@ -49,6 +49,17 @@ POSSIBILITY OF SUCH DAMAGE.
#include "pcre2_internal.h"
/* In rare error cases debugging might require calling pcre2_printint(). */
#if 0
#ifdef EBCDIC
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
#else
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
#endif
#include "pcre2_printint.c"
#define CALL_PRINTINT
#endif
/* There are a few things that vary with different code unit sizes. Handle them
by defining macros in order to minimize #if usage. */
@ -1899,11 +1910,11 @@ else
*errorcodeptr = ERR61;
break;
}
/* \1 to \9 are always back references. \8x and \9x are too, unless there
are an awful lot of previous captures; \1x to \7x are octal escapes if
there are not that many previous captures. */
/* \1 to \9 are always back references. \8x and \9x are too, unless there
are an awful lot of previous captures; \1x to \7x are octal escapes if
there are not that many previous captures. */
if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount)
{
escape = -s; /* Indicates a back reference */
@ -1912,7 +1923,7 @@ else
ptr = oldptr; /* Put the pointer back and fall through */
}
/* Handle a digit following \ when the number is not a back reference, or
/* Handle a digit following \ when the number is not a back reference, or
we are within a character class. If the first digit is 8 or 9, Perl used to
generate a binary zero byte and then treat the digit as a following
literal. At least by Perl 5.18 this changed so as not to insert the binary
@ -2609,7 +2620,7 @@ This function has been extended to cope with forward references for recursions
and subroutine calls. It must check the list of such references for the
group we are dealing with. If it finds that one of the recursions in the
current group is on this list, it does not adjust the value in the reference
(which is a group number). After the group has been scanned, all the offsets in
(which is a group number). After the group has been scanned, all the offsets in
the forward reference list for the group are adjusted.
Arguments:
@ -2630,7 +2641,7 @@ uint32_t offset;
PCRE2_UCHAR *hc;
PCRE2_UCHAR *ptr = group;
/* Scan the group for recursions. For each one found, check the forward
/* Scan the group for recursions. For each one found, check the forward
reference list. */
while ((ptr = (PCRE2_UCHAR *)find_recurse(ptr, utf)) != NULL)
@ -2653,7 +2664,7 @@ while ((ptr = (PCRE2_UCHAR *)find_recurse(ptr, utf)) != NULL)
ptr += 1 + LINK_SIZE;
}
/* Now adjust all forward reference offsets for the group. */
for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; hc < cb->hwm;
@ -2661,7 +2672,7 @@ for (hc = (PCRE2_UCHAR *)cb->start_workspace + save_hwm_offset; hc < cb->hwm;
{
offset = (int)GET(hc, 0);
PUT(hc, 0, offset + adjust);
}
}
}
@ -3088,7 +3099,6 @@ Arguments:
reqcuflagsptr place to put the last required code unit flags, or a negative number
bcptr points to current branch chain
cond_depth conditional nesting depth
save_hwm_offset high water mark for the start of the group
cb contains pointers to tables etc.
lengthptr NULL during the real compile phase
points to length accumulator during pre-compile phase
@ -3103,7 +3113,6 @@ compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr,
uint32_t *firstcuptr, int32_t *firstcuflagsptr,
uint32_t *reqcuptr, int32_t *reqcuflagsptr,
branch_chain *bcptr, int cond_depth,
size_t save_hwm_offset,
compile_block *cb, size_t *lengthptr)
{
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
@ -3119,6 +3128,7 @@ int32_t req_caseopt, reqvary, tempreqvary;
int after_manual_callout = 0;
int escape;
size_t length_prevgroup = 0;
size_t item_hwm_offset = 0;
register uint32_t c;
register PCRE2_UCHAR *code = *codeptr;
PCRE2_UCHAR *last_code = code;
@ -3425,6 +3435,7 @@ for (;; ptr++)
zeroreqcu = reqcu;
zeroreqcuflags = reqcuflags;
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
*code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
break;
@ -3471,6 +3482,7 @@ for (;; ptr++)
/* Handle a real character class. */
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
/* PCRE supports POSIX class stuff inside a class. Perl gives an error if
they are encountered at the top level, so we'll do that too. */
@ -4540,7 +4552,7 @@ for (;; ptr++)
{
register int i;
int len = (int)(code - previous);
size_t base_hwm_offset = save_hwm_offset;
size_t base_hwm_offset = item_hwm_offset;
PCRE2_UCHAR *bralink = NULL;
PCRE2_UCHAR *brazeroptr = NULL;
@ -4597,7 +4609,7 @@ for (;; ptr++)
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
{
*code = OP_END;
adjust_recurse(previous, 1, utf, cb, save_hwm_offset);
adjust_recurse(previous, 1, utf, cb, item_hwm_offset);
memmove(previous + 1, previous, CU2BYTES(len));
code++;
if (repeat_max == 0)
@ -4621,7 +4633,7 @@ for (;; ptr++)
{
int offset;
*code = OP_END;
adjust_recurse(previous, 2 + LINK_SIZE, utf, cb, save_hwm_offset);
adjust_recurse(previous, 2 + LINK_SIZE, utf, cb, item_hwm_offset);
memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
code += 2 + LINK_SIZE;
*previous++ = OP_BRAZERO + repeat_type;
@ -4879,7 +4891,7 @@ for (;; ptr++)
{
int nlen = (int)(code - bracode);
*code = OP_END;
adjust_recurse(bracode, 1 + LINK_SIZE, utf, cb, save_hwm_offset);
adjust_recurse(bracode, 1 + LINK_SIZE, utf, cb, item_hwm_offset);
memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
code += 1 + LINK_SIZE;
nlen += 1 + LINK_SIZE;
@ -5014,7 +5026,7 @@ for (;; ptr++)
else
{
*code = OP_END;
adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cb, save_hwm_offset);
adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cb, item_hwm_offset);
memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
code += 1 + LINK_SIZE;
len += 1 + LINK_SIZE;
@ -5190,7 +5202,6 @@ for (;; ptr++)
newoptions = options;
skipunits = 0;
bravalue = OP_CBRA;
save_hwm_offset = cb->hwm - cb->start_workspace;
reset_bracount = FALSE;
/* Deal with the extended parentheses; all are introduced by '?', and the
@ -6010,6 +6021,7 @@ for (;; ptr++)
{
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
PUT2INC(code, 0, index);
PUT2INC(code, 0, count);
@ -6123,6 +6135,7 @@ for (;; ptr++)
HANDLE_RECURSION:
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
called = cb->start_code;
/* When we are actually compiling, find the bracket that is being
@ -6324,7 +6337,11 @@ for (;; ptr++)
previous = NULL;
cb->iscondassert = FALSE;
}
else previous = code;
else
{
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
}
*code = bravalue;
tempcode = code;
@ -6574,9 +6591,6 @@ for (;; ptr++)
PCRE2_SPTR p;
uint32_t cf;
/* Normally save_hwm_offset is set when '(' is read */
save_hwm_offset = cb->hwm - cb->start_workspace;
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
@ -6644,6 +6658,7 @@ for (;; ptr++)
HANDLE_REFERENCE:
if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
*code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
PUT2INC(code, 0, recno);
cb->backref_map |= (recno < 32)? (1 << recno) : 1;
@ -6673,6 +6688,7 @@ for (;; ptr++)
if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
goto FAILED;
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
*code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
*code++ = ptype;
*code++ = pdata;
@ -6721,6 +6737,7 @@ for (;; ptr++)
{
previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
item_hwm_offset = cb->hwm - cb->start_workspace;
*code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
}
}
@ -6755,6 +6772,7 @@ for (;; ptr++)
ONE_CHAR:
previous = code;
item_hwm_offset = cb->hwm - cb->start_workspace;
/* For caseless UTF mode, check whether this character has more than one
other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
@ -6980,7 +6998,7 @@ for (;;)
if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstcu,
&branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
cond_depth, save_hwm_offset, cb, (lengthptr == NULL)? NULL : &length))
cond_depth, cb, (lengthptr == NULL)? NULL : &length))
{
*ptrptr = ptr;
return FALSE;
@ -7992,6 +8010,8 @@ if (cb.names_found > 0)
error, errorcode will be set non-zero, so we don't need to look at the result
of the function here. */
/* fprintf(stderr, "+++\n\nPASS TWO\n"); */
ptr = pattern + skipatstart;
code = (PCRE2_UCHAR *)codestart;
*code = OP_BRA;
@ -8026,6 +8046,13 @@ if (usedlength > length) errorcode = ERR23; else
#endif
}
/* In rare debugging situations we sometimes need to look at the compiled code
at this stage. */
#ifdef CALL_PRINTINT
pcre2_printint(re, stderr, TRUE);
#endif
/* Fill in any forward references that are required. There may be repeated
references; optimize for them, as searching a large regex takes time. The
test of errorcode inside the loop means that nothing is done if it is already
@ -8041,6 +8068,9 @@ if (cb.hwm > cb.start_workspace)
cb.hwm -= LINK_SIZE;
offset = GET(cb.hwm, 0);
recno = GET(codestart, offset);
/* fprintf(stderr, "+++offset=%d recno=%d\n", offset, recno); */
if (recno != prev_recno)
{
groupptr = PRIV(find_bracket)(codestart, utf, recno);

View File

@ -43,7 +43,8 @@ POSSIBILITY OF SUCH DAMAGE.
internal form of a compiled regular expression, along with some supporting
local functions. This source file is #included in pcre2test.c at each supported
code unit width, with PCRE2_SUFFIX set appropriately, just like the functions
that comprise the library. */
that comprise the library. It can also optionally be included in
pcre2_compile.c for detailed debugging in error situations. */
/* Tables of operator names. The same 8-bit table is used for all code unit
@ -138,9 +139,9 @@ if ((c & 0xc0) != 0xc0)
else
{
int i;
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */
int a = PRIV(utf8_table4)[c & 0x3f]; /* Number of additional bytes */
int s = 6*a;
c = (c & utf8_table3[a]) << s;
c = (c & PRIV(utf8_table3)[a]) << s;
for (i = 1; i <= a; i++)
{
if ((ptr[i] & 0xc0) != 0x80)
@ -223,12 +224,11 @@ get_ucpname(unsigned int ptype, unsigned int pvalue)
{
#ifdef SUPPORT_UNICODE
int i;
for (i = utt_size - 1; i >= 0; i--)
for (i = PRIV(utt_size) - 1; i >= 0; i--)
{
if (ptype == utt[i].type && pvalue == utt[i].value) break;
if (ptype == PRIV(utt)[i].type && pvalue == PRIV(utt)[i].value) break;
}
return (i >= 0)? utt_names + utt[i].name_offset : "??";
return (i >= 0)? PRIV(utt_names) + PRIV(utt)[i].name_offset : "??";
#else /* No UTF support */
(void)ptype;
(void)pvalue;
@ -266,7 +266,7 @@ if (code[1] != PT_CLIST)
else
{
const char *not = (*code == OP_PROP)? "" : "not ";
const uint32_t *p = ucd_caseless_sets + code[2];
const uint32_t *p = PRIV(ucd_caseless_sets) + code[2];
fprintf (f, "%s%sclist", before, not);
while (*p < NOTACHAR) fprintf(f, " %04x", *p++);
fprintf(f, "%s", after);
@ -286,7 +286,7 @@ bytecode can be written that do not depend on the value of LINK_SIZE.
Arguments:
re a compiled pattern
f the file to write to
print_lenghts show various lengths
print_lengths show various lengths
Returns: nothing
*/
@ -305,7 +305,7 @@ for(;;)
{
PCRE2_SPTR ccode;
uint32_t c;
int i;
int i;
const char *flag = " ";
unsigned int extra = 0;
@ -600,17 +600,17 @@ for(;;)
break;
case OP_CALLOUT_STR:
c = code[1 + 4*LINK_SIZE];
c = code[1 + 4*LINK_SIZE];
fprintf(f, " %s %c", OP_names[*code], c);
extra = GET(code, 1 + 2*LINK_SIZE);
print_custring(f, code + 2 + 4*LINK_SIZE);
for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
if (c == PRIV(callout_start_delims)[i])
{
c = PRIV(callout_end_delims)[i];
{
c = PRIV(callout_end_delims)[i];
break;
}
fprintf(f, "%c %d %d %d", c, GET(code, 1 + 3*LINK_SIZE), GET(code, 1),
}
fprintf(f, "%c %d %d %d", c, GET(code, 1 + 3*LINK_SIZE), GET(code, 1),
GET(code, 1 + LINK_SIZE));
break;

3
testdata/testinput1 vendored
View File

@ -5721,4 +5721,7 @@ name)/mark
/A[\8]B[\9]C/
A8B9C
/(?1)()((((((\1++))\x85)+)|))/
\x85\x85
# End of testinput1

4
testdata/testinput2 vendored
View File

@ -4294,6 +4294,8 @@ a random value. /Ix
/.((?3)(?R)()(?2)|\1|$)()/B
/(?1)()((((((\1++))\x85)+)|))/
/(\9*+(?2);\3++()2|)++{/
/\V\x85\9*+((?2)\3++()2)*:2/
# End of testinput2

2
testdata/testinput8 vendored
View File

@ -146,4 +146,6 @@
/.((?3)(?R)()(?2)|\1|$)()/
/(?1)()((((((\1++))\x85)+)|))/
# End of testinput8

11
testdata/testoutput1 vendored
View File

@ -9447,4 +9447,15 @@ No match
A8B9C
0: A8B9C
/(?1)()((((((\1++))\x85)+)|))/
\x85\x85
0: \x85\x85
1:
2: \x85\x85
3: \x85\x85
4: \x85\x85
5: \x85
6:
7:
# End of testinput1

View File

@ -14391,6 +14391,10 @@ Failed: error 115 at offset 7: reference to non-existent subpattern
End
------------------------------------------------------------------
/(?1)()((((((\1++))\x85)+)|))/
/(\9*+(?2);\3++()2|)++{/
Failed: error 115 at offset 22: reference to non-existent subpattern
/\V\x85\9*+((?2)\3++()2)*:2/
Failed: error 115 at offset 26: reference to non-existent subpattern
# End of testinput2

View File

@ -813,4 +813,31 @@ Memory allocation (code space): 14
37 End
------------------------------------------------------------------
/(?1)()((((((\1++))\x85)+)|))/
------------------------------------------------------------------
0 50 Bra
2 4 Recurse
4 3 CBra 1
7 3 Ket
9 39 CBra 2
12 32 CBra 3
15 27 CBra 4
18 22 CBra 5
21 15 CBra 6
24 10 CBra 7
27 5 Once
29 \1+
32 5 Ket
34 10 Ket
36 15 Ket
38 \x{85}
40 22 KetRmax
42 27 Ket
44 2 Alt
46 34 Ket
48 39 Ket
50 50 Ket
52 End
------------------------------------------------------------------
# End of testinput8

View File

@ -813,4 +813,31 @@ Memory allocation (code space): 28
37 End
------------------------------------------------------------------
/(?1)()((((((\1++))\x85)+)|))/
------------------------------------------------------------------
0 50 Bra
2 4 Recurse
4 3 CBra 1
7 3 Ket
9 39 CBra 2
12 32 CBra 3
15 27 CBra 4
18 22 CBra 5
21 15 CBra 6
24 10 CBra 7
27 5 Once
29 \1+
32 5 Ket
34 10 Ket
36 15 Ket
38 \x{85}
40 22 KetRmax
42 27 Ket
44 2 Alt
46 34 Ket
48 39 Ket
50 50 Ket
52 End
------------------------------------------------------------------
# End of testinput8

View File

@ -813,4 +813,31 @@ Memory allocation (code space): 10
56 End
------------------------------------------------------------------
/(?1)()((((((\1++))\x85)+)|))/
------------------------------------------------------------------
0 77 Bra
3 6 Recurse
6 5 CBra 1
11 5 Ket
14 60 CBra 2
19 49 CBra 3
24 41 CBra 4
29 33 CBra 5
34 23 CBra 6
39 15 CBra 7
44 7 Once
47 \1+
51 7 Ket
54 15 Ket
57 23 Ket
60 \x{85}
62 33 KetRmax
65 41 Ket
68 3 Alt
71 52 Ket
74 60 Ket
77 77 Ket
80 End
------------------------------------------------------------------
# End of testinput8