Fix backtracking bug for \C\X* in UTF mode.

This commit is contained in:
Philip.Hazel 2015-04-08 16:33:58 +00:00
parent 7105d249f6
commit aa8ee3ded5
4 changed files with 29 additions and 7 deletions

View File

@ -73,6 +73,12 @@ lookbehind assertion. This bug was discovered by the LLVM fuzzer.
18. There was a similar problem to 17 in pcre2test for global matches, though 18. There was a similar problem to 17 in pcre2test for global matches, though
the code there did catch the loop. the code there did catch the loop.
19. If a greedy quantified \X was preceded by \C in UTF mode (e.g. \C\X*),
and a subsequent item in the pattern caused a non-match, backtracking over the
repeated \X did not stop, but carried on past the start of the subject, causing
reference to random memory and/or a segfault. This bug was discovered by the
LLVM fuzzer.
Version 10.10 06-March-2015 Version 10.10 06-March-2015
--------------------------- ---------------------------

View File

@ -1333,14 +1333,14 @@ for (;;)
if (*ecode == OP_CALLOUT) if (*ecode == OP_CALLOUT)
{ {
cb.callout_number = ecode[1 + 2*LINK_SIZE]; cb.callout_number = ecode[1 + 2*LINK_SIZE];
cb.callout_string_offset = 0; cb.callout_string_offset = 0;
cb.callout_string = NULL; cb.callout_string = NULL;
cb.callout_string_length = 0; cb.callout_string_length = 0;
} }
else else
{ {
cb.callout_number = 0; cb.callout_number = 0;
cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
cb.callout_string_length = cb.callout_string_length =
callout_length - (1 + 4*LINK_SIZE) - 2; callout_length - (1 + 4*LINK_SIZE) - 2;
@ -1408,7 +1408,7 @@ for (;;)
break; break;
case OP_FALSE: case OP_FALSE:
case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */
break; break;
case OP_TRUE: case OP_TRUE:
@ -1760,14 +1760,14 @@ for (;;)
if (*ecode == OP_CALLOUT) if (*ecode == OP_CALLOUT)
{ {
cb.callout_number = ecode[1 + 2*LINK_SIZE]; cb.callout_number = ecode[1 + 2*LINK_SIZE];
cb.callout_string_offset = 0; cb.callout_string_offset = 0;
cb.callout_string = NULL; cb.callout_string = NULL;
cb.callout_string_length = 0; cb.callout_string_length = 0;
} }
else else
{ {
cb.callout_number = 0; cb.callout_number = 0;
cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE);
cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1;
cb.callout_string_length = cb.callout_string_length =
callout_length - (1 + 4*LINK_SIZE) - 2; callout_length - (1 + 4*LINK_SIZE) - 2;
@ -5723,12 +5723,17 @@ for (;;)
if (possessive) continue; /* No backtracking */ if (possessive) continue; /* No backtracking */
/* We use <= pp rather than == pp to detect the start of the run while
backtracking because the use of \C in UTF mode can cause BACKCHAR to
move back past pp. This is just palliative; the use of \C in UTF mode
is fraught with danger. */
for(;;) for(;;)
{ {
int lgb, rgb; int lgb, rgb;
PCRE2_SPTR fptr; PCRE2_SPTR fptr;
if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45); RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@ -5746,7 +5751,7 @@ for (;;)
for (;;) for (;;)
{ {
if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
fptr = eptr - 1; fptr = eptr - 1;
if (!utf) c = *fptr; else if (!utf) c = *fptr; else
{ {

5
testdata/testinput4 vendored
View File

@ -2221,4 +2221,9 @@
"[\S\V\H]"utf "[\S\V\H]"utf
/\C\X*TӅ;
{0,6}\v+ F
/utf
Ӆ\x0a
# End of testinput4 # End of testinput4

View File

@ -3741,4 +3741,10 @@ No match
"[\S\V\H]"utf "[\S\V\H]"utf
/\C\X*TӅ;
{0,6}\v+ F
/utf
Ӆ\x0a
No match
# End of testinput4 # End of testinput4