From aa8ee3ded572baa70568b57cb94ab0dec0b3e7e9 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Wed, 8 Apr 2015 16:33:58 +0000 Subject: [PATCH] Fix backtracking bug for \C\X* in UTF mode. --- ChangeLog | 6 ++++++ src/pcre2_match.c | 19 ++++++++++++------- testdata/testinput4 | 5 +++++ testdata/testoutput4 | 6 ++++++ 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index be173f7..ef41e9b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -73,6 +73,12 @@ lookbehind assertion. This bug was discovered by the LLVM fuzzer. 18. There was a similar problem to 17 in pcre2test for global matches, though the code there did catch the loop. +19. If a greedy quantified \X was preceded by \C in UTF mode (e.g. \C\X*), +and a subsequent item in the pattern caused a non-match, backtracking over the +repeated \X did not stop, but carried on past the start of the subject, causing +reference to random memory and/or a segfault. This bug was discovered by the +LLVM fuzzer. + Version 10.10 06-March-2015 --------------------------- diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 231a8ff..3ff0b63 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -1333,14 +1333,14 @@ for (;;) if (*ecode == OP_CALLOUT) { cb.callout_number = ecode[1 + 2*LINK_SIZE]; - cb.callout_string_offset = 0; + cb.callout_string_offset = 0; cb.callout_string = NULL; cb.callout_string_length = 0; } else { cb.callout_number = 0; - cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); + cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; cb.callout_string_length = callout_length - (1 + 4*LINK_SIZE) - 2; @@ -1408,7 +1408,7 @@ for (;;) break; case OP_FALSE: - case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ + case OP_FAIL: /* The assertion (?!) becomes OP_FAIL */ break; case OP_TRUE: @@ -1760,14 +1760,14 @@ for (;;) if (*ecode == OP_CALLOUT) { cb.callout_number = ecode[1 + 2*LINK_SIZE]; - cb.callout_string_offset = 0; + cb.callout_string_offset = 0; cb.callout_string = NULL; cb.callout_string_length = 0; } else { cb.callout_number = 0; - cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); + cb.callout_string_offset = GET(ecode, 1 + 3*LINK_SIZE); cb.callout_string = ecode + (1 + 4*LINK_SIZE) + 1; cb.callout_string_length = callout_length - (1 + 4*LINK_SIZE) - 2; @@ -5723,12 +5723,17 @@ for (;;) if (possessive) continue; /* No backtracking */ + /* We use <= pp rather than == pp to detect the start of the run while + backtracking because the use of \C in UTF mode can cause BACKCHAR to + move back past pp. This is just palliative; the use of \C in UTF mode + is fraught with danger. */ + for(;;) { int lgb, rgb; PCRE2_SPTR fptr; - if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ + if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ RMATCH(eptr, ecode, offset_top, mb, eptrb, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -5746,7 +5751,7 @@ for (;;) for (;;) { - if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */ + if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */ fptr = eptr - 1; if (!utf) c = *fptr; else { diff --git a/testdata/testinput4 b/testdata/testinput4 index c50169d..8e51fea 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -2221,4 +2221,9 @@ "[\S\V\H]"utf +/\C\X*TӅ; +{0,6}\v+ F +/utf + Ӆ\x0a + # End of testinput4 diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 8364515..39924bf 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -3741,4 +3741,10 @@ No match "[\S\V\H]"utf +/\C\X*TӅ; +{0,6}\v+ F +/utf + Ӆ\x0a +No match + # End of testinput4