From 7105d249f65e77a94266e53686c755ea7ce5a4e6 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Mon, 6 Apr 2015 12:16:36 +0000 Subject: [PATCH] Fix handling of global matching in pcre2test when a lookbehind assertion contains \K. --- ChangeLog | 3 ++ src/pcre2test.c | 73 ++++++++++++++++++++++++++++++++------------ testdata/testinput2 | 8 +++++ testdata/testinput5 | 6 ++++ testdata/testoutput2 | 28 +++++++++++++++++ testdata/testoutput5 | 26 ++++++++++++++++ 6 files changed, 125 insertions(+), 19 deletions(-) diff --git a/ChangeLog b/ChangeLog index 1bbea12..be173f7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -70,6 +70,9 @@ lookbehind assertion. This bug was discovered by the LLVM fuzzer. 17. The use of \K in a positive lookbehind assertion in a non-anchored pattern (e.g. /(?<=\Ka)/) could make pcre2grep loop. +18. There was a similar problem to 17 in pcre2test for global matches, though +the code there did catch the loop. + Version 10.10 06-March-2015 --------------------------- diff --git a/src/pcre2test.c b/src/pcre2test.c index f64be14..22d3681 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -3557,14 +3557,14 @@ unit widths are that the pointers to the subject, the most recent MARK, and a callout argument string point to strings of the appropriate width. Casts can be used to deal with this. -Argument: +Argument: cb pointer to enumerate block callout_data user data -Returns: 0 +Returns: 0 */ -static int callout_callback(pcre2_callout_enumerate_block_8 *cb, +static int callout_callback(pcre2_callout_enumerate_block_8 *cb, void *callout_data) { uint32_t i; @@ -3587,13 +3587,13 @@ if (cb->callout_string != NULL) } fprintf(outfile, "%c ", delimiter); } -else fprintf(outfile, "%d ", cb->callout_number); +else fprintf(outfile, "%d ", cb->callout_number); fprintf(outfile, "%.*s\n", (int)((cb->next_item_length == 0)? 1 : cb->next_item_length), pbuffer8 + cb->pattern_position); - -return 0; + +return 0; } @@ -3879,10 +3879,10 @@ if ((pat_patctl.control & CTL_CALLOUT_INFO) != 0) int len; fprintf(outfile, "Callout enumerate failed: error %d: ", errorcode); if (errorcode < 0) - { + { PCRE2_GET_ERROR_MESSAGE(len, errorcode, pbuffer); PCHARSV(CASTVAR(void *, pbuffer), 0, len, FALSE, outfile); - } + } fprintf(outfile, "\n"); return PR_SKIP; } @@ -5684,20 +5684,20 @@ else for (gmatched = 0;; gmatched++) ovector = FLD(match_data, ovector); - /* After the first time round a global loop, save the current ovector[0,1] so - that we can check that they do change each time. Otherwise a matching bug - that returns the same string causes an infinite loop. It has happened! */ + /* After the first time round a global loop, for a normal global (/g) + iteration, save the current ovector[0,1] so that we can check that they do + change each time. Otherwise a matching bug that returns the same string + causes an infinite loop. It has happened! */ - if (gmatched > 0) + if (gmatched > 0 && (dat_datctl.control & CTL_GLOBAL) != 0) { ovecsave[0] = ovector[0]; ovecsave[1] = ovector[1]; } - /* Set the variables on the first iteration, just to stop a compiler warning - when ovecsave[] is referenced below. */ + /* For altglobal (or first time round the loop), set an "unset" value. */ - else ovecsave[0] = ovecsave[1] = 0; + else ovecsave[0] = ovecsave[1] = PCRE2_UNSET; /* Fill the ovector with junk to detect elements that do not get set when they should be. */ @@ -6169,13 +6169,48 @@ else for (gmatched = 0;; gmatched++) if (end_offset == ulen) break; /* End of subject */ g_notempty = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } - else g_notempty = 0; - /* For /g, update the start offset, leaving the rest alone */ + /* However, even after matching a non-empty string, there is still one + tricky case. If a pattern contains \K within a lookbehind assertion at the + start, the end of the matched string can be at the offset where the match + started. In the case of a normal /g iteration without special action, this + leads to a loop that keeps on returning the same substring. The loop would + be caught above, but we really want to move on to the next match. */ - if ((dat_datctl.control & CTL_GLOBAL) != 0) dat_datctl.offset = end_offset; + else + { + g_notempty = 0; /* Set for a "normal" repeat */ + if ((dat_datctl.control & CTL_GLOBAL) != 0) + { + PCRE2_SIZE startchar; + PCRE2_GET_STARTCHAR(startchar, match_data); + if (end_offset <= startchar) + { + if (startchar >= ulen) break; /* End of subject */ + end_offset = startchar + 1; + if (utf && test_mode != PCRE32_MODE) + { + if (test_mode == PCRE8_MODE) + { + for (; end_offset < ulen; end_offset++) + if ((((PCRE2_SPTR8)pp)[end_offset] & 0xc0) != 0x80) break; + } + else /* 16-bit mode */ + { + for (; end_offset < ulen; end_offset++) + if ((((PCRE2_SPTR16)pp)[end_offset] & 0xfc00) != 0xdc00) break; + } + } + } + } + } - /* For /G, update the pointer and length */ + /* For /g (global), update the start offset, leaving the rest alone. */ + + if ((dat_datctl.control & CTL_GLOBAL) != 0) + dat_datctl.offset = end_offset; + + /* For altglobal, just update the pointer and length. */ else { diff --git a/testdata/testinput2 b/testdata/testinput2 index ee83ba2..0caf88a 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4255,4 +4255,12 @@ a random value. /Ix ";(?<=()((?3))((?2)))" +# Perl loops on this (PCRE2 used to!) + +/(?<=\Ka)/g,aftertext + aaaaa + +/(?<=\Ka)/altglobal,aftertext + aaaaa + # End of testinput2 diff --git a/testdata/testinput5 b/testdata/testinput5 index 9c3771b..e548969 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1641,4 +1641,10 @@ /[A-`]/iB,utf abcdefghijklmno +/(?<=\K\x{17f})/g,utf,aftertext + \x{17f}\x{17f}\x{17f}\x{17f}\x{17f} + +/(?<=\K\x{17f})/altglobal,utf,aftertext + \x{17f}\x{17f}\x{17f}\x{17f}\x{17f} + # End of testinput5 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index f8a103e..fbb0a0d 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14260,4 +14260,32 @@ Failed: error 115 at offset 15: reference to non-existent subpattern ";(?<=()((?3))((?2)))" Failed: error 125 at offset 20: lookbehind assertion is not fixed length +# Perl loops on this (PCRE2 used to!) + +/(?<=\Ka)/g,aftertext + aaaaa + 0: a + 0+ aaaa + 0: a + 0+ aaa + 0: a + 0+ aa + 0: a + 0+ a + 0: a + 0+ + +/(?<=\Ka)/altglobal,aftertext + aaaaa + 0: a + 0+ aaaa + 0: a + 0+ aaa + 0: a + 0+ aa + 0: a + 0+ a + 0: a + 0+ + # End of testinput2 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 46e66c5..b364714 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -4019,4 +4019,30 @@ Failed: error 140 at offset 11: recursion could loop indefinitely abcdefghijklmno 0: a +/(?<=\K\x{17f})/g,utf,aftertext + \x{17f}\x{17f}\x{17f}\x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f}\x{17f}\x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f}\x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f} + 0: \x{17f} + 0+ + +/(?<=\K\x{17f})/altglobal,utf,aftertext + \x{17f}\x{17f}\x{17f}\x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f}\x{17f}\x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f}\x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f}\x{17f} + 0: \x{17f} + 0+ \x{17f} + 0: \x{17f} + 0+ + # End of testinput5