From a9f7c80fa3d4e493ea10519e8269d49a3d012d15 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 23 Dec 2017 17:15:51 +0000 Subject: [PATCH] Update pcre2demo to deal with various \K inside assertion anomalies. --- ChangeLog | 5 +++++ src/pcre2demo.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/ChangeLog b/ChangeLog index da4f2e3..cc82d2e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -95,6 +95,11 @@ not by JIT or pcre2_dfa_match(). Their settings are shown in pcre2test callouts if the callout_extra subject modifier is set. These bits are provided to help with tracking how a backtracking match is proceeding. +23. Updated the pcre2demo.c demonstration program, which was missing the extra +code for -g that handles the case when \K in an assertion causes the match to +end at the original start point. Also arranged for it to detect when \K causes +the end of a match to be before its start. + Version 10.30 14-August-2017 ---------------------------- diff --git a/src/pcre2demo.c b/src/pcre2demo.c index 8ae49f1..5d9b321 100644 --- a/src/pcre2demo.c +++ b/src/pcre2demo.c @@ -211,6 +211,21 @@ pcre2_match_data_create_from_pattern() above. */ if (rc == 0) printf("ovector was not big enough for all the captured substrings\n"); +/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion +to set the start of a match later than its end. In this demonstration program, +we just detect this case and give up. */ + +if (ovector[0] > ovector[1]) + { + printf("\\K was used in an assertion to set the match start after its end.\n" + "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), + (char *)(subject + ovector[1])); + printf("Run abandoned\n"); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + /* Show substrings stored in the output vector by number. Obviously, in a real application you might want to do things other than print them. */ @@ -338,6 +353,29 @@ for (;;) options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } + /* If the previous match was not an empty string, there is one tricky case to + consider. If a pattern contains \K within a lookbehind assertion at the + start, the end of the matched string can be at the offset where the match + started. Without special action, this leads to a loop that keeps on matching + the same substring. We must detect this case and arrange to move the start on + by one character. The pcre2_get_startchar() function returns the starting + offset that was passed to pcre2_match(). */ + + else + { + PCRE2_SIZE startchar = pcre2_get_startchar(match_data); + if (start_offset <= startchar) + { + if (startchar >= subject_length) break; /* Reached end of subject. */ + start_offset = startchar + 1; /* Advance by one character. */ + if (utf8) /* If UTF-8, it may be more */ + { /* than one code unit. */ + for (; start_offset < subject_length; start_offset++) + if ((subject[start_offset] & 0xc0) != 0x80) break; + } + } + } + /* Run the next matching operation */ rc = pcre2_match( @@ -402,6 +440,21 @@ for (;;) if (rc == 0) printf("ovector was not big enough for all the captured substrings\n"); + /* We must guard against patterns such as /(?=.\K)/ that use \K in an + assertion to set the start of a match later than its end. In this + demonstration program, we just detect this case and give up. */ + + if (ovector[0] > ovector[1]) + { + printf("\\K was used in an assertion to set the match start after its end.\n" + "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), + (char *)(subject + ovector[1])); + printf("Run abandoned\n"); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + /* As before, show substrings stored in the output vector by number, and then also any named substrings. */