From 07de1b1a9fa92a52010219a8b8f7e309a57fd518 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sun, 24 Dec 2017 10:27:13 +0000 Subject: [PATCH] Documentation update. --- doc/html/pcre2demo.html | 53 +++++++++++++++++++++++++++++++++++++++++ doc/pcre2demo.3 | 53 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/doc/html/pcre2demo.html b/doc/html/pcre2demo.html index d64e16b..72754d3 100644 --- a/doc/html/pcre2demo.html +++ b/doc/html/pcre2demo.html @@ -228,6 +228,21 @@ pcre2_match_data_create_from_pattern() above. */ if (rc == 0) printf("ovector was not big enough for all the captured substrings\n"); +/* We must guard against patterns such as /(?=.\K)/ that use \K in an assertion +to set the start of a match later than its end. In this demonstration program, +we just detect this case and give up. */ + +if (ovector[0] > ovector[1]) + { + printf("\\K was used in an assertion to set the match start after its end.\n" + "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), + (char *)(subject + ovector[1])); + printf("Run abandoned\n"); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + /* Show substrings stored in the output vector by number. Obviously, in a real application you might want to do things other than print them. */ @@ -355,6 +370,29 @@ for (;;) options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } + /* If the previous match was not an empty string, there is one tricky case to + consider. If a pattern contains \K within a lookbehind assertion at the + start, the end of the matched string can be at the offset where the match + started. Without special action, this leads to a loop that keeps on matching + the same substring. We must detect this case and arrange to move the start on + by one character. The pcre2_get_startchar() function returns the starting + offset that was passed to pcre2_match(). */ + + else + { + PCRE2_SIZE startchar = pcre2_get_startchar(match_data); + if (start_offset <= startchar) + { + if (startchar >= subject_length) break; /* Reached end of subject. */ + start_offset = startchar + 1; /* Advance by one character. */ + if (utf8) /* If UTF-8, it may be more */ + { /* than one code unit. */ + for (; start_offset < subject_length; start_offset++) + if ((subject[start_offset] & 0xc0) != 0x80) break; + } + } + } + /* Run the next matching operation */ rc = pcre2_match( @@ -419,6 +457,21 @@ for (;;) if (rc == 0) printf("ovector was not big enough for all the captured substrings\n"); + /* We must guard against patterns such as /(?=.\K)/ that use \K in an + assertion to set the start of a match later than its end. In this + demonstration program, we just detect this case and give up. */ + + if (ovector[0] > ovector[1]) + { + printf("\\K was used in an assertion to set the match start after its end.\n" + "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), + (char *)(subject + ovector[1])); + printf("Run abandoned\n"); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + /* As before, show substrings stored in the output vector by number, and then also any named substrings. */ diff --git a/doc/pcre2demo.3 b/doc/pcre2demo.3 index c02dcd9..a9e58e2 100644 --- a/doc/pcre2demo.3 +++ b/doc/pcre2demo.3 @@ -228,6 +228,21 @@ pcre2_match_data_create_from_pattern() above. */ if (rc == 0) printf("ovector was not big enough for all the captured substrings\en"); +/* We must guard against patterns such as /(?=.\eK)/ that use \eK in an assertion +to set the start of a match later than its end. In this demonstration program, +we just detect this case and give up. */ + +if (ovector[0] > ovector[1]) + { + printf("\e\eK was used in an assertion to set the match start after its end.\en" + "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]), + (char *)(subject + ovector[1])); + printf("Run abandoned\en"); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + /* Show substrings stored in the output vector by number. Obviously, in a real application you might want to do things other than print them. */ @@ -355,6 +370,29 @@ for (;;) options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; } + /* If the previous match was not an empty string, there is one tricky case to + consider. If a pattern contains \eK within a lookbehind assertion at the + start, the end of the matched string can be at the offset where the match + started. Without special action, this leads to a loop that keeps on matching + the same substring. We must detect this case and arrange to move the start on + by one character. The pcre2_get_startchar() function returns the starting + offset that was passed to pcre2_match(). */ + + else + { + PCRE2_SIZE startchar = pcre2_get_startchar(match_data); + if (start_offset <= startchar) + { + if (startchar >= subject_length) break; /* Reached end of subject. */ + start_offset = startchar + 1; /* Advance by one character. */ + if (utf8) /* If UTF-8, it may be more */ + { /* than one code unit. */ + for (; start_offset < subject_length; start_offset++) + if ((subject[start_offset] & 0xc0) != 0x80) break; + } + } + } + /* Run the next matching operation */ rc = pcre2_match( @@ -419,6 +457,21 @@ for (;;) if (rc == 0) printf("ovector was not big enough for all the captured substrings\en"); + /* We must guard against patterns such as /(?=.\eK)/ that use \eK in an + assertion to set the start of a match later than its end. In this + demonstration program, we just detect this case and give up. */ + + if (ovector[0] > ovector[1]) + { + printf("\e\eK was used in an assertion to set the match start after its end.\en" + "From end to start the match was: %.*s\en", (int)(ovector[0] - ovector[1]), + (char *)(subject + ovector[1])); + printf("Run abandoned\en"); + pcre2_match_data_free(match_data); + pcre2_code_free(re); + return 1; + } + /* As before, show substrings stored in the output vector by number, and then also any named substrings. */