From 3458a2e2cda4c46f2238aa67d46a2ff2635c8646 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Tue, 12 Dec 2017 16:23:01 +0000 Subject: [PATCH] Fix infelicity in not finding a first character inside a non-assertive group within a positive assertion. --- ChangeLog | 8 ++++++++ src/pcre2_compile.c | 12 ++++++------ testdata/testoutput2 | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7af58de..e74a6bf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -70,6 +70,14 @@ pattern, apart from assertions, an incorrect first matching character could be recorded. For example, for the pattern /(?=(a))\1?b/, "b" was incorrectly set as the first character of a match. +18. Characters in a leading positive assertion are considered for recording a +first character of a match when the rest of the pattern does not provide one. +However, a character in a non-assertive group within a leading assertion such +as in the pattern /(?=(a))\1?b/ caused this process to fail. This was an +infelicity rather than an outright bug, because it did not affect the result of +a match, just its speed. (In fact, in this case, the starting 'a' was +subsequently picked up in the study.) + Version 10.30 14-August-2017 ---------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index ad17338..1e06040 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -8106,13 +8106,13 @@ REQ_NONE in the flags. Arguments: code points to start of compiled pattern flags points to the first code unit flags - inassert TRUE if in an assertion + inassert non-zero if in an assertion Returns: the fixed first code unit, or 0 with REQ_NONE in flags */ static uint32_t -find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, BOOL inassert) +find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, uint32_t inassert) { uint32_t c = 0; int cflags = REQ_NONE; @@ -8139,7 +8139,7 @@ do { case OP_SCBRAPOS: case OP_ASSERT: case OP_ONCE: - d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT); + d = find_firstassertedcu(scode, &dflags, inassert + ((op==OP_ASSERT)?1:0)); if (dflags < 0) return 0; if (cflags < 0) { c = d; cflags = dflags; } @@ -8154,7 +8154,7 @@ do { case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: - if (!inassert) return 0; + if (inassert == 0) return 0; if (cflags < 0) { c = scode[1]; cflags = 0; } else if (c != scode[1]) return 0; break; @@ -8167,7 +8167,7 @@ do { case OP_PLUSI: case OP_MINPLUSI: case OP_POSPLUSI: - if (!inassert) return 0; + if (inassert == 0) return 0; if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; } else if (c != scode[1]) return 0; break; @@ -9674,7 +9674,7 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) actual literals that follow). */ if (firstcuflags < 0) - firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE); + firstcu = find_firstassertedcu(codestart, &firstcuflags, 0); /* Save the data for a first code unit. */ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 31ccfbe..ee9cde9 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -16358,7 +16358,7 @@ Subject length lower bound = 1 "(?=(a))\1?b"I Capturing subpattern count = 1 Max back reference = 1 -Starting code units: a +First code unit = 'a' Last code unit = 'b' Subject length lower bound = 1 ab