From ead78198d1360b3bf7524824e650e60c0ae9f648 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sun, 16 Jun 2019 15:37:45 +0000 Subject: [PATCH] Improve minimum length finder in the presence of back references when there are multiple groups with the same number. --- ChangeLog | 8 +++++++- src/pcre2_study.c | 12 +++++------- testdata/testoutput2 | 6 +++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2eea672..b3debf4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -31,11 +31,17 @@ minimum is potentially useful. 9. Some changes to the way the minimum subject length is handled: * When PCRE2_NO_START_OPTIMIZE is set, no minimum length is computed; - pcre2test no longer shows a value (of zero). + pcre2test omits this item instead of showing a value of zero. * When no minimum length is set by the normal scan, but a first and/or last code unit is recorded, set the minimum to 1 or 2 as appropriate. + * When a pattern contains multiple groups with the same number, a back + reference cannot know which one to scan for a minimum length. This used to + cause the minimum length finder to give up with no result. Now it treats + such references as not adding to the minimum length (which it should have + done all along). + 10. A (*MARK) value inside a successful condition was not being returned by the interpretive matcher (it was returned by JIT). This bug has been mended. diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 62f373b..496c8dc 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -92,7 +92,6 @@ Returns: the minimum length -1 \C in UTF-8 mode or (*ACCEPT) or pattern too complicated - or back reference to duplicate name/number -2 internal error (missing capturing bracket) -3 internal error (opcode not listed) */ @@ -135,7 +134,7 @@ for (;;) int d, min, recno; PCRE2_UCHAR *cs, *ce; PCRE2_UCHAR op = *cc; - + if (branchlength >= UINT16_MAX) return UINT16_MAX; switch (op) @@ -452,12 +451,12 @@ for (;;) that case we must set the minimum length to zero. */ /* Duplicate named pattern back reference. We cannot reliably find a length - for this if duplicate numbers are present in the pattern. */ + for this if duplicate numbers are present in the pattern, so we set the + length to zero here also. */ case OP_DNREF: case OP_DNREFI: - if (dupcapused) return -1; - if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) + if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) { int count = GET2(cc, 1+IMM2_SIZE); PCRE2_UCHAR *slot = @@ -524,14 +523,13 @@ for (;;) case OP_REF: case OP_REFI: - if (dupcapused) return -1; recno = GET2(cc, 1); if (recno <= backref_cache[0] && backref_cache[recno] >= 0) d = backref_cache[recno]; else { int i; - if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) + if (!dupcapused && (re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0) { ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno); if (cs == NULL) return -2; diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 74e10a5..9c1dc50 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14607,7 +14607,7 @@ Subject length lower bound = 65535 Capture group count = 1 Max back reference = 1 Starting code units: a b -Subject length lower bound = 0 +Subject length lower bound = 1 /(?|(aaa)|(b))(?1)/I Capture group count = 1 @@ -14625,7 +14625,7 @@ Max back reference = 1 Named capture groups: a 1 Starting code units: a b -Subject length lower bound = 0 +Subject length lower bound = 1 /(?|(?'a'aaa)|(?'a'b))(?'a'cccc)\k'a'/I,dupnames Capture group count = 2 @@ -14636,7 +14636,7 @@ Named capture groups: Options: dupnames Starting code units: a b Last code unit = 'c' -Subject length lower bound = 1 +Subject length lower bound = 5 /ab{3cd/ ab{3cd