diff --git a/ChangeLog b/ChangeLog index 3bb4764..55d56dd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -28,6 +28,14 @@ increase it substantially for non-anchored patterns. 8. Allow (*ACCEPT) to be quantified, because an ungreedy quantifier with a zero minimum is potentially useful. +9. Some changes to the way the minimum subject length is handled: + + * When PCRE2_NO_START_OPTIMIZE is set, no minimum length is computed; + pcre2test no longer shows a value (of zero). + + * When no minimum length is set by the normal scan, but a first and/or last + code unit is recorded, set the minimum to 1 or 2 as appropriate. + Version 10.33 16-April-2019 --------------------------- diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 6fa81ac..f8954d7 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "30 May 2019" "PCRE2 10.34" +.TH PCRE2API 3 "11 June 2019" "PCRE2 10.34" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -2229,12 +2229,12 @@ segment. PCRE2_INFO_MINLENGTH .sp If a minimum length for matching subject strings was computed, its value is -returned. Otherwise the returned value is 0. The value is a number of -characters, which in UTF mode may be different from the number of code units. -The third argument should point to an \fBuint32_t\fP variable. The value is a -lower bound to the length of any matching string. There may not be any strings -of that length that do actually match, but every string that does match is at -least that long. +returned. Otherwise the returned value is 0. This value is not computed when +PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in +UTF mode may be different from the number of code units. The third argument +should point to an \fBuint32_t\fP variable. The value is a lower bound to the +length of any matching string. There may not be any strings of that length that +do actually match, but every string that does match is at least that long. .sp PCRE2_INFO_NAMECOUNT PCRE2_INFO_NAMEENTRYSIZE @@ -3848,6 +3848,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 May 2019 +Last updated: 11 June 2019 Copyright (c) 1997-2019 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index eab478d..7b87210 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "23 May 2019" "PCRE 10.34" +.TH PCRE2TEST 1 "11 June 2019" "PCRE 10.34" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -695,7 +695,9 @@ options, the line is omitted. "First code unit" is where any match must start; if there is more than one they are listed as "starting code units". "Last code unit" is the last literal code unit that must be present in any match. This is not necessarily the last character. These lines are omitted if no starting or -ending code units are recorded. +ending code units are recorded. The subject length line is omitted when +\fBno_start_optimize\fP is set because the minimum length is not calculated +when it can never be used. .P The \fBframesize\fP modifier shows the size, in bytes, of the storage frames used by \fBpcre2_match()\fP for handling backtracking. The size depends on the @@ -2060,6 +2062,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 23 May 2019 +Last updated: 11 June 2019 Copyright (c) 1997-2019 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index eb5f66d..c06327b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -10151,6 +10151,8 @@ unit. */ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) { + int minminlength = 0; /* For minimal minlength from first/required CU */ + /* If we do not have a first code unit, see if there is one that is asserted (these are not saved during the compile because they can cause conflicts with actual literals that follow). */ @@ -10158,12 +10160,14 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) if (firstcuflags < 0) firstcu = find_firstassertedcu(codestart, &firstcuflags, 0); - /* Save the data for a first code unit. */ + /* Save the data for a first code unit. The existence of one means the + minimum length must be at least 1. */ if (firstcuflags >= 0) { re->first_codeunit = firstcu; re->flags |= PCRE2_FIRSTSET; + minminlength++; /* Handle caseless first code units. */ @@ -10197,39 +10201,65 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) is_startline(codestart, 0, &cb, 0, FALSE)) re->flags |= PCRE2_STARTLINE; - /* Handle the "required code unit", if one is set. In the case of an anchored - pattern, do this only if it follows a variable length item in the pattern. */ + /* Handle the "required code unit", if one is set. We can increment the + minimum minimum length only if we are sure this really is a different + character, because the count is in characters, not code units. */ - if (reqcuflags >= 0 && - ((re->overall_options & PCRE2_ANCHORED) == 0 || - (reqcuflags & REQ_VARY) != 0)) + if (reqcuflags >= 0) { - re->last_codeunit = reqcu; - re->flags |= PCRE2_LASTSET; - - /* Handle caseless required code units as for first code units (above). */ - - if ((reqcuflags & REQ_CASELESS) != 0) - { - if (reqcu < 128 || (!utf && reqcu < 255)) - { - if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; - } -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) - re->flags |= PCRE2_LASTCASELESS; +#if PCRE2_CODE_UNIT_WIDTH == 16 + if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ + firstcuflags < 0 || /* First not set */ + (firstcu & 0xf800) != 0xd800 || /* First not surrogate */ + (reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */ +#elif PCRE2_CODE_UNIT_WIDTH == 8 + if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */ + firstcuflags < 0 || /* First not set */ + (firstcu & 0x80) == 0 || /* First is ASCII */ + (reqcu & 0x80) == 0) /* Req is ASCII */ #endif + { + minminlength++; + } + + /* In the case of an anchored pattern, set up the value only if it follows + a variable length item in the pattern. */ + + if ((re->overall_options & PCRE2_ANCHORED) == 0 || + (reqcuflags & REQ_VARY) != 0) + { + re->last_codeunit = reqcu; + re->flags |= PCRE2_LASTSET; + + /* Handle caseless required code units as for first code units (above). */ + + if ((reqcuflags & REQ_CASELESS) != 0) + { + if (reqcu < 128 || (!utf && reqcu < 255)) + { + if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS; + } +#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 + else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) + re->flags |= PCRE2_LASTCASELESS; +#endif + } } } - /* Finally, study the compiled pattern to set up information such as a bitmap - of starting code units and a minimum matching length. */ + /* Study the compiled pattern to set up information such as a bitmap of + starting code units and a minimum matching length. */ if (PRIV(study)(re) != 0) { errorcode = ERR31; goto HAD_CB_ERROR; } + + /* If the minimum length set (or not set) by study() is less than the minimum + implied by required code units, override it. */ + + if (re->minlength < minminlength) re->minlength = minminlength; } /* End of start-of-match optimizations. */ /* Control ends up here in all cases. When running under valgrind, make a diff --git a/src/pcre2_study.c b/src/pcre2_study.c index e883c2e..62f373b 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -214,9 +214,7 @@ for (;;) /* Reached end of a branch; if it's a ket it is the end of a nested call. If it's ALT it is an alternation in a nested call. If it is END it's - the end of the outer call. All can be handled by the same code. If an - ACCEPT was previously encountered, use the length that was in force at that - time, and pass back the shortest ACCEPT length. */ + the end of the outer call. All can be handled by the same code. */ case OP_ALT: case OP_KET: diff --git a/src/pcre2test.c b/src/pcre2test.c index d7ad3cf..cc66b9e 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -4704,7 +4704,8 @@ if ((pat_patctl.control & CTL_INFO) != 0) } } - fprintf(outfile, "Subject length lower bound = %d\n", minlength); + if ((FLD(compiled_code, overall_options) & PCRE2_NO_START_OPTIMIZE) == 0) + fprintf(outfile, "Subject length lower bound = %d\n", minlength); if (pat_patctl.jit != 0 && (pat_patctl.control & CTL_JITVERIFY) != 0) { diff --git a/testdata/testoutput2 b/testdata/testoutput2 index c292dc8..2fd3c62 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -1480,7 +1480,7 @@ Subject length lower bound = 2 Capture group count = 0 First code unit = 'a' Last code unit = 'a' -Subject length lower bound = 1 +Subject length lower bound = 2 /(?=a)a.*/I Capture group count = 0 @@ -3406,7 +3406,7 @@ Subject length lower bound = 2 Capture group count = 0 May match empty string First code unit = 'a' -Subject length lower bound = 0 +Subject length lower bound = 1 /(?=abc).xyz/Ii Capture group count = 0 @@ -3425,7 +3425,7 @@ Subject length lower bound = 4 Capture group count = 0 May match empty string First code unit = 'a' -Subject length lower bound = 0 +Subject length lower bound = 1 /(?=.)a/I Capture group count = 0 @@ -3436,7 +3436,7 @@ Subject length lower bound = 1 Capture group count = 1 First code unit = 'a' Last code unit = 'a' -Subject length lower bound = 1 +Subject length lower bound = 2 /((?=abcda)ab)/I Capture group count = 1 @@ -10780,7 +10780,7 @@ Capture group count = 0 Options: caseless First code unit = 'a' (caseless) Last code unit = 'a' (caseless) -Subject length lower bound = 1 +Subject length lower bound = 2 /(abc)\1+/ @@ -11254,7 +11254,7 @@ Subject length lower bound = 0 /z(*ACCEPT)a/I,aftertext Capture group count = 0 First code unit = 'z' -Subject length lower bound = 0 +Subject length lower bound = 1 baxzbx 0: z 0+ bx @@ -13572,7 +13572,6 @@ Subject length lower bound = 4 /abcd/I,no_start_optimize Capture group count = 0 Options: no_start_optimize -Subject length lower bound = 0 /(|ab)*?d/I Capture group count = 1 @@ -13588,7 +13587,6 @@ Subject length lower bound = 1 /(|ab)*?d/I,no_start_optimize Capture group count = 1 Options: no_start_optimize -Subject length lower bound = 0 abd 0: abd 1: ab @@ -14638,7 +14636,7 @@ Named capture groups: Options: dupnames Starting code units: a b Last code unit = 'c' -Subject length lower bound = 0 +Subject length lower bound = 1 /ab{3cd/ ab{3cd @@ -16436,7 +16434,7 @@ Capture group count = 1 Max back reference = 1 First code unit = 'a' Last code unit = 'b' -Subject length lower bound = 1 +Subject length lower bound = 2 ab 0: ab 1: a diff --git a/testdata/testoutput22-16 b/testdata/testoutput22-16 index 4b6008c..5421854 100644 --- a/testdata/testoutput22-16 +++ b/testdata/testoutput22-16 @@ -9,7 +9,7 @@ Contains \C Options: utf First code unit = 'a' Last code unit = 'e' -Subject length lower bound = 0 +Subject length lower bound = 2 abXde 0: abXde diff --git a/testdata/testoutput22-8 b/testdata/testoutput22-8 index 5dd167e..eab410e 100644 --- a/testdata/testoutput22-8 +++ b/testdata/testoutput22-8 @@ -9,7 +9,7 @@ Contains \C Options: utf First code unit = 'a' Last code unit = 'e' -Subject length lower bound = 0 +Subject length lower bound = 2 abXde 0: abXde diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 5d64d00..4249c5e 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -474,7 +474,6 @@ Subject length lower bound = 0 Capture group count = 0 Compile options: no_start_optimize utf Overall options: anchored no_start_optimize utf -Subject length lower bound = 0 /()()()()()()()()()() ()()()()()()()()()() diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 3ef10b4..5f6fa5d 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -6845,7 +6845,6 @@ No match /(abc|def|xyz)/I,no_start_optimize Capture group count = 1 Options: no_start_optimize -Subject length lower bound = 0 terhjk;abcdaadsfe 0: abc the quick xyz brown fox