Minor improvement to minimum length calculation.
This commit is contained in:
parent
f0c06ee212
commit
1f6b9097f4
|
@ -28,6 +28,14 @@ increase it substantially for non-anchored patterns.
|
|||
8. Allow (*ACCEPT) to be quantified, because an ungreedy quantifier with a zero
|
||||
minimum is potentially useful.
|
||||
|
||||
9. Some changes to the way the minimum subject length is handled:
|
||||
|
||||
* When PCRE2_NO_START_OPTIMIZE is set, no minimum length is computed;
|
||||
pcre2test no longer shows a value (of zero).
|
||||
|
||||
* When no minimum length is set by the normal scan, but a first and/or last
|
||||
code unit is recorded, set the minimum to 1 or 2 as appropriate.
|
||||
|
||||
|
||||
Version 10.33 16-April-2019
|
||||
---------------------------
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "30 May 2019" "PCRE2 10.34"
|
||||
.TH PCRE2API 3 "11 June 2019" "PCRE2 10.34"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -2229,12 +2229,12 @@ segment.
|
|||
PCRE2_INFO_MINLENGTH
|
||||
.sp
|
||||
If a minimum length for matching subject strings was computed, its value is
|
||||
returned. Otherwise the returned value is 0. The value is a number of
|
||||
characters, which in UTF mode may be different from the number of code units.
|
||||
The third argument should point to an \fBuint32_t\fP variable. The value is a
|
||||
lower bound to the length of any matching string. There may not be any strings
|
||||
of that length that do actually match, but every string that does match is at
|
||||
least that long.
|
||||
returned. Otherwise the returned value is 0. This value is not computed when
|
||||
PCRE2_NO_START_OPTIMIZE is set. The value is a number of characters, which in
|
||||
UTF mode may be different from the number of code units. The third argument
|
||||
should point to an \fBuint32_t\fP variable. The value is a lower bound to the
|
||||
length of any matching string. There may not be any strings of that length that
|
||||
do actually match, but every string that does match is at least that long.
|
||||
.sp
|
||||
PCRE2_INFO_NAMECOUNT
|
||||
PCRE2_INFO_NAMEENTRYSIZE
|
||||
|
@ -3848,6 +3848,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 30 May 2019
|
||||
Last updated: 11 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "23 May 2019" "PCRE 10.34"
|
||||
.TH PCRE2TEST 1 "11 June 2019" "PCRE 10.34"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -695,7 +695,9 @@ options, the line is omitted. "First code unit" is where any match must start;
|
|||
if there is more than one they are listed as "starting code units". "Last code
|
||||
unit" is the last literal code unit that must be present in any match. This is
|
||||
not necessarily the last character. These lines are omitted if no starting or
|
||||
ending code units are recorded.
|
||||
ending code units are recorded. The subject length line is omitted when
|
||||
\fBno_start_optimize\fP is set because the minimum length is not calculated
|
||||
when it can never be used.
|
||||
.P
|
||||
The \fBframesize\fP modifier shows the size, in bytes, of the storage frames
|
||||
used by \fBpcre2_match()\fP for handling backtracking. The size depends on the
|
||||
|
@ -2060,6 +2062,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 23 May 2019
|
||||
Last updated: 11 June 2019
|
||||
Copyright (c) 1997-2019 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -10151,6 +10151,8 @@ unit. */
|
|||
|
||||
if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
||||
{
|
||||
int minminlength = 0; /* For minimal minlength from first/required CU */
|
||||
|
||||
/* If we do not have a first code unit, see if there is one that is asserted
|
||||
(these are not saved during the compile because they can cause conflicts with
|
||||
actual literals that follow). */
|
||||
|
@ -10158,12 +10160,14 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
|||
if (firstcuflags < 0)
|
||||
firstcu = find_firstassertedcu(codestart, &firstcuflags, 0);
|
||||
|
||||
/* Save the data for a first code unit. */
|
||||
/* Save the data for a first code unit. The existence of one means the
|
||||
minimum length must be at least 1. */
|
||||
|
||||
if (firstcuflags >= 0)
|
||||
{
|
||||
re->first_codeunit = firstcu;
|
||||
re->flags |= PCRE2_FIRSTSET;
|
||||
minminlength++;
|
||||
|
||||
/* Handle caseless first code units. */
|
||||
|
||||
|
@ -10197,12 +10201,32 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
|||
is_startline(codestart, 0, &cb, 0, FALSE))
|
||||
re->flags |= PCRE2_STARTLINE;
|
||||
|
||||
/* Handle the "required code unit", if one is set. In the case of an anchored
|
||||
pattern, do this only if it follows a variable length item in the pattern. */
|
||||
/* Handle the "required code unit", if one is set. We can increment the
|
||||
minimum minimum length only if we are sure this really is a different
|
||||
character, because the count is in characters, not code units. */
|
||||
|
||||
if (reqcuflags >= 0 &&
|
||||
((re->overall_options & PCRE2_ANCHORED) == 0 ||
|
||||
(reqcuflags & REQ_VARY) != 0))
|
||||
if (reqcuflags >= 0)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 16
|
||||
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
|
||||
firstcuflags < 0 || /* First not set */
|
||||
(firstcu & 0xf800) != 0xd800 || /* First not surrogate */
|
||||
(reqcu & 0xfc00) != 0xdc00) /* Req not low surrogate */
|
||||
#elif PCRE2_CODE_UNIT_WIDTH == 8
|
||||
if ((re->overall_options & PCRE2_UTF) == 0 || /* Not UTF */
|
||||
firstcuflags < 0 || /* First not set */
|
||||
(firstcu & 0x80) == 0 || /* First is ASCII */
|
||||
(reqcu & 0x80) == 0) /* Req is ASCII */
|
||||
#endif
|
||||
{
|
||||
minminlength++;
|
||||
}
|
||||
|
||||
/* In the case of an anchored pattern, set up the value only if it follows
|
||||
a variable length item in the pattern. */
|
||||
|
||||
if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
|
||||
(reqcuflags & REQ_VARY) != 0)
|
||||
{
|
||||
re->last_codeunit = reqcu;
|
||||
re->flags |= PCRE2_LASTSET;
|
||||
|
@ -10221,15 +10245,21 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
|
|||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Finally, study the compiled pattern to set up information such as a bitmap
|
||||
of starting code units and a minimum matching length. */
|
||||
/* Study the compiled pattern to set up information such as a bitmap of
|
||||
starting code units and a minimum matching length. */
|
||||
|
||||
if (PRIV(study)(re) != 0)
|
||||
{
|
||||
errorcode = ERR31;
|
||||
goto HAD_CB_ERROR;
|
||||
}
|
||||
|
||||
/* If the minimum length set (or not set) by study() is less than the minimum
|
||||
implied by required code units, override it. */
|
||||
|
||||
if (re->minlength < minminlength) re->minlength = minminlength;
|
||||
} /* End of start-of-match optimizations. */
|
||||
|
||||
/* Control ends up here in all cases. When running under valgrind, make a
|
||||
|
|
|
@ -214,9 +214,7 @@ for (;;)
|
|||
|
||||
/* Reached end of a branch; if it's a ket it is the end of a nested
|
||||
call. If it's ALT it is an alternation in a nested call. If it is END it's
|
||||
the end of the outer call. All can be handled by the same code. If an
|
||||
ACCEPT was previously encountered, use the length that was in force at that
|
||||
time, and pass back the shortest ACCEPT length. */
|
||||
the end of the outer call. All can be handled by the same code. */
|
||||
|
||||
case OP_ALT:
|
||||
case OP_KET:
|
||||
|
|
|
@ -4704,6 +4704,7 @@ if ((pat_patctl.control & CTL_INFO) != 0)
|
|||
}
|
||||
}
|
||||
|
||||
if ((FLD(compiled_code, overall_options) & PCRE2_NO_START_OPTIMIZE) == 0)
|
||||
fprintf(outfile, "Subject length lower bound = %d\n", minlength);
|
||||
|
||||
if (pat_patctl.jit != 0 && (pat_patctl.control & CTL_JITVERIFY) != 0)
|
||||
|
|
|
@ -1480,7 +1480,7 @@ Subject length lower bound = 2
|
|||
Capture group count = 0
|
||||
First code unit = 'a'
|
||||
Last code unit = 'a'
|
||||
Subject length lower bound = 1
|
||||
Subject length lower bound = 2
|
||||
|
||||
/(?=a)a.*/I
|
||||
Capture group count = 0
|
||||
|
@ -3406,7 +3406,7 @@ Subject length lower bound = 2
|
|||
Capture group count = 0
|
||||
May match empty string
|
||||
First code unit = 'a'
|
||||
Subject length lower bound = 0
|
||||
Subject length lower bound = 1
|
||||
|
||||
/(?=abc).xyz/Ii
|
||||
Capture group count = 0
|
||||
|
@ -3425,7 +3425,7 @@ Subject length lower bound = 4
|
|||
Capture group count = 0
|
||||
May match empty string
|
||||
First code unit = 'a'
|
||||
Subject length lower bound = 0
|
||||
Subject length lower bound = 1
|
||||
|
||||
/(?=.)a/I
|
||||
Capture group count = 0
|
||||
|
@ -3436,7 +3436,7 @@ Subject length lower bound = 1
|
|||
Capture group count = 1
|
||||
First code unit = 'a'
|
||||
Last code unit = 'a'
|
||||
Subject length lower bound = 1
|
||||
Subject length lower bound = 2
|
||||
|
||||
/((?=abcda)ab)/I
|
||||
Capture group count = 1
|
||||
|
@ -10780,7 +10780,7 @@ Capture group count = 0
|
|||
Options: caseless
|
||||
First code unit = 'a' (caseless)
|
||||
Last code unit = 'a' (caseless)
|
||||
Subject length lower bound = 1
|
||||
Subject length lower bound = 2
|
||||
|
||||
/(abc)\1+/
|
||||
|
||||
|
@ -11254,7 +11254,7 @@ Subject length lower bound = 0
|
|||
/z(*ACCEPT)a/I,aftertext
|
||||
Capture group count = 0
|
||||
First code unit = 'z'
|
||||
Subject length lower bound = 0
|
||||
Subject length lower bound = 1
|
||||
baxzbx
|
||||
0: z
|
||||
0+ bx
|
||||
|
@ -13572,7 +13572,6 @@ Subject length lower bound = 4
|
|||
/abcd/I,no_start_optimize
|
||||
Capture group count = 0
|
||||
Options: no_start_optimize
|
||||
Subject length lower bound = 0
|
||||
|
||||
/(|ab)*?d/I
|
||||
Capture group count = 1
|
||||
|
@ -13588,7 +13587,6 @@ Subject length lower bound = 1
|
|||
/(|ab)*?d/I,no_start_optimize
|
||||
Capture group count = 1
|
||||
Options: no_start_optimize
|
||||
Subject length lower bound = 0
|
||||
abd
|
||||
0: abd
|
||||
1: ab
|
||||
|
@ -14638,7 +14636,7 @@ Named capture groups:
|
|||
Options: dupnames
|
||||
Starting code units: a b
|
||||
Last code unit = 'c'
|
||||
Subject length lower bound = 0
|
||||
Subject length lower bound = 1
|
||||
|
||||
/ab{3cd/
|
||||
ab{3cd
|
||||
|
@ -16436,7 +16434,7 @@ Capture group count = 1
|
|||
Max back reference = 1
|
||||
First code unit = 'a'
|
||||
Last code unit = 'b'
|
||||
Subject length lower bound = 1
|
||||
Subject length lower bound = 2
|
||||
ab
|
||||
0: ab
|
||||
1: a
|
||||
|
|
|
@ -9,7 +9,7 @@ Contains \C
|
|||
Options: utf
|
||||
First code unit = 'a'
|
||||
Last code unit = 'e'
|
||||
Subject length lower bound = 0
|
||||
Subject length lower bound = 2
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ Contains \C
|
|||
Options: utf
|
||||
First code unit = 'a'
|
||||
Last code unit = 'e'
|
||||
Subject length lower bound = 0
|
||||
Subject length lower bound = 2
|
||||
abXde
|
||||
0: abXde
|
||||
|
||||
|
|
|
@ -474,7 +474,6 @@ Subject length lower bound = 0
|
|||
Capture group count = 0
|
||||
Compile options: no_start_optimize utf
|
||||
Overall options: anchored no_start_optimize utf
|
||||
Subject length lower bound = 0
|
||||
|
||||
/()()()()()()()()()()
|
||||
()()()()()()()()()()
|
||||
|
|
|
@ -6845,7 +6845,6 @@ No match
|
|||
/(abc|def|xyz)/I,no_start_optimize
|
||||
Capture group count = 1
|
||||
Options: no_start_optimize
|
||||
Subject length lower bound = 0
|
||||
terhjk;abcdaadsfe
|
||||
0: abc
|
||||
the quick xyz brown fox
|
||||
|
|
Loading…
Reference in New Issue