Add "allvector" to pcre2test.

This commit is contained in:
Philip.Hazel 2018-09-15 17:10:39 +00:00
parent 142c667bbc
commit 3fce7c75e9
6 changed files with 193 additions and 41 deletions

View File

@ -2,9 +2,17 @@ Change Log for PCRE2
--------------------
Version 10.32-RC1 10-September-2018
Version 10.33-RC1 15-September-2018
-----------------------------------
1. Added "allvector" to pcre2test to make it easy to check the part of the
ovector that shouldn't be changed, in particular after substitute and failed or
partial matches.
Version 10.32 10-September-2018
-------------------------------
1. When matching using the the REG_STARTEND feature of the POSIX API with a
non-zero starting offset, unset capturing groups with lower numbers than a
group that did capture something were not being correctly returned as "unset"

View File

@ -9,9 +9,9 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might
dnl be defined as -RC2, for example. For real releases, it should be empty.
m4_define(pcre2_major, [10])
m4_define(pcre2_minor, [32])
m4_define(pcre2_prerelease, [])
m4_define(pcre2_date, [2018-09-10])
m4_define(pcre2_minor, [33])
m4_define(pcre2_prerelease, [-RC1])
m4_define(pcre2_date, [2018-09-14])
# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "21 July 2018" "PCRE 10.32"
.TH PCRE2TEST 1 "15 September 2018" "PCRE 10.33"
.SH NAME
pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
@ -1003,6 +1003,7 @@ process.
aftertext show text after match
allaftertext show text after captures
allcaptures show all captures
allvector show the entire ovector
allusedtext show all consulted text
altglobal alternative global matching
/g global global matching
@ -1154,6 +1155,7 @@ pattern.
aftertext show text after match
allaftertext show text after captures
allcaptures show all captures
allvector show the entire ovector
allusedtext show all consulted text (non-JIT only)
altglobal alternative global matching
callout_capture show captures at callout time
@ -1248,7 +1250,25 @@ captured parentheses be output after a match. By default, only those up to the
highest one actually used in the match are output (corresponding to the return
code from \fBpcre2_match()\fP). Groups that did not take part in the match
are output as "<unset>". This modifier is not relevant for DFA matching (which
does no capturing); it is ignored, with a warning message, if present.
does no capturing) and does not apply when \fBreplace\fP is specified; it is
ignored, with a warning message, if present.
.
.
.SS "Showing the entire ovector, for all outcomes"
.rs
.sp
The \fBallvector\fP modifier requests that the entire ovector be shown,
whatever the outcome of the match. Compare \fBallcaptures\fP, which shows only
up to the maximum number of capture groups for the pattern, and then only for a
successful complete non-DFA match. This modifier, which acts after any match
result, and also for DFA matching, provides a means of checking that there are
no unexpected modifications to ovector fields. Before each match attempt, the
ovector is filled with a special value, and if this is found in both elements
of a capturing pair, "<unchanged>" is output. After a successful match, this
applies to all groups after the maximum capture group for the pattern. In other
cases it applies to the entire ovector. After a partial match, the first two
elements are the only ones that should be set. After a DFA match, the amount of
ovector that is used depends on the number of matches that were found.
.
.
.SS "Testing callouts"
@ -1982,6 +2002,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 21 July 2018
Last updated: 15 September 2018
Copyright (c) 1997-2018 University of Cambridge.
.fi

View File

@ -491,6 +491,7 @@ so many of them that they are split into two fields. */
#define CTL2_SUBJECT_LITERAL 0x00000010u
#define CTL2_CALLOUT_NO_WHERE 0x00000020u
#define CTL2_CALLOUT_EXTRA 0x00000040u
#define CTL2_ALLVECTOR 0x00000080u
#define CTL2_NL_SET 0x40000000u /* Informational */
#define CTL2_BSR_SET 0x80000000u /* Informational */
@ -513,7 +514,8 @@ different things in the two cases. */
#define CTL2_ALLPD (CTL2_SUBSTITUTE_EXTENDED|\
CTL2_SUBSTITUTE_OVERFLOW_LENGTH|\
CTL2_SUBSTITUTE_UNKNOWN_UNSET|\
CTL2_SUBSTITUTE_UNSET_EMPTY)
CTL2_SUBSTITUTE_UNSET_EMPTY|\
CTL2_ALLVECTOR)
/* Structures for holding modifier information for patterns and subject strings
(data). Fields containing modifiers that can be set either for a pattern or a
@ -592,6 +594,7 @@ static modstruct modlist[] = {
{ "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) },
{ "allow_surrogate_escapes", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES, CO(extra_options) },
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
{ "allvector", MOD_PND, MOD_CTL, CTL2_ALLVECTOR, PO(control2) },
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
{ "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) },
@ -888,6 +891,7 @@ static uint32_t forbid_utf = 0;
static uint32_t maxlookbehind;
static uint32_t max_oveccount;
static uint32_t callout_count;
static uint32_t maxcapcount;
static uint16_t local_newline_default = 0;
@ -4018,12 +4022,13 @@ Returns: nothing
static void
show_controls(uint32_t controls, uint32_t controls2, const char *before)
{
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
before,
((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "",
((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "",
((controls & CTL_ALLCAPTURES) != 0)? " allcaptures" : "",
((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "",
((controls2 & CTL2_ALLVECTOR) != 0)? " allvector" : "",
((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "",
((controls & CTL_BINCODE) != 0)? " bincode" : "",
((controls2 & CTL2_BSR_SET) != 0)? " bsr" : "",
@ -5717,6 +5722,11 @@ if (forbid_utf != 0)
if (pattern_info(PCRE2_INFO_MAXLOOKBEHIND, &maxlookbehind, FALSE) != 0)
return PR_ABEND;
/* Remember the number of captures. */
if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount, FALSE) < 0)
return PR_ABEND;
/* If an explicit newline modifier was given, set the information flag in the
pattern so that it is preserved over push/pop. */
@ -6317,6 +6327,42 @@ return TRUE;
/*************************************************
* Show an entire ovector *
*************************************************/
/* This function is called after partial matching or match failure, when the
"allvector" modifier is set. It is a means of checking the contents of the
entire ovector, to ensure no modification of fields that should be unchanged.
Arguments:
ovector points to the ovector
oveccount number of pairs
Returns: nothing
*/
static void
show_ovector(PCRE2_SIZE *ovector, uint32_t oveccount)
{
uint32_t i;
for (i = 0; i < 2*oveccount; i += 2)
{
PCRE2_SIZE start = ovector[i];
PCRE2_SIZE end = ovector[i+1];
fprintf(outfile, "%2d: ", i/2);
if (start == PCRE2_UNSET && end == PCRE2_UNSET)
fprintf(outfile, "<unset>\n");
else if (start == JUNK_OFFSET && end == JUNK_OFFSET)
fprintf(outfile, "<unchanged>\n");
else
fprintf(outfile, "%ld %ld\n", (unsigned long int)start,
(unsigned long int)end);
}
}
/*************************************************
* Process a data line *
*************************************************/
@ -6342,7 +6388,10 @@ size_t needlen;
void *use_dat_context;
BOOL utf;
BOOL subject_literal;
PCRE2_SIZE *ovector;
PCRE2_SIZE ovecsave[3];
uint32_t oveccount;
#ifdef SUPPORT_PCRE2_8
uint8_t *q8 = NULL;
@ -6722,11 +6771,23 @@ for (k = 0; k < sizeof(exclusive_dat_controls)/sizeof(uint32_t); k++)
}
}
if (pat_patctl.replacement[0] != 0 &&
(dat_datctl.control & CTL_NULLCONTEXT) != 0)
if (pat_patctl.replacement[0] != 0)
{
fprintf(outfile, "** Replacement text is not supported with null_context.\n");
return PR_OK;
if ((dat_datctl.control & CTL_NULLCONTEXT) != 0)
{
fprintf(outfile, "** Replacement text is not supported with null_context.\n");
return PR_OK;
}
if ((dat_datctl.control & CTL_ALLCAPTURES) != 0)
fprintf(outfile, "** Ignored with replacement text: allcaptures\n");
}
/* Warn for modifiers that are ignored for DFA. */
if ((dat_datctl.control & CTL_DFA) != 0)
{
if ((dat_datctl.control & CTL_ALLCAPTURES) != 0)
fprintf(outfile, "** Ignored after DFA matching: allcaptures\n");
}
/* We now have the subject in dbuffer, with len containing the byte length, and
@ -6955,6 +7016,9 @@ if (CASTVAR(void *, match_data) == NULL)
return PR_OK;
}
ovector = FLD(match_data, ovector);
PCRE2_GET_OVECTOR_COUNT(oveccount, match_data);
/* Replacement processing is ignored for DFA matching. */
if (dat_datctl.replacement[0] != 0 && (dat_datctl.control & CTL_DFA) != 0)
@ -6974,7 +7038,7 @@ if (dat_datctl.replacement[0] != 0)
uint8_t rbuffer[REPLACE_BUFFSIZE];
uint8_t nbuffer[REPLACE_BUFFSIZE];
uint32_t xoptions;
PCRE2_SIZE rlen, nsize, erroroffset;
PCRE2_SIZE j, rlen, nsize, erroroffset;
BOOL badutf = FALSE;
#ifdef SUPPORT_PCRE2_8
@ -6987,6 +7051,11 @@ if (dat_datctl.replacement[0] != 0)
uint32_t *r32 = NULL;
#endif
/* Fill the ovector with junk to detect elements that do not get set
when they should be (relevant only when "allvector" is specified). */
for (j = 0; j < 2*oveccount; j++) ovector[j] = JUNK_OFFSET;
if (timeitm)
fprintf(outfile, "** Timing is not supported with replace: ignored\n");
@ -7112,6 +7181,12 @@ if (dat_datctl.replacement[0] != 0)
fprintf(outfile, "\n");
show_memory = FALSE;
/* Show final ovector contents if requested. */
if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0)
show_ovector(ovector, oveccount);
return PR_OK;
} /* End of substitution handling */
@ -7125,14 +7200,11 @@ for (gmatched = 0;; gmatched++)
{
PCRE2_SIZE j;
int capcount;
PCRE2_SIZE *ovector;
ovector = FLD(match_data, ovector);
/* Fill the ovector with junk to detect elements that do not get set
when they should be. */
for (j = 0; j < 2*dat_datctl.oveccount; j++) ovector[j] = JUNK_OFFSET;
for (j = 0; j < 2*oveccount; j++) ovector[j] = JUNK_OFFSET;
/* When matching is via pcre2_match(), we will detect the use of JIT via the
stack callback function. */
@ -7280,12 +7352,8 @@ for (gmatched = 0;; gmatched++)
if (capcount >= 0)
{
int i;
uint32_t oveccount;
/* This is a check against a lunatic return value. */
PCRE2_GET_OVECTOR_COUNT(oveccount, match_data);
if (capcount > (int)oveccount)
if (capcount > (int)oveccount) /* Check for lunatic return value */
{
fprintf(outfile,
"** PCRE2 error: returned count %d is too big for ovector count %d\n",
@ -7325,24 +7393,18 @@ for (gmatched = 0;; gmatched++)
/* "allcaptures" requests showing of all captures in the pattern, to check
unset ones at the end. It may be set on the pattern or the data. Implement
by setting capcount to the maximum. This is not relevant for DFA matching,
so ignore it. */
so ignore it (warning given above). */
if ((dat_datctl.control & CTL_ALLCAPTURES) != 0)
if ((dat_datctl.control & (CTL_ALLCAPTURES|CTL_DFA)) == CTL_ALLCAPTURES)
{
uint32_t maxcapcount;
if ((dat_datctl.control & CTL_DFA) != 0)
{
fprintf(outfile, "** Ignored after DFA matching: allcaptures\n");
}
else
{
if (pattern_info(PCRE2_INFO_CAPTURECOUNT, &maxcapcount, FALSE) < 0)
return PR_SKIP;
capcount = maxcapcount + 1; /* Allow for full match */
if (capcount > (int)oveccount) capcount = oveccount;
}
capcount = maxcapcount + 1; /* Allow for full match */
if (capcount > (int)oveccount) capcount = oveccount;
}
/* "allvector" request showing the entire ovector. */
if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0) capcount = oveccount;
/* Output the captured substrings. Note that, for the matched string,
the use of \K in an assertion can make the start later than the end. */
@ -7364,19 +7426,26 @@ for (gmatched = 0;; gmatched++)
/* Check for an unset group */
if (start == PCRE2_UNSET)
if (start == PCRE2_UNSET && end == PCRE2_UNSET)
{
fprintf(outfile, "<unset>\n");
continue;
}
/* Check for silly offsets, in particular, values that have not been
set when they should have been. */
set when they should have been. However, if we are past the end of the
captures for this pattern ("allvector" causes this), or if we are DFA
matching, it isn't an error if the entry is unchanged. */
if (start > ulen || end > ulen)
{
fprintf(outfile, "ERROR: bad value(s) for offset(s): 0x%lx 0x%lx\n",
(unsigned long int)start, (unsigned long int)end);
if (((dat_datctl.control & CTL_DFA) != 0 ||
i >= (int)(2*maxcapcount + 2)) &&
start == JUNK_OFFSET && end == JUNK_OFFSET)
fprintf(outfile, "<unchanged>\n");
else
fprintf(outfile, "ERROR: bad value(s) for offset(s): 0x%lx 0x%lx\n",
(unsigned long int)start, (unsigned long int)end);
continue;
}
@ -7517,10 +7586,19 @@ for (gmatched = 0;; gmatched++)
fprintf(outfile, "\n");
}
if (ulen != ovector[1])
fprintf(outfile, "** ovector[1] is not equal to the subject length: "
"%ld != %ld\n", (unsigned long int)ovector[1], (unsigned long int)ulen);
/* Process copy/get strings */
if (!copy_and_get(utf, 1)) return PR_ABEND;
/* "allvector" outputs the entire vector */
if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0)
show_ovector(ovector, oveccount);
break; /* Out of the /g loop */
} /* End of handling partial match */
@ -7590,6 +7668,11 @@ for (gmatched = 0;; gmatched++)
if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used)
fprintf(outfile, " (JIT)");
fprintf(outfile, "\n");
/* "allvector" outputs the entire vector */
if ((dat_datctl.control2 & CTL2_ALLVECTOR) != 0)
show_ovector(ovector, oveccount);
}
break;

9
testdata/testinput2 vendored
View File

@ -5505,4 +5505,13 @@ a)"xI
bbc
xbc
/a(b)c|xyz/g,allvector,replace=<$0>
abcdefabcpqr\=ovector=4
abxyz\=ovector=4
abcdefxyz\=ovector=4
/a(b)c|xyz/allvector
abcdef\=ovector=4
abxyz\=ovector=4
# End of testinput2

32
testdata/testoutput2 vendored
View File

@ -16763,6 +16763,38 @@ Subject length lower bound = 1
0: b
0+ c
/a(b)c|xyz/g,allvector,replace=<$0>
abcdefabcpqr\=ovector=4
2: <abc>def<abc>pqr
0: 6 9
1: 7 8
2: <unchanged>
3: <unchanged>
abxyz\=ovector=4
1: ab<xyz>
0: 2 5
1: <unset>
2: <unchanged>
3: <unchanged>
abcdefxyz\=ovector=4
2: <abc>def<xyz>
0: 6 9
1: <unset>
2: <unchanged>
3: <unchanged>
/a(b)c|xyz/allvector
abcdef\=ovector=4
0: abc
1: b
2: <unchanged>
3: <unchanged>
abxyz\=ovector=4
0: xyz
1: <unset>
2: <unchanged>
3: <unchanged>
# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data