Fix matching offsets from regexec() in the POSIX wrapper when called with

REG_STARTEND and a starting offset greater than zero.
This commit is contained in:
Philip.Hazel 2017-06-03 16:42:58 +00:00
parent 8e4b992682
commit e4c86e2ced
7 changed files with 81 additions and 17 deletions

View File

@ -179,6 +179,9 @@ deeply. (Compare item 10.23/36.) This should fix oss-fuzz #1761.
37. Implement PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL.
38. Fix returned offsets from regexec() when REG_STARTEND is used with a
starting offset greater than zero.
Version 10.23 14-February-2017
------------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2POSIX 3 "31 January 2016" "PCRE2 10.22"
.TH PCRE2POSIX 3 "03 June 2017" "PCRE2 10.30"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "SYNOPSIS"
@ -204,15 +204,21 @@ function.
.sp
REG_STARTEND
.sp
The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and
to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP
(there need not actually be a NUL at that location), regardless of the value of
\fInmatch\fP. This is a BSD extension, compatible with but not specified by
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software
intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not
how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are
mutually exclusive; the error REG_INVARG is returned.
When this option is set, the string is considered to start at \fIstring\fP +
\fIpmatch[0].rm_so\fP and to have a terminating NUL located at \fIstring\fP +
\fIpmatch[0].rm_eo\fP (there need not actually be a NUL at that location),
regardless of the value of \fInmatch\fP. However, the offsets of the matched
string and any captured substrings are still given relative to the start of
\fIstring\fP. (Before PCRE2 release 10.30 these were given relative to
\fIstring\fP + \fIpmatch[0].rm_so\fP, but this differs from other
implementations.)
.P
This is a BSD extension, compatible with but not specified by IEEE Standard
1003.2 (POSIX.2), and should be used with caution in software intended to be
portable to other systems. Note that a non-zero \fIrm_so\fP does not imply
REG_NOTBOL; REG_STARTEND affects only the location of the string, not how it is
matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are mutually
exclusive; the error REG_INVARG is returned.
.P
If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
@ -271,6 +277,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 31 January 2016
Copyright (c) 1997-2016 University of Cambridge.
Last updated: 03 June 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "01 June 2017" "PCRE 10.30"
.TH PCRE2TEST 1 "03 June 2017" "PCRE 10.30"
.SH NAME
pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS
@ -1046,6 +1046,20 @@ wrapper API to be used, the only option-setting modifiers that have any effect
are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP.
The other modifiers are ignored, with a warning message.
.P
There is one additional modifier that can be used with the POSIX wrapper. It is
ignored (with a warning) if used for non-POSIX matching.
.sp
posix_startend=<n>[:<m>]
.sp
This causes the subject string to be passed to \fBregexec()\fP using the
REG_STARTEND option, which uses offsets to restrict which part of the string is
searched. If only one number is given, the end offset is passed as the end of
the subject string. For more detail of REG_STARTEND, see the
.\" HREF
\fBpcre2posix\fP
.\"
documentation.
.
.
.SS "Setting match controls"
@ -1793,6 +1807,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 01 June 2017
Last updated: 03 June 2017
Copyright (c) 1997-2017 University of Cambridge.
.fi

View File

@ -338,8 +338,8 @@ if (rc >= 0)
if ((size_t)rc > nmatch) rc = (int)nmatch;
for (i = 0; i < (size_t)rc; i++)
{
pmatch[i].rm_so = ovector[i*2];
pmatch[i].rm_eo = ovector[i*2+1];
pmatch[i].rm_so = ovector[i*2] + so;
pmatch[i].rm_eo = ovector[i*2+1] + so;
}
for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
return 0;

View File

@ -186,7 +186,7 @@ void vms_setsymbol( char *, char *, int );
#endif
#endif
#define CFORE_UNSET UINT32_MAX /* Unset value for cfail/cerror fields */
#define CFORE_UNSET UINT32_MAX /* Unset value for startend/cfail/cerror fields */
#define CONVERT_UNSET UINT32_MAX /* Unset value for convert_type field */
#define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */
#define DEFAULT_OVECCOUNT 15 /* Default ovector count */
@ -538,6 +538,7 @@ typedef struct datctl { /* Structure for data line modifiers. */
uint32_t control; /* Must be in same position as patctl */
uint32_t control2; /* Must be in same position as patctl */
uint8_t replacement[REPLACE_MODSIZE]; /* So must this */
uint32_t startend[2];
uint32_t cerror[2];
uint32_t cfail[2];
int32_t callout_data;
@ -662,6 +663,7 @@ static modstruct modlist[] = {
{ "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) },
{ "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) },
{ "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) },
{ "posix_startend", MOD_DAT, MOD_IN2, 0, DO(startend) },
{ "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) },
{ "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) },
{ "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) },
@ -6660,6 +6662,14 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
}
}
if (dat_datctl.startend[0] != CFORE_UNSET)
{
pmatch[0].rm_so = dat_datctl.startend[0];
pmatch[0].rm_eo = (dat_datctl.startend[1] != 0)?
dat_datctl.startend[1] : len;
eflags |= REG_STARTEND;
}
if ((dat_datctl.options & PCRE2_NOTBOL) != 0) eflags |= REG_NOTBOL;
if ((dat_datctl.options & PCRE2_NOTEOL) != 0) eflags |= REG_NOTEOL;
if ((dat_datctl.options & PCRE2_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY;
@ -6713,6 +6723,9 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
/* Handle matching via the native interface. Check for consistency of
modifiers. */
if (dat_datctl.startend[0] != CFORE_UNSET)
fprintf(outfile, "** \\=posix_startend ignored for non-POSIX matching\n");
/* ALLUSEDTEXT is not supported with JIT, but JIT is not used with DFA
matching, even if the JIT compiler was used. */
@ -7903,6 +7916,7 @@ memset(&def_datctl, 0, sizeof(datctl));
def_datctl.oveccount = DEFAULT_OVECCOUNT;
def_datctl.copy_numbers[0] = -1;
def_datctl.get_numbers[0] = -1;
def_datctl.startend[0] = def_datctl.startend[1] = CFORE_UNSET;
def_datctl.cerror[0] = def_datctl.cerror[1] = CFORE_UNSET;
def_datctl.cfail[0] = def_datctl.cfail[1] = CFORE_UNSET;

10
testdata/testinput18 vendored
View File

@ -113,4 +113,14 @@
/(?=(a\K))/
a
/^d(e)$/posix
acdef\=posix_startend=2:4
acde\=posix_startend=2
\= Expect no match
acdef
acdef\=posix_startend=2
/^a\x{00}b$/posix
a\x{00}b\=posix_startend=0:3
# End of testdata/testinput18

17
testdata/testoutput18 vendored
View File

@ -174,4 +174,21 @@ Start of matched string is beyond its end - displaying from end to start.
0: a
1: a
/^d(e)$/posix
acdef\=posix_startend=2:4
0: de
1: e
acde\=posix_startend=2
0: de
1: e
\= Expect no match
acdef
No match: POSIX code 17: match failed
acdef\=posix_startend=2
No match: POSIX code 17: match failed
/^a\x{00}b$/posix
a\x{00}b\=posix_startend=0:3
0: a\x00b
# End of testdata/testinput18