Fix matching offsets from regexec() in the POSIX wrapper when called with

REG_STARTEND and a starting offset greater than zero.
This commit is contained in:
Philip.Hazel 2017-06-03 16:42:58 +00:00
parent 8e4b992682
commit e4c86e2ced
7 changed files with 81 additions and 17 deletions

View File

@ -179,6 +179,9 @@ deeply. (Compare item 10.23/36.) This should fix oss-fuzz #1761.
37. Implement PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. 37. Implement PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL.
38. Fix returned offsets from regexec() when REG_STARTEND is used with a
starting offset greater than zero.
Version 10.23 14-February-2017 Version 10.23 14-February-2017
------------------------------ ------------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2POSIX 3 "31 January 2016" "PCRE2 10.22" .TH PCRE2POSIX 3 "03 June 2017" "PCRE2 10.30"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "SYNOPSIS" .SH "SYNOPSIS"
@ -204,15 +204,21 @@ function.
.sp .sp
REG_STARTEND REG_STARTEND
.sp .sp
The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and When this option is set, the string is considered to start at \fIstring\fP +
to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP \fIpmatch[0].rm_so\fP and to have a terminating NUL located at \fIstring\fP +
(there need not actually be a NUL at that location), regardless of the value of \fIpmatch[0].rm_eo\fP (there need not actually be a NUL at that location),
\fInmatch\fP. This is a BSD extension, compatible with but not specified by regardless of the value of \fInmatch\fP. However, the offsets of the matched
IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software string and any captured substrings are still given relative to the start of
intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does \fIstring\fP. (Before PCRE2 release 10.30 these were given relative to
not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not \fIstring\fP + \fIpmatch[0].rm_so\fP, but this differs from other
how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are implementations.)
mutually exclusive; the error REG_INVARG is returned. .P
This is a BSD extension, compatible with but not specified by IEEE Standard
1003.2 (POSIX.2), and should be used with caution in software intended to be
portable to other systems. Note that a non-zero \fIrm_so\fP does not imply
REG_NOTBOL; REG_STARTEND affects only the location of the string, not how it is
matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are mutually
exclusive; the error REG_INVARG is returned.
.P .P
If the pattern was compiled with the REG_NOSUB flag, no data about any matched If the pattern was compiled with the REG_NOSUB flag, no data about any matched
strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of
@ -271,6 +277,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 31 January 2016 Last updated: 03 June 2017
Copyright (c) 1997-2016 University of Cambridge. Copyright (c) 1997-2017 University of Cambridge.
.fi .fi

View File

@ -1,4 +1,4 @@
.TH PCRE2TEST 1 "01 June 2017" "PCRE 10.30" .TH PCRE2TEST 1 "03 June 2017" "PCRE 10.30"
.SH NAME .SH NAME
pcre2test - a program for testing Perl-compatible regular expressions. pcre2test - a program for testing Perl-compatible regular expressions.
.SH SYNOPSIS .SH SYNOPSIS
@ -1046,6 +1046,20 @@ wrapper API to be used, the only option-setting modifiers that have any effect
are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL, are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL,
REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP. REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP.
The other modifiers are ignored, with a warning message. The other modifiers are ignored, with a warning message.
.P
There is one additional modifier that can be used with the POSIX wrapper. It is
ignored (with a warning) if used for non-POSIX matching.
.sp
posix_startend=<n>[:<m>]
.sp
This causes the subject string to be passed to \fBregexec()\fP using the
REG_STARTEND option, which uses offsets to restrict which part of the string is
searched. If only one number is given, the end offset is passed as the end of
the subject string. For more detail of REG_STARTEND, see the
.\" HREF
\fBpcre2posix\fP
.\"
documentation.
. .
. .
.SS "Setting match controls" .SS "Setting match controls"
@ -1793,6 +1807,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 01 June 2017 Last updated: 03 June 2017
Copyright (c) 1997-2017 University of Cambridge. Copyright (c) 1997-2017 University of Cambridge.
.fi .fi

View File

@ -338,8 +338,8 @@ if (rc >= 0)
if ((size_t)rc > nmatch) rc = (int)nmatch; if ((size_t)rc > nmatch) rc = (int)nmatch;
for (i = 0; i < (size_t)rc; i++) for (i = 0; i < (size_t)rc; i++)
{ {
pmatch[i].rm_so = ovector[i*2]; pmatch[i].rm_so = ovector[i*2] + so;
pmatch[i].rm_eo = ovector[i*2+1]; pmatch[i].rm_eo = ovector[i*2+1] + so;
} }
for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
return 0; return 0;

View File

@ -186,7 +186,7 @@ void vms_setsymbol( char *, char *, int );
#endif #endif
#endif #endif
#define CFORE_UNSET UINT32_MAX /* Unset value for cfail/cerror fields */ #define CFORE_UNSET UINT32_MAX /* Unset value for startend/cfail/cerror fields */
#define CONVERT_UNSET UINT32_MAX /* Unset value for convert_type field */ #define CONVERT_UNSET UINT32_MAX /* Unset value for convert_type field */
#define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */ #define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */
#define DEFAULT_OVECCOUNT 15 /* Default ovector count */ #define DEFAULT_OVECCOUNT 15 /* Default ovector count */
@ -538,6 +538,7 @@ typedef struct datctl { /* Structure for data line modifiers. */
uint32_t control; /* Must be in same position as patctl */ uint32_t control; /* Must be in same position as patctl */
uint32_t control2; /* Must be in same position as patctl */ uint32_t control2; /* Must be in same position as patctl */
uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ uint8_t replacement[REPLACE_MODSIZE]; /* So must this */
uint32_t startend[2];
uint32_t cerror[2]; uint32_t cerror[2];
uint32_t cfail[2]; uint32_t cfail[2];
int32_t callout_data; int32_t callout_data;
@ -662,6 +663,7 @@ static modstruct modlist[] = {
{ "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, { "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) },
{ "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) }, { "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) },
{ "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) }, { "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) },
{ "posix_startend", MOD_DAT, MOD_IN2, 0, DO(startend) },
{ "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, { "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) },
{ "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) }, { "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) },
{ "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) }, { "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) },
@ -6660,6 +6662,14 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
} }
} }
if (dat_datctl.startend[0] != CFORE_UNSET)
{
pmatch[0].rm_so = dat_datctl.startend[0];
pmatch[0].rm_eo = (dat_datctl.startend[1] != 0)?
dat_datctl.startend[1] : len;
eflags |= REG_STARTEND;
}
if ((dat_datctl.options & PCRE2_NOTBOL) != 0) eflags |= REG_NOTBOL; if ((dat_datctl.options & PCRE2_NOTBOL) != 0) eflags |= REG_NOTBOL;
if ((dat_datctl.options & PCRE2_NOTEOL) != 0) eflags |= REG_NOTEOL; if ((dat_datctl.options & PCRE2_NOTEOL) != 0) eflags |= REG_NOTEOL;
if ((dat_datctl.options & PCRE2_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY; if ((dat_datctl.options & PCRE2_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY;
@ -6713,6 +6723,9 @@ if ((pat_patctl.control & CTL_POSIX) != 0)
/* Handle matching via the native interface. Check for consistency of /* Handle matching via the native interface. Check for consistency of
modifiers. */ modifiers. */
if (dat_datctl.startend[0] != CFORE_UNSET)
fprintf(outfile, "** \\=posix_startend ignored for non-POSIX matching\n");
/* ALLUSEDTEXT is not supported with JIT, but JIT is not used with DFA /* ALLUSEDTEXT is not supported with JIT, but JIT is not used with DFA
matching, even if the JIT compiler was used. */ matching, even if the JIT compiler was used. */
@ -7903,6 +7916,7 @@ memset(&def_datctl, 0, sizeof(datctl));
def_datctl.oveccount = DEFAULT_OVECCOUNT; def_datctl.oveccount = DEFAULT_OVECCOUNT;
def_datctl.copy_numbers[0] = -1; def_datctl.copy_numbers[0] = -1;
def_datctl.get_numbers[0] = -1; def_datctl.get_numbers[0] = -1;
def_datctl.startend[0] = def_datctl.startend[1] = CFORE_UNSET;
def_datctl.cerror[0] = def_datctl.cerror[1] = CFORE_UNSET; def_datctl.cerror[0] = def_datctl.cerror[1] = CFORE_UNSET;
def_datctl.cfail[0] = def_datctl.cfail[1] = CFORE_UNSET; def_datctl.cfail[0] = def_datctl.cfail[1] = CFORE_UNSET;

10
testdata/testinput18 vendored
View File

@ -113,4 +113,14 @@
/(?=(a\K))/ /(?=(a\K))/
a a
/^d(e)$/posix
acdef\=posix_startend=2:4
acde\=posix_startend=2
\= Expect no match
acdef
acdef\=posix_startend=2
/^a\x{00}b$/posix
a\x{00}b\=posix_startend=0:3
# End of testdata/testinput18 # End of testdata/testinput18

17
testdata/testoutput18 vendored
View File

@ -174,4 +174,21 @@ Start of matched string is beyond its end - displaying from end to start.
0: a 0: a
1: a 1: a
/^d(e)$/posix
acdef\=posix_startend=2:4
0: de
1: e
acde\=posix_startend=2
0: de
1: e
\= Expect no match
acdef
No match: POSIX code 17: match failed
acdef\=posix_startend=2
No match: POSIX code 17: match failed
/^a\x{00}b$/posix
a\x{00}b\=posix_startend=0:3
0: a\x00b
# End of testdata/testinput18 # End of testdata/testinput18