diff --git a/ChangeLog b/ChangeLog index 6e9a2c3..25ced58 100644 --- a/ChangeLog +++ b/ChangeLog @@ -179,6 +179,9 @@ deeply. (Compare item 10.23/36.) This should fix oss-fuzz #1761. 37. Implement PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL. +38. Fix returned offsets from regexec() when REG_STARTEND is used with a +starting offset greater than zero. + Version 10.23 14-February-2017 ------------------------------ diff --git a/doc/pcre2posix.3 b/doc/pcre2posix.3 index 70a86d8..b37046b 100644 --- a/doc/pcre2posix.3 +++ b/doc/pcre2posix.3 @@ -1,4 +1,4 @@ -.TH PCRE2POSIX 3 "31 January 2016" "PCRE2 10.22" +.TH PCRE2POSIX 3 "03 June 2017" "PCRE2 10.30" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SYNOPSIS" @@ -204,15 +204,21 @@ function. .sp REG_STARTEND .sp -The string is considered to start at \fIstring\fP + \fIpmatch[0].rm_so\fP and -to have a terminating NUL located at \fIstring\fP + \fIpmatch[0].rm_eo\fP -(there need not actually be a NUL at that location), regardless of the value of -\fInmatch\fP. This is a BSD extension, compatible with but not specified by -IEEE Standard 1003.2 (POSIX.2), and should be used with caution in software -intended to be portable to other systems. Note that a non-zero \fIrm_so\fP does -not imply REG_NOTBOL; REG_STARTEND affects only the location of the string, not -how it is matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are -mutually exclusive; the error REG_INVARG is returned. +When this option is set, the string is considered to start at \fIstring\fP + +\fIpmatch[0].rm_so\fP and to have a terminating NUL located at \fIstring\fP + +\fIpmatch[0].rm_eo\fP (there need not actually be a NUL at that location), +regardless of the value of \fInmatch\fP. However, the offsets of the matched +string and any captured substrings are still given relative to the start of +\fIstring\fP. (Before PCRE2 release 10.30 these were given relative to +\fIstring\fP + \fIpmatch[0].rm_so\fP, but this differs from other +implementations.) +.P +This is a BSD extension, compatible with but not specified by IEEE Standard +1003.2 (POSIX.2), and should be used with caution in software intended to be +portable to other systems. Note that a non-zero \fIrm_so\fP does not imply +REG_NOTBOL; REG_STARTEND affects only the location of the string, not how it is +matched. Setting REG_STARTEND and passing \fIpmatch\fP as NULL are mutually +exclusive; the error REG_INVARG is returned. .P If the pattern was compiled with the REG_NOSUB flag, no data about any matched strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of @@ -271,6 +277,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 31 January 2016 -Copyright (c) 1997-2016 University of Cambridge. +Last updated: 03 June 2017 +Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index 26d395f..abd42d0 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "01 June 2017" "PCRE 10.30" +.TH PCRE2TEST 1 "03 June 2017" "PCRE 10.30" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -1046,6 +1046,20 @@ wrapper API to be used, the only option-setting modifiers that have any effect are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP. The other modifiers are ignored, with a warning message. +.P +There is one additional modifier that can be used with the POSIX wrapper. It is +ignored (with a warning) if used for non-POSIX matching. +.sp + posix_startend=[:] +.sp +This causes the subject string to be passed to \fBregexec()\fP using the +REG_STARTEND option, which uses offsets to restrict which part of the string is +searched. If only one number is given, the end offset is passed as the end of +the subject string. For more detail of REG_STARTEND, see the +.\" HREF +\fBpcre2posix\fP +.\" +documentation. . . .SS "Setting match controls" @@ -1793,6 +1807,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 01 June 2017 +Last updated: 03 June 2017 Copyright (c) 1997-2017 University of Cambridge. .fi diff --git a/src/pcre2posix.c b/src/pcre2posix.c index 4ecc701..8be969a 100644 --- a/src/pcre2posix.c +++ b/src/pcre2posix.c @@ -338,8 +338,8 @@ if (rc >= 0) if ((size_t)rc > nmatch) rc = (int)nmatch; for (i = 0; i < (size_t)rc; i++) { - pmatch[i].rm_so = ovector[i*2]; - pmatch[i].rm_eo = ovector[i*2+1]; + pmatch[i].rm_so = ovector[i*2] + so; + pmatch[i].rm_eo = ovector[i*2+1] + so; } for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; return 0; diff --git a/src/pcre2test.c b/src/pcre2test.c index 8eafadf..5a2b86f 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -186,7 +186,7 @@ void vms_setsymbol( char *, char *, int ); #endif #endif -#define CFORE_UNSET UINT32_MAX /* Unset value for cfail/cerror fields */ +#define CFORE_UNSET UINT32_MAX /* Unset value for startend/cfail/cerror fields */ #define CONVERT_UNSET UINT32_MAX /* Unset value for convert_type field */ #define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */ #define DEFAULT_OVECCOUNT 15 /* Default ovector count */ @@ -538,6 +538,7 @@ typedef struct datctl { /* Structure for data line modifiers. */ uint32_t control; /* Must be in same position as patctl */ uint32_t control2; /* Must be in same position as patctl */ uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ + uint32_t startend[2]; uint32_t cerror[2]; uint32_t cfail[2]; int32_t callout_data; @@ -662,6 +663,7 @@ static modstruct modlist[] = { { "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, { "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) }, { "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) }, + { "posix_startend", MOD_DAT, MOD_IN2, 0, DO(startend) }, { "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, { "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) }, { "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) }, @@ -6660,6 +6662,14 @@ if ((pat_patctl.control & CTL_POSIX) != 0) } } + if (dat_datctl.startend[0] != CFORE_UNSET) + { + pmatch[0].rm_so = dat_datctl.startend[0]; + pmatch[0].rm_eo = (dat_datctl.startend[1] != 0)? + dat_datctl.startend[1] : len; + eflags |= REG_STARTEND; + } + if ((dat_datctl.options & PCRE2_NOTBOL) != 0) eflags |= REG_NOTBOL; if ((dat_datctl.options & PCRE2_NOTEOL) != 0) eflags |= REG_NOTEOL; if ((dat_datctl.options & PCRE2_NOTEMPTY) != 0) eflags |= REG_NOTEMPTY; @@ -6713,6 +6723,9 @@ if ((pat_patctl.control & CTL_POSIX) != 0) /* Handle matching via the native interface. Check for consistency of modifiers. */ +if (dat_datctl.startend[0] != CFORE_UNSET) + fprintf(outfile, "** \\=posix_startend ignored for non-POSIX matching\n"); + /* ALLUSEDTEXT is not supported with JIT, but JIT is not used with DFA matching, even if the JIT compiler was used. */ @@ -7903,6 +7916,7 @@ memset(&def_datctl, 0, sizeof(datctl)); def_datctl.oveccount = DEFAULT_OVECCOUNT; def_datctl.copy_numbers[0] = -1; def_datctl.get_numbers[0] = -1; +def_datctl.startend[0] = def_datctl.startend[1] = CFORE_UNSET; def_datctl.cerror[0] = def_datctl.cerror[1] = CFORE_UNSET; def_datctl.cfail[0] = def_datctl.cfail[1] = CFORE_UNSET; diff --git a/testdata/testinput18 b/testdata/testinput18 index 7fc9b12..ececc06 100644 --- a/testdata/testinput18 +++ b/testdata/testinput18 @@ -113,4 +113,14 @@ /(?=(a\K))/ a +/^d(e)$/posix + acdef\=posix_startend=2:4 + acde\=posix_startend=2 +\= Expect no match + acdef + acdef\=posix_startend=2 + +/^a\x{00}b$/posix + a\x{00}b\=posix_startend=0:3 + # End of testdata/testinput18 diff --git a/testdata/testoutput18 b/testdata/testoutput18 index 6f68ca1..96386da 100644 --- a/testdata/testoutput18 +++ b/testdata/testoutput18 @@ -174,4 +174,21 @@ Start of matched string is beyond its end - displaying from end to start. 0: a 1: a +/^d(e)$/posix + acdef\=posix_startend=2:4 + 0: de + 1: e + acde\=posix_startend=2 + 0: de + 1: e +\= Expect no match + acdef +No match: POSIX code 17: match failed + acdef\=posix_startend=2 +No match: POSIX code 17: match failed + +/^a\x{00}b$/posix + a\x{00}b\=posix_startend=0:3 + 0: a\x00b + # End of testdata/testinput18