From 6c1c817438f271c770ff94c31ef29839cc6087ad Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sun, 31 Jan 2016 19:14:15 +0000 Subject: [PATCH] Don't set PCRE2_NO_AUTO_CAPTURE when REG_NOSUB is passed to regcomp(). --- ChangeLog | 12 ++++++++++++ doc/pcre2api.3 | 10 ++++++---- doc/pcre2posix.3 | 33 +++++++++++++++++---------------- doc/pcre2test.1 | 30 ++++++++++++++++++------------ src/pcre2posix.c | 11 +++++------ src/pcre2posix.h | 1 + src/pcre2test.c | 29 ++++++++++++++++------------- testdata/testinput18 | 7 +++++-- testdata/testoutput18 | 8 ++++++-- 9 files changed, 86 insertions(+), 55 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8462887..91ea3b3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -22,6 +22,18 @@ that it matches only unknown objects. 5. Updated the maintenance script maint/ManyConfigTests to make it easier to select individual groups of tests. +6. When the POSIX wrapper function regcomp() is called, the REG_NOSUB option +used to set PCRE2_NO_AUTO_CAPTURE when calling pcre2_compile(). However, this +disables the use of back references (and subroutine calls), which are supported +by other implementations of regcomp() with RE_NOSUB. Therefore, REG_NOSUB no +longer causes PCRE2_NO_AUTO_CAPTURE to be set, though it still ignores nmatch +and pmatch when regexec() is called. + +7. Because of 6 above, pcre2test has been modified with a new modifier called +posix_nosub, to call regcomp() with REG_NOSUB. Previously the no_auto_capture +modifier had this effect. That option is now ignored when the POSIX API is in +use. + Version 10.21 12-January-2016 ----------------------------- diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index b29f7b0..b34fd0b 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "16 December 2015" "PCRE2 10.21" +.TH PCRE2API 3 "31 January 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1255,7 +1255,9 @@ If this option is set, it disables the use of numbered capturing parentheses in the pattern. Any opening parenthesis that is not followed by ? behaves as if it were followed by ?: but named parentheses can still be used for capturing (and they acquire numbers in the usual way). There is no equivalent of this option -in Perl. +in Perl. Note that, if this option is set, references to capturing groups (back +references or recursion/subroutine calls) may only refer to named groups, +though the reference can be by name or by number. .sp PCRE2_NO_AUTO_POSSESS .sp @@ -3166,6 +3168,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 16 December 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 31 January 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/doc/pcre2posix.3 b/doc/pcre2posix.3 index 833e96c..1c5516b 100644 --- a/doc/pcre2posix.3 +++ b/doc/pcre2posix.3 @@ -1,4 +1,4 @@ -.TH PCRE2POSIX 3 "29 November 2015" "PCRE2 10.21" +.TH PCRE2POSIX 3 "31 January 2016" "PCRE2 10.22" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "SYNOPSIS" @@ -28,7 +28,7 @@ expression 8-bit library. See the \fBpcre2api\fP .\" documentation for a description of PCRE2's native API, which contains much -additional functionality. There is no POSIX-style wrapper for PCRE2's 16-bit +additional functionality. There are no POSIX-style wrappers for PCRE2's 16-bit and 32-bit libraries. .P The functions described here are just wrapper functions that ultimately call @@ -44,9 +44,9 @@ value zero. This has no effect, but since programs that are written to the POSIX interface often use it, this makes it easier to slot in PCRE2 as a replacement library. Other POSIX options are not even defined. .P -There are also some other options that are not defined by POSIX. These have -been added at the request of users who want to make use of certain -PCRE2-specific features via the POSIX calling interface. +There are also some options that are not defined by POSIX. These have been +added at the request of users who want to make use of certain PCRE2-specific +features via the POSIX calling interface. .P When PCRE2 is called via these functions, it is only the API that is POSIX-like in style. The syntax and semantics of the regular expressions themselves are @@ -95,11 +95,11 @@ defined POSIX behaviour for REG_NEWLINE (see the following section). .sp REG_NOSUB .sp -The PCRE2_NO_AUTO_CAPTURE option is set when the regular expression is passed -for compilation to the native function. In addition, when a pattern that is -compiled with this flag is passed to \fBregexec()\fP for matching, the -\fInmatch\fP and \fIpmatch\fP arguments are ignored, and no captured strings -are returned. +When a pattern that is compiled with this flag is passed to \fBregexec()\fP for +matching, the \fInmatch\fP and \fIpmatch\fP arguments are ignored, and no +captured strings are returned. Versions of the PCRE library prior to 10.22 used +to set the PCRE2_NO_AUTO_CAPTURE compile option, but this no longer happens +because it disables the use of back references. .sp REG_UCP .sp @@ -216,12 +216,13 @@ mutually exclusive; the error REG_INVARG is returned. .P If the pattern was compiled with the REG_NOSUB flag, no data about any matched strings is returned. The \fInmatch\fP and \fIpmatch\fP arguments of -\fBregexec()\fP are ignored. +\fBregexec()\fP are ignored (except possibly as input for REG_STARTEND). .P -If the value of \fInmatch\fP is zero, or if the value \fIpmatch\fP is NULL, -no data about any matched strings is returned. +The value of \fInmatch\fP may be zero, and the value \fIpmatch\fP may be NULL +(unless REG_STARTEND is set); in both these cases no data about any matched +strings is returned. .P -Otherwise,the portion of the string that was matched, and also any captured +Otherwise, the portion of the string that was matched, and also any captured substrings, are returned via the \fIpmatch\fP argument, which points to an array of \fInmatch\fP structures of type \fIregmatch_t\fP, containing the members \fIrm_so\fP and \fIrm_eo\fP. These contain the byte offset to the first @@ -270,6 +271,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 29 November 2015 -Copyright (c) 1997-2015 University of Cambridge. +Last updated: 31 January 2016 +Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index ce1bc08..47be7f6 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "29 January 2016" "PCRE 10.22" +.TH PCRE2TEST 1 "31 January 2016" "PCRE 10.22" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -535,6 +535,7 @@ about the pattern: null_context compile with a NULL context parens_nest_limit= set maximum parentheses depth posix use the POSIX API + posix_nosub use the POSIX API with REG_NOSUB push push compiled pattern onto the stack stackguard= test the stackguard feature tables=[0|1|2] select internal tables @@ -791,18 +792,19 @@ variable can hold (essentially unlimited). .SS "Using the POSIX wrapper API" .rs .sp -The \fB/posix\fP modifier causes \fBpcre2test\fP to call PCRE2 via the POSIX -wrapper API rather than its native API. This supports only the 8-bit library. -Note that it does not imply POSIX matching semantics; for more detail see the +The \fB/posix\fP and \fBposix_nosub\fP modifiers cause \fBpcre2test\fP to call +PCRE2 via the POSIX wrapper API rather than its native API. When +\fBposix_nosub\fP is used, the POSIX option REG_NOSUB is passed to +\fBregcomp()\fP. The POSIX wrapper supports only the 8-bit library. Note that +it does not imply POSIX matching semantics; for more detail see the .\" HREF \fBpcre2posix\fP .\" -documentation. When the POSIX API is being used, the following pattern -modifiers set options for the \fBregcomp()\fP function: +documentation. The following pattern modifiers set options for the +\fBregcomp()\fP function: .sp caseless REG_ICASE multiline REG_NEWLINE - no_auto_capture REG_NOSUB dotall REG_DOTALL ) ungreedy REG_UNGREEDY ) These options are not part of ucp REG_UCP ) the POSIX standard @@ -818,7 +820,8 @@ buffer is too small for the error message. If this modifier has not been set, a large buffer is used. .P The \fBaftertext\fP and \fBallaftertext\fP subject modifiers work as described -below. All other modifiers cause an error. +below. All other modifiers are either ignored, with a warning message, or cause +an error. . . .SS "Testing the stack guard feature" @@ -937,7 +940,7 @@ If the \fB/posix\fP modifier was present on the pattern, causing the POSIX wrapper API to be used, the only option-setting modifiers that have any effect are \fBnotbol\fP, \fBnotempty\fP, and \fBnoteol\fP, causing REG_NOTBOL, REG_NOTEMPTY, and REG_NOTEOL, respectively, to be passed to \fBregexec()\fP. -Any other modifiers cause an error. +The other modifiers are ignored, with a warning message. . . .SS "Setting match controls" @@ -981,7 +984,10 @@ pattern. substitute_unset_empty use PCRE2_SUBSTITUTE_UNSET_EMPTY zero_terminate pass the subject as zero-terminated .sp -The effects of these modifiers are described in the following sections. +The effects of these modifiers are described in the following sections. When +matching via the POSIX wrapper API, the \fBaftertext\fP, \fBallaftertext\fP, +and \fBovector\fP subject modifiers work as described below. All other +modifiers are either ignored, with a warning message, or cause an error. . . .SS "Showing more text" @@ -1606,7 +1612,7 @@ modifier list containing only control modifiers .\" that act after a pattern has been compiled. In particular, \fBhex\fP, -\fBposix\fP, and \fBpush\fP are not allowed, nor are any +\fBposix\fP, \fBposix_nosub\fP, and \fBpush\fP are not allowed, nor are any .\" HTML .\" option-setting modifiers. @@ -1651,6 +1657,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 29 January 2016 +Last updated: 31 January 2016 Copyright (c) 1997-2016 University of Cambridge. .fi diff --git a/src/pcre2posix.c b/src/pcre2posix.c index 1d6e5b7..ce49876 100644 --- a/src/pcre2posix.c +++ b/src/pcre2posix.c @@ -205,11 +205,11 @@ int re_nsub = 0; if ((cflags & REG_ICASE) != 0) options |= PCRE2_CASELESS; if ((cflags & REG_NEWLINE) != 0) options |= PCRE2_MULTILINE; if ((cflags & REG_DOTALL) != 0) options |= PCRE2_DOTALL; -if ((cflags & REG_NOSUB) != 0) options |= PCRE2_NO_AUTO_CAPTURE; if ((cflags & REG_UTF) != 0) options |= PCRE2_UTF; if ((cflags & REG_UCP) != 0) options |= PCRE2_UCP; if ((cflags & REG_UNGREEDY) != 0) options |= PCRE2_UNGREEDY; +preg->cflags = cflags; preg->re_pcre2_code = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, options, &errorcode, &erroffset, NULL); preg->re_erroffset = erroffset; @@ -234,7 +234,6 @@ if (preg->re_pcre2_code == NULL) (void)pcre2_pattern_info((const pcre2_code *)preg->re_pcre2_code, PCRE2_INFO_CAPTURECOUNT, &re_nsub); preg->re_nsub = (size_t)re_nsub; -if ((options & PCRE2_NO_AUTO_CAPTURE) != 0) re_nsub = -1; preg->re_match_data = pcre2_match_data_create(re_nsub + 1, NULL); if (preg->re_match_data == NULL) @@ -272,11 +271,11 @@ if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE2_NOTEMPTY; ((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */ -/* When no string data is being returned, or no vector has been passed in which -to put it, ensure that nmatch is zero. */ +/* When REG_NOSUB was specified, or if no vector has been passed in which to +put captured strings, ensure that nmatch is zero. This will stop any attempt to +write to pmatch. */ -if ((((pcre2_real_code *)(preg->re_pcre2_code))->compile_options & - PCRE2_NO_AUTO_CAPTURE) != 0 || pmatch == NULL) nmatch = 0; +if ((preg->cflags & REG_NOSUB) != 0 || pmatch == NULL) nmatch = 0; /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings. The man page from OS X says "REG_STARTEND affects only the location of the diff --git a/src/pcre2posix.h b/src/pcre2posix.h index 44a2fd8..7b7af6c 100644 --- a/src/pcre2posix.h +++ b/src/pcre2posix.h @@ -98,6 +98,7 @@ typedef struct { void *re_match_data; size_t re_nsub; size_t re_erroffset; + int cflags; } regex_t; /* The structure in which a captured offset is returned. */ diff --git a/src/pcre2test.c b/src/pcre2test.c index 51f1298..f679327 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -425,10 +425,10 @@ so many of them that they are split into two fields. */ #define CTL_MEMORY 0x00100000u #define CTL_NULLCONTEXT 0x00200000u #define CTL_POSIX 0x00400000u -#define CTL_PUSH 0x00800000u -#define CTL_STARTCHAR 0x01000000u -#define CTL_ZERO_TERMINATE 0x02000000u -/* Spare 0x04000000u */ +#define CTL_POSIX_NOSUB 0x00800000u +#define CTL_PUSH 0x01000000u +#define CTL_STARTCHAR 0x02000000u +#define CTL_ZERO_TERMINATE 0x04000000u /* Spare 0x08000000u */ /* Spare 0x10000000u */ /* Spare 0x20000000u */ @@ -600,6 +600,7 @@ static modstruct modlist[] = { { "partial_soft", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, { "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, { "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) }, + { "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) }, { "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, { "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) }, { "recursion_limit", MOD_CTM, MOD_INT, 0, MO(recursion_limit) }, @@ -625,11 +626,11 @@ static modstruct modlist[] = { /* Controls and options that are supported for use with the POSIX interface. */ #define POSIX_SUPPORTED_COMPILE_OPTIONS ( \ - PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \ - PCRE2_UCP|PCRE2_UTF|PCRE2_UNGREEDY) + PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_MULTILINE|PCRE2_UCP|PCRE2_UTF| \ + PCRE2_UNGREEDY) #define POSIX_SUPPORTED_COMPILE_CONTROLS ( \ - CTL_AFTERTEXT|CTL_ALLAFTERTEXT|CTL_EXPAND|CTL_POSIX) + CTL_AFTERTEXT|CTL_ALLAFTERTEXT|CTL_EXPAND|CTL_POSIX|CTL_POSIX_NOSUB) #define POSIX_SUPPORTED_COMPILE_CONTROLS2 (0) @@ -654,10 +655,11 @@ static modstruct modlist[] = { /* Controls that are forbidden with #pop. */ -#define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_PUSH) +#define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_POSIX_NOSUB|CTL_PUSH) /* Pattern controls that are mutually exclusive. At present these are all in -the first control word. */ +the first control word. Note that CTL_POSIX_NOSUB is always accompanied by +CTL_POSIX, so it doesn't need its own entries. */ static uint32_t exclusive_pat_controls[] = { CTL_POSIX | CTL_HEXPAT, @@ -811,7 +813,7 @@ static void *patstack[PATSTACKSIZE]; static int patstacknext = 0; #ifdef SUPPORT_PCRE2_8 -static regex_t preg = { NULL, NULL, 0, 0 }; +static regex_t preg = { NULL, NULL, 0, 0, 0 }; #endif static int *dfa_workspace = NULL; @@ -3580,7 +3582,7 @@ Returns: nothing static void show_controls(uint32_t controls, uint32_t controls2, const char *before) { -fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "", ((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "", @@ -3607,6 +3609,7 @@ fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s ((controls & CTL_NL_SET) != 0)? " newline" : "", ((controls & CTL_NULLCONTEXT) != 0)? " null_context" : "", ((controls & CTL_POSIX) != 0)? " posix" : "", + ((controls & CTL_POSIX_NOSUB) != 0)? " posix_nosub" : "", ((controls & CTL_PUSH) != 0)? " push" : "", ((controls & CTL_STARTCHAR) != 0)? " startchar" : "", ((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "", @@ -4702,11 +4705,11 @@ if ((pat_patctl.control & CTL_POSIX) != 0) up a match_data block to be used for all matches. */ if (utf) cflags |= REG_UTF; + if ((pat_patctl.control & CTL_POSIX_NOSUB) != 0) cflags |= REG_NOSUB; if ((pat_patctl.options & PCRE2_UCP) != 0) cflags |= REG_UCP; if ((pat_patctl.options & PCRE2_CASELESS) != 0) cflags |= REG_ICASE; if ((pat_patctl.options & PCRE2_MULTILINE) != 0) cflags |= REG_NEWLINE; if ((pat_patctl.options & PCRE2_DOTALL) != 0) cflags |= REG_DOTALL; - if ((pat_patctl.options & PCRE2_NO_AUTO_CAPTURE) != 0) cflags |= REG_NOSUB; if ((pat_patctl.options & PCRE2_UNGREEDY) != 0) cflags |= REG_UNGREEDY; rc = regcomp(&preg, (char *)pbuffer8, cflags); @@ -5829,7 +5832,7 @@ if ((pat_patctl.control & CTL_POSIX) != 0) (void)regerror(rc, &preg, (char *)pbuffer8, pbuffer8_size); fprintf(outfile, "No match: POSIX code %d: %s\n", rc, pbuffer8); } - else if ((pat_patctl.options & PCRE2_NO_AUTO_CAPTURE) != 0) + else if ((pat_patctl.control & CTL_POSIX_NOSUB) != 0) fprintf(outfile, "Matched with REG_NOSUB\n"); else if (dat_datctl.oveccount == 0) fprintf(outfile, "Matched without capture\n"); diff --git a/testdata/testinput18 b/testdata/testinput18 index c75b842..0fe856d 100644 --- a/testdata/testinput18 +++ b/testdata/testinput18 @@ -68,12 +68,15 @@ ab=cd ab\ncd -/a(b)c/no_auto_capture +/a(b)c/posix_nosub abc -/a(?Pb)c/no_auto_capture +/a(?Pb)c/posix_nosub abc +/(a)\1/posix_nosub + zaay + /a?|b?/ abc \= Expect no match diff --git a/testdata/testoutput18 b/testdata/testoutput18 index 0a5ffff..40ee960 100644 --- a/testdata/testoutput18 +++ b/testdata/testoutput18 @@ -105,14 +105,18 @@ No match: POSIX code 17: match failed ab\ncd 0: ab\x0acd -/a(b)c/no_auto_capture +/a(b)c/posix_nosub abc Matched with REG_NOSUB -/a(?Pb)c/no_auto_capture +/a(?Pb)c/posix_nosub abc Matched with REG_NOSUB +/(a)\1/posix_nosub + zaay +Matched with REG_NOSUB + /a?|b?/ abc 0: a