diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 7a359b6..8372873 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "01 October 2014" "PCRE2 10.00" +.TH PCRE2API 3 "05 October 2014" "PCRE2 10.00" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1650,13 +1650,15 @@ particular, the match data block contains a vector of offsets into the subject string that define the matched part of the subject and any substrings that were capured. This is know as the \fIovector\fP. .P -Before calling \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP you must create a +Before calling \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP you must create a match data block by calling one of the creation functions above. For \fBpcre2_match_data_create()\fP, the first argument is the number of pairs of offsets in the \fIovector\fP. One pair of offsets is required to identify the string that matched the whole pattern, with another pair for each captured -substring. For example, a value of 4 creates enough space to record the -matched portion of the subject plus three captured substrings. +substring. For example, a value of 4 creates enough space to record the matched +portion of the subject plus three captured substrings. A minimum of at least 1 +pair is imposed by \fBpcre2_match_data_create()\fP, so it is always possible to +return the overall matched string. .P For \fBpcre2_match_data_create_from_pattern()\fP, the first argument is a pointer to a compiled pattern. In this case the ovector is created to be @@ -2015,13 +2017,13 @@ operation, it is the last portion of the string that it matched that is returned. .P If the ovector is too small to hold all the captured substring offsets, as much -as possible is filled in, and the function returns a value of zero. If neither -the actual string matched nor any captured substrings are of interest, -\fBpcre2_match()\fP may be called with a match data block whose ovector is of -zero length. However, if the pattern contains back references and the -\fIovector\fP is not big enough to remember the related substrings, PCRE2 has -to get additional memory for use during matching. Thus it is usually advisable -to set up a match data block containing an ovector of reasonable size. +as possible is filled in, and the function returns a value of zero. If captured +substrings are not of interest, \fBpcre2_match()\fP may be called with a match +data block whose ovector is of minimum length (that is, one pair). However, if +the pattern contains back references and the \fIovector\fP is not big enough to +remember the related substrings, PCRE2 has to get additional memory for use +during matching. Thus it is usually advisable to set up a match data block +containing an ovector of reasonable size. .P It is possible for capturing subpattern number \fIn+1\fP to match some part of the subject when subpattern \fIn\fP has not been used at all. For example, if @@ -2652,6 +2654,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 01 October 2014 +Last updated: 05 October 2014 Copyright (c) 1997-2014 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index dcd0930..05f2b8f 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "19 August 2014" "PCRE 10.00" +.TH PCRE2TEST 1 "05 October 2014" "PCRE 10.00" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -881,6 +881,12 @@ The \fBovector\fP modifier applies only to the subject line in which it appears, though of course it can also be used to set a default in a \fB#subject\fP command. It specifies the number of pairs of offsets that are available for storing matching information. The default is 15. +.P +At least one pair of offsets is always created by +\fBpcre2_match_data_create()\fP, for matching with PCRE2's native API, so a +value of 0 is the same as 1. However a value of 0 is useful when testing the +POSIX API because it causes \fBregexec()\fP to be called with a NULL capture +vector. . . .SH "THE ALTERNATIVE MATCHING FUNCTION" @@ -1145,6 +1151,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 19 August 2014 +Last updated: 05 October 2014 Copyright (c) 1997-2014 University of Cambridge. .fi diff --git a/src/pcre2_match_data.c b/src/pcre2_match_data.c index 8fe1a5a..3db6cde 100644 --- a/src/pcre2_match_data.c +++ b/src/pcre2_match_data.c @@ -51,10 +51,14 @@ POSSIBILITY OF SUCH DAMAGE. * Create a match data block given ovector size * *************************************************/ +/* A minimum of 1 is imposed on the number of ovector triplets. */ + PCRE2_EXP_DEFN pcre2_match_data * PCRE2_CALL_CONVENTION pcre2_match_data_create(uint32_t oveccount, pcre2_general_context *gcontext) { -pcre2_match_data *yield = PRIV(memctl_malloc)( +pcre2_match_data *yield; +if (oveccount < 1) oveccount = 1; +yield = PRIV(memctl_malloc)( sizeof(pcre2_match_data) + 3*oveccount*sizeof(PCRE2_SIZE), (pcre2_memctl *)gcontext); yield->oveccount = oveccount; diff --git a/src/pcre2test.c b/src/pcre2test.c index 32837b8..8ece7d5 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -2531,7 +2531,7 @@ switch (m->which) case MOD_CTC: /* Compile context modifier */ if (ctx == CTX_DEFPAT) field = PTR(default_pat_context); else if (ctx == CTX_PAT) field = PTR(pat_context); - break; + break; case MOD_CTM: /* Match context modifier */ if (ctx == CTX_DEFDAT) field = PTR(default_dat_context); @@ -3705,8 +3705,8 @@ if (TEST(compiled_code, ==, NULL)) /* Call the JIT compiler if requested. */ if (pat_patctl.jit != 0) - { - PCRE2_JIT_COMPILE(compiled_code, pat_patctl.jit); + { + PCRE2_JIT_COMPILE(compiled_code, pat_patctl.jit); } /* Output code size and other information if requested. */ @@ -4385,11 +4385,10 @@ if ((dat_datctl.control & (CTL_DFA|CTL_FINDLIMITS)) == (CTL_DFA|CTL_FINDLIMITS)) dat_datctl.control &= ~CTL_FINDLIMITS; } -if ((dat_datctl.control & CTL_ANYGLOB) != 0 && dat_datctl.oveccount < 1) - { - printf("** Global matching requires a non-zero ovector count: ignored\n"); - dat_datctl.control &= ~CTL_ANYGLOB; - } +/* As pcre2_match_data_create() imposes a minimum of 1 on the ovector count, we +must do so too. */ + +if (dat_datctl.oveccount < 1) dat_datctl.oveccount = 1; /* Enable display of malloc/free if wanted. */ @@ -4438,28 +4437,28 @@ else PCRE2_MATCH_DATA_FREE(match_data); PCRE2_MATCH_DATA_CREATE(match_data, max_oveccount, NULL); } - + /* Loop for global matching */ for (gmatched = 0;; gmatched++) { int capcount; PCRE2_SIZE *ovector; - PCRE2_SIZE ovecsave[2]; + PCRE2_SIZE ovecsave[2]; jit_was_used = FALSE; ovector = FLD(match_data, ovector); - + /* After the first time round a global loop, save the current ovector[0,1] so - that we can check that they do change each time. Otherwise a matching bug + that we can check that they do change each time. Otherwise a matching bug that returns the same string causes an infinite loop. It has happened! */ if (gmatched > 0) - { + { ovecsave[0] = ovector[0]; - ovecsave[1] = ovector[1]; - } - + ovecsave[1] = ovector[1]; + } + /* Do timing if required. */ if (timeitm > 0) @@ -4564,7 +4563,7 @@ for (gmatched = 0;; gmatched++) PCRE2_SIZE rightchar = FLD(match_data, rightchar); /* This is a check against a lunatic return value. */ - + if (capcount > (int)dat_datctl.oveccount) { fprintf(outfile, @@ -4577,20 +4576,20 @@ for (gmatched = 0;; gmatched++) dat_datctl.control &= ~CTL_ANYGLOB; /* Break g/G loop */ } } - - /* If this is not the first time round a global loop, check that the - returned string has changed. If not, there is a bug somewhere and we must + + /* If this is not the first time round a global loop, check that the + returned string has changed. If not, there is a bug somewhere and we must break the loop because it will go on for ever. We know that for a global - match there must be at least two elements in the ovector. This is checked + match there must be at least two elements in the ovector. This is checked above. */ - + if (gmatched > 0 && ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) { - fprintf(outfile, + fprintf(outfile, "** PCRE2 error: global repeat returned the same string as previous\n"); fprintf(outfile, "** Global loop abandoned\n"); dat_datctl.control &= ~CTL_ANYGLOB; /* Break g/G loop */ - } + } /* "allcaptures" requests showing of all captures in the pattern, to check unset ones at the end. It may be set on the pattern or the data. Implement @@ -4647,7 +4646,7 @@ for (gmatched = 0;; gmatched++) PCHARSV(pp, start, end - start, utf, outfile); } - if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used) + if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used) fprintf(outfile, " (JIT)"); fprintf(outfile, "\n"); @@ -4864,7 +4863,7 @@ for (gmatched = 0;; gmatched++) fprintf(outfile, ": "); PCHARSV(pp, leftchar, ulen - leftchar, utf, outfile); - if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used) + if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used) fprintf(outfile, " (JIT)"); fprintf(outfile, "\n"); break; /* Out of the /g loop */ @@ -4875,8 +4874,7 @@ for (gmatched = 0;; gmatched++) If that is the case, this is not necessarily the end. We want to advance the start offset, and continue. We won't be at the end of the string - that was checked before setting g_notempty. We achieve the effect by pretending that a - single character was matched. We know that match_data->oveccount is at least - 1 because that was checked above. + single character was matched. Complication arises in the case when the newline convention is "any", "crlf", or "anycrlf". If the previous match was at the end of a line terminated by @@ -4936,7 +4934,7 @@ for (gmatched = 0;; gmatched++) fprintf(outfile, ", mark = "); PCHARSV(CASTFLD(void *, match_data, mark), 0, -1, utf, outfile); } - if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used) + if ((pat_patctl.control & CTL_JITVERIFY) != 0 && jit_was_used) fprintf(outfile, " (JIT)"); fprintf(outfile, "\n"); } diff --git a/testdata/testoutput2 b/testdata/testoutput2 index d9b457c..690139c 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -245,6 +245,7 @@ Subject length lower bound = 4 3: c abcb\=ovector=0 Matched, but too many substrings + 0: abcb abcb\=ovector=1 Matched, but too many substrings 0: abcb @@ -273,6 +274,7 @@ Subject length lower bound = 3 1: a abc\=ovector=0 Matched, but too many substrings + 0: abc abc\=ovector=1 Matched, but too many substrings 0: abc @@ -286,6 +288,7 @@ Matched, but too many substrings 3: b aba\=ovector=0 Matched, but too many substrings + 0: aba aba\=ovector=1 Matched, but too many substrings 0: aba @@ -7404,6 +7407,7 @@ Subject length lower bound = 3 No match aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4\=ovector=0 Matched, but too many substrings + 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa4 /^a.b/newline=lf a\rb @@ -10922,6 +10926,7 @@ Minimum recursion limit = 4 3: baz bazfooX\=ovector=0 Matched, but too many substrings + 0: fooX bazfooX\=ovector=1 Matched, but too many substrings 0: fooX @@ -11970,7 +11975,7 @@ Callout 2: last capture = 0 /(ab)x|ab/ ab\=ovector=0 -Matched, but too many substrings + 0: ab ab\=ovector=1 0: ab diff --git a/testdata/testoutput6 b/testdata/testoutput6 index e37369b..40e8a93 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7611,7 +7611,7 @@ Failed: error -37: invalid data in workspace for DFA restart /abcd/ abcd\=ovector=0 -Matched, but offsets vector is too small to show all matches + 0: abcd # These tests show up auto-possessification