diff --git a/ChangeLog b/ChangeLog index 308a81f..9ab6986 100644 --- a/ChangeLog +++ b/ChangeLog @@ -44,6 +44,17 @@ to be incorrectly calculated, leading to overwriting. segfault at compile time (while trying to find the minimum matching length). The infinite loop is now broken (with the minimum length unset, that is, zero). +9. If an assertion that was used as a condition was quantified with a minimum +of zero, matching went wrong. In particular, if the whole group had unlimited +repetition and could match an empty string, a segfault was likely. The pattern +(?(?=0)?)+ is an example that caused this. Perl allows assertions to be +quantified, but not if they are being used as conditions, so the above pattern +is faulted by Perl. PCRE2 has now been changed so that it also rejects such +patterns. + +10. The error message for an invalid quantifier has been changed from "nothing +to repeat" to "quantifier does not follow a repeatable item". + Version 10.00 05-January-2015 ----------------------------- diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 9e81dcf..5f9ba78 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "26 January 2015" "PCRE2 10.00" +.TH PCRE2PATTERN 3 "28 January 2015" "PCRE2 10.00" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -1742,8 +1742,8 @@ items: the \eR escape sequence an escape such as \ed or \epL that matches a single character a character class - a back reference (see next section) - a parenthesized subpattern (including assertions) + a back reference + a parenthesized subpattern (including most assertions) a subroutine call to a subpattern (recursive or otherwise) .sp The general repetition quantifier specifies a minimum and maximum number of @@ -2152,10 +2152,11 @@ numbering the capturing subpatterns in the whole pattern. However, substring capturing is carried out only for positive assertions. (Perl sometimes, but not always, does do capturing in negative assertions.) .P -For compatibility with Perl, assertion subpatterns may be repeated; though +For compatibility with Perl, most assertion subpatterns may be repeated; though it makes no sense to assert the same thing several times, the side effect of -capturing parentheses may occasionally be useful. In practice, there only three -cases: +capturing parentheses may occasionally be useful. However, an assertion that +forms the condition for a conditional subpattern may not be quantified. In +practice, for other assertions, there only three cases: .sp (1) If the quantifier is {0}, the assertion is never obeyed during matching. However, it may contain internal capturing parenthesized groups that are called @@ -3301,6 +3302,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 26 January 2015 +Last updated: 28 January 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index ffd2a2b..27089b1 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -5178,14 +5178,19 @@ for (;; ptr++) } /* For conditions that are assertions, check the syntax, and then exit - the switch. This will take control down to where bracketed groups, - including assertions, are processed. */ + the switch. This will take control down to where bracketed groups + are processed. The assertion will be handled as part of the group, + but we need to identify this case because the conditional assertion may + not be quantifier. */ if (tempptr[1] == CHAR_QUESTION_MARK && (tempptr[2] == CHAR_EQUALS_SIGN || tempptr[2] == CHAR_EXCLAMATION_MARK || tempptr[2] == CHAR_LESS_THAN_SIGN)) + { + cb->iscondassert = TRUE; break; + } /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all need to skip at least 1+IMM2_SIZE bytes at the start of the group. */ @@ -6098,12 +6103,22 @@ for (;; ptr++) goto FAILED; } - /* Assertions used not to be repeatable, but this was changed for Perl - compatibility, so all kinds can now be repeated. We copy code into a + /* All assertions used not to be repeatable, but this was changed for Perl + compatibility. All kinds can now be repeated except for assertions that are + conditions (Perl also forbids these to be repeated). We copy code into a non-register variable (tempcode) in order to be able to pass its address - because some compilers complain otherwise. */ + because some compilers complain otherwise. At the start of a conditional + group whose condition is an assertion, cb->iscondassert is set. We unset it + here so as to allow assertions later in the group to be quantified. */ - previous = code; /* For handling repetition */ + if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT && + cb->iscondassert) + { + previous = NULL; + cb->iscondassert = FALSE; + } + else previous = code; + *code = bravalue; tempcode = code; tempreqvary = cb->req_varyopt; /* Save value before bracket */ @@ -6121,9 +6136,9 @@ for (;; ptr++) skipbytes, /* Skip over bracket number */ cond_depth + ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */ - &subfirstcu, /* For possible first char */ + &subfirstcu, /* For possible first char */ &subfirstcuflags, - &subreqcu, /* For possible last char */ + &subreqcu, /* For possible last char */ &subreqcuflags, bcptr, /* Current branch chain */ cb, /* Compile data block */ @@ -7474,6 +7489,7 @@ cb.end_pattern = pattern + patlen; cb.external_flags = 0; cb.external_options = options; cb.hwm = cworkspace; +cb.iscondassert = FALSE; cb.max_lookbehind = 0; cb.name_entry_size = 0; cb.name_table = NULL; @@ -7725,6 +7741,7 @@ cb.max_lookbehind = 0; cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)); cb.start_code = codestart; cb.hwm = (PCRE2_UCHAR *)(cb.start_workspace); +cb.iscondassert = FALSE; cb.req_varyopt = 0; cb.had_accept = FALSE; cb.had_pruneorskip = FALSE; diff --git a/src/pcre2_error.c b/src/pcre2_error.c index c801be7..b257c60 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -74,7 +74,7 @@ static const char compile_error_texts[] = "missing terminating ] for character class\0" "invalid escape sequence in character class\0" "range out of order in character class\0" - "nothing to repeat\0" + "quantifier does not follow a repeatable item\0" /* 10 */ "internal error: unexpected repeat\0" "unrecognized character after (? or (?-\0" diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 4e57ed4..355938e 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -688,6 +688,7 @@ typedef struct compile_block { BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ BOOL check_lookbehind; /* Lookbehinds need later checking */ BOOL dupnames; /* Duplicate names exist */ + BOOL iscondassert; /* Next assert is a condition */ } compile_block; /* Structure for keeping the properties of the in-memory stack used diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 1c44503..da68756 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -75,7 +75,7 @@ implementing this. */ #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */ #define OVFLBIT 0x00010000 /* The bit that is set for overflow */ -/* Values for setting in mb->match_function_type to indicate two special types +/* Bits for setting in mb->match_function_type to indicate two special types of call to match(). We do it this way to save on using another stack variable, as stack usage is to be discouraged. */ @@ -487,7 +487,7 @@ data and the last captured value. */ do { - if (cbegroup) mb->match_function_type = MATCH_CBEGROUP; + if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP; rrc = match(eptr, callpat + PRIV(OP_lengths)[*callpat], mstart, offset_top, mb, eptrb, rdepth + 1); memcpy(mb->ovector, new_recursive->ovec_save, @@ -771,9 +771,9 @@ if (mb->match_call_count++ >= mb->match_limit) RRETURN(PCRE2_ERROR_MATCHLIMIT); if (rdepth >= mb->match_limit_recursion) RRETURN(PCRE2_ERROR_RECURSIONLIMIT); /* At the start of a group with an unlimited repeat that may match an empty -string, the variable mb->match_function_type is set to MATCH_CBEGROUP. It is -done this way to save having to use another function argument, which would take -up space on the stack. See also MATCH_CONDASSERT below. +string, the variable mb->match_function_type contains the MATCH_CBEGROUP bit. +It is done this way to save having to use another function argument, which +would take up space on the stack. See also MATCH_CONDASSERT below. When MATCH_CBEGROUP is set, add the current subject pointer to the chain of such remembered pointers, to be checked when we hit the closing ket, in order @@ -782,12 +782,12 @@ other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must NOT be used with tail recursion, because the memory block that is used is on the stack, so a new one may be required for each match(). */ -if (mb->match_function_type == MATCH_CBEGROUP) +if ((mb->match_function_type & MATCH_CBEGROUP) != 0) { newptrb.epb_saved_eptr = eptr; newptrb.epb_prev = eptrb; eptrb = &newptrb; - mb->match_function_type = 0; + mb->match_function_type &= ~MATCH_CBEGROUP; } /* Now, at last, we can start processing the opcodes. */ @@ -1016,7 +1016,7 @@ for (;;) for (;;) { - if (op >= OP_SBRA) mb->match_function_type = MATCH_CBEGROUP; + if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP; RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, eptrb, RM1); if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */ @@ -1091,7 +1091,7 @@ for (;;) for (;;) { if (op >= OP_SBRA || op == OP_ONCE) - mb->match_function_type = MATCH_CBEGROUP; + mb->match_function_type |= MATCH_CBEGROUP; /* If this is not a possibly empty group, and there are no (*THEN)s in the pattern, and this is the final alternative, optimize as described @@ -1181,7 +1181,7 @@ for (;;) for (;;) { mb->ovector[mb->offset_end - number] = eptr - mb->start_subject; - if (op >= OP_SBRA) mb->match_function_type = MATCH_CBEGROUP; + if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP; RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, eptrb, RM63); if (rrc == MATCH_KETRPOS) @@ -1255,7 +1255,7 @@ for (;;) for (;;) { - if (op >= OP_SBRA) mb->match_function_type = MATCH_CBEGROUP; + if (op >= OP_SBRA) mb->match_function_type |= MATCH_CBEGROUP; RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, mb, eptrb, RM48); if (rrc == MATCH_KETRPOS) @@ -1404,11 +1404,11 @@ for (;;) break; /* The condition is an assertion. Call match() to evaluate it - setting - mb->match_function_type to MATCH_CONDASSERT causes it to stop at the end - of an assertion. */ + the MATCH_CONDASSERT bit in mb->match_function_type causes it to stop at + the end of an assertion. */ default: - mb->match_function_type = MATCH_CONDASSERT; + mb->match_function_type |= MATCH_CONDASSERT; RMATCH(eptr, ecode, offset_top, mb, NULL, RM3); if (rrc == MATCH_MATCH) { @@ -1459,7 +1459,7 @@ for (;;) goto TAIL_RECURSE; } - mb->match_function_type = MATCH_CBEGROUP; + mb->match_function_type |= MATCH_CBEGROUP; RMATCH(eptr, ecode, offset_top, mb, eptrb, RM49); RRETURN(rrc); } @@ -1548,10 +1548,10 @@ for (;;) case OP_ASSERT: case OP_ASSERTBACK: save_mark = mb->mark; - if (mb->match_function_type == MATCH_CONDASSERT) + if ((mb->match_function_type & MATCH_CONDASSERT) != 0) { condassert = TRUE; - mb->match_function_type = 0; + mb->match_function_type &= ~MATCH_CONDASSERT; } else condassert = FALSE; @@ -1619,10 +1619,10 @@ for (;;) case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: save_mark = mb->mark; - if (mb->match_function_type == MATCH_CONDASSERT) + if ((mb->match_function_type & MATCH_CONDASSERT) != 0) { condassert = TRUE; - mb->match_function_type = 0; + mb->match_function_type &= ~MATCH_CONDASSERT; } else condassert = FALSE; @@ -1844,7 +1844,7 @@ for (;;) cbegroup = (*callpat >= OP_SBRA); do { - if (cbegroup) mb->match_function_type = MATCH_CBEGROUP; + if (cbegroup) mb->match_function_type |= MATCH_CBEGROUP; RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top, mb, eptrb, RM6); memcpy(mb->ovector, new_recursive.ovec_save, diff --git a/testdata/testinput2 b/testdata/testinput2 index ad6f0a3..40a2b4a 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4078,10 +4078,10 @@ a random value. /Ix # End of substitute tests -"((?=(?(?=(?(?=(?(?=())))*)))))" +"((?=(?(?=(?(?=(?(?=()))))))))" a -"(?(?=)?==)(((((((((?=)))))))))" +"(?(?=)==)(((((((((?=)))))))))" a /(a)(b)|(c)/ @@ -4138,4 +4138,18 @@ a random value. /Ix /(?(?J)(?))(?-J)\k/ +# Quantifiers are not allowed on condition assertions, but are otherwise +# OK in conditions. + +/(?(?=0)?)+/ + +/(?(?=0)(?=00)?00765)/ + 00765 + +/(?(?=0)(?=00)?00765|(?!3).56)/ + 00765 + 456 + ** Failers + 356 + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index da01b4e..0b4f7ae 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -147,7 +147,7 @@ Failed: error 107 at offset 2: invalid escape sequence in character class Failed: error 108 at offset 3: range out of order in character class /^*/ -Failed: error 109 at offset 1: nothing to repeat +Failed: error 109 at offset 1: quantifier does not follow a repeatable item /(abc/ Failed: error 114 at offset 4: missing closing parenthesis @@ -230,7 +230,7 @@ Subject length lower bound = 1 Failed: error 115 at offset 6: reference to non-existent subpattern /{4,5}abc/ -Failed: error 109 at offset 4: nothing to repeat +Failed: error 109 at offset 4: quantifier does not follow a repeatable item /(a)(b)(c)\2/I Capturing subpattern count = 3 @@ -883,10 +883,10 @@ Failed: error 106 at offset 4: missing terminating ] for character class Failed: error 106 at offset 2: missing terminating ] for character class /*a/ -Failed: error 109 at offset 0: nothing to repeat +Failed: error 109 at offset 0: quantifier does not follow a repeatable item /(*)b/ -Failed: error 109 at offset 1: nothing to repeat +Failed: error 109 at offset 1: quantifier does not follow a repeatable item /abc)/ Failed: error 122 at offset 3: unmatched closing parenthesis @@ -895,7 +895,7 @@ Failed: error 122 at offset 3: unmatched closing parenthesis Failed: error 114 at offset 4: missing closing parenthesis /a**/ -Failed: error 109 at offset 2: nothing to repeat +Failed: error 109 at offset 2: quantifier does not follow a repeatable item /)(/ Failed: error 122 at offset 0: unmatched closing parenthesis @@ -919,10 +919,10 @@ Failed: error 106 at offset 4: missing terminating ] for character class Failed: error 106 at offset 2: missing terminating ] for character class /*a/Ii -Failed: error 109 at offset 0: nothing to repeat +Failed: error 109 at offset 0: quantifier does not follow a repeatable item /(*)b/Ii -Failed: error 109 at offset 1: nothing to repeat +Failed: error 109 at offset 1: quantifier does not follow a repeatable item /abc)/Ii Failed: error 122 at offset 3: unmatched closing parenthesis @@ -931,7 +931,7 @@ Failed: error 122 at offset 3: unmatched closing parenthesis Failed: error 114 at offset 4: missing closing parenthesis /a**/Ii -Failed: error 109 at offset 2: nothing to repeat +Failed: error 109 at offset 2: quantifier does not follow a repeatable item /)(/Ii Failed: error 122 at offset 0: unmatched closing parenthesis @@ -3025,16 +3025,16 @@ Last code unit = 'c' Subject length lower bound = 3 /a+?+/I -Failed: error 109 at offset 3: nothing to repeat +Failed: error 109 at offset 3: quantifier does not follow a repeatable item /a{2,3}?+b/I -Failed: error 109 at offset 7: nothing to repeat +Failed: error 109 at offset 7: quantifier does not follow a repeatable item /(?U)a+?+/I -Failed: error 109 at offset 7: nothing to repeat +Failed: error 109 at offset 7: quantifier does not follow a repeatable item /a{2,3}?+b/I,ungreedy -Failed: error 109 at offset 7: nothing to repeat +Failed: error 109 at offset 7: quantifier does not follow a repeatable item /x(?U)a++b/IB ------------------------------------------------------------------ @@ -8816,7 +8816,7 @@ No match 0: a /a(*FAIL)+b/ -Failed: error 109 at offset 8: nothing to repeat +Failed: error 109 at offset 8: quantifier does not follow a repeatable item /(abc|pqr|123){0}[xyz]/I Capturing subpattern count = 1 @@ -13724,13 +13724,13 @@ Failed: error -34: bad option value # End of substitute tests -"((?=(?(?=(?(?=(?(?=())))*)))))" +"((?=(?(?=(?(?=(?(?=()))))))))" a 0: 1: 2: -"(?(?=)?==)(((((((((?=)))))))))" +"(?(?=)==)(((((((((?=)))))))))" a No match @@ -13897,4 +13897,24 @@ Matched, but too many substrings /(?(?J)(?))(?-J)\k/ +# Quantifiers are not allowed on condition assertions, but are otherwise +# OK in conditions. + +/(?(?=0)?)+/ +Failed: error 109 at offset 7: quantifier does not follow a repeatable item + +/(?(?=0)(?=00)?00765)/ + 00765 + 0: 00765 + +/(?(?=0)(?=00)?00765|(?!3).56)/ + 00765 + 0: 00765 + 456 + 0: 456 + ** Failers +No match + 356 +No match + # End of testinput2