diff --git a/ChangeLog b/ChangeLog index 80d2ec5..4507cec 100644 --- a/ChangeLog +++ b/ChangeLog @@ -11,6 +11,17 @@ Version 10.35 3. A JIT bug is fixed which allowed to read the fields of the compiled pattern before its existence is checked. +4. Back in the PCRE1 day, capturing groups that contained recursive back +references to themselves were made atomic (version 8.01, change 18) because +after the end a repeated group, the captured substrings had their values from +the final repetition, not from an earlier repetition that might be the +destination of a backtrack. This feature was documented, and was carried over +into PCRE2. However, it has now been realized that the major refactoring that +was done for 10.30 has made this atomicizing unnecessary, and it is confusing +when users are unaware of it, making some patterns appear not to be working as +expected. Capture values of recursive back references in repeated groups are +now correctly backtracked, so this unnecessary restriction has been removed. + Version 10.34 21-November-2019 ------------------------------ diff --git a/configure.ac b/configure.ac index 30d4ddd..0a7cedb 100644 --- a/configure.ac +++ b/configure.ac @@ -9,9 +9,9 @@ dnl The PCRE2_PRERELEASE feature is for identifying release candidates. It might dnl be defined as -RC2, for example. For real releases, it should be empty. m4_define(pcre2_major, [10]) -m4_define(pcre2_minor, [34]) -m4_define(pcre2_prerelease, []) -m4_define(pcre2_date, [2019-11-21]) +m4_define(pcre2_minor, [35]) +m4_define(pcre2_prerelease, [-RC1]) +m4_define(pcre2_date, [2019-11-27]) # NOTE: The CMakeLists.txt file searches for the above variables in the first # 50 lines of this file. Please update that if the variables above are moved. diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html index 0aa2191..f365306 100644 --- a/doc/html/pcre2pattern.html +++ b/doc/html/pcre2pattern.html @@ -2349,11 +2349,11 @@ using alternation, as in the example above, or by a quantifier with a minimum of zero.
-Backreferences of this type cause the group that they reference to be treated -as an +For versions of PCRE2 less than 10.25, backreferences of this type used to +cause the group that they reference to be treated as an atomic group. -Once the whole group has been matched, a subsequent matching failure cannot -cause backtracking into the middle of the group. +This restriction no longer applies, and backtracking into such groups can occur +as normal.
@@ -3833,7 +3833,7 @@ Cambridge, England.
-Last updated: 29 July 2019
+Last updated: 18 December 2019
Copyright © 1997-2019 University of Cambridge.
diff --git a/doc/pcre2.txt b/doc/pcre2.txt
index 948b91a..ed4b4e3 100644
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
@@ -180,8 +180,8 @@ REVISION
Last updated: 17 September 2018
Copyright (c) 1997-2018 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2API(3) Library Functions Manual PCRE2API(3)
@@ -3724,8 +3724,8 @@ REVISION
Last updated: 02 September 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2BUILD(3) Library Functions Manual PCRE2BUILD(3)
@@ -4296,8 +4296,8 @@ REVISION
Last updated: 03 March 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2CALLOUT(3) Library Functions Manual PCRE2CALLOUT(3)
@@ -4726,8 +4726,8 @@ REVISION
Last updated: 03 February 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2COMPAT(3) Library Functions Manual PCRE2COMPAT(3)
@@ -4935,8 +4935,8 @@ REVISION
Last updated: 13 July 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2JIT(3) Library Functions Manual PCRE2JIT(3)
@@ -5360,8 +5360,8 @@ REVISION
Last updated: 23 May 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2LIMITS(3) Library Functions Manual PCRE2LIMITS(3)
@@ -5430,8 +5430,8 @@ REVISION
Last updated: 02 February 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2MATCHING(3) Library Functions Manual PCRE2MATCHING(3)
@@ -5654,8 +5654,8 @@ REVISION
Last updated: 23 May 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2PARTIAL(3) Library Functions Manual PCRE2PARTIAL(3)
@@ -6034,8 +6034,8 @@ REVISION
Last updated: 04 September 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2PATTERN(3) Library Functions Manual PCRE2PATTERN(3)
@@ -8078,10 +8078,10 @@ BACKREFERENCES
the backreference. This can be done using alternation, as in the exam-
ple above, or by a quantifier with a minimum of zero.
- Backreferences of this type cause the group that they reference to be
- treated as an atomic group. Once the whole group has been matched, a
- subsequent matching failure cannot cause backtracking into the middle
- of the group.
+ For versions of PCRE2 less than 10.25, backreferences of this type used
+ to cause the group that they reference to be treated as an atomic
+ group. This restriction no longer applies, and backtracking into such
+ groups can occur as normal.
ASSERTIONS
@@ -9463,11 +9463,11 @@ AUTHOR
REVISION
- Last updated: 29 July 2019
+ Last updated: 18 December 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2PERFORM(3) Library Functions Manual PCRE2PERFORM(3)
@@ -9701,8 +9701,8 @@ REVISION
Last updated: 03 February 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2POSIX(3) Library Functions Manual PCRE2POSIX(3)
@@ -10031,8 +10031,8 @@ REVISION
Last updated: 30 January 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2SAMPLE(3) Library Functions Manual PCRE2SAMPLE(3)
@@ -10310,8 +10310,8 @@ REVISION
Last updated: 27 June 2018
Copyright (c) 1997-2018 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2SYNTAX(3) Library Functions Manual PCRE2SYNTAX(3)
@@ -10823,8 +10823,8 @@ REVISION
Last updated: 29 July 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE2UNICODE(3) Library Functions Manual PCRE2UNICODE(3)
@@ -11256,5 +11256,5 @@ REVISION
Last updated: 24 May 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
index d5228f3..dbf7634 100644
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "29 July 2019" "PCRE2 10.34"
+.TH PCRE2PATTERN 3 "18 December 2019" "PCRE2 10.35"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -2346,14 +2346,14 @@ the first iteration does not need to match the backreference. This can be done
using alternation, as in the example above, or by a quantifier with a minimum
of zero.
.P
-Backreferences of this type cause the group that they reference to be treated
-as an
+For versions of PCRE2 less than 10.25, backreferences of this type used to
+cause the group that they reference to be treated as an
.\" HTML
.\"
atomic group.
.\"
-Once the whole group has been matched, a subsequent matching failure cannot
-cause backtracking into the middle of the group.
+This restriction no longer applies, and backtracking into such groups can occur
+as normal.
.
.
.\" HTML
@@ -3874,6 +3874,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 29 July 2019
+Last updated: 18 December 2019
Copyright (c) 1997-2019 University of Cambridge.
.fi
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index f2e6b6b..8ad4583 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -6671,23 +6671,11 @@ for (;; pptr++)
}
/* For a back reference, update the back reference map and the
- maximum back reference. Then, for each group, we must check to
- see if it is recursive, that is, it is inside the group that it
- references. A flag is set so that the group can be made atomic.
- */
+ maximum back reference. */
cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
if (groupnumber > cb->top_backref)
cb->top_backref = groupnumber;
-
- for (oc = cb->open_caps; oc != NULL; oc = oc->next)
- {
- if (oc->number == groupnumber)
- {
- oc->flag = TRUE;
- break;
- }
- }
}
}
@@ -7682,19 +7670,6 @@ for (;; pptr++)
cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
-
- /* Check to see if this back reference is recursive, that it, it
- is inside the group that it references. A flag is set so that the
- group can be made atomic. */
-
- for (oc = cb->open_caps; oc != NULL; oc = oc->next)
- {
- if (oc->number == meta_arg)
- {
- oc->flag = TRUE;
- break;
- }
- }
break;
@@ -8035,7 +8010,6 @@ and skip over the pattern offset. */
lookbehind = *code == OP_ASSERTBACK ||
*code == OP_ASSERTBACK_NOT ||
*code == OP_ASSERTBACK_NA;
-
if (lookbehind)
{
lookbehindlength = META_DATA(pptr[-1]);
@@ -8053,7 +8027,6 @@ if (*code == OP_CBRA)
capnumber = GET2(code, 1 + LINK_SIZE);
capitem.number = capnumber;
capitem.next = cb->open_caps;
- capitem.flag = FALSE;
capitem.assert_depth = cb->assert_depth;
cb->open_caps = &capitem;
}
@@ -8182,26 +8155,9 @@ for (;;)
PUT(code, 1, (int)(code - start_bracket));
code += 1 + LINK_SIZE;
- /* If it was a capturing subpattern, check to see if it contained any
- recursive back references. If so, we must wrap it in atomic brackets. In
- any event, remove the block from the chain. */
+ /* If it was a capturing subpattern, remove the block from the chain. */
- if (capnumber > 0)
- {
- if (cb->open_caps->flag)
- {
- (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
- CU2BYTES(code - start_bracket));
- *start_bracket = OP_ONCE;
- code += 1 + LINK_SIZE;
- PUT(start_bracket, 1, (int)(code - start_bracket));
- *code = OP_KET;
- PUT(code, 1, (int)(code - start_bracket));
- code += 1 + LINK_SIZE;
- length += 2 + 2*LINK_SIZE;
- }
- cb->open_caps = cb->open_caps->next;
- }
+ if (capnumber > 0) cb->open_caps = cb->open_caps->next;
/* Set values to pass back */
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index fe8ffe5..ac96d2d 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -1759,13 +1759,11 @@ typedef struct pcre2_memctl {
/* Structure for building a chain of open capturing subpatterns during
compiling, so that instructions to close them can be compiled when (*ACCEPT) is
-encountered. This is also used to identify subpatterns that contain recursive
-back references to themselves, so that they can be made atomic. */
+encountered. */
typedef struct open_capitem {
struct open_capitem *next; /* Chain link */
uint16_t number; /* Capture number */
- uint16_t flag; /* Set TRUE if recursive back ref */
uint16_t assert_depth; /* Assertion depth when opened */
} open_capitem;
diff --git a/testdata/testinput1 b/testdata/testinput1
index f5159d6..109de29 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -6386,4 +6386,11 @@ ef) x/x,mark
/^(?a)(?()b)((?<=b).*)$/
abc
+/^(a\1?){4}$/
+ aaaa
+ aaaaaa
+
+/^((\1+)|\d)+133X$/
+ 111133X
+
# End of testinput1
diff --git a/testdata/testinput2 b/testdata/testinput2
index 655e519..b700d9e 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -324,16 +324,7 @@
\= Expect no match
fooabar
-# This one is here because Perl behaves differently; see also the following.
-
-/^(a\1?){4}$/I
-\= Expect no match
- aaaa
- aaaaaa
-
-# Perl does not fail these two for the final subjects. Neither did PCRE until
-# release 8.01. The problem is in backtracking into a subpattern that contains
-# a recursive reference to itself. PCRE has now made these into atomic patterns.
+# Perl does not fail these two for the final subjects.
/^(xa|=?\1a){2}$/
xa=xaa
@@ -5772,4 +5763,13 @@ a)"xI
/(a)?a/I
manm
+/^(?|(\*)(*napla:\S*_(\2?+.+))|(\w)(?=\S*_(\2?+\1)))+_\2$/
+ *abc_12345abc
+
+/^(?|(\*)(*napla:\S*_(\3?+.+))|(\w)(?=\S*_((\2?+\1))))+_\2$/
+ *abc_12345abc
+
+/^((\1+)(?C)|\d)+133X$/
+ 111133X\=callout_capture
+
# End of testinput2
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index ad2175b..c425ed4 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -10112,4 +10112,18 @@ No match
1: a
2: c
+/^(a\1?){4}$/
+ aaaa
+ 0: aaaa
+ 1: a
+ aaaaaa
+ 0: aaaaaa
+ 1: aa
+
+/^((\1+)|\d)+133X$/
+ 111133X
+ 0: 111133X
+ 1: 11
+ 2: 11
+
# End of testinput1
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index c733c12..df2f230 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -809,24 +809,7 @@ Subject length lower bound = 3
fooabar
No match
-# This one is here because Perl behaves differently; see also the following.
-
-/^(a\1?){4}$/I
-Capture group count = 1
-Max back reference = 1
-Compile options: