diff --git a/ChangeLog b/ChangeLog index df46cd5..84d7e44 100644 --- a/ChangeLog +++ b/ChangeLog @@ -28,6 +28,10 @@ now correctly backtracked, so this unnecessary restriction has been removed. 7. Added PCRE2_SUBSTITUTE_MATCHED. +8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another +regex engine. The Perl regex folks are aware of this usage and have made a note +about it. + Version 10.34 21-November-2019 ------------------------------ diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html index f365306..4fed554 100644 --- a/doc/html/pcre2pattern.html +++ b/doc/html/pcre2pattern.html @@ -2624,8 +2624,8 @@ backtracking into the assertion. However, there are some cases where non-atomic positive assertions can be useful. PCRE2 provides these using the following syntax:
- (*non_atomic_positive_lookahead: or (*napla: - (*non_atomic_positive_lookbehind: or (*naplb: + (*non_atomic_positive_lookahead: or (*napla: or (?* + (*non_atomic_positive_lookbehind: or (*naplb: or (?<*Consider the problem of finding the right-most word in a string that also appears earlier in the string, that is, it must appear at least twice in total. @@ -3833,7 +3833,7 @@ Cambridge, England.
-Last updated: 18 December 2019
+Last updated: 28 December 2019
Copyright © 1997-2019 University of Cambridge.
diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html
index 00f0513..3687b35 100644
--- a/doc/html/pcre2syntax.html
+++ b/doc/html/pcre2syntax.html
@@ -553,11 +553,13 @@ Each top-level branch of a lookbehind must be of a fixed length.
These assertions are specific to PCRE2 and are not Perl-compatible.
- (*napla:...) - (*non_atomic_positive_lookahead:...) + (?*...) ) + (*napla:...) ) synonyms + (*non_atomic_positive_lookahead:...) ) - (*naplb:...) - (*non_atomic_positive_lookbehind:...) + (?<*...) ) + (*naplb:...) ) synonyms + (*non_atomic_positive_lookbehind:...) )
-Last updated: 29 July 2019
+Last updated: 28 December 2019
Copyright © 1997-2019 University of Cambridge.
diff --git a/doc/pcre2.txt b/doc/pcre2.txt
index f49ab40..e99dbef 100644
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
@@ -8354,8 +8354,8 @@ NON-ATOMIC ASSERTIONS
some cases where non-atomic positive assertions can be useful. PCRE2
provides these using the following syntax:
- (*non_atomic_positive_lookahead: or (*napla:
- (*non_atomic_positive_lookbehind: or (*naplb:
+ (*non_atomic_positive_lookahead: or (*napla: or (?*
+ (*non_atomic_positive_lookbehind: or (*naplb: or (?<*
Consider the problem of finding the right-most word in a string that
also appears earlier in the string, that is, it must appear at least
@@ -9487,7 +9487,7 @@ AUTHOR
REVISION
- Last updated: 18 December 2019
+ Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
@@ -10716,11 +10716,13 @@ NON-ATOMIC LOOKAROUND ASSERTIONS
These assertions are specific to PCRE2 and are not Perl-compatible.
- (*napla:...)
- (*non_atomic_positive_lookahead:...)
+ (?*...) )
+ (*napla:...) ) synonyms
+ (*non_atomic_positive_lookahead:...) )
- (*naplb:...)
- (*non_atomic_positive_lookbehind:...)
+ (?<*...) )
+ (*naplb:...) ) synonyms
+ (*non_atomic_positive_lookbehind:...) )
SCRIPT RUNS
@@ -10844,7 +10846,7 @@ AUTHOR
REVISION
- Last updated: 29 July 2019
+ Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
index dbf7634..4819187 100644
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "18 December 2019" "PCRE2 10.35"
+.TH PCRE2PATTERN 3 "28 December 2019" "PCRE2 10.35"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@@ -2637,8 +2637,8 @@ backtracking into the assertion. However, there are some cases where non-atomic
positive assertions can be useful. PCRE2 provides these using the following
syntax:
.sp
- (*non_atomic_positive_lookahead: or (*napla:
- (*non_atomic_positive_lookbehind: or (*naplb:
+ (*non_atomic_positive_lookahead: or (*napla: or (?*
+ (*non_atomic_positive_lookbehind: or (*naplb: or (?<*
.sp
Consider the problem of finding the right-most word in a string that also
appears earlier in the string, that is, it must appear at least twice in total.
@@ -3874,6 +3874,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 18 December 2019
+Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
.fi
diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3
index bbe418a..765ddbf 100644
--- a/doc/pcre2syntax.3
+++ b/doc/pcre2syntax.3
@@ -1,4 +1,4 @@
-.TH PCRE2SYNTAX 3 "29 July 2019" "PCRE2 10.34"
+.TH PCRE2SYNTAX 3 "28 December 2019" "PCRE2 10.35"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
@@ -531,11 +531,13 @@ Each top-level branch of a lookbehind must be of a fixed length.
.sp
These assertions are specific to PCRE2 and are not Perl-compatible.
.sp
- (*napla:...)
- (*non_atomic_positive_lookahead:...)
-.sp
- (*naplb:...)
- (*non_atomic_positive_lookbehind:...)
+ (?*...) )
+ (*napla:...) ) synonyms
+ (*non_atomic_positive_lookahead:...) )
+.sp
+ (?<*...) )
+ (*naplb:...) ) synonyms
+ (*non_atomic_positive_lookbehind:...) )
.
.
.SH "SCRIPT RUNS"
@@ -670,6 +672,6 @@ Cambridge, England.
.rs
.sp
.nf
-Last updated: 29 July 2019
+Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
.fi
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 8ad4583..ed4fc74 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -3653,7 +3653,7 @@ while (ptr < ptrend)
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
/* If ( is not followed by ? it is either a capture or a special verb or an
- alpha assertion. */
+ alpha assertion or a positive non-atomic lookahead. */
if (*ptr != CHAR_QUESTION_MARK)
{
@@ -3685,10 +3685,10 @@ while (ptr < ptrend)
break;
/* Handle "alpha assertions" such as (*pla:...). Most of these are
- synonyms for the historical symbolic assertions, but the script run ones
- are new. They are distinguished by starting with a lower case letter.
- Checking both ends of the alphabet makes this work in all character
- codes. */
+ synonyms for the historical symbolic assertions, but the script run and
+ non-atomic lookaround ones are new. They are distinguished by starting
+ with a lower case letter. Checking both ends of the alphabet makes this
+ work in all character codes. */
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
{
@@ -3747,9 +3747,7 @@ while (ptr < ptrend)
goto POSITIVE_LOOK_AHEAD;
case META_LOOKAHEAD_NA:
- *parsed_pattern++ = meta;
- ptr++;
- goto POST_ASSERTION;
+ goto POSITIVE_NONATOMIC_LOOK_AHEAD;
case META_LOOKAHEADNOT:
goto NEGATIVE_LOOK_AHEAD;
@@ -4438,6 +4436,12 @@ while (ptr < ptrend)
ptr++;
goto POST_ASSERTION;
+ case CHAR_ASTERISK:
+ POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
+ *parsed_pattern++ = META_LOOKAHEAD_NA;
+ ptr++;
+ goto POST_ASSERTION;
+
case CHAR_EXCLAMATION_MARK:
NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
*parsed_pattern++ = META_LOOKAHEADNOT;
@@ -4447,20 +4451,23 @@ while (ptr < ptrend)
/* ---- Lookbehind assertions ---- */
- /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
- start of the name of a capturing group. */
+ /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
+ is the start of the name of a capturing group. */
case CHAR_LESS_THAN_SIGN:
if (ptrend - ptr <= 1 ||
- (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
+ (ptr[1] != CHAR_EQUALS_SIGN &&
+ ptr[1] != CHAR_EXCLAMATION_MARK &&
+ ptr[1] != CHAR_ASTERISK))
{
terminator = CHAR_GREATER_THAN_SIGN;
goto DEFINE_NAME;
}
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
- META_LOOKBEHIND : META_LOOKBEHINDNOT;
+ META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
+ META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
- POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
+ POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
*has_lookbehind = TRUE;
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
PUTOFFSET(offset, parsed_pattern);
@@ -4633,8 +4640,6 @@ while (ptr < ptrend)
*parsed_pattern++ = META_KET;
}
-
-
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
else top_nest--;
}
diff --git a/testdata/testinput2 b/testdata/testinput2
index 0dc0c47..a10613d 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -5670,6 +5670,9 @@ a)"xI
/\A(*napla:.*\b(\w++))(?>.*?\b\1\b){3}/
word1 word3 word1 word2 word3 word2 word2 word1 word3 word4
+/\A(?*.*\b(\w++))(?>.*?\b\1\b){3}/
+ word1 word3 word1 word2 word3 word2 word2 word1 word3 word4
+
/(*plb:(.)..|(.)...)(\1|\2)/
abcdb\=offset=4
abcda\=offset=4
@@ -5678,6 +5681,10 @@ a)"xI
abcdb\=offset=4
abcda\=offset=4
+/(?<*(.)..|(.)...)(\1|\2)/
+ abcdb\=offset=4
+ abcda\=offset=4
+
/(*non_atomic_positive_lookahead:ab)/B
/(*non_atomic_positive_lookbehind:ab)/B
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 4649fae..438aefe 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -17088,6 +17088,11 @@ No match
0: word1 word3 word1 word2 word3 word2 word2 word1 word3
1: word3
+/\A(?*.*\b(\w++))(?>.*?\b\1\b){3}/
+ word1 word3 word1 word2 word3 word2 word2 word1 word3 word4
+ 0: word1 word3 word1 word2 word3 word2 word2 word1 word3
+ 1: word3
+
/(*plb:(.)..|(.)...)(\1|\2)/
abcdb\=offset=4
0: b
@@ -17109,6 +17114,18 @@ No match
2: a
3: a
+/(?<*(.)..|(.)...)(\1|\2)/
+ abcdb\=offset=4
+ 0: b
+ 1: b
+ 2: