diff --git a/ChangeLog b/ChangeLog index df46cd5..84d7e44 100644 --- a/ChangeLog +++ b/ChangeLog @@ -28,6 +28,10 @@ now correctly backtracked, so this unnecessary restriction has been removed. 7. Added PCRE2_SUBSTITUTE_MATCHED. +8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another +regex engine. The Perl regex folks are aware of this usage and have made a note +about it. + Version 10.34 21-November-2019 ------------------------------ diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html index f365306..4fed554 100644 --- a/doc/html/pcre2pattern.html +++ b/doc/html/pcre2pattern.html @@ -2624,8 +2624,8 @@ backtracking into the assertion. However, there are some cases where non-atomic positive assertions can be useful. PCRE2 provides these using the following syntax:
-  (*non_atomic_positive_lookahead:  or (*napla:
-  (*non_atomic_positive_lookbehind: or (*naplb:
+  (*non_atomic_positive_lookahead:  or (*napla: or (?*
+  (*non_atomic_positive_lookbehind: or (*naplb: or (?<*
 
Consider the problem of finding the right-most word in a string that also appears earlier in the string, that is, it must appear at least twice in total. @@ -3833,7 +3833,7 @@ Cambridge, England.


REVISION

-Last updated: 18 December 2019 +Last updated: 28 December 2019
Copyright © 1997-2019 University of Cambridge.
diff --git a/doc/html/pcre2syntax.html b/doc/html/pcre2syntax.html index 00f0513..3687b35 100644 --- a/doc/html/pcre2syntax.html +++ b/doc/html/pcre2syntax.html @@ -553,11 +553,13 @@ Each top-level branch of a lookbehind must be of a fixed length.

These assertions are specific to PCRE2 and are not Perl-compatible.

-  (*napla:...)
-  (*non_atomic_positive_lookahead:...)
+  (?*...)                                )
+  (*napla:...)                           ) synonyms
+  (*non_atomic_positive_lookahead:...)   )
 
-  (*naplb:...)
-  (*non_atomic_positive_lookbehind:...)
+  (?<*...)                               )
+  (*naplb:...)                           ) synonyms
+  (*non_atomic_positive_lookbehind:...)  )
 


SCRIPT RUNS
@@ -683,7 +685,7 @@ Cambridge, England.


REVISION

-Last updated: 29 July 2019 +Last updated: 28 December 2019
Copyright © 1997-2019 University of Cambridge.
diff --git a/doc/pcre2.txt b/doc/pcre2.txt index f49ab40..e99dbef 100644 --- a/doc/pcre2.txt +++ b/doc/pcre2.txt @@ -8354,8 +8354,8 @@ NON-ATOMIC ASSERTIONS some cases where non-atomic positive assertions can be useful. PCRE2 provides these using the following syntax: - (*non_atomic_positive_lookahead: or (*napla: - (*non_atomic_positive_lookbehind: or (*naplb: + (*non_atomic_positive_lookahead: or (*napla: or (?* + (*non_atomic_positive_lookbehind: or (*naplb: or (?<* Consider the problem of finding the right-most word in a string that also appears earlier in the string, that is, it must appear at least @@ -9487,7 +9487,7 @@ AUTHOR REVISION - Last updated: 18 December 2019 + Last updated: 28 December 2019 Copyright (c) 1997-2019 University of Cambridge. ------------------------------------------------------------------------------ @@ -10716,11 +10716,13 @@ NON-ATOMIC LOOKAROUND ASSERTIONS These assertions are specific to PCRE2 and are not Perl-compatible. - (*napla:...) - (*non_atomic_positive_lookahead:...) + (?*...) ) + (*napla:...) ) synonyms + (*non_atomic_positive_lookahead:...) ) - (*naplb:...) - (*non_atomic_positive_lookbehind:...) + (?<*...) ) + (*naplb:...) ) synonyms + (*non_atomic_positive_lookbehind:...) ) SCRIPT RUNS @@ -10844,7 +10846,7 @@ AUTHOR REVISION - Last updated: 29 July 2019 + Last updated: 28 December 2019 Copyright (c) 1997-2019 University of Cambridge. ------------------------------------------------------------------------------ diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index dbf7634..4819187 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "18 December 2019" "PCRE2 10.35" +.TH PCRE2PATTERN 3 "28 December 2019" "PCRE2 10.35" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -2637,8 +2637,8 @@ backtracking into the assertion. However, there are some cases where non-atomic positive assertions can be useful. PCRE2 provides these using the following syntax: .sp - (*non_atomic_positive_lookahead: or (*napla: - (*non_atomic_positive_lookbehind: or (*naplb: + (*non_atomic_positive_lookahead: or (*napla: or (?* + (*non_atomic_positive_lookbehind: or (*naplb: or (?<* .sp Consider the problem of finding the right-most word in a string that also appears earlier in the string, that is, it must appear at least twice in total. @@ -3874,6 +3874,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 18 December 2019 +Last updated: 28 December 2019 Copyright (c) 1997-2019 University of Cambridge. .fi diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3 index bbe418a..765ddbf 100644 --- a/doc/pcre2syntax.3 +++ b/doc/pcre2syntax.3 @@ -1,4 +1,4 @@ -.TH PCRE2SYNTAX 3 "29 July 2019" "PCRE2 10.34" +.TH PCRE2SYNTAX 3 "28 December 2019" "PCRE2 10.35" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY" @@ -531,11 +531,13 @@ Each top-level branch of a lookbehind must be of a fixed length. .sp These assertions are specific to PCRE2 and are not Perl-compatible. .sp - (*napla:...) - (*non_atomic_positive_lookahead:...) -.sp - (*naplb:...) - (*non_atomic_positive_lookbehind:...) + (?*...) ) + (*napla:...) ) synonyms + (*non_atomic_positive_lookahead:...) ) +.sp + (?<*...) ) + (*naplb:...) ) synonyms + (*non_atomic_positive_lookbehind:...) ) . . .SH "SCRIPT RUNS" @@ -670,6 +672,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 29 July 2019 +Last updated: 28 December 2019 Copyright (c) 1997-2019 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 8ad4583..ed4fc74 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -3653,7 +3653,7 @@ while (ptr < ptrend) if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; /* If ( is not followed by ? it is either a capture or a special verb or an - alpha assertion. */ + alpha assertion or a positive non-atomic lookahead. */ if (*ptr != CHAR_QUESTION_MARK) { @@ -3685,10 +3685,10 @@ while (ptr < ptrend) break; /* Handle "alpha assertions" such as (*pla:...). Most of these are - synonyms for the historical symbolic assertions, but the script run ones - are new. They are distinguished by starting with a lower case letter. - Checking both ends of the alphabet makes this work in all character - codes. */ + synonyms for the historical symbolic assertions, but the script run and + non-atomic lookaround ones are new. They are distinguished by starting + with a lower case letter. Checking both ends of the alphabet makes this + work in all character codes. */ else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0) { @@ -3747,9 +3747,7 @@ while (ptr < ptrend) goto POSITIVE_LOOK_AHEAD; case META_LOOKAHEAD_NA: - *parsed_pattern++ = meta; - ptr++; - goto POST_ASSERTION; + goto POSITIVE_NONATOMIC_LOOK_AHEAD; case META_LOOKAHEADNOT: goto NEGATIVE_LOOK_AHEAD; @@ -4438,6 +4436,12 @@ while (ptr < ptrend) ptr++; goto POST_ASSERTION; + case CHAR_ASTERISK: + POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */ + *parsed_pattern++ = META_LOOKAHEAD_NA; + ptr++; + goto POST_ASSERTION; + case CHAR_EXCLAMATION_MARK: NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */ *parsed_pattern++ = META_LOOKAHEADNOT; @@ -4447,20 +4451,23 @@ while (ptr < ptrend) /* ---- Lookbehind assertions ---- */ - /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the - start of the name of a capturing group. */ + /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?< + is the start of the name of a capturing group. */ case CHAR_LESS_THAN_SIGN: if (ptrend - ptr <= 1 || - (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK)) + (ptr[1] != CHAR_EQUALS_SIGN && + ptr[1] != CHAR_EXCLAMATION_MARK && + ptr[1] != CHAR_ASTERISK)) { terminator = CHAR_GREATER_THAN_SIGN; goto DEFINE_NAME; } *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? - META_LOOKBEHIND : META_LOOKBEHINDNOT; + META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)? + META_LOOKBEHINDNOT : META_LOOKBEHIND_NA; - POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ + POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */ *has_lookbehind = TRUE; offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); PUTOFFSET(offset, parsed_pattern); @@ -4633,8 +4640,6 @@ while (ptr < ptrend) *parsed_pattern++ = META_KET; } - - if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; else top_nest--; } diff --git a/testdata/testinput2 b/testdata/testinput2 index 0dc0c47..a10613d 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5670,6 +5670,9 @@ a)"xI /\A(*napla:.*\b(\w++))(?>.*?\b\1\b){3}/ word1 word3 word1 word2 word3 word2 word2 word1 word3 word4 +/\A(?*.*\b(\w++))(?>.*?\b\1\b){3}/ + word1 word3 word1 word2 word3 word2 word2 word1 word3 word4 + /(*plb:(.)..|(.)...)(\1|\2)/ abcdb\=offset=4 abcda\=offset=4 @@ -5678,6 +5681,10 @@ a)"xI abcdb\=offset=4 abcda\=offset=4 +/(?<*(.)..|(.)...)(\1|\2)/ + abcdb\=offset=4 + abcda\=offset=4 + /(*non_atomic_positive_lookahead:ab)/B /(*non_atomic_positive_lookbehind:ab)/B diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 4649fae..438aefe 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -17088,6 +17088,11 @@ No match 0: word1 word3 word1 word2 word3 word2 word2 word1 word3 1: word3 +/\A(?*.*\b(\w++))(?>.*?\b\1\b){3}/ + word1 word3 word1 word2 word3 word2 word2 word1 word3 word4 + 0: word1 word3 word1 word2 word3 word2 word2 word1 word3 + 1: word3 + /(*plb:(.)..|(.)...)(\1|\2)/ abcdb\=offset=4 0: b @@ -17109,6 +17114,18 @@ No match 2: a 3: a +/(?<*(.)..|(.)...)(\1|\2)/ + abcdb\=offset=4 + 0: b + 1: b + 2: + 3: b + abcda\=offset=4 + 0: a + 1: + 2: a + 3: a + /(*non_atomic_positive_lookahead:ab)/B ------------------------------------------------------------------ Bra