Add (?* and (?<* synonyms for non-atomic lookarounds.

This commit is contained in:
Philip.Hazel 2019-12-28 13:53:59 +00:00
parent d170829b26
commit ac4ab7186d
9 changed files with 81 additions and 42 deletions

View File

@ -28,6 +28,10 @@ now correctly backtracked, so this unnecessary restriction has been removed.
7. Added PCRE2_SUBSTITUTE_MATCHED.
8. Added (?* and (?<* as synonms for (*napla: and (*naplb: to match another
regex engine. The Perl regex folks are aware of this usage and have made a note
about it.
Version 10.34 21-November-2019
------------------------------

View File

@ -2624,8 +2624,8 @@ backtracking into the assertion. However, there are some cases where non-atomic
positive assertions can be useful. PCRE2 provides these using the following
syntax:
<pre>
(*non_atomic_positive_lookahead: or (*napla:
(*non_atomic_positive_lookbehind: or (*naplb:
(*non_atomic_positive_lookahead: or (*napla: or (?*
(*non_atomic_positive_lookbehind: or (*naplb: or (?&#60;*
</pre>
Consider the problem of finding the right-most word in a string that also
appears earlier in the string, that is, it must appear at least twice in total.
@ -3833,7 +3833,7 @@ Cambridge, England.
</P>
<br><a name="SEC32" href="#TOC1">REVISION</a><br>
<P>
Last updated: 18 December 2019
Last updated: 28 December 2019
<br>
Copyright &copy; 1997-2019 University of Cambridge.
<br>

View File

@ -553,11 +553,13 @@ Each top-level branch of a lookbehind must be of a fixed length.
<P>
These assertions are specific to PCRE2 and are not Perl-compatible.
<pre>
(*napla:...)
(*non_atomic_positive_lookahead:...)
(?*...) )
(*napla:...) ) synonyms
(*non_atomic_positive_lookahead:...) )
(*naplb:...)
(*non_atomic_positive_lookbehind:...)
(?&#60;*...) )
(*naplb:...) ) synonyms
(*non_atomic_positive_lookbehind:...) )
</PRE>
</P>
<br><a name="SEC21" href="#TOC1">SCRIPT RUNS</a><br>
@ -683,7 +685,7 @@ Cambridge, England.
</P>
<br><a name="SEC29" href="#TOC1">REVISION</a><br>
<P>
Last updated: 29 July 2019
Last updated: 28 December 2019
<br>
Copyright &copy; 1997-2019 University of Cambridge.
<br>

View File

@ -8354,8 +8354,8 @@ NON-ATOMIC ASSERTIONS
some cases where non-atomic positive assertions can be useful. PCRE2
provides these using the following syntax:
(*non_atomic_positive_lookahead: or (*napla:
(*non_atomic_positive_lookbehind: or (*naplb:
(*non_atomic_positive_lookahead: or (*napla: or (?*
(*non_atomic_positive_lookbehind: or (*naplb: or (?<*
Consider the problem of finding the right-most word in a string that
also appears earlier in the string, that is, it must appear at least
@ -9487,7 +9487,7 @@ AUTHOR
REVISION
Last updated: 18 December 2019
Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------
@ -10716,11 +10716,13 @@ NON-ATOMIC LOOKAROUND ASSERTIONS
These assertions are specific to PCRE2 and are not Perl-compatible.
(*napla:...)
(*non_atomic_positive_lookahead:...)
(?*...) )
(*napla:...) ) synonyms
(*non_atomic_positive_lookahead:...) )
(*naplb:...)
(*non_atomic_positive_lookbehind:...)
(?<*...) )
(*naplb:...) ) synonyms
(*non_atomic_positive_lookbehind:...) )
SCRIPT RUNS
@ -10844,7 +10846,7 @@ AUTHOR
REVISION
Last updated: 29 July 2019
Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
------------------------------------------------------------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2PATTERN 3 "18 December 2019" "PCRE2 10.35"
.TH PCRE2PATTERN 3 "28 December 2019" "PCRE2 10.35"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -2637,8 +2637,8 @@ backtracking into the assertion. However, there are some cases where non-atomic
positive assertions can be useful. PCRE2 provides these using the following
syntax:
.sp
(*non_atomic_positive_lookahead: or (*napla:
(*non_atomic_positive_lookbehind: or (*naplb:
(*non_atomic_positive_lookahead: or (*napla: or (?*
(*non_atomic_positive_lookbehind: or (*naplb: or (?<*
.sp
Consider the problem of finding the right-most word in a string that also
appears earlier in the string, that is, it must appear at least twice in total.
@ -3874,6 +3874,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 18 December 2019
Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
.fi

View File

@ -1,4 +1,4 @@
.TH PCRE2SYNTAX 3 "29 July 2019" "PCRE2 10.34"
.TH PCRE2SYNTAX 3 "28 December 2019" "PCRE2 10.35"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
@ -531,11 +531,13 @@ Each top-level branch of a lookbehind must be of a fixed length.
.sp
These assertions are specific to PCRE2 and are not Perl-compatible.
.sp
(*napla:...)
(*non_atomic_positive_lookahead:...)
(?*...) )
(*napla:...) ) synonyms
(*non_atomic_positive_lookahead:...) )
.sp
(*naplb:...)
(*non_atomic_positive_lookbehind:...)
(?<*...) )
(*naplb:...) ) synonyms
(*non_atomic_positive_lookbehind:...) )
.
.
.SH "SCRIPT RUNS"
@ -670,6 +672,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 29 July 2019
Last updated: 28 December 2019
Copyright (c) 1997-2019 University of Cambridge.
.fi

View File

@ -3653,7 +3653,7 @@ while (ptr < ptrend)
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
/* If ( is not followed by ? it is either a capture or a special verb or an
alpha assertion. */
alpha assertion or a positive non-atomic lookahead. */
if (*ptr != CHAR_QUESTION_MARK)
{
@ -3685,10 +3685,10 @@ while (ptr < ptrend)
break;
/* Handle "alpha assertions" such as (*pla:...). Most of these are
synonyms for the historical symbolic assertions, but the script run ones
are new. They are distinguished by starting with a lower case letter.
Checking both ends of the alphabet makes this work in all character
codes. */
synonyms for the historical symbolic assertions, but the script run and
non-atomic lookaround ones are new. They are distinguished by starting
with a lower case letter. Checking both ends of the alphabet makes this
work in all character codes. */
else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
{
@ -3747,9 +3747,7 @@ while (ptr < ptrend)
goto POSITIVE_LOOK_AHEAD;
case META_LOOKAHEAD_NA:
*parsed_pattern++ = meta;
ptr++;
goto POST_ASSERTION;
goto POSITIVE_NONATOMIC_LOOK_AHEAD;
case META_LOOKAHEADNOT:
goto NEGATIVE_LOOK_AHEAD;
@ -4438,6 +4436,12 @@ while (ptr < ptrend)
ptr++;
goto POST_ASSERTION;
case CHAR_ASTERISK:
POSITIVE_NONATOMIC_LOOK_AHEAD: /* Come from (?* */
*parsed_pattern++ = META_LOOKAHEAD_NA;
ptr++;
goto POST_ASSERTION;
case CHAR_EXCLAMATION_MARK:
NEGATIVE_LOOK_AHEAD: /* Come from (*nla: */
*parsed_pattern++ = META_LOOKAHEADNOT;
@ -4447,20 +4451,23 @@ while (ptr < ptrend)
/* ---- Lookbehind assertions ---- */
/* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the
start of the name of a capturing group. */
/* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
is the start of the name of a capturing group. */
case CHAR_LESS_THAN_SIGN:
if (ptrend - ptr <= 1 ||
(ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK))
(ptr[1] != CHAR_EQUALS_SIGN &&
ptr[1] != CHAR_EXCLAMATION_MARK &&
ptr[1] != CHAR_ASTERISK))
{
terminator = CHAR_GREATER_THAN_SIGN;
goto DEFINE_NAME;
}
*parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
META_LOOKBEHIND : META_LOOKBEHINDNOT;
META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
POST_LOOKBEHIND: /* Come from (*plb: (*naplb: and (*nlb: */
*has_lookbehind = TRUE;
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
PUTOFFSET(offset, parsed_pattern);
@ -4633,8 +4640,6 @@ while (ptr < ptrend)
*parsed_pattern++ = META_KET;
}
if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
else top_nest--;
}

7
testdata/testinput2 vendored
View File

@ -5670,6 +5670,9 @@ a)"xI
/\A(*napla:.*\b(\w++))(?>.*?\b\1\b){3}/
word1 word3 word1 word2 word3 word2 word2 word1 word3 word4
/\A(?*.*\b(\w++))(?>.*?\b\1\b){3}/
word1 word3 word1 word2 word3 word2 word2 word1 word3 word4
/(*plb:(.)..|(.)...)(\1|\2)/
abcdb\=offset=4
abcda\=offset=4
@ -5678,6 +5681,10 @@ a)"xI
abcdb\=offset=4
abcda\=offset=4
/(?<*(.)..|(.)...)(\1|\2)/
abcdb\=offset=4
abcda\=offset=4
/(*non_atomic_positive_lookahead:ab)/B
/(*non_atomic_positive_lookbehind:ab)/B

17
testdata/testoutput2 vendored
View File

@ -17088,6 +17088,11 @@ No match
0: word1 word3 word1 word2 word3 word2 word2 word1 word3
1: word3
/\A(?*.*\b(\w++))(?>.*?\b\1\b){3}/
word1 word3 word1 word2 word3 word2 word2 word1 word3 word4
0: word1 word3 word1 word2 word3 word2 word2 word1 word3
1: word3
/(*plb:(.)..|(.)...)(\1|\2)/
abcdb\=offset=4
0: b
@ -17109,6 +17114,18 @@ No match
2: a
3: a
/(?<*(.)..|(.)...)(\1|\2)/
abcdb\=offset=4
0: b
1: b
2: <unset>
3: b
abcda\=offset=4
0: a
1: <unset>
2: a
3: a
/(*non_atomic_positive_lookahead:ab)/B
------------------------------------------------------------------
Bra