Implement PCRE2_ALT_CIRCUMFLEX.
This commit is contained in:
parent
fe2733263d
commit
d1a13b8e2e
|
@ -92,6 +92,8 @@ memory if the replication required a buffer to be extended, and it was not
|
|||
working properly in 16-bit and 32-bit modes. This issue was discovered by a
|
||||
fuzzer: see http://lcamtuf.coredump.cx/afl/.
|
||||
|
||||
23. Added the PCRE2_ALT_CIRCUMFLEX option.
|
||||
|
||||
|
||||
Version 10.10 06-March-2015
|
||||
---------------------------
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_COMPILE 3 "13 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2_COMPILE 3 "22 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -37,6 +37,7 @@ or provide an external function for stack size checking. The option bits are:
|
|||
.sp
|
||||
PCRE2_ANCHORED Force pattern anchoring
|
||||
PCRE2_ALT_BSUX Alternative handling of \eu, \eU, and \ex
|
||||
PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode
|
||||
PCRE2_AUTO_CALLOUT Compile automatic callouts
|
||||
PCRE2_CASELESS Do caseless matching
|
||||
PCRE2_DOLLAR_ENDONLY $ not to match newline at end
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "13 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2API 3 "22 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -1043,6 +1043,15 @@ hexadecimal digits, in which case the hexadecimal number defines the code point
|
|||
to match. By default, as in Perl, a hexadecimal number is always expected after
|
||||
\ex, but it may have zero, one, or two digits (so, for example, \exz matches a
|
||||
binary zero character followed by z).
|
||||
.sp
|
||||
PCRE2_ALT_CIRCUMFLEX
|
||||
.sp
|
||||
In multiline mode (when PCRE2_MULTILINE is set), the circumflex metacharacter
|
||||
matches at the start of the subject (unless PCRE2_NOTBOL is set), and also
|
||||
after any internal newline. However, it does not match after a newline at the
|
||||
end of the subject, for compatibility with Perl. If you want a multiline
|
||||
circumflex also to match after a terminating newline, you must set
|
||||
PCRE2_ALT_CIRCUMFLEX.
|
||||
.sp
|
||||
PCRE2_AUTO_CALLOUT
|
||||
.sp
|
||||
|
@ -1147,8 +1156,12 @@ When PCRE2_MULTILINE it is set, the "start of line" and "end of line"
|
|||
constructs match immediately following or immediately before internal newlines
|
||||
in the subject string, respectively, as well as at the very start and end. This
|
||||
is equivalent to Perl's /m option, and it can be changed within a pattern by a
|
||||
(?m) option setting. If there are no newlines in a subject string, or no
|
||||
occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect.
|
||||
(?m) option setting. Note that the "start of line" metacharacter does not match
|
||||
after a newline at the end of the subject, for compatibility with Perl.
|
||||
However, you can change this by setting the PCRE2_ALT_CIRCUMFLEX option. If
|
||||
there are no newlines in a subject string, or no occurrences of ^ or $ in a
|
||||
pattern, setting PCRE2_MULTILINE has no effect.
|
||||
|
||||
.sp
|
||||
PCRE2_NEVER_BACKSLASH_C
|
||||
.sp
|
||||
|
@ -2927,6 +2940,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 22 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "13 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2PATTERN 3 "22 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -351,7 +351,7 @@ than the binary character it represents:
|
|||
\eo{ddd..} character with octal code ddd..
|
||||
\exhh character with hex code hh
|
||||
\ex{hhh..} character with hex code hhh.. (default mode)
|
||||
\euhhhh character with hex code hhhh (when PCRE2_ALT_BSUX is set)
|
||||
\euhhhh character with hex code hhhh (only when PCRE2_ALT_BSUX is set)
|
||||
.sp
|
||||
The precise effect of \ecx on ASCII characters is as follows: if x is a lower
|
||||
case letter, it is converted to upper case. Then bit 6 of the character (hex
|
||||
|
@ -1110,14 +1110,18 @@ regular expression.
|
|||
.sp
|
||||
The circumflex and dollar metacharacters are zero-width assertions. That is,
|
||||
they test for a particular condition being true without consuming any
|
||||
characters from the subject string.
|
||||
characters from the subject string. These two metacharacters are concerned with
|
||||
matching the starts and ends of lines. If the newline convention is set so that
|
||||
only the two-character sequence CRLF is recognized as a newline, isolated CR
|
||||
and LF characters are treated as ordinary data characters, and are not
|
||||
recognized as newlines.
|
||||
.P
|
||||
Outside a character class, in the default matching mode, the circumflex
|
||||
character is an assertion that is true only if the current matching point is at
|
||||
the start of the subject string. If the \fIstartoffset\fP argument of
|
||||
\fBpcre2_match()\fP is non-zero, circumflex can never match if the
|
||||
PCRE2_MULTILINE option is unset. Inside a character class, circumflex has an
|
||||
entirely different meaning
|
||||
\fBpcre2_match()\fP is non-zero, or if PCRE2_NOTBOL is set, circumflex can
|
||||
never match if the PCRE2_MULTILINE option is unset. Inside a character class,
|
||||
circumflex has an entirely different meaning
|
||||
.\" HTML <a href="#characterclass">
|
||||
.\" </a>
|
||||
(see below).
|
||||
|
@ -1133,22 +1137,23 @@ to be anchored.)
|
|||
.P
|
||||
The dollar character is an assertion that is true only if the current matching
|
||||
point is at the end of the subject string, or immediately before a newline at
|
||||
the end of the string (by default). Note, however, that it does not actually
|
||||
match the newline. Dollar need not be the last character of the pattern if a
|
||||
number of alternatives are involved, but it should be the last item in any
|
||||
branch in which it appears. Dollar has no special meaning in a character class.
|
||||
the end of the string (by default), unless PCRE2_NOTEOL is set. Note, however,
|
||||
that it does not actually match the newline. Dollar need not be the last
|
||||
character of the pattern if a number of alternatives are involved, but it
|
||||
should be the last item in any branch in which it appears. Dollar has no
|
||||
special meaning in a character class.
|
||||
.P
|
||||
The meaning of dollar can be changed so that it matches only at the very end of
|
||||
the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This
|
||||
does not affect the \eZ assertion.
|
||||
.P
|
||||
The meanings of the circumflex and dollar characters are changed if the
|
||||
PCRE2_MULTILINE option is set. When this is the case, a circumflex matches
|
||||
immediately after internal newlines as well as at the start of the subject
|
||||
string. It does not match after a newline that ends the string. A dollar
|
||||
matches before any newlines in the string, as well as at the very end, when
|
||||
PCRE2_MULTILINE is set. When newline is specified as the two-character
|
||||
sequence CRLF, isolated CR and LF characters do not indicate newlines.
|
||||
The meanings of the circumflex and dollar metacharacters are changed if the
|
||||
PCRE2_MULTILINE option is set. When this is the case, a dollar character
|
||||
matches before any newlines in the string, as well as at the very end, and a
|
||||
circumflex matches immediately after internal newlines as well as at the start
|
||||
of the subject string. It does not match after a newline that ends the string,
|
||||
for compatibility with Perl. However, this can be changed by setting the
|
||||
PCRE2_ALT_CIRCUMFLEX option.
|
||||
.P
|
||||
For example, the pattern /^abc$/ matches the subject string "def\enabc" (where
|
||||
\en represents a newline) in multiline mode, but not otherwise. Consequently,
|
||||
|
@ -3332,6 +3337,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 22 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2SYNTAX 3 "13 April 2015" "PCRE2 10.20"
|
||||
.TH PCRE2SYNTAX 3 "22 April 2015" "PCRE2 10.20"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY"
|
||||
|
@ -32,11 +32,17 @@ documentation. This document contains a quick-reference summary of the syntax.
|
|||
\e0dd character with octal code 0dd
|
||||
\eddd character with octal code ddd, or backreference
|
||||
\eo{ddd..} character with octal code ddd..
|
||||
\exhh character with hex code hh
|
||||
\eU "U" if PCRE2_ALT_BSUX is set (otherwise is an error)
|
||||
\euhhhh character with hex code hhhh (if PCRE2_ALT_BSUX is set)
|
||||
\exhh character with hex code hh
|
||||
\ex{hhh..} character with hex code hhh..
|
||||
.sp
|
||||
Note that \e0dd is always an octal code, and that \e8 and \e9 are the literal
|
||||
characters "8" and "9".
|
||||
characters "8" and "9". When \ex is not followed by {, from zero to two
|
||||
hexadecimal digits are read, but if PCRE2_ALT_BSUX is set, \ex must be followed
|
||||
by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise
|
||||
it matches a literal "x". Likewise, if \eu (in ALT_BSUX mode) is not followed
|
||||
by four hexadecimal digits, it matches a literal "u".
|
||||
.
|
||||
.
|
||||
.SH "CHARACTER TYPES"
|
||||
|
@ -322,13 +328,14 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use
|
|||
\eb word boundary
|
||||
\eB not a word boundary
|
||||
^ start of subject
|
||||
also after internal newline in multiline mode
|
||||
also after an internal newline in multiline mode
|
||||
(after any newline if PCRE2_ALT_CIRCUMFLEX is set)
|
||||
\eA start of subject
|
||||
$ end of subject
|
||||
also before newline at end of subject
|
||||
also before internal newline in multiline mode
|
||||
also before newline at end of subject
|
||||
also before internal newline in multiline mode
|
||||
\eZ end of subject
|
||||
also before newline at end of subject
|
||||
also before newline at end of subject
|
||||
\ez end of subject
|
||||
\eG first matching position in subject
|
||||
.
|
||||
|
@ -549,6 +556,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 22 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "13 April 2015" "PCRE 10.20"
|
||||
.TH PCRE2TEST 1 "22 April 2015" "PCRE 10.20"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -437,6 +437,7 @@ for a description of their effects.
|
|||
.sp
|
||||
allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS
|
||||
alt_bsux set PCRE2_ALT_BSUX
|
||||
alt_circumflex set PCRE2_ALT_CIRCUMFLEX
|
||||
anchored set PCRE2_ANCHORED
|
||||
auto_callout set PCRE2_AUTO_CALLOUT
|
||||
/i caseless set PCRE2_CASELESS
|
||||
|
@ -1444,6 +1445,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 13 April 2015
|
||||
Last updated: 22 April 2015
|
||||
Copyright (c) 1997-2015 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -119,6 +119,7 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_UNGREEDY 0x00040000u /* C */
|
||||
#define PCRE2_UTF 0x00080000u /* C J M D */
|
||||
#define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */
|
||||
#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
|
@ -126,9 +127,10 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_JIT_PARTIAL_SOFT 0x00000002u
|
||||
#define PCRE2_JIT_PARTIAL_HARD 0x00000004u
|
||||
|
||||
/* These are for pcre2_match() and pcre2_dfa_match(). Note that PCRE2_ANCHORED,
|
||||
and PCRE2_NO_UTF_CHECK can also be passed to these functions, so take care not
|
||||
to define synonyms by mistake. */
|
||||
/* These are for pcre2_match(), pcre2_dfa_match(), and pcre2_jit_match(). Note
|
||||
that PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these
|
||||
functions (though pcre2_jit_match() ignores the latter since it bypasses all
|
||||
sanity checks). */
|
||||
|
||||
#define PCRE2_NOTBOL 0x00000001u
|
||||
#define PCRE2_NOTEOL 0x00000002u
|
||||
|
|
|
@ -553,9 +553,9 @@ static PCRE2_SPTR posix_substitutes[] = {
|
|||
/* Masks for checking option settings. */
|
||||
|
||||
#define PUBLIC_COMPILE_OPTIONS \
|
||||
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_AUTO_CALLOUT| \
|
||||
PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
|
||||
PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \
|
||||
(PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
|
||||
PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL| \
|
||||
PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \
|
||||
PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
|
||||
PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
|
||||
PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \
|
||||
|
|
|
@ -802,7 +802,8 @@ for (;;)
|
|||
/*-----------------------------------------------------------------*/
|
||||
case OP_CIRCM:
|
||||
if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
|
||||
(ptr != end_subject && WAS_NEWLINE(ptr)))
|
||||
((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
|
||||
&& WAS_NEWLINE(ptr)))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
break;
|
||||
|
||||
|
|
|
@ -2156,13 +2156,16 @@ for (;;)
|
|||
ecode++;
|
||||
break;
|
||||
|
||||
/* Multiline mode: start of subject unless notbol, or after any newline. */
|
||||
/* Multiline mode: start of subject unless notbol, or after any newline
|
||||
except for one at the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */
|
||||
|
||||
case OP_CIRCM:
|
||||
if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
if (eptr != mb->start_subject &&
|
||||
(eptr == mb->end_subject || !WAS_NEWLINE(eptr)))
|
||||
((eptr == mb->end_subject &&
|
||||
(mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) ||
|
||||
!WAS_NEWLINE(eptr)))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
ecode++;
|
||||
break;
|
||||
|
|
|
@ -487,6 +487,7 @@ static modstruct modlist[] = {
|
|||
{ "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) },
|
||||
{ "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) },
|
||||
{ "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) },
|
||||
{ "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) },
|
||||
{ "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) },
|
||||
{ "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) },
|
||||
{ "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) },
|
||||
|
@ -3460,9 +3461,10 @@ static void
|
|||
show_compile_options(uint32_t options, const char *before, const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "",
|
||||
((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "",
|
||||
((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "",
|
||||
((options & PCRE2_ANCHORED) != 0)? " anchored" : "",
|
||||
((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "",
|
||||
|
|
|
@ -4273,4 +4273,10 @@ a random value. /Ix
|
|||
/(abc)*/
|
||||
\[abc]{5}
|
||||
|
||||
/^/gm
|
||||
\n\n\n
|
||||
|
||||
/^/gm,alt_circumflex
|
||||
\n\n\n
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -4850,4 +4850,10 @@
|
|||
bbb
|
||||
aaa
|
||||
|
||||
/^/gm
|
||||
\n\n\n
|
||||
|
||||
/^/gm,alt_circumflex
|
||||
\n\n\n
|
||||
|
||||
# End of testinput6
|
||||
|
|
|
@ -14305,4 +14305,17 @@ No match
|
|||
0: abcabcabcabcabc
|
||||
1: abc
|
||||
|
||||
/^/gm
|
||||
\n\n\n
|
||||
0:
|
||||
0:
|
||||
0:
|
||||
|
||||
/^/gm,alt_circumflex
|
||||
\n\n\n
|
||||
0:
|
||||
0:
|
||||
0:
|
||||
0:
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -7925,4 +7925,17 @@ Callout (5): 'x\x00z'
|
|||
aaa
|
||||
No match
|
||||
|
||||
/^/gm
|
||||
\n\n\n
|
||||
0:
|
||||
0:
|
||||
0:
|
||||
|
||||
/^/gm,alt_circumflex
|
||||
\n\n\n
|
||||
0:
|
||||
0:
|
||||
0:
|
||||
0:
|
||||
|
||||
# End of testinput6
|
||||
|
|
Loading…
Reference in New Issue