diff --git a/ChangeLog b/ChangeLog index 647c675..cfcb9b6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -92,6 +92,8 @@ memory if the replication required a buffer to be extended, and it was not working properly in 16-bit and 32-bit modes. This issue was discovered by a fuzzer: see http://lcamtuf.coredump.cx/afl/. +23. Added the PCRE2_ALT_CIRCUMFLEX option. + Version 10.10 06-March-2015 --------------------------- diff --git a/doc/pcre2_compile.3 b/doc/pcre2_compile.3 index 3173804..60247cd 100644 --- a/doc/pcre2_compile.3 +++ b/doc/pcre2_compile.3 @@ -1,4 +1,4 @@ -.TH PCRE2_COMPILE 3 "13 April 2015" "PCRE2 10.20" +.TH PCRE2_COMPILE 3 "22 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH SYNOPSIS @@ -37,6 +37,7 @@ or provide an external function for stack size checking. The option bits are: .sp PCRE2_ANCHORED Force pattern anchoring PCRE2_ALT_BSUX Alternative handling of \eu, \eU, and \ex + PCRE2_ALT_CIRCUMFLEX Alternative handling of ^ in multiline mode PCRE2_AUTO_CALLOUT Compile automatic callouts PCRE2_CASELESS Do caseless matching PCRE2_DOLLAR_ENDONLY $ not to match newline at end diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 668db78..e8b44ff 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "13 April 2015" "PCRE2 10.20" +.TH PCRE2API 3 "22 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1043,6 +1043,15 @@ hexadecimal digits, in which case the hexadecimal number defines the code point to match. By default, as in Perl, a hexadecimal number is always expected after \ex, but it may have zero, one, or two digits (so, for example, \exz matches a binary zero character followed by z). +.sp + PCRE2_ALT_CIRCUMFLEX +.sp +In multiline mode (when PCRE2_MULTILINE is set), the circumflex metacharacter +matches at the start of the subject (unless PCRE2_NOTBOL is set), and also +after any internal newline. However, it does not match after a newline at the +end of the subject, for compatibility with Perl. If you want a multiline +circumflex also to match after a terminating newline, you must set +PCRE2_ALT_CIRCUMFLEX. .sp PCRE2_AUTO_CALLOUT .sp @@ -1147,8 +1156,12 @@ When PCRE2_MULTILINE it is set, the "start of line" and "end of line" constructs match immediately following or immediately before internal newlines in the subject string, respectively, as well as at the very start and end. This is equivalent to Perl's /m option, and it can be changed within a pattern by a -(?m) option setting. If there are no newlines in a subject string, or no -occurrences of ^ or $ in a pattern, setting PCRE2_MULTILINE has no effect. +(?m) option setting. Note that the "start of line" metacharacter does not match +after a newline at the end of the subject, for compatibility with Perl. +However, you can change this by setting the PCRE2_ALT_CIRCUMFLEX option. If +there are no newlines in a subject string, or no occurrences of ^ or $ in a +pattern, setting PCRE2_MULTILINE has no effect. + .sp PCRE2_NEVER_BACKSLASH_C .sp @@ -2927,6 +2940,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 13 April 2015 +Last updated: 22 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index cd52503..9a18ca5 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "13 April 2015" "PCRE2 10.20" +.TH PCRE2PATTERN 3 "22 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -351,7 +351,7 @@ than the binary character it represents: \eo{ddd..} character with octal code ddd.. \exhh character with hex code hh \ex{hhh..} character with hex code hhh.. (default mode) - \euhhhh character with hex code hhhh (when PCRE2_ALT_BSUX is set) + \euhhhh character with hex code hhhh (only when PCRE2_ALT_BSUX is set) .sp The precise effect of \ecx on ASCII characters is as follows: if x is a lower case letter, it is converted to upper case. Then bit 6 of the character (hex @@ -1110,14 +1110,18 @@ regular expression. .sp The circumflex and dollar metacharacters are zero-width assertions. That is, they test for a particular condition being true without consuming any -characters from the subject string. +characters from the subject string. These two metacharacters are concerned with +matching the starts and ends of lines. If the newline convention is set so that +only the two-character sequence CRLF is recognized as a newline, isolated CR +and LF characters are treated as ordinary data characters, and are not +recognized as newlines. .P Outside a character class, in the default matching mode, the circumflex character is an assertion that is true only if the current matching point is at the start of the subject string. If the \fIstartoffset\fP argument of -\fBpcre2_match()\fP is non-zero, circumflex can never match if the -PCRE2_MULTILINE option is unset. Inside a character class, circumflex has an -entirely different meaning +\fBpcre2_match()\fP is non-zero, or if PCRE2_NOTBOL is set, circumflex can +never match if the PCRE2_MULTILINE option is unset. Inside a character class, +circumflex has an entirely different meaning .\" HTML .\" (see below). @@ -1133,22 +1137,23 @@ to be anchored.) .P The dollar character is an assertion that is true only if the current matching point is at the end of the subject string, or immediately before a newline at -the end of the string (by default). Note, however, that it does not actually -match the newline. Dollar need not be the last character of the pattern if a -number of alternatives are involved, but it should be the last item in any -branch in which it appears. Dollar has no special meaning in a character class. +the end of the string (by default), unless PCRE2_NOTEOL is set. Note, however, +that it does not actually match the newline. Dollar need not be the last +character of the pattern if a number of alternatives are involved, but it +should be the last item in any branch in which it appears. Dollar has no +special meaning in a character class. .P The meaning of dollar can be changed so that it matches only at the very end of the string, by setting the PCRE2_DOLLAR_ENDONLY option at compile time. This does not affect the \eZ assertion. .P -The meanings of the circumflex and dollar characters are changed if the -PCRE2_MULTILINE option is set. When this is the case, a circumflex matches -immediately after internal newlines as well as at the start of the subject -string. It does not match after a newline that ends the string. A dollar -matches before any newlines in the string, as well as at the very end, when -PCRE2_MULTILINE is set. When newline is specified as the two-character -sequence CRLF, isolated CR and LF characters do not indicate newlines. +The meanings of the circumflex and dollar metacharacters are changed if the +PCRE2_MULTILINE option is set. When this is the case, a dollar character +matches before any newlines in the string, as well as at the very end, and a +circumflex matches immediately after internal newlines as well as at the start +of the subject string. It does not match after a newline that ends the string, +for compatibility with Perl. However, this can be changed by setting the +PCRE2_ALT_CIRCUMFLEX option. .P For example, the pattern /^abc$/ matches the subject string "def\enabc" (where \en represents a newline) in multiline mode, but not otherwise. Consequently, @@ -3332,6 +3337,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 13 April 2015 +Last updated: 22 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2syntax.3 b/doc/pcre2syntax.3 index 10a1584..bd5eabc 100644 --- a/doc/pcre2syntax.3 +++ b/doc/pcre2syntax.3 @@ -1,4 +1,4 @@ -.TH PCRE2SYNTAX 3 "13 April 2015" "PCRE2 10.20" +.TH PCRE2SYNTAX 3 "22 April 2015" "PCRE2 10.20" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY" @@ -32,11 +32,17 @@ documentation. This document contains a quick-reference summary of the syntax. \e0dd character with octal code 0dd \eddd character with octal code ddd, or backreference \eo{ddd..} character with octal code ddd.. - \exhh character with hex code hh + \eU "U" if PCRE2_ALT_BSUX is set (otherwise is an error) + \euhhhh character with hex code hhhh (if PCRE2_ALT_BSUX is set) + \exhh character with hex code hh \ex{hhh..} character with hex code hhh.. .sp Note that \e0dd is always an octal code, and that \e8 and \e9 are the literal -characters "8" and "9". +characters "8" and "9". When \ex is not followed by {, from zero to two +hexadecimal digits are read, but if PCRE2_ALT_BSUX is set, \ex must be followed +by two hexadecimal digits to be recognized as a hexadecimal escape; otherwise +it matches a literal "x". Likewise, if \eu (in ALT_BSUX mode) is not followed +by four hexadecimal digits, it matches a literal "u". . . .SH "CHARACTER TYPES" @@ -322,13 +328,14 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use \eb word boundary \eB not a word boundary ^ start of subject - also after internal newline in multiline mode + also after an internal newline in multiline mode + (after any newline if PCRE2_ALT_CIRCUMFLEX is set) \eA start of subject $ end of subject - also before newline at end of subject - also before internal newline in multiline mode + also before newline at end of subject + also before internal newline in multiline mode \eZ end of subject - also before newline at end of subject + also before newline at end of subject \ez end of subject \eG first matching position in subject . @@ -549,6 +556,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 13 April 2015 +Last updated: 22 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index 63e3da6..1a5741e 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "13 April 2015" "PCRE 10.20" +.TH PCRE2TEST 1 "22 April 2015" "PCRE 10.20" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -437,6 +437,7 @@ for a description of their effects. .sp allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS alt_bsux set PCRE2_ALT_BSUX + alt_circumflex set PCRE2_ALT_CIRCUMFLEX anchored set PCRE2_ANCHORED auto_callout set PCRE2_AUTO_CALLOUT /i caseless set PCRE2_CASELESS @@ -1444,6 +1445,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 13 April 2015 +Last updated: 22 April 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2.h.in b/src/pcre2.h.in index a752114..f6255ac 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -119,6 +119,7 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_UNGREEDY 0x00040000u /* C */ #define PCRE2_UTF 0x00080000u /* C J M D */ #define PCRE2_NEVER_BACKSLASH_C 0x00100000u /* C */ +#define PCRE2_ALT_CIRCUMFLEX 0x00200000u /* J M D */ /* These are for pcre2_jit_compile(). */ @@ -126,9 +127,10 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_JIT_PARTIAL_SOFT 0x00000002u #define PCRE2_JIT_PARTIAL_HARD 0x00000004u -/* These are for pcre2_match() and pcre2_dfa_match(). Note that PCRE2_ANCHORED, -and PCRE2_NO_UTF_CHECK can also be passed to these functions, so take care not -to define synonyms by mistake. */ +/* These are for pcre2_match(), pcre2_dfa_match(), and pcre2_jit_match(). Note +that PCRE2_ANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these +functions (though pcre2_jit_match() ignores the latter since it bypasses all +sanity checks). */ #define PCRE2_NOTBOL 0x00000001u #define PCRE2_NOTEOL 0x00000002u diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 2276fff..c2405eb 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -553,9 +553,9 @@ static PCRE2_SPTR posix_substitutes[] = { /* Masks for checking option settings. */ #define PUBLIC_COMPILE_OPTIONS \ - (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_AUTO_CALLOUT| \ - PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \ - PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \ + (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \ + PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL| \ + PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE|PCRE2_MATCH_UNSET_BACKREF| \ PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \ PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \ PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK| \ diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 8f6ed62..45e97c1 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -802,7 +802,8 @@ for (;;) /*-----------------------------------------------------------------*/ case OP_CIRCM: if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) || - (ptr != end_subject && WAS_NEWLINE(ptr))) + ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 ) + && WAS_NEWLINE(ptr))) { ADD_ACTIVE(state_offset + 1, 0); } break; diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 6719e40..e74f966 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -2156,13 +2156,16 @@ for (;;) ecode++; break; - /* Multiline mode: start of subject unless notbol, or after any newline. */ + /* Multiline mode: start of subject unless notbol, or after any newline + except for one at the very end, unless PCRE2_ALT_CIRCUMFLEX is set. */ case OP_CIRCM: if ((mb->moptions & PCRE2_NOTBOL) != 0 && eptr == mb->start_subject) RRETURN(MATCH_NOMATCH); if (eptr != mb->start_subject && - (eptr == mb->end_subject || !WAS_NEWLINE(eptr))) + ((eptr == mb->end_subject && + (mb->poptions & PCRE2_ALT_CIRCUMFLEX) == 0) || + !WAS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); ecode++; break; diff --git a/src/pcre2test.c b/src/pcre2test.c index 0552f84..1162244 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -487,6 +487,7 @@ static modstruct modlist[] = { { "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) }, { "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) }, { "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) }, + { "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) }, { "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) }, { "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) }, { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, @@ -3460,9 +3461,10 @@ static void show_compile_options(uint32_t options, const char *before, const char *after) { if (options == 0) fprintf(outfile, "%s %s", before, after); -else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", +else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", + ((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", ((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "", ((options & PCRE2_ANCHORED) != 0)? " anchored" : "", ((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "", diff --git a/testdata/testinput2 b/testdata/testinput2 index 92f8882..9116d37 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4273,4 +4273,10 @@ a random value. /Ix /(abc)*/ \[abc]{5} +/^/gm + \n\n\n + +/^/gm,alt_circumflex + \n\n\n + # End of testinput2 diff --git a/testdata/testinput6 b/testdata/testinput6 index de31b53..2651c91 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -4850,4 +4850,10 @@ bbb aaa +/^/gm + \n\n\n + +/^/gm,alt_circumflex + \n\n\n + # End of testinput6 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 053f65a..3ce9b71 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14305,4 +14305,17 @@ No match 0: abcabcabcabcabc 1: abc +/^/gm + \n\n\n + 0: + 0: + 0: + +/^/gm,alt_circumflex + \n\n\n + 0: + 0: + 0: + 0: + # End of testinput2 diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 3e33562..0fef124 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7925,4 +7925,17 @@ Callout (5): 'x\x00z' aaa No match +/^/gm + \n\n\n + 0: + 0: + 0: + +/^/gm,alt_circumflex + \n\n\n + 0: + 0: + 0: + 0: + # End of testinput6