Implement PCRE2_EXTRA_ESCAPED_CR_IS_LF
This commit is contained in:
parent
8800191109
commit
69254c77f1
|
@ -20,6 +20,8 @@ wrong library in some environments.
|
|||
|
||||
5. Fix an xclass matching issue in JIT.
|
||||
|
||||
6. Implement PCRE2_EXTRA_ESCAPED_CR_IS_LF (see Bugzilla 2315).
|
||||
|
||||
|
||||
Version 10.32 10-September-2018
|
||||
-------------------------------
|
||||
|
|
|
@ -32,6 +32,7 @@ options are:
|
|||
<pre>
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n
|
||||
PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines
|
||||
PCRE2_EXTRA_MATCH_WORD Pattern matches "words"
|
||||
</pre>
|
||||
|
|
|
@ -1872,6 +1872,14 @@ treated as single-character escapes. For example, \j is a literal "j" and
|
|||
\x{2z} is treated as the literal string "x{2z}". Setting this option means
|
||||
that typos in patterns may go undetected and have unexpected results. This is a
|
||||
dangerous option. Use with care.
|
||||
<pre>
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
||||
</pre>
|
||||
There are some legacy applications where the escape sequence \r in a pattern
|
||||
is expected to match a newline. If this option is set, \r in a pattern is
|
||||
converted to \n so that it matches a LF (linefeed) instead of a CR (carriage
|
||||
return) character. The option does not affect a literal CR in the pattern, nor
|
||||
does it affect CR specified as an explicit code point such as \x{0D}.
|
||||
<pre>
|
||||
PCRE2_EXTRA_MATCH_LINE
|
||||
</pre>
|
||||
|
@ -3724,7 +3732,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC42" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 18 September 2018
|
||||
Last updated: 21 September 2018
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -393,7 +393,7 @@ these escapes are as follows:
|
|||
\e escape (hex 1B)
|
||||
\f form feed (hex 0C)
|
||||
\n linefeed (hex 0A)
|
||||
\r carriage return (hex 0D)
|
||||
\r carriage return (hex 0D) (but see below)
|
||||
\t tab (hex 09)
|
||||
\0dd character with octal code 0dd
|
||||
\ddd character with octal code ddd, or backreference
|
||||
|
@ -403,6 +403,12 @@ these escapes are as follows:
|
|||
\N{U+hhh..} character with Unicode hex code point hhh..
|
||||
\uhhhh character with hex code hhhh (when PCRE2_ALT_BSUX is set)
|
||||
</pre>
|
||||
There are some legacy applications where the escape sequence \r is expected to
|
||||
match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \r in a
|
||||
pattern is converted to \n so that it matches a LF (linefeed) instead of a CR
|
||||
(carriage return) character.
|
||||
</P>
|
||||
<P>
|
||||
The \N{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
|
||||
is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
|
||||
\N{name} to specify characters by Unicode name; PCRE2 does not support this.
|
||||
|
@ -3624,7 +3630,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC30" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 04 September 2018
|
||||
Last updated: 21 September 2018
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -606,6 +606,7 @@ for a description of the effects of these options.
|
|||
/s dotall set PCRE2_DOTALL
|
||||
dupnames set PCRE2_DUPNAMES
|
||||
endanchored set PCRE2_ENDANCHORED
|
||||
escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
||||
/x extended set PCRE2_EXTENDED
|
||||
/xx extended_more set PCRE2_EXTENDED_MORE
|
||||
firstline set PCRE2_FIRSTLINE
|
||||
|
@ -2039,7 +2040,7 @@ Cambridge, England.
|
|||
</P>
|
||||
<br><a name="SEC21" href="#TOC1">REVISION</a><br>
|
||||
<P>
|
||||
Last updated: 17 September 2018
|
||||
Last updated: 21 September 2018
|
||||
<br>
|
||||
Copyright © 1997-2018 University of Cambridge.
|
||||
<br>
|
||||
|
|
|
@ -1849,6 +1849,15 @@ COMPILING A PATTERN
|
|||
option means that typos in patterns may go undetected and have unex-
|
||||
pected results. This is a dangerous option. Use with care.
|
||||
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
||||
|
||||
There are some legacy applications where the escape sequence \r in a
|
||||
pattern is expected to match a newline. If this option is set, \r in a
|
||||
pattern is converted to \n so that it matches a LF (linefeed) instead
|
||||
of a CR (carriage return) character. The option does not affect a lit-
|
||||
eral CR in the pattern, nor does it affect CR specified as an explicit
|
||||
code point such as \x{0D}.
|
||||
|
||||
PCRE2_EXTRA_MATCH_LINE
|
||||
|
||||
This option is provided for use by the -x option of pcre2grep. It
|
||||
|
@ -3598,7 +3607,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 18 September 2018
|
||||
Last updated: 21 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
@ -6247,7 +6256,7 @@ BACKSLASH
|
|||
\e escape (hex 1B)
|
||||
\f form feed (hex 0C)
|
||||
\n linefeed (hex 0A)
|
||||
\r carriage return (hex 0D)
|
||||
\r carriage return (hex 0D) (but see below)
|
||||
\t tab (hex 09)
|
||||
\0dd character with octal code 0dd
|
||||
\ddd character with octal code ddd, or backreference
|
||||
|
@ -6257,6 +6266,11 @@ BACKSLASH
|
|||
\N{U+hhh..} character with Unicode hex code point hhh..
|
||||
\uhhhh character with hex code hhhh (when PCRE2_ALT_BSUX is set)
|
||||
|
||||
There are some legacy applications where the escape sequence \r is
|
||||
expected to match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option
|
||||
is set, \r in a pattern is converted to \n so that it matches a LF
|
||||
(linefeed) instead of a CR (carriage return) character.
|
||||
|
||||
The \N{U+hhh..} escape sequence is recognized only when the PCRE2_UTF
|
||||
option is set, that is, when PCRE2 is operating in a Unicode mode. Perl
|
||||
also uses \N{name} to specify characters by Unicode name; PCRE2 does
|
||||
|
@ -9165,7 +9179,7 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 04 September 2018
|
||||
Last updated: 21 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "16 June 2017" "PCRE2 10.30"
|
||||
.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "21 September 2018" "PCRE2 10.33"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH SYNOPSIS
|
||||
|
@ -24,6 +24,7 @@ options are:
|
|||
.\" JOIN
|
||||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as
|
||||
a literal following character
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \er as \en
|
||||
PCRE2_EXTRA_MATCH_LINE Pattern matches whole lines
|
||||
PCRE2_EXTRA_MATCH_WORD Pattern matches "words"
|
||||
.sp
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2API 3 "18 September 2018" "PCRE2 10.33"
|
||||
.TH PCRE2API 3 "21 September 2018" "PCRE2 10.33"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.sp
|
||||
|
@ -1824,6 +1824,14 @@ treated as single-character escapes. For example, \ej is a literal "j" and
|
|||
\ex{2z} is treated as the literal string "x{2z}". Setting this option means
|
||||
that typos in patterns may go undetected and have unexpected results. This is a
|
||||
dangerous option. Use with care.
|
||||
.sp
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
||||
.sp
|
||||
There are some legacy applications where the escape sequence \er in a pattern
|
||||
is expected to match a newline. If this option is set, \er in a pattern is
|
||||
converted to \en so that it matches a LF (linefeed) instead of a CR (carriage
|
||||
return) character. The option does not affect a literal CR in the pattern, nor
|
||||
does it affect CR specified as an explicit code point such as \ex{0D}.
|
||||
.sp
|
||||
PCRE2_EXTRA_MATCH_LINE
|
||||
.sp
|
||||
|
@ -3729,6 +3737,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 18 September 2018
|
||||
Last updated: 21 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2PATTERN 3 "04 September 2018" "PCRE2 10.32"
|
||||
.TH PCRE2PATTERN 3 "21 September 2018" "PCRE2 10.33"
|
||||
.SH NAME
|
||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||
.SH "PCRE2 REGULAR EXPRESSION DETAILS"
|
||||
|
@ -370,7 +370,7 @@ these escapes are as follows:
|
|||
\ee escape (hex 1B)
|
||||
\ef form feed (hex 0C)
|
||||
\en linefeed (hex 0A)
|
||||
\er carriage return (hex 0D)
|
||||
\er carriage return (hex 0D) (but see below)
|
||||
\et tab (hex 09)
|
||||
\e0dd character with octal code 0dd
|
||||
\eddd character with octal code ddd, or backreference
|
||||
|
@ -380,6 +380,11 @@ these escapes are as follows:
|
|||
\eN{U+hhh..} character with Unicode hex code point hhh..
|
||||
\euhhhh character with hex code hhhh (when PCRE2_ALT_BSUX is set)
|
||||
.sp
|
||||
There are some legacy applications where the escape sequence \er is expected to
|
||||
match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \er in a
|
||||
pattern is converted to \en so that it matches a LF (linefeed) instead of a CR
|
||||
(carriage return) character.
|
||||
.P
|
||||
The \eN{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
|
||||
is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
|
||||
\eN{name} to specify characters by Unicode name; PCRE2 does not support this.
|
||||
|
@ -3655,6 +3660,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 04 September 2018
|
||||
Last updated: 21 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
.TH PCRE2TEST 1 "17 September 2018" "PCRE 10.33"
|
||||
.TH PCRE2TEST 1 "21 September 2018" "PCRE 10.33"
|
||||
.SH NAME
|
||||
pcre2test - a program for testing Perl-compatible regular expressions.
|
||||
.SH SYNOPSIS
|
||||
|
@ -565,6 +565,7 @@ for a description of the effects of these options.
|
|||
/s dotall set PCRE2_DOTALL
|
||||
dupnames set PCRE2_DUPNAMES
|
||||
endanchored set PCRE2_ENDANCHORED
|
||||
escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
||||
/x extended set PCRE2_EXTENDED
|
||||
/xx extended_more set PCRE2_EXTENDED_MORE
|
||||
firstline set PCRE2_FIRSTLINE
|
||||
|
@ -2021,6 +2022,6 @@ Cambridge, England.
|
|||
.rs
|
||||
.sp
|
||||
.nf
|
||||
Last updated: 17 September 2018
|
||||
Last updated: 21 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
.fi
|
||||
|
|
|
@ -544,6 +544,7 @@ PATTERN MODIFIERS
|
|||
/s dotall set PCRE2_DOTALL
|
||||
dupnames set PCRE2_DUPNAMES
|
||||
endanchored set PCRE2_ENDANCHORED
|
||||
escaped_cr_is_lf set PCRE2_EXTRA_ESCAPED_CR_IS_LF
|
||||
/x extended set PCRE2_EXTENDED
|
||||
/xx extended_more set PCRE2_EXTENDED_MORE
|
||||
firstline set PCRE2_FIRSTLINE
|
||||
|
@ -1852,5 +1853,5 @@ AUTHOR
|
|||
|
||||
REVISION
|
||||
|
||||
Last updated: 17 September 2018
|
||||
Last updated: 21 September 2018
|
||||
Copyright (c) 1997-2018 University of Cambridge.
|
||||
|
|
|
@ -158,6 +158,7 @@ D is inspected during pcre2_dfa_match() execution
|
|||
#define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */
|
||||
#define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */
|
||||
#define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */
|
||||
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
|
|
|
@ -714,7 +714,8 @@ are allowed. */
|
|||
|
||||
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
|
||||
(PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL)
|
||||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
|
||||
PCRE2_EXTRA_ESCAPED_CR_IS_LF)
|
||||
|
||||
/* Compile time error code numbers. They are given names so that they can more
|
||||
easily be tracked. When a new number is added, the tables called eint1 and
|
||||
|
@ -1398,7 +1399,7 @@ Arguments:
|
|||
errorcodeptr points to the errorcode variable (containing zero)
|
||||
options the current options bits
|
||||
isclass TRUE if inside a character class
|
||||
cb compile data block
|
||||
cb compile data block or NULL when called from pcre2_substitute()
|
||||
|
||||
Returns: zero => a data character
|
||||
positive => a special escape sequence
|
||||
|
@ -1429,14 +1430,26 @@ GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
|
|||
|
||||
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
|
||||
value test saves a memory lookup for code points outside the alphanumeric
|
||||
range. Otherwise, do a table lookup. A non-zero result is something that can be
|
||||
returned immediately. Otherwise further processing is required. */
|
||||
range. */
|
||||
|
||||
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {} /* Definitely literal */
|
||||
|
||||
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
|
||||
positive value is a literal value for something like \n. A negative value is
|
||||
the negation of one of the ESC_ macros that is passed back for handling by the
|
||||
calling function. Some extra checking is needed for \N because only \N{U+dddd}
|
||||
is supported. If the value is zero, further processing is handled below. */
|
||||
|
||||
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
|
||||
{
|
||||
if (i > 0) c = (uint32_t)i; else /* Positive is a data character */
|
||||
if (i > 0)
|
||||
{
|
||||
c = (uint32_t)i;
|
||||
if (cb != NULL && c == CHAR_CR &&
|
||||
(cb->cx->extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
|
||||
c = CHAR_LF;
|
||||
}
|
||||
else /* Negative table entry */
|
||||
{
|
||||
escape = -i; /* Else return a special escape */
|
||||
if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
|
||||
|
@ -1486,9 +1499,9 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
|
|||
}
|
||||
}
|
||||
|
||||
/* Escapes that need further processing, including those that are unknown.
|
||||
When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
|
||||
when BSUX is set). */
|
||||
/* Escapes that need further processing, including those that are unknown, have
|
||||
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
|
||||
\o, and \x are recognized (and \u when BSUX is set). */
|
||||
|
||||
else
|
||||
{
|
||||
|
|
|
@ -629,6 +629,7 @@ static modstruct modlist[] = {
|
|||
{ "dotall", MOD_PATP, MOD_OPT, PCRE2_DOTALL, PO(options) },
|
||||
{ "dupnames", MOD_PATP, MOD_OPT, PCRE2_DUPNAMES, PO(options) },
|
||||
{ "endanchored", MOD_PD, MOD_OPT, PCRE2_ENDANCHORED, PD(options) },
|
||||
{ "escaped_cr_is_lf", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ESCAPED_CR_IS_LF, CO(extra_options) },
|
||||
{ "expand", MOD_PAT, MOD_CTL, CTL_EXPAND, PO(control) },
|
||||
{ "extended", MOD_PATP, MOD_OPT, PCRE2_EXTENDED, PO(options) },
|
||||
{ "extended_more", MOD_PATP, MOD_OPT, PCRE2_EXTENDED_MORE, PO(options) },
|
||||
|
@ -4173,12 +4174,13 @@ show_compile_extra_options(uint32_t options, const char *before,
|
|||
const char *after)
|
||||
{
|
||||
if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
|
||||
else fprintf(outfile, "%s%s%s%s%s%s",
|
||||
else fprintf(outfile, "%s%s%s%s%s%s%s",
|
||||
before,
|
||||
((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
|
||||
((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "",
|
||||
((options & PCRE2_EXTRA_MATCH_WORD) != 0)? " match_word" : "",
|
||||
((options & PCRE2_EXTRA_MATCH_LINE) != 0)? " match_line" : "",
|
||||
((options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)? " escaped_cr_is_lf" : "",
|
||||
after);
|
||||
}
|
||||
|
||||
|
|
|
@ -5517,4 +5517,12 @@ a)"xI
|
|||
/a(b)c|xyz/g,replace=<$0>,substitute_callout
|
||||
abcdefabcpqr
|
||||
|
||||
/abc\rdef/
|
||||
abc\ndef
|
||||
|
||||
/abc\rdef\x{0d}xyz/escaped_cr_is_lf
|
||||
abc\ndef\rxyz
|
||||
\= Expect no match
|
||||
abc\ndef\nxyz
|
||||
|
||||
# End of testinput2
|
||||
|
|
|
@ -16801,6 +16801,17 @@ Old 0 3 New 0 5
|
|||
Old 6 9 New 8 13
|
||||
2: <abc>def<abc>pqr
|
||||
|
||||
/abc\rdef/
|
||||
abc\ndef
|
||||
No match
|
||||
|
||||
/abc\rdef\x{0d}xyz/escaped_cr_is_lf
|
||||
abc\ndef\rxyz
|
||||
0: abc\x0adef\x0dxyz
|
||||
\= Expect no match
|
||||
abc\ndef\nxyz
|
||||
No match
|
||||
|
||||
# End of testinput2
|
||||
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
|
||||
Error -62: bad serialized data
|
||||
|
|
Loading…
Reference in New Issue