Implement PCRE2_EXTRA_ESCAPED_CR_IS_LF

2018-09-21 16:59:48 +00:00 · 2018-09-21 16:59:48 +00:00 · 69254c77f1
parent 8800191109
commit 69254c77f1
16 changed files with 1950 additions and 1867 deletions
--- a/2
+++ b/2
@ -20,6 +20,8 @@ wrong library in some environments.

 5. Fix an xclass matching issue in JIT.

+6. Implement PCRE2_EXTRA_ESCAPED_CR_IS_LF (see Bugzilla 2315).
+

 Version 10.32 10-September-2018
 -------------------------------
--- a/doc/html/pcre2_set_compile_extra_options.html
+++ b/doc/html/pcre2_set_compile_extra_options.html
@ -32,6 +32,7 @@ options are:
 <pre>
  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \x{df800} to \x{dfff} in UTF-8 and UTF-32 modes
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as a literal following character
+  PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \r as \n
  PCRE2_EXTRA_MATCH_LINE               Pattern matches whole lines
  PCRE2_EXTRA_MATCH_WORD               Pattern matches "words"
 </pre>
--- a/doc/html/pcre2api.html
+++ b/doc/html/pcre2api.html
@ -1872,6 +1872,14 @@ treated as single-character escapes. For example, \j is a literal "j" and
 \x{2z} is treated as the literal string "x{2z}". Setting this option means
 that typos in patterns may go undetected and have unexpected results. This is a
 dangerous option. Use with care.
+<pre>
+  PCRE2_EXTRA_ESCAPED_CR_IS_LF
+</pre>
+There are some legacy applications where the escape sequence \r in a pattern 
+is expected to match a newline. If this option is set, \r in a pattern is 
+converted to \n so that it matches a LF (linefeed) instead of a CR (carriage 
+return) character. The option does not affect a literal CR in the pattern, nor
+does it affect CR specified as an explicit code point such as \x{0D}.
 <pre>
  PCRE2_EXTRA_MATCH_LINE
 </pre>
@ -3724,7 +3732,7 @@ Cambridge, England.
 </P>
 <br><a name="SEC42" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 18 September 2018
+Last updated: 21 September 2018
 <br>
 Copyright &copy; 1997-2018 University of Cambridge.
 <br>
--- a/doc/html/pcre2pattern.html
+++ b/doc/html/pcre2pattern.html
@ -393,7 +393,7 @@ these escapes are as follows:
  \e          escape (hex 1B)
  \f          form feed (hex 0C)
  \n          linefeed (hex 0A)
-  \r          carriage return (hex 0D)
+  \r          carriage return (hex 0D) (but see below)
  \t          tab (hex 09)
  \0dd        character with octal code 0dd
  \ddd        character with octal code ddd, or backreference
@ -403,6 +403,12 @@ these escapes are as follows:
  \N{U+hhh..} character with Unicode hex code point hhh..
  \uhhhh      character with hex code hhhh (when PCRE2_ALT_BSUX is set)
 </pre>
+There are some legacy applications where the escape sequence \r is expected to
+match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \r in a
+pattern is converted to \n so that it matches a LF (linefeed) instead of a CR
+(carriage return) character.
+</P>
+<P>
 The \N{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
 is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
 \N{name} to specify characters by Unicode name; PCRE2 does not support this.
@ -3624,7 +3630,7 @@ Cambridge, England.
 </P>
 <br><a name="SEC30" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 04 September 2018
+Last updated: 21 September 2018
 <br>
 Copyright &copy; 1997-2018 University of Cambridge.
 <br>
--- a/doc/html/pcre2test.html
+++ b/doc/html/pcre2test.html
@ -606,6 +606,7 @@ for a description of the effects of these options.
  /s  dotall                    set PCRE2_DOTALL
      dupnames                  set PCRE2_DUPNAMES
      endanchored               set PCRE2_ENDANCHORED
+      escaped_cr_is_lf          set PCRE2_EXTRA_ESCAPED_CR_IS_LF 
  /x  extended                  set PCRE2_EXTENDED
  /xx extended_more             set PCRE2_EXTENDED_MORE
      firstline                 set PCRE2_FIRSTLINE
@ -2039,7 +2040,7 @@ Cambridge, England.
 </P>
 <br><a name="SEC21" href="#TOC1">REVISION</a><br>
 <P>
-Last updated: 17 September 2018
+Last updated: 21 September 2018
 <br>
 Copyright &copy; 1997-2018 University of Cambridge.
 <br>
--- a/doc/pcre2.txt
+++ b/doc/pcre2.txt
--- a/doc/pcre2_set_compile_extra_options.3
+++ b/doc/pcre2_set_compile_extra_options.3
@ -1,4 +1,4 @@
-.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "16 June 2017" "PCRE2 10.30"
+.TH PCRE2_SET_COMPILE_EXTRA_OPTIONS 3 "21 September 2018" "PCRE2 10.33"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH SYNOPSIS
@ -24,6 +24,7 @@ options are:
 .\" JOIN
  PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    Treat all invalid escapes as
                                         a literal following character
+  PCRE2_EXTRA_ESCAPED_CR_IS_LF         Interpret \er as \en
  PCRE2_EXTRA_MATCH_LINE               Pattern matches whole lines
  PCRE2_EXTRA_MATCH_WORD               Pattern matches "words"
 .sp
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@ -1,4 +1,4 @@
-.TH PCRE2API 3 "18 September 2018" "PCRE2 10.33"
+.TH PCRE2API 3 "21 September 2018" "PCRE2 10.33"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@ -1824,6 +1824,14 @@ treated as single-character escapes. For example, \ej is a literal "j" and
 \ex{2z} is treated as the literal string "x{2z}". Setting this option means
 that typos in patterns may go undetected and have unexpected results. This is a
 dangerous option. Use with care.
+.sp
+  PCRE2_EXTRA_ESCAPED_CR_IS_LF
+.sp
+There are some legacy applications where the escape sequence \er in a pattern 
+is expected to match a newline. If this option is set, \er in a pattern is 
+converted to \en so that it matches a LF (linefeed) instead of a CR (carriage 
+return) character. The option does not affect a literal CR in the pattern, nor
+does it affect CR specified as an explicit code point such as \ex{0D}.
 .sp
  PCRE2_EXTRA_MATCH_LINE
 .sp
@ -3729,6 +3737,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 18 September 2018
+Last updated: 21 September 2018
 Copyright (c) 1997-2018 University of Cambridge.
 .fi
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "04 September 2018" "PCRE2 10.32"
+.TH PCRE2PATTERN 3 "21 September 2018" "PCRE2 10.33"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -370,7 +370,7 @@ these escapes are as follows:
  \ee          escape (hex 1B)
  \ef          form feed (hex 0C)
  \en          linefeed (hex 0A)
-  \er          carriage return (hex 0D)
+  \er          carriage return (hex 0D) (but see below)
  \et          tab (hex 09)
  \e0dd        character with octal code 0dd
  \eddd        character with octal code ddd, or backreference
@ -380,6 +380,11 @@ these escapes are as follows:
  \eN{U+hhh..} character with Unicode hex code point hhh..
  \euhhhh      character with hex code hhhh (when PCRE2_ALT_BSUX is set)
 .sp
+There are some legacy applications where the escape sequence \er is expected to
+match a newline. If the PCRE2_EXTRA_ESCAPED_CR_IS_LF option is set, \er in a
+pattern is converted to \en so that it matches a LF (linefeed) instead of a CR
+(carriage return) character.
+.P
 The \eN{U+hhh..} escape sequence is recognized only when the PCRE2_UTF option
 is set, that is, when PCRE2 is operating in a Unicode mode. Perl also uses
 \eN{name} to specify characters by Unicode name; PCRE2 does not support this.
@ -3655,6 +3660,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 04 September 2018
+Last updated: 21 September 2018
 Copyright (c) 1997-2018 University of Cambridge.
 .fi
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "17 September 2018" "PCRE 10.33"
+.TH PCRE2TEST 1 "21 September 2018" "PCRE 10.33"
 .SH NAME
 pcre2test - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@ -565,6 +565,7 @@ for a description of the effects of these options.
  /s  dotall                    set PCRE2_DOTALL
      dupnames                  set PCRE2_DUPNAMES
      endanchored               set PCRE2_ENDANCHORED
+      escaped_cr_is_lf          set PCRE2_EXTRA_ESCAPED_CR_IS_LF 
  /x  extended                  set PCRE2_EXTENDED
  /xx extended_more             set PCRE2_EXTENDED_MORE
      firstline                 set PCRE2_FIRSTLINE
@ -2021,6 +2022,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 17 September 2018
+Last updated: 21 September 2018
 Copyright (c) 1997-2018 University of Cambridge.
 .fi
--- a/doc/pcre2test.txt
+++ b/doc/pcre2test.txt
@ -544,6 +544,7 @@ PATTERN MODIFIERS
         /s  dotall                    set PCRE2_DOTALL
             dupnames                  set PCRE2_DUPNAMES
             endanchored               set PCRE2_ENDANCHORED
+             escaped_cr_is_lf          set PCRE2_EXTRA_ESCAPED_CR_IS_LF
         /x  extended                  set PCRE2_EXTENDED
         /xx extended_more             set PCRE2_EXTENDED_MORE
             firstline                 set PCRE2_FIRSTLINE
@ -1852,5 +1853,5 @@ AUTHOR

 REVISION

-       Last updated: 17 September 2018
+       Last updated: 21 September 2018
       Copyright (c) 1997-2018 University of Cambridge.
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@ -158,6 +158,7 @@ D   is inspected during pcre2_dfa_match() execution
 #define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    0x00000002u  /* C */
 #define PCRE2_EXTRA_MATCH_WORD               0x00000004u  /* C */
 #define PCRE2_EXTRA_MATCH_LINE               0x00000008u  /* C */
+#define PCRE2_EXTRA_ESCAPED_CR_IS_LF         0x00000010u  /* C */

 /* These are for pcre2_jit_compile(). */

--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -714,7 +714,8 @@ are allowed. */

 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
-    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL)
+    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
+    PCRE2_EXTRA_ESCAPED_CR_IS_LF)

 /* Compile time error code numbers. They are given names so that they can more
 easily be tracked. When a new number is added, the tables called eint1 and
@ -1398,7 +1399,7 @@ Arguments:
  errorcodeptr   points to the errorcode variable (containing zero)
  options        the current options bits
  isclass        TRUE if inside a character class
-  cb             compile data block
+  cb             compile data block or NULL when called from pcre2_substitute()

 Returns:         zero => a data character
                 positive => a special escape sequence
@ -1429,14 +1430,26 @@ GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */

 /* Non-alphanumerics are literals, so we just leave the value in c. An initial
 value test saves a memory lookup for code points outside the alphanumeric
-range. Otherwise, do a table lookup. A non-zero result is something that can be
-returned immediately. Otherwise further processing is required. */
+range. */

 if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */

+/* Otherwise, do a table lookup. Non-zero values need little processing here. A
+positive value is a literal value for something like \n. A negative value is
+the negation of one of the ESC_ macros that is passed back for handling by the
+calling function. Some extra checking is needed for \N because only \N{U+dddd}
+is supported. If the value is zero, further processing is handled below. */
+
 else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
  {
-  if (i > 0) c = (uint32_t)i; else  /* Positive is a data character */
+  if (i > 0)
+    {
+    c = (uint32_t)i;
+    if (cb != NULL && c == CHAR_CR &&
+        (cb->cx->extra_options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
+      c = CHAR_LF;   
+    }
+  else  /* Negative table entry */  
    {
    escape = -i;                    /* Else return a special escape */
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
@ -1486,9 +1499,9 @@ else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
    }
  }

-/* Escapes that need further processing, including those that are unknown.
-When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
-when BSUX is set). */
+/* Escapes that need further processing, including those that are unknown, have 
+a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
+\o, and \x are recognized (and \u when BSUX is set). */

 else
  {
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@ -629,6 +629,7 @@ static modstruct modlist[] = {
  { "dotall",                     MOD_PATP, MOD_OPT, PCRE2_DOTALL,               PO(options) },
  { "dupnames",                   MOD_PATP, MOD_OPT, PCRE2_DUPNAMES,             PO(options) },
  { "endanchored",                MOD_PD,   MOD_OPT, PCRE2_ENDANCHORED,          PD(options) },
+  { "escaped_cr_is_lf",           MOD_CTC,  MOD_OPT, PCRE2_EXTRA_ESCAPED_CR_IS_LF, CO(extra_options) },
  { "expand",                     MOD_PAT,  MOD_CTL, CTL_EXPAND,                 PO(control) },
  { "extended",                   MOD_PATP, MOD_OPT, PCRE2_EXTENDED,             PO(options) },
  { "extended_more",              MOD_PATP, MOD_OPT, PCRE2_EXTENDED_MORE,        PO(options) },
@ -4173,12 +4174,13 @@ show_compile_extra_options(uint32_t options, const char *before,
  const char *after)
 {
 if (options == 0) fprintf(outfile, "%s <none>%s", before, after);
-else fprintf(outfile, "%s%s%s%s%s%s",
+else fprintf(outfile, "%s%s%s%s%s%s%s",
  before,
  ((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "",
  ((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "",
  ((options & PCRE2_EXTRA_MATCH_WORD) != 0)? " match_word" : "",
  ((options & PCRE2_EXTRA_MATCH_LINE) != 0)? " match_line" : "",
+  ((options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)? " escaped_cr_is_lf" : "", 
  after);
 }

--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -5517,4 +5517,12 @@ a)"xI
 /a(b)c|xyz/g,replace=<$0>,substitute_callout
    abcdefabcpqr

+/abc\rdef/
+    abc\ndef
+
+/abc\rdef\x{0d}xyz/escaped_cr_is_lf
+    abc\ndef\rxyz
+\= Expect no match     
+    abc\ndef\nxyz
+
 # End of testinput2
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -16801,6 +16801,17 @@ Old 0 3  New 0 5
 Old 6 9  New 8 13
 2: <abc>def<abc>pqr

+/abc\rdef/
+    abc\ndef
+No match
+
+/abc\rdef\x{0d}xyz/escaped_cr_is_lf
+    abc\ndef\rxyz
+ 0: abc\x0adef\x0dxyz
+\= Expect no match     
+    abc\ndef\nxyz
+No match
+
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data