From c19bd9a377891ba72113d24c69bdbc01fa226aec Mon Sep 17 00:00:00 2001
From: "Philip.Hazel" <Philip.Hazel@gmail.com>
Date: Wed, 12 Nov 2014 16:57:56 +0000
Subject: [PATCH] Substitution tests and documentation.

---
 doc/pcre2test.1       |  76 ++++++++++++++++++-------
 src/pcre2_error.c     |   4 +-
 src/pcre2_valid_utf.c |   8 ++-
 src/pcre2test.c       | 127 ++++++++++++++++++++++++++++--------------
 testdata/testinput10  |   3 +
 testdata/testinput2   |   6 ++
 testdata/testoutput10 |   4 ++
 testdata/testoutput2  |  10 ++++
 8 files changed, 171 insertions(+), 67 deletions(-)
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
index d616210..ce494f4 100644
--- a/doc/pcre2test.1
+++ b/doc/pcre2test.1
@@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "09 November 2014" "PCRE 10.00"
+.TH PCRE2TEST 1 "12 November 2014" "PCRE 10.00"
 .SH NAME
 pcre2test - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -645,6 +645,7 @@ not affect the compilation process.
       allusedtext         show all consulted text
   /g  global              global matching
       mark                show mark values
+      replace=<string>    specify a replacement string 
       startchar           show starting character when relevant
 .sp
 These modifiers may not appear in a \fB#pattern\fP command. If you want them as
@@ -719,6 +720,7 @@ pattern.
       offset=<n>                set starting offset
       ovector=<n>               set size of output vector
       recursion_limit=<n>       set a recursion limit
+      replace=<string>          specify a replacement string 
       startchar                 show startchar when relevant
       zero_terminate            pass the subject as zero-terminated
 .sp
@@ -797,6 +799,29 @@ Any value other than zero is used as a return from \fBpcre2test\fP's callout
 function.
 .
 .
+.SS "Finding all matches in a string"
+.rs
+.sp
+Searching for all possible matches within a subject can be requested by the
+\fBglobal\fP or \fB/altglobal\fP modifier. After finding a match, the matching
+function is called again to search the remainder of the subject. The difference
+between \fBglobal\fP and \fBaltglobal\fP is that the former uses the
+\fIstart_offset\fP argument to \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP
+to start searching at a new point within the entire string (which is what Perl
+does), whereas the latter passes over a shortened substring. This makes a
+difference to the matching process if the pattern begins with a lookbehind
+assertion (including \eb or \eB).
+.P
+If an empty string is matched, the next match is done with the
+PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for
+another, non-empty, match at the same point in the subject. If this match
+fails, the start offset is advanced, and the normal match is retried. This
+imitates the way Perl handles such cases when using the \fB/g\fP modifier or
+the \fBsplit()\fP function. Normally, the start offset is advanced by one
+character, but if the newline convention recognizes CRLF as a newline, and the
+current character is CR followed by LF, an advance of two is used.
+.
+.
 .SS "Testing substring extraction functions"
 .rs
 .sp
@@ -821,27 +846,38 @@ length (that is, the return from the extraction function) is given in
 parentheses after each substring.
 .
 .
-.SS "Finding all matches in a string"
+.SS "Testing the substitution function"
 .rs
 .sp
-Searching for all possible matches within a subject can be requested by the
-\fBglobal\fP or \fB/altglobal\fP modifier. After finding a match, the matching
-function is called again to search the remainder of the subject. The difference
-between \fBglobal\fP and \fBaltglobal\fP is that the former uses the
-\fIstart_offset\fP argument to \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP
-to start searching at a new point within the entire string (which is what Perl
-does), whereas the latter passes over a shortened substring. This makes a
-difference to the matching process if the pattern begins with a lookbehind
-assertion (including \eb or \eB).
+If the \fBreplace\fP modifier is set, the \fBpcre2_substitute()\fP function is 
+called instead of one of the matching functions. Unlike subject strings,
+\fBpcre2test\fP does not process replacement strings for escape sequences. In
+UTF mode, a replacement string is checked to see if it is a valid UTF-8 string.
+If so, it is correctly converted to a UTF string of the appropriate code unit
+width. If it is not a valid UTF-8 string, the individual code units are copied
+directly. This provides a means of passing an invalid UTF-8 string for testing
+purposes. 
 .P
-If an empty string is matched, the next match is done with the
-PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set, in order to search for
-another, non-empty, match at the same point in the subject. If this match
-fails, the start offset is advanced, and the normal match is retried. This
-imitates the way Perl handles such cases when using the \fB/g\fP modifier or
-the \fBsplit()\fP function. Normally, the start offset is advanced by one
-character, but if the newline convention recognizes CRLF as a newline, and the
-current character is CR followed by LF, an advance of two is used.
+If the \fBglobal\fP modifier is set, PCRE2_SUBSTITUTE_GLOBAL is passed to
+\fBpcre2_substitute()\fP. After a successful substitution, the modified string
+is output, preceded by the number of replacements. This may be zero if there
+were no matches. Here is a simple example of a substitution test:
+.sp
+  /abc/replace=xxx
+      =abc=abc=
+   1: =xxx=abc=
+      =abc=abc=\=global
+   2: =xxx=xxx=
+.sp
+Subject and replacement strings should be kept relatively short for 
+substitution tests, as fixed-size buffers are used. To make it easy to test for
+buffer overflow, if the replacement string starts with a number in square 
+brackets, that number is passed to \fBpcre2_substitute()\fP as the size of the 
+output buffer, with the replacement string starting at the next character.
+.P
+A replacement string is ignored with POSIX and DFA matching. Specifying partial 
+matching provokes an error return ("bad option value") from
+\fBpcre2_substitute()\fP.
 .
 .
 .SS "Setting the JIT stack size"
@@ -1200,6 +1236,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 09 November 2014
+Last updated: 12 November 2014
 Copyright (c) 1997-2014 University of Cambridge.
 .fi
diff --git a/src/pcre2_error.c b/src/pcre2_error.c
index bd38714..2fb5340 100644
--- a/src/pcre2_error.c
+++ b/src/pcre2_error.c
@@ -102,7 +102,7 @@ static const char compile_error_texts[] =
   /* 30 */
   "unknown POSIX class name\0"
   "internal error in pcre2_study(): should not occur\0"
-  "this version of PCRE does not have UTF or Unicode property support\0"
+  "this version of PCRE2 does not have Unicode support\0"
   "parentheses are too deeply nested (stack check)\0"
   "character code point value in \\x{} or \\o{} is too large\0"
   /* 35 */
@@ -118,7 +118,7 @@ static const char compile_error_texts[] =
   "two named subpatterns have the same name (PCRE2_DUPNAMES not set)\0"
   "group name must start with a non-digit\0"
   /* 45 */
-  "this version of PCRE does not have support for \\P, \\p, or \\X\0"
+  "this version of PCRE2 does not have support for \\P, \\p, or \\X\0"
   "malformed \\P or \\p sequence\0"
   "unknown property name after \\P or \\p\0"
   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
diff --git a/src/pcre2_valid_utf.c b/src/pcre2_valid_utf.c
index d0378c9..a97847a 100644
--- a/src/pcre2_valid_utf.c
+++ b/src/pcre2_valid_utf.c
@@ -40,14 +40,16 @@ POSSIBILITY OF SUCH DAMAGE.
 
 
 /* This module contains an internal function for validating UTF character
-strings. */
-
+strings. This file is also #included by the pcre2test program, which uses
+macros to change names from _pcre2_xxx to xxxx, thereby avoiding name clashes
+with the library. In this case, PCRE2_PCRE2TEST is defined. */
 
+#ifndef PCRE2_PCRE2TEST           /* We're compiling the library */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
-
 #include "pcre2_internal.h"
+#endif /* PCRE2_PCRE2TEST */
 
 
 #ifndef SUPPORT_UNICODE
diff --git a/src/pcre2test.c b/src/pcre2test.c
index b73b03c..d935cb5 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -165,9 +165,14 @@ void vms_setsymbol( char *, char *, int );
 #define DEFAULT_OVECCOUNT 15    /* Default ovector count */
 #define JUNK_OFFSET 0xdeadbeef  /* For initializing ovector */
 #define LOOPREPEAT 500000       /* Default loop count for timing */
-#define REPLACE_BUFFSIZE 400    /* For replacement strings */
+#define REPLACE_MODSIZE 96      /* Field for reading 8-bit replacement */
 #define VERSION_SIZE 64         /* Size of buffer for the version strings */
 
+/* Make sure the buffer into which replacement strings are copied is big enough 
+to hold them as 32-bit code units. */
+
+#define REPLACE_BUFFSIZE (4*REPLACE_MODSIZE)
+
 /* Execution modes */
 
 #define PCRE8_MODE   8
@@ -258,6 +263,20 @@ these inclusions should not be changed. */
 
 #define PCRE2_SUFFIX(a) a
 
+/* We need to be able to check input text for UTF-8 validity, whatever code 
+widths are actually available, because the input to pcre2test is always in 
+8-bit code units. So we include the UTF validity checking function for 8-bit 
+code units. */
+
+extern int valid_utf(PCRE2_SPTR8, PCRE2_SIZE, PCRE2_SIZE *);
+
+#define  PCRE2_CODE_UNIT_WIDTH 8
+#undef   PCRE2_SPTR
+#define  PCRE2_SPTR PCRE2_SPTR8
+#include "pcre2_valid_utf.c"
+#undef   PCRE2_CODE_UNIT_WIDTH
+#undef   PCRE2_SPTR
+
 /* If we have 8-bit support, default to it; if there is also 16-or 32-bit
 support, it can be selected by a command-line option. If there is no 8-bit
 support, there must be 16- or 32-bit support, so default to one of them. The
@@ -369,15 +388,20 @@ data line. */
                     CTL_MARK|\
                     CTL_MEMORY|\
                     CTL_STARTCHAR)
+                    
+/* Structures for holding modifier information for patterns and subject strings 
+(data). Fields containing modifiers that can be set either for a pattern or a 
+subject must be at the start and in the same order in both cases so that the 
+same offset in the big table below works for both. */
 
 typedef struct patctl {    /* Structure for pattern modifiers. */
   uint32_t  options;       /* Must be in same position as datctl */
   uint32_t  control;       /* Must be in same position as datctl */
+   uint8_t  replacement[REPLACE_MODSIZE];  /* So must this */
   uint32_t  jit;
   uint32_t  stackguard_test;
   uint32_t  tables_id;
   uint8_t   locale[32];
-  uint8_t   replacement[REPLACE_BUFFSIZE];
 } patctl;
 
 #define MAXCPYGET 10
@@ -386,6 +410,7 @@ typedef struct patctl {    /* Structure for pattern modifiers. */
 typedef struct datctl {    /* Structure for data line modifiers. */
   uint32_t  options;       /* Must be in same position as patctl */
   uint32_t  control;       /* Must be in same position as patctl */
+   uint8_t  replacement[REPLACE_MODSIZE];  /* So must this */
   uint32_t  cfail[2];
    int32_t  callout_data;
    int32_t  copy_numbers[MAXCPYGET];
@@ -487,7 +512,7 @@ static modstruct modlist[] = {
   { "posix",               MOD_PAT,  MOD_CTL, CTL_POSIX,                 PO(control) },
   { "ps",                  MOD_DAT,  MOD_OPT, PCRE2_PARTIAL_SOFT,        DO(options) },
   { "recursion_limit",     MOD_CTM,  MOD_INT, 0,                         MO(recursion_limit) },
-  { "replace",             MOD_PAT,  MOD_STR, 0,                         PO(replacement) },
+  { "replace",             MOD_PND,  MOD_STR, 0,                         PO(replacement) },
   { "stackguard",          MOD_PAT,  MOD_INT, 0,                         PO(stackguard_test) },
   { "startchar",           MOD_PND,  MOD_CTL, CTL_STARTCHAR,             PO(control) },
   { "tables",              MOD_PAT,  MOD_INT, 0,                         PO(tables_id) },
@@ -4211,13 +4236,14 @@ uint32_t *q32 = NULL;
 
 /* Copy the default context and data control blocks to the active ones. Then
 copy from the pattern the controls that can be set in either the pattern or the
-data. This allows them to be unset in the data line. We do not do this for
+data. This allows them to be overridden in the data line. We do not do this for
 options because those that are common apply separately to compiling and
 matching. */
 
 DATCTXCPY(dat_context, default_dat_context);
 memcpy(&dat_datctl, &def_datctl, sizeof(datctl));
 dat_datctl.control |= (pat_patctl.control & CTL_ALLPD);
+strcpy((char *)dat_datctl.replacement, (char *)pat_patctl.replacement);
 
 /* Initialize for scanning the data line. */
 
@@ -4715,20 +4741,28 @@ else
   PCRE2_MATCH_DATA_FREE(match_data);
   PCRE2_MATCH_DATA_CREATE(match_data, max_oveccount, NULL);
   }
+  
+/* Replacement processing is ignored for DFA matching. */ 
+
+if (dat_datctl.replacement[0] != 0 && (dat_datctl.control & CTL_DFA) != 0)
+  {
+  fprintf(outfile, "** Ignored for DFA matching: replace\n");
+  dat_datctl.replacement[0] = 0;
+  }
 
 /* If a replacement string is provided, call pcre2_substitute() instead of one
 of the matching functions. First we have to convert the replacement string to
 the appropriate width. */
 
-if (pat_patctl.replacement[0] != 0)
+if (dat_datctl.replacement[0] != 0)
   {
   int rc;
   uint8_t *pr;
   uint8_t rbuffer[REPLACE_BUFFSIZE];
   uint8_t nbuffer[REPLACE_BUFFSIZE];
   uint32_t goption;
-  PCRE2_SIZE rlen;
-  PCRE2_SIZE nsize;
+  PCRE2_SIZE rlen, nsize, erroroffset;
+  BOOL badutf = FALSE;
 
 #ifdef SUPPORT_PCRE2_8
   uint8_t *r8 = NULL;
@@ -4740,10 +4774,13 @@ if (pat_patctl.replacement[0] != 0)
   uint32_t *r32 = NULL;
 #endif
 
-  goption = ((pat_patctl.control & CTL_GLOBAL) == 0)? 0 :
+  if (timeitm)
+    fprintf(outfile, "** Timing is not supported with replace: ignored\n"); 
+
+  goption = ((dat_datctl.control & CTL_GLOBAL) == 0)? 0 :
     PCRE2_SUBSTITUTE_GLOBAL;
   SETCASTPTR(r, rbuffer);  /* Sets r8, r16, or r32, as appropriate. */
-  pr = pat_patctl.replacement;
+  pr = dat_datctl.replacement;
 
   /* If the replacement starts with '[<number>]' we interpret that as length
   value for the replacement buffer. */
@@ -4767,52 +4804,58 @@ if (pat_patctl.replacement[0] != 0)
     nsize = n;
     }
 
-  /* Now copy the replacement string to a buffer of the appropriate width. */
+  /* Now copy the replacement string to a buffer of the appropriate width. No 
+  escape processing is done for replacements. In UTF mode, check for an invalid 
+  UTF-8 input string, and if it is invalid, just copy its code units without 
+  UTF interpretation. This provides a means of checking that an invalid string 
+  is detected. Otherwise, UTF-8 can be used to include wide characters in a 
+  replacement. */
+  
+  if (utf) badutf = valid_utf(pr, strlen((const char *)pr), &erroroffset);
 
-  while ((c = *pr++) != 0)
+  /* Not UTF or invalid UTF-8: just copy the code units. */
+  
+  if (!utf || badutf)
     {
-    if (utf && HASUTF8EXTRALEN(c)) { GETUTF8INC(c, pr); }
-
-    /* At present no escape processing is provided for replacements. */
+    while ((c = *pr++) != 0)
+      { 
+#ifdef SUPPORT_PCRE2_8
+      if (test_mode == PCRE8_MODE) *r8++ = c;
+#endif
+#ifdef SUPPORT_PCRE2_16
+      if (test_mode == PCRE16_MODE) *r16++ = c;
+#endif
+#ifdef SUPPORT_PCRE2_32
+      if (test_mode == PCRE32_MODE) *r32++ = c;
+#endif
+      }
+    }
+    
+  /* Valid UTF-8 replacement string */
+        
+  else while ((c = *pr++) != 0)
+    {
+    if (HASUTF8EXTRALEN(c)) { GETUTF8INC(c, pr); }
 
 #ifdef SUPPORT_PCRE2_8
-    if (test_mode == PCRE8_MODE)
-      {
-      if (utf)
-        {
-        r8 += ord2utf8(c, r8);
-        }
-      else
-        {
-        *r8++ = c;
-        }
-      }
+    if (test_mode == PCRE8_MODE) r8 += ord2utf8(c, r8);
 #endif
+
 #ifdef SUPPORT_PCRE2_16
     if (test_mode == PCRE16_MODE)
       {
-      if (utf)
+      if (c >= 0x10000u)
         {
-        if (c >= 0x10000u)
-          {
-          c-= 0x10000u;
-          *r16++ = 0xD800 | (c >> 10);
-          *r16++ = 0xDC00 | (c & 0x3ff);
-          }
-        else
-          *r16++ = c;
-        }
-      else
-        {
-        *r16++ = c;
+        c-= 0x10000u;
+        *r16++ = 0xD800 | (c >> 10);
+        *r16++ = 0xDC00 | (c & 0x3ff);
         }
+      else *r16++ = c;
       }
 #endif
+
 #ifdef SUPPORT_PCRE2_32
-    if (test_mode == PCRE32_MODE)
-      {
-      *r32++ = c;
-      }
+    if (test_mode == PCRE32_MODE) *r32++ = c;
 #endif
     }
 
diff --git a/testdata/testinput10 b/testdata/testinput10
index 8158623..4584f79 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -444,4 +444,7 @@
 
 /\x{3a3}B/IBi,utf
 
+/abc/utf,replace=�
+   abc
+
 # End of testinput10
diff --git a/testdata/testinput2 b/testdata/testinput2
index f42a2aa..c0640e0 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4067,6 +4067,12 @@ a random value. /Ix
 /abc/replace=xyz
     1abc2\=partial_hard
 
+/abc/replace=xyz
+    123abc456
+    123abc456\=replace=pqr
+    123abc456abc789
+    123abc456abc789\=g
+
 # End of substitute tests 
 
 # End of testinput2 
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index bc010c9..6e4bdc5 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -1546,4 +1546,8 @@ Starting code units: \xce \xcf
 Last code unit = 'B' (caseless)
 Subject length lower bound = 2
 
+/abc/utf,replace=�
+   abc
+Failed: error -3: UTF-8 error: 1 byte missing at end
+
 # End of testinput10
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index f999a1f..3d43e50 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -13689,6 +13689,16 @@ Failed: error -47: no more memory
     1abc2\=partial_hard
 Failed: error -34: bad option value
 
+/abc/replace=xyz
+    123abc456
+ 1: 123xyz456
+    123abc456\=replace=pqr
+ 1: 123pqr456
+    123abc456abc789
+ 1: 123xyz456abc789
+    123abc456abc789\=g
+ 2: 123xyz456xyz789
+
 # End of substitute tests 
 
 # End of testinput2