Complete escape processing for PCRE2_ALT_VERBNAMES

2015-09-01 17:32:42 +00:00 · 2015-09-01 17:32:42 +00:00 · cdf07ab585
parent d2e87a75af
commit cdf07ab585
5 changed files with 202 additions and 60 deletions
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@ -1,4 +1,4 @@
-.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "01 September 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@ -1060,7 +1060,10 @@ By default, for compatibility with Perl, the name in any verb sequence such as
 parenthesis. The name is not processed in any way, and it is not possible to
 include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
 option is set, normal backslash processing is applied to verb names and only an
-unescaped closing parenthesis terminates the name.
+unescaped closing parenthesis terminates the name. A closing parenthesis can be
+included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED
+option is set, unescaped whitespace in verb names is skipped and #-comments are
+recognized, exactly as in the rest of the pattern.
 .sp
  PCRE2_AUTO_CALLOUT
 .sp
@ -2962,6 +2965,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -2953,7 +2953,10 @@ that does not include a closing parenthesis. The name is not processed in
 any way, and it is not possible to include a closing parenthesis in the name.
 However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing 
 is applied to verb names and only an unescaped closing parenthesis terminates 
-the name.
+the name. A closing parenthesis can be included in a name either as \e) or 
+between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace 
+in verb names is skipped and #-comments are recognized, exactly as in the rest 
+of the pattern.
 .P
 The maximum length of a name is 255 in the 8-bit library and 65535 in the
 16-bit and 32-bit libraries. If the name is empty, that is, if the closing
@ -3383,6 +3386,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -2792,6 +2792,148 @@ return n8;



+/*************************************************
+*       Process (*VERB) name for escapes         *
+*************************************************/
+
+/* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
+process the characters in a verb's name argument. It is called twice, once with 
+codeptr == NULL, to find out the length of the processed name, and again to put 
+the name into memory.
+
+Arguments:
+  ptrptr        pointer to the input pointer
+  codeptr       pointer to the compiled code pointer
+  errorcodeptr  pointer to the error code
+  utf           TRUE if processing UTF
+  cb            compile data block
+
+Returns:        length of the processed name, or < 0 on error
+*/
+
+static int
+process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
+  uint32_t options, BOOL utf, compile_block *cb)
+{
+int arglen = 0;
+BOOL inescq = FALSE;
+PCRE2_SPTR ptr = *ptrptr;
+PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
+
+for (; ptr < cb->end_pattern; ptr++)
+  {
+  uint32_t x = *ptr;
+
+  /* Skip over literals */
+
+  if (inescq)
+    {
+    if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
+      {
+      inescq = FALSE;
+      ptr++;;
+      continue;
+      }
+    }
+
+  else  /* Not a literal character */
+    { 
+    if (x == CHAR_RIGHT_PARENTHESIS) break;
+ 
+    /* Skip over comments and whitespace in extended mode. Need a loop to handle
+    whitespace after a comment. */
+  
+    if ((options & PCRE2_EXTENDED) != 0)
+      {
+      for (;;)
+        {
+        while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
+        if (x != CHAR_NUMBER_SIGN) break;
+        ptr++;
+        while (*ptr != CHAR_NULL)
+          {
+          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
+            {                          /* IS_NEWLINE sets cb->nllen. */
+            ptr += cb->nllen;
+            break;
+            }
+          ptr++;
+#ifdef SUPPORT_UNICODE
+          if (utf) FORWARDCHAR(ptr);
+#endif
+          }
+        x = *ptr;     /* Either NULL or the char after a newline */
+        }
+      if (ptr >= cb->end_pattern) break;   
+      }
+  
+    /* Process escapes */
+  
+    if (x == '\\')
+      {
+      int rc;
+      *errorcodeptr = 0;
+      rc = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
+      *ptrptr = ptr;   /* For possible error */ 
+      if (*errorcodeptr != 0) return -1;
+      if (rc != 0)
+        {
+        if (rc == ESC_Q) 
+          {
+          inescq = TRUE;
+          continue;
+          }
+        if (rc == ESC_E) continue;
+        *errorcodeptr = ERR40;
+        return -1;
+        }
+      }
+    }   
+    
+  /* We have the next character in the name. */
+
+#ifdef SUPPORT_UNICODE
+  if (utf)
+    {
+    if (code == NULL)   /* Just want the length */
+      {
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      int i;
+      for (i = 0; i < PRIV(utf8_table1_size); i++)
+        if ((int)x <= PRIV(utf8_table1)[i]) break;
+      arglen += i;
+#elif PCRE2_CODE_UNIT_WIDTH == 16
+      if (x > 0xffff) arglen++;
+#endif
+      }
+    else
+      {
+      PCRE2_UCHAR cbuff[8];
+      x = PRIV(ord2utf)(x, cbuff);
+      memcpy(code, cbuff, CU2BYTES(x));
+      code += x;
+      }
+    }
+  else
+#endif  /* SUPPORT_UNICODE */
+
+  /* Not UTF */
+    {
+    if (code != NULL) *code++ = x;
+    }
+
+  arglen++;
+  }
+
+/* Update the pointers before returning. */
+
+*ptrptr = ptr;
+if (codeptr != NULL) *codeptr = code;
+return arglen;
+}
+
+
+
 /*************************************************
 *      Scan regex to identify named groups       *
 *************************************************/
@ -5399,33 +5541,9 @@ for (;; ptr++)
          }
        else
          {
-          arglen = 0;
-          while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS)
-            {
-            if (*ptr == '\\')
-              {
-              uint32_t x;
-              *errorcodeptr = 0;
-              i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
-              if (*errorcodeptr != 0) goto FAILED;
-              if (i != 0)
-                {
-                *errorcodeptr = ERR40;
-                goto FAILED;
-                }
-#ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
-              for (i = 0; i < PRIV(utf8_table1_size); i++)
-                if ((int)x <= PRIV(utf8_table1)[i]) break;
-              arglen += i;
-#elif PCRE2_CODE_UNIT_WIDTH == 16
-              if (x > 0xffff) arglen++;
-#endif
-#endif
-              }
-            arglen++;
-            ptr++;
-            }
+          arglen = process_verb_name(&ptr, NULL, errorcodeptr, options, 
+            utf, cb);
+          if (arglen < 0) goto FAILED;
          }

        if ((unsigned int)arglen > MAX_MARK)
@ -5495,35 +5613,12 @@ for (;; ptr++)
              }
            setverb = *code++ = verbs[i].op_arg;
            *code++ = arglen;
-
-            /* If we are processing the argument for escapes, we don't need
-            to apply checks here because it was all checked above when
-            computing the length. */
-
            if ((options & PCRE2_ALT_VERBNAMES) != 0)
              {
-              for (; arg != ptr; arg++)
-                {
-                if (*arg == '\\')
-                  {
-                  uint32_t x;
-                  *errorcodeptr = 0;
-                  (void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
-                    cb);
-#ifdef SUPPORT_UNICODE
-                  if (utf)
-                    {
-                    PCRE2_UCHAR cbuff[8];
-                    x = PRIV(ord2utf)(x, cbuff);
-                    memcpy(code, cbuff, CU2BYTES(x));
-                    code += x;
-                    }
-                  else
-#endif
-                  *code++ = x;
-                  }
-                else *code++ = *arg;
-                }
+              PCRE2_UCHAR *memcode = code;  /* code is "register" */
+              (void)process_verb_name(&arg, &memcode, errorcodeptr, options, 
+                utf, cb);
+              code = memcode;  
              }
            else   /* No argument processing */
              {
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -4449,4 +4449,20 @@ a random value. /Ix
 /(*:ab\t(d\)c)xxx/alt_verbnames,mark
    cxxxz

+/(*:A\Qxx)x\EB)x/alt_verbnames,mark
+    x
+    
+/(*:A\ExxxB)x/alt_verbnames,mark
+    x 
+    
+/(*: A \ and #comment
+     \ B)x/x,alt_verbnames,mark
+    x  
+    
+/(*:A
+B)x/alt_verbnames,mark 
+    x
+
+/(*:abc\Qpqr)/alt_verbnames
+
 # End of testinput2 
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -14724,4 +14724,29 @@ Failed: error 122 at offset 12: unmatched closing parenthesis
 0: xxx
 MK: ab\x09(d)c

+/(*:A\Qxx)x\EB)x/alt_verbnames,mark
+    x
+ 0: x
+MK: Axx)xB
+    
+/(*:A\ExxxB)x/alt_verbnames,mark
+    x 
+ 0: x
+MK: AxxxB
+    
+/(*: A \ and #comment
+     \ B)x/x,alt_verbnames,mark
+    x  
+ 0: x
+MK: A and B
+    
+/(*:A
+B)x/alt_verbnames,mark 
+    x
+ 0: x
+MK: A\x0aB
+
+/(*:abc\Qpqr)/alt_verbnames
+Failed: error 160 at offset 12: (*VERB) not recognized or malformed
+
 # End of testinput2