Complete escape processing for PCRE2_ALT_VERBNAMES

2015-09-01 17:32:42 +00:00 · 2015-09-01 17:32:42 +00:00 · cdf07ab585
parent d2e87a75af
commit cdf07ab585
5 changed files with 202 additions and 60 deletions
--- a/doc/pcre2api.3
+++ b/doc/pcre2api.3
@ -1,4 +1,4 @@
-.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2API 3 "01 September 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .sp
@ -1060,7 +1060,10 @@ By default, for compatibility with Perl, the name in any verb sequence such as
 parenthesis. The name is not processed in any way, and it is not possible to
 include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
 option is set, normal backslash processing is applied to verb names and only an
-unescaped closing parenthesis terminates the name.
+unescaped closing parenthesis terminates the name. A closing parenthesis can be
 included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED
 option is set, unescaped whitespace in verb names is skipped and #-comments are
 recognized, exactly as in the rest of the pattern.
 .sp
  PCRE2_AUTO_CALLOUT
 .sp
@ -2962,6 +2965,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/doc/pcre2pattern.3
+++ b/doc/pcre2pattern.3
@ -1,4 +1,4 @@
-.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21"
+.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
 .SH NAME
 PCRE2 - Perl-compatible regular expressions (revised API)
 .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -2953,7 +2953,10 @@ that does not include a closing parenthesis. The name is not processed in
 any way, and it is not possible to include a closing parenthesis in the name.
 However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing 
 is applied to verb names and only an unescaped closing parenthesis terminates 
-the name.
+the name. A closing parenthesis can be included in a name either as \e) or 
 between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace 
 in verb names is skipped and #-comments are recognized, exactly as in the rest 
 of the pattern.
 .P
 The maximum length of a name is 255 in the 8-bit library and 65535 in the
 16-bit and 32-bit libraries. If the name is empty, that is, if the closing
@ -3383,6 +3386,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 30 August 2015
+Last updated: 01 September 2015
 Copyright (c) 1997-2015 University of Cambridge.
 .fi
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@ -2792,6 +2792,148 @@ return n8;
 /*************************************************
 *       Process (*VERB) name for escapes         *
 *************************************************/
 /* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
 process the characters in a verb's name argument. It is called twice, once with 
 codeptr == NULL, to find out the length of the processed name, and again to put 
 the name into memory.
 Arguments:
  ptrptr        pointer to the input pointer
  codeptr       pointer to the compiled code pointer
  errorcodeptr  pointer to the error code
  utf           TRUE if processing UTF
  cb            compile data block
 Returns:        length of the processed name, or < 0 on error
 */
 static int
 process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
  uint32_t options, BOOL utf, compile_block *cb)
 {
 int arglen = 0;
 BOOL inescq = FALSE;
 PCRE2_SPTR ptr = *ptrptr;
 PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
 for (; ptr < cb->end_pattern; ptr++)
  {
  uint32_t x = *ptr;
  /* Skip over literals */
  if (inescq)
    {
    if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
      {
      inescq = FALSE;
      ptr++;;
      continue;
      }
    }
  else  /* Not a literal character */
    { 
    if (x == CHAR_RIGHT_PARENTHESIS) break;
    /* Skip over comments and whitespace in extended mode. Need a loop to handle
    whitespace after a comment. */
    if ((options & PCRE2_EXTENDED) != 0)
      {
      for (;;)
        {
        while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
        if (x != CHAR_NUMBER_SIGN) break;
        ptr++;
        while (*ptr != CHAR_NULL)
          {
          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
            {                          /* IS_NEWLINE sets cb->nllen. */
            ptr += cb->nllen;
            break;
            }
          ptr++;
 #ifdef SUPPORT_UNICODE
          if (utf) FORWARDCHAR(ptr);
 #endif
          }
        x = *ptr;     /* Either NULL or the char after a newline */
        }
      if (ptr >= cb->end_pattern) break;   
      }
    /* Process escapes */
    if (x == '\\')
      {
      int rc;
      *errorcodeptr = 0;
      rc = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
      *ptrptr = ptr;   /* For possible error */ 
      if (*errorcodeptr != 0) return -1;
      if (rc != 0)
        {
        if (rc == ESC_Q) 
          {
          inescq = TRUE;
          continue;
          }
        if (rc == ESC_E) continue;
        *errorcodeptr = ERR40;
        return -1;
        }
      }
    }   
  /* We have the next character in the name. */
 #ifdef SUPPORT_UNICODE
  if (utf)
    {
    if (code == NULL)   /* Just want the length */
      {
 #if PCRE2_CODE_UNIT_WIDTH == 8
      int i;
      for (i = 0; i < PRIV(utf8_table1_size); i++)
        if ((int)x <= PRIV(utf8_table1)[i]) break;
      arglen += i;
 #elif PCRE2_CODE_UNIT_WIDTH == 16
      if (x > 0xffff) arglen++;
 #endif
      }
    else
      {
      PCRE2_UCHAR cbuff[8];
      x = PRIV(ord2utf)(x, cbuff);
      memcpy(code, cbuff, CU2BYTES(x));
      code += x;
      }
    }
  else
 #endif  /* SUPPORT_UNICODE */
  /* Not UTF */
    {
    if (code != NULL) *code++ = x;
    }
  arglen++;
  }
 /* Update the pointers before returning. */
 *ptrptr = ptr;
 if (codeptr != NULL) *codeptr = code;
 return arglen;
 }
 /*************************************************
 *      Scan regex to identify named groups       *
 *************************************************/
@ -5399,33 +5541,9 @@ for (;; ptr++)
          }
        else
          {
-          arglen = 0;
+          arglen = process_verb_name(&ptr, NULL, errorcodeptr, options, 
-          while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS)
+            utf, cb);
-            {
+          if (arglen < 0) goto FAILED;
            if (*ptr == '\\')
              {
              uint32_t x;
              *errorcodeptr = 0;
              i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
              if (*errorcodeptr != 0) goto FAILED;
              if (i != 0)
                {
                *errorcodeptr = ERR40;
                goto FAILED;
                }
 #ifdef SUPPORT_UNICODE
 #if PCRE2_CODE_UNIT_WIDTH == 8
              for (i = 0; i < PRIV(utf8_table1_size); i++)
                if ((int)x <= PRIV(utf8_table1)[i]) break;
              arglen += i;
 #elif PCRE2_CODE_UNIT_WIDTH == 16
              if (x > 0xffff) arglen++;
 #endif
 #endif
              }
            arglen++;
            ptr++;
            }
          }
        if ((unsigned int)arglen > MAX_MARK)
@ -5495,35 +5613,12 @@ for (;; ptr++)
              }
            setverb = *code++ = verbs[i].op_arg;
            *code++ = arglen;
            /* If we are processing the argument for escapes, we don't need
            to apply checks here because it was all checked above when
            computing the length. */
            if ((options & PCRE2_ALT_VERBNAMES) != 0)
              {
-              for (; arg != ptr; arg++)
+              PCRE2_UCHAR *memcode = code;  /* code is "register" */
-                {
+              (void)process_verb_name(&arg, &memcode, errorcodeptr, options, 
-                if (*arg == '\\')
+                utf, cb);
-                  {
+              code = memcode;  
                  uint32_t x;
                  *errorcodeptr = 0;
                  (void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
                    cb);
 #ifdef SUPPORT_UNICODE
                  if (utf)
                    {
                    PCRE2_UCHAR cbuff[8];
                    x = PRIV(ord2utf)(x, cbuff);
                    memcpy(code, cbuff, CU2BYTES(x));
                    code += x;
                    }
                  else
 #endif
                  *code++ = x;
                  }
                else *code++ = *arg;
                }
              }
            else   /* No argument processing */
              {
--- a/testdata/testinput2
+++ b/testdata/testinput2
@ -4449,4 +4449,20 @@ a random value. /Ix
 /(*:ab\t(d\)c)xxx/alt_verbnames,mark
    cxxxz
 /(*:A\Qxx)x\EB)x/alt_verbnames,mark
    x
 /(*:A\ExxxB)x/alt_verbnames,mark
    x 
 /(*: A \ and #comment
     \ B)x/x,alt_verbnames,mark
    x  
 /(*:A
 B)x/alt_verbnames,mark 
    x
 /(*:abc\Qpqr)/alt_verbnames
 # End of testinput2 
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@ -14724,4 +14724,29 @@ Failed: error 122 at offset 12: unmatched closing parenthesis
 0: xxx
 MK: ab\x09(d)c
 /(*:A\Qxx)x\EB)x/alt_verbnames,mark
    x
 0: x
 MK: Axx)xB
 /(*:A\ExxxB)x/alt_verbnames,mark
    x 
 0: x
 MK: AxxxB
 /(*: A \ and #comment
     \ B)x/x,alt_verbnames,mark
    x  
 0: x
 MK: A and B
 /(*:A
 B)x/alt_verbnames,mark 
    x
 0: x
 MK: A\x0aB
 /(*:abc\Qpqr)/alt_verbnames
 Failed: error 160 at offset 12: (*VERB) not recognized or malformed
 # End of testinput2