diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 0f338da..deb1f8b 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21" +.TH PCRE2API 3 "01 September 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1060,7 +1060,10 @@ By default, for compatibility with Perl, the name in any verb sequence such as parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing is applied to verb names and only an -unescaped closing parenthesis terminates the name. +unescaped closing parenthesis terminates the name. A closing parenthesis can be +included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED +option is set, unescaped whitespace in verb names is skipped and #-comments are +recognized, exactly as in the rest of the pattern. .sp PCRE2_AUTO_CALLOUT .sp @@ -2962,6 +2965,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 August 2015 +Last updated: 01 September 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index a1156ae..cbd10ea 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21" +.TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -2953,7 +2953,10 @@ that does not include a closing parenthesis. The name is not processed in any way, and it is not possible to include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing is applied to verb names and only an unescaped closing parenthesis terminates -the name. +the name. A closing parenthesis can be included in a name either as \e) or +between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace +in verb names is skipped and #-comments are recognized, exactly as in the rest +of the pattern. .P The maximum length of a name is 255 in the 8-bit library and 65535 in the 16-bit and 32-bit libraries. If the name is empty, that is, if the closing @@ -3383,6 +3386,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 August 2015 +Last updated: 01 September 2015 Copyright (c) 1997-2015 University of Cambridge. .fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 470c716..47ead0f 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -2792,6 +2792,148 @@ return n8; +/************************************************* +* Process (*VERB) name for escapes * +*************************************************/ + +/* This function is called when the PCRE2_ALT_VERBNAMES option is set, to +process the characters in a verb's name argument. It is called twice, once with +codeptr == NULL, to find out the length of the processed name, and again to put +the name into memory. + +Arguments: + ptrptr pointer to the input pointer + codeptr pointer to the compiled code pointer + errorcodeptr pointer to the error code + utf TRUE if processing UTF + cb compile data block + +Returns: length of the processed name, or < 0 on error +*/ + +static int +process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr, + uint32_t options, BOOL utf, compile_block *cb) +{ +int arglen = 0; +BOOL inescq = FALSE; +PCRE2_SPTR ptr = *ptrptr; +PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr; + +for (; ptr < cb->end_pattern; ptr++) + { + uint32_t x = *ptr; + + /* Skip over literals */ + + if (inescq) + { + if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E) + { + inescq = FALSE; + ptr++;; + continue; + } + } + + else /* Not a literal character */ + { + if (x == CHAR_RIGHT_PARENTHESIS) break; + + /* Skip over comments and whitespace in extended mode. Need a loop to handle + whitespace after a comment. */ + + if ((options & PCRE2_EXTENDED) != 0) + { + for (;;) + { + while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr); + if (x != CHAR_NUMBER_SIGN) break; + ptr++; + while (*ptr != CHAR_NULL) + { + if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ + { /* IS_NEWLINE sets cb->nllen. */ + ptr += cb->nllen; + break; + } + ptr++; +#ifdef SUPPORT_UNICODE + if (utf) FORWARDCHAR(ptr); +#endif + } + x = *ptr; /* Either NULL or the char after a newline */ + } + if (ptr >= cb->end_pattern) break; + } + + /* Process escapes */ + + if (x == '\\') + { + int rc; + *errorcodeptr = 0; + rc = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb); + *ptrptr = ptr; /* For possible error */ + if (*errorcodeptr != 0) return -1; + if (rc != 0) + { + if (rc == ESC_Q) + { + inescq = TRUE; + continue; + } + if (rc == ESC_E) continue; + *errorcodeptr = ERR40; + return -1; + } + } + } + + /* We have the next character in the name. */ + +#ifdef SUPPORT_UNICODE + if (utf) + { + if (code == NULL) /* Just want the length */ + { +#if PCRE2_CODE_UNIT_WIDTH == 8 + int i; + for (i = 0; i < PRIV(utf8_table1_size); i++) + if ((int)x <= PRIV(utf8_table1)[i]) break; + arglen += i; +#elif PCRE2_CODE_UNIT_WIDTH == 16 + if (x > 0xffff) arglen++; +#endif + } + else + { + PCRE2_UCHAR cbuff[8]; + x = PRIV(ord2utf)(x, cbuff); + memcpy(code, cbuff, CU2BYTES(x)); + code += x; + } + } + else +#endif /* SUPPORT_UNICODE */ + + /* Not UTF */ + { + if (code != NULL) *code++ = x; + } + + arglen++; + } + +/* Update the pointers before returning. */ + +*ptrptr = ptr; +if (codeptr != NULL) *codeptr = code; +return arglen; +} + + + /************************************************* * Scan regex to identify named groups * *************************************************/ @@ -5399,33 +5541,9 @@ for (;; ptr++) } else { - arglen = 0; - while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) - { - if (*ptr == '\\') - { - uint32_t x; - *errorcodeptr = 0; - i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb); - if (*errorcodeptr != 0) goto FAILED; - if (i != 0) - { - *errorcodeptr = ERR40; - goto FAILED; - } -#ifdef SUPPORT_UNICODE -#if PCRE2_CODE_UNIT_WIDTH == 8 - for (i = 0; i < PRIV(utf8_table1_size); i++) - if ((int)x <= PRIV(utf8_table1)[i]) break; - arglen += i; -#elif PCRE2_CODE_UNIT_WIDTH == 16 - if (x > 0xffff) arglen++; -#endif -#endif - } - arglen++; - ptr++; - } + arglen = process_verb_name(&ptr, NULL, errorcodeptr, options, + utf, cb); + if (arglen < 0) goto FAILED; } if ((unsigned int)arglen > MAX_MARK) @@ -5495,35 +5613,12 @@ for (;; ptr++) } setverb = *code++ = verbs[i].op_arg; *code++ = arglen; - - /* If we are processing the argument for escapes, we don't need - to apply checks here because it was all checked above when - computing the length. */ - if ((options & PCRE2_ALT_VERBNAMES) != 0) { - for (; arg != ptr; arg++) - { - if (*arg == '\\') - { - uint32_t x; - *errorcodeptr = 0; - (void)check_escape(&arg, &x, errorcodeptr, options, FALSE, - cb); -#ifdef SUPPORT_UNICODE - if (utf) - { - PCRE2_UCHAR cbuff[8]; - x = PRIV(ord2utf)(x, cbuff); - memcpy(code, cbuff, CU2BYTES(x)); - code += x; - } - else -#endif - *code++ = x; - } - else *code++ = *arg; - } + PCRE2_UCHAR *memcode = code; /* code is "register" */ + (void)process_verb_name(&arg, &memcode, errorcodeptr, options, + utf, cb); + code = memcode; } else /* No argument processing */ { diff --git a/testdata/testinput2 b/testdata/testinput2 index 52b89c7..3df7590 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4449,4 +4449,20 @@ a random value. /Ix /(*:ab\t(d\)c)xxx/alt_verbnames,mark cxxxz +/(*:A\Qxx)x\EB)x/alt_verbnames,mark + x + +/(*:A\ExxxB)x/alt_verbnames,mark + x + +/(*: A \ and #comment + \ B)x/x,alt_verbnames,mark + x + +/(*:A +B)x/alt_verbnames,mark + x + +/(*:abc\Qpqr)/alt_verbnames + # End of testinput2 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 3451d95..382ea58 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -14724,4 +14724,29 @@ Failed: error 122 at offset 12: unmatched closing parenthesis 0: xxx MK: ab\x09(d)c +/(*:A\Qxx)x\EB)x/alt_verbnames,mark + x + 0: x +MK: Axx)xB + +/(*:A\ExxxB)x/alt_verbnames,mark + x + 0: x +MK: AxxxB + +/(*: A \ and #comment + \ B)x/x,alt_verbnames,mark + x + 0: x +MK: A and B + +/(*:A +B)x/alt_verbnames,mark + x + 0: x +MK: A\x0aB + +/(*:abc\Qpqr)/alt_verbnames +Failed: error 160 at offset 12: (*VERB) not recognized or malformed + # End of testinput2