Complete escape processing for PCRE2_ALT_VERBNAMES

This commit is contained in:
Philip.Hazel 2015-09-01 17:32:42 +00:00
parent d2e87a75af
commit cdf07ab585
5 changed files with 202 additions and 60 deletions

View File

@ -1,4 +1,4 @@
.TH PCRE2API 3 "30 August 2015" "PCRE2 10.21" .TH PCRE2API 3 "01 September 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.sp .sp
@ -1060,7 +1060,10 @@ By default, for compatibility with Perl, the name in any verb sequence such as
parenthesis. The name is not processed in any way, and it is not possible to parenthesis. The name is not processed in any way, and it is not possible to
include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES include a closing parenthesis in the name. However, if the PCRE2_ALT_VERBNAMES
option is set, normal backslash processing is applied to verb names and only an option is set, normal backslash processing is applied to verb names and only an
unescaped closing parenthesis terminates the name. unescaped closing parenthesis terminates the name. A closing parenthesis can be
included in a name either as \e) or between \eQ and \eE. If the PCRE2_EXTENDED
option is set, unescaped whitespace in verb names is skipped and #-comments are
recognized, exactly as in the rest of the pattern.
.sp .sp
PCRE2_AUTO_CALLOUT PCRE2_AUTO_CALLOUT
.sp .sp
@ -2962,6 +2965,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 30 August 2015 Last updated: 01 September 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -1,4 +1,4 @@
.TH PCRE2PATTERN 3 "30 August 2015" "PCRE2 10.21" .TH PCRE2PATTERN 3 "01 September 2015" "PCRE2 10.21"
.SH NAME .SH NAME
PCRE2 - Perl-compatible regular expressions (revised API) PCRE2 - Perl-compatible regular expressions (revised API)
.SH "PCRE2 REGULAR EXPRESSION DETAILS" .SH "PCRE2 REGULAR EXPRESSION DETAILS"
@ -2953,7 +2953,10 @@ that does not include a closing parenthesis. The name is not processed in
any way, and it is not possible to include a closing parenthesis in the name. any way, and it is not possible to include a closing parenthesis in the name.
However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing However, if the PCRE2_ALT_VERBNAMES option is set, normal backslash processing
is applied to verb names and only an unescaped closing parenthesis terminates is applied to verb names and only an unescaped closing parenthesis terminates
the name. the name. A closing parenthesis can be included in a name either as \e) or
between \eQ and \eE. If the PCRE2_EXTENDED option is set, unescaped whitespace
in verb names is skipped and #-comments are recognized, exactly as in the rest
of the pattern.
.P .P
The maximum length of a name is 255 in the 8-bit library and 65535 in the The maximum length of a name is 255 in the 8-bit library and 65535 in the
16-bit and 32-bit libraries. If the name is empty, that is, if the closing 16-bit and 32-bit libraries. If the name is empty, that is, if the closing
@ -3383,6 +3386,6 @@ Cambridge, England.
.rs .rs
.sp .sp
.nf .nf
Last updated: 30 August 2015 Last updated: 01 September 2015
Copyright (c) 1997-2015 University of Cambridge. Copyright (c) 1997-2015 University of Cambridge.
.fi .fi

View File

@ -2792,6 +2792,148 @@ return n8;
/*************************************************
* Process (*VERB) name for escapes *
*************************************************/
/* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
process the characters in a verb's name argument. It is called twice, once with
codeptr == NULL, to find out the length of the processed name, and again to put
the name into memory.
Arguments:
ptrptr pointer to the input pointer
codeptr pointer to the compiled code pointer
errorcodeptr pointer to the error code
utf TRUE if processing UTF
cb compile data block
Returns: length of the processed name, or < 0 on error
*/
static int
process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
uint32_t options, BOOL utf, compile_block *cb)
{
int arglen = 0;
BOOL inescq = FALSE;
PCRE2_SPTR ptr = *ptrptr;
PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
for (; ptr < cb->end_pattern; ptr++)
{
uint32_t x = *ptr;
/* Skip over literals */
if (inescq)
{
if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
{
inescq = FALSE;
ptr++;;
continue;
}
}
else /* Not a literal character */
{
if (x == CHAR_RIGHT_PARENTHESIS) break;
/* Skip over comments and whitespace in extended mode. Need a loop to handle
whitespace after a comment. */
if ((options & PCRE2_EXTENDED) != 0)
{
for (;;)
{
while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
if (x != CHAR_NUMBER_SIGN) break;
ptr++;
while (*ptr != CHAR_NULL)
{
if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */
{ /* IS_NEWLINE sets cb->nllen. */
ptr += cb->nllen;
break;
}
ptr++;
#ifdef SUPPORT_UNICODE
if (utf) FORWARDCHAR(ptr);
#endif
}
x = *ptr; /* Either NULL or the char after a newline */
}
if (ptr >= cb->end_pattern) break;
}
/* Process escapes */
if (x == '\\')
{
int rc;
*errorcodeptr = 0;
rc = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
*ptrptr = ptr; /* For possible error */
if (*errorcodeptr != 0) return -1;
if (rc != 0)
{
if (rc == ESC_Q)
{
inescq = TRUE;
continue;
}
if (rc == ESC_E) continue;
*errorcodeptr = ERR40;
return -1;
}
}
}
/* We have the next character in the name. */
#ifdef SUPPORT_UNICODE
if (utf)
{
if (code == NULL) /* Just want the length */
{
#if PCRE2_CODE_UNIT_WIDTH == 8
int i;
for (i = 0; i < PRIV(utf8_table1_size); i++)
if ((int)x <= PRIV(utf8_table1)[i]) break;
arglen += i;
#elif PCRE2_CODE_UNIT_WIDTH == 16
if (x > 0xffff) arglen++;
#endif
}
else
{
PCRE2_UCHAR cbuff[8];
x = PRIV(ord2utf)(x, cbuff);
memcpy(code, cbuff, CU2BYTES(x));
code += x;
}
}
else
#endif /* SUPPORT_UNICODE */
/* Not UTF */
{
if (code != NULL) *code++ = x;
}
arglen++;
}
/* Update the pointers before returning. */
*ptrptr = ptr;
if (codeptr != NULL) *codeptr = code;
return arglen;
}
/************************************************* /*************************************************
* Scan regex to identify named groups * * Scan regex to identify named groups *
*************************************************/ *************************************************/
@ -5399,33 +5541,9 @@ for (;; ptr++)
} }
else else
{ {
arglen = 0; arglen = process_verb_name(&ptr, NULL, errorcodeptr, options,
while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) utf, cb);
{ if (arglen < 0) goto FAILED;
if (*ptr == '\\')
{
uint32_t x;
*errorcodeptr = 0;
i = check_escape(&ptr, &x, errorcodeptr, options, FALSE, cb);
if (*errorcodeptr != 0) goto FAILED;
if (i != 0)
{
*errorcodeptr = ERR40;
goto FAILED;
}
#ifdef SUPPORT_UNICODE
#if PCRE2_CODE_UNIT_WIDTH == 8
for (i = 0; i < PRIV(utf8_table1_size); i++)
if ((int)x <= PRIV(utf8_table1)[i]) break;
arglen += i;
#elif PCRE2_CODE_UNIT_WIDTH == 16
if (x > 0xffff) arglen++;
#endif
#endif
}
arglen++;
ptr++;
}
} }
if ((unsigned int)arglen > MAX_MARK) if ((unsigned int)arglen > MAX_MARK)
@ -5495,35 +5613,12 @@ for (;; ptr++)
} }
setverb = *code++ = verbs[i].op_arg; setverb = *code++ = verbs[i].op_arg;
*code++ = arglen; *code++ = arglen;
/* If we are processing the argument for escapes, we don't need
to apply checks here because it was all checked above when
computing the length. */
if ((options & PCRE2_ALT_VERBNAMES) != 0) if ((options & PCRE2_ALT_VERBNAMES) != 0)
{ {
for (; arg != ptr; arg++) PCRE2_UCHAR *memcode = code; /* code is "register" */
{ (void)process_verb_name(&arg, &memcode, errorcodeptr, options,
if (*arg == '\\') utf, cb);
{ code = memcode;
uint32_t x;
*errorcodeptr = 0;
(void)check_escape(&arg, &x, errorcodeptr, options, FALSE,
cb);
#ifdef SUPPORT_UNICODE
if (utf)
{
PCRE2_UCHAR cbuff[8];
x = PRIV(ord2utf)(x, cbuff);
memcpy(code, cbuff, CU2BYTES(x));
code += x;
}
else
#endif
*code++ = x;
}
else *code++ = *arg;
}
} }
else /* No argument processing */ else /* No argument processing */
{ {

16
testdata/testinput2 vendored
View File

@ -4449,4 +4449,20 @@ a random value. /Ix
/(*:ab\t(d\)c)xxx/alt_verbnames,mark /(*:ab\t(d\)c)xxx/alt_verbnames,mark
cxxxz cxxxz
/(*:A\Qxx)x\EB)x/alt_verbnames,mark
x
/(*:A\ExxxB)x/alt_verbnames,mark
x
/(*: A \ and #comment
\ B)x/x,alt_verbnames,mark
x
/(*:A
B)x/alt_verbnames,mark
x
/(*:abc\Qpqr)/alt_verbnames
# End of testinput2 # End of testinput2

25
testdata/testoutput2 vendored
View File

@ -14724,4 +14724,29 @@ Failed: error 122 at offset 12: unmatched closing parenthesis
0: xxx 0: xxx
MK: ab\x09(d)c MK: ab\x09(d)c
/(*:A\Qxx)x\EB)x/alt_verbnames,mark
x
0: x
MK: Axx)xB
/(*:A\ExxxB)x/alt_verbnames,mark
x
0: x
MK: AxxxB
/(*: A \ and #comment
\ B)x/x,alt_verbnames,mark
x
0: x
MK: A and B
/(*:A
B)x/alt_verbnames,mark
x
0: x
MK: A\x0aB
/(*:abc\Qpqr)/alt_verbnames
Failed: error 160 at offset 12: (*VERB) not recognized or malformed
# End of testinput2 # End of testinput2