Give error for overlong lookbehind assertion.
This commit is contained in:
parent
7d7a92edef
commit
75181cca2e
|
@ -255,6 +255,8 @@ trouble in some environments.
|
||||||
73. The maximum lookbehind length was incorrectly calculated for patterns such
|
73. The maximum lookbehind length was incorrectly calculated for patterns such
|
||||||
as /(?<=(a)(?-1))x/ which have a recursion within a backreference.
|
as /(?<=(a)(?-1))x/ which have a recursion within a backreference.
|
||||||
|
|
||||||
|
74. Give an error if a lookbehind assertion is longer than 65535 code units.
|
||||||
|
|
||||||
|
|
||||||
Version 10.20 30-June-2015
|
Version 10.20 30-June-2015
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
.TH PCRE2LIMITS 3 "25 November 2014" "PCRE2 10.00"
|
.TH PCRE2LIMITS 3 "03 November 2015" "PCRE2 10.21"
|
||||||
.SH NAME
|
.SH NAME
|
||||||
PCRE2 - Perl-compatible regular expressions (revised API)
|
PCRE2 - Perl-compatible regular expressions (revised API)
|
||||||
.SH "SIZE AND OTHER LIMITATIONS"
|
.SH "SIZE AND OTHER LIMITATIONS"
|
||||||
|
@ -37,6 +37,8 @@ documentation.
|
||||||
.P
|
.P
|
||||||
All values in repeating quantifiers must be less than 65536.
|
All values in repeating quantifiers must be less than 65536.
|
||||||
.P
|
.P
|
||||||
|
The maximum length of a lookbehind assertion is 65535 characters.
|
||||||
|
.P
|
||||||
There is no limit to the number of parenthesized subpatterns, but there can be
|
There is no limit to the number of parenthesized subpatterns, but there can be
|
||||||
no more than 65535 capturing subpatterns. There is, however, a limit to the
|
no more than 65535 capturing subpatterns. There is, however, a limit to the
|
||||||
depth of nesting of parenthesized subpatterns of all kinds. This is imposed in
|
depth of nesting of parenthesized subpatterns of all kinds. This is imposed in
|
||||||
|
@ -69,6 +71,6 @@ Cambridge, England.
|
||||||
.rs
|
.rs
|
||||||
.sp
|
.sp
|
||||||
.nf
|
.nf
|
||||||
Last updated: 25 November 2014
|
Last updated: 03 November 2015
|
||||||
Copyright (c) 1997-2014 University of Cambridge.
|
Copyright (c) 1997-2015 University of Cambridge.
|
||||||
.fi
|
.fi
|
||||||
|
|
|
@ -571,8 +571,8 @@ static PCRE2_SPTR posix_substitutes[] = {
|
||||||
|
|
||||||
/* Compile time error code numbers. They are given names so that they can more
|
/* Compile time error code numbers. They are given names so that they can more
|
||||||
easily be tracked. When a new number is added, the tables called eint1 and
|
easily be tracked. When a new number is added, the tables called eint1 and
|
||||||
eint2 in pcre2posix.c must be updated, and a new error text must be added to
|
eint2 in pcre2posix.c may need to be updated, and a new error text must be
|
||||||
compile_error_texts in pcre2_error.c. */
|
added to compile_error_texts in pcre2_error.c. */
|
||||||
|
|
||||||
enum { ERR0 = COMPILE_ERROR_BASE,
|
enum { ERR0 = COMPILE_ERROR_BASE,
|
||||||
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
|
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
|
||||||
|
@ -583,7 +583,20 @@ enum { ERR0 = COMPILE_ERROR_BASE,
|
||||||
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
|
||||||
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
|
||||||
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
|
||||||
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86 };
|
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87 };
|
||||||
|
|
||||||
|
/* Error codes that correspond to negative error codes returned by
|
||||||
|
find_fixedlength(). */
|
||||||
|
|
||||||
|
static int fixed_length_errors[] =
|
||||||
|
{
|
||||||
|
ERR0, /* Not an error */
|
||||||
|
ERR0, /* Not an error; -1 is used for "process later" */
|
||||||
|
ERR25, /* Lookbehind is not fixed length */
|
||||||
|
ERR36, /* \C in lookbehind is not allowed */
|
||||||
|
ERR87, /* Lookbehind is too long */
|
||||||
|
ERR70 /* Internal error: unknown opcode encountered */
|
||||||
|
};
|
||||||
|
|
||||||
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
/* This is a table of start-of-pattern options such as (*UTF) and settings such
|
||||||
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
|
||||||
|
@ -780,16 +793,19 @@ PUT(previous_callout, 1 + LINK_SIZE, length);
|
||||||
*************************************************/
|
*************************************************/
|
||||||
|
|
||||||
/* Scan a branch and compute the fixed length of subject that will match it, if
|
/* Scan a branch and compute the fixed length of subject that will match it, if
|
||||||
the length is fixed. This is needed for dealing with backward assertions. In
|
the length is fixed. This is needed for dealing with lookbehind assertions. In
|
||||||
UTF mode, the result is in code units rather than bytes. The branch is
|
UTF mode, the result is in code units rather than bytes. The branch is
|
||||||
temporarily terminated with OP_END when this function is called.
|
temporarily terminated with OP_END when this function is called.
|
||||||
|
|
||||||
This function is called when a backward assertion is encountered, so that if it
|
This function is called when a lookbehind assertion is encountered, so that if
|
||||||
fails, the error message can point to the correct place in the pattern.
|
it fails, the error message can point to the correct place in the pattern.
|
||||||
However, we cannot do this when the assertion contains subroutine calls,
|
However, we cannot do this when the assertion contains subroutine calls,
|
||||||
because they can be forward references. We solve this by remembering this case
|
because they can be forward references. We solve this by remembering this case
|
||||||
and doing the check at the end; a flag specifies which mode we are running in.
|
and doing the check at the end; a flag specifies which mode we are running in.
|
||||||
|
|
||||||
|
Lookbehind lengths are held in 16-bit fields and the maximum value is defined
|
||||||
|
as LOOKBEHIND_MAX.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
code points to the start of the pattern (the bracket)
|
code points to the start of the pattern (the bracket)
|
||||||
utf TRUE in UTF mode
|
utf TRUE in UTF mode
|
||||||
|
@ -797,13 +813,20 @@ Arguments:
|
||||||
cb the "compile data" structure
|
cb the "compile data" structure
|
||||||
recurses chain of recurse_check to catch mutual recursion
|
recurses chain of recurse_check to catch mutual recursion
|
||||||
|
|
||||||
Returns: the fixed length,
|
Returns: if non-negative, the fixed length,
|
||||||
or -1 if there is no fixed length,
|
or -1 if an OP_RECURSE item was encountered and atend is FALSE
|
||||||
or -2 if \C was encountered (in UTF-8 mode only)
|
or -2 if there is no fixed length,
|
||||||
or -3 if an OP_RECURSE item was encountered and atend is FALSE
|
or -3 if \C was encountered (in UTF-8 mode only)
|
||||||
or -4 if an unknown opcode was encountered (internal error)
|
or -4 length is too long
|
||||||
|
or -5 if an unknown opcode was encountered (internal error)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#define FFL_LATER (-1)
|
||||||
|
#define FFL_NOTFIXED (-2)
|
||||||
|
#define FFL_BACKSLASHC (-3)
|
||||||
|
#define FFL_TOOLONG (-4)
|
||||||
|
#define FFL_UNKNOWNOP (-5)
|
||||||
|
|
||||||
static int
|
static int
|
||||||
find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
|
find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
|
||||||
recurse_check *recurses)
|
recurse_check *recurses)
|
||||||
|
@ -822,6 +845,8 @@ for (;;)
|
||||||
PCRE2_UCHAR *ce, *cs;
|
PCRE2_UCHAR *ce, *cs;
|
||||||
register PCRE2_UCHAR op = *cc;
|
register PCRE2_UCHAR op = *cc;
|
||||||
|
|
||||||
|
if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG;
|
||||||
|
|
||||||
switch (op)
|
switch (op)
|
||||||
{
|
{
|
||||||
/* We only need to continue for OP_CBRA (normal capturing bracket) and
|
/* We only need to continue for OP_CBRA (normal capturing bracket) and
|
||||||
|
@ -854,7 +879,7 @@ for (;;)
|
||||||
case OP_ACCEPT:
|
case OP_ACCEPT:
|
||||||
case OP_ASSERT_ACCEPT:
|
case OP_ASSERT_ACCEPT:
|
||||||
if (length < 0) length = branchlength;
|
if (length < 0) length = branchlength;
|
||||||
else if (length != branchlength) return -1;
|
else if (length != branchlength) return FFL_NOTFIXED;
|
||||||
if (*cc != OP_ALT) return length;
|
if (*cc != OP_ALT) return length;
|
||||||
cc += 1 + LINK_SIZE;
|
cc += 1 + LINK_SIZE;
|
||||||
branchlength = 0;
|
branchlength = 0;
|
||||||
|
@ -862,18 +887,18 @@ for (;;)
|
||||||
|
|
||||||
/* A true recursion implies not fixed length, but a subroutine call may
|
/* A true recursion implies not fixed length, but a subroutine call may
|
||||||
be OK. If the subroutine is a forward reference, we can't deal with
|
be OK. If the subroutine is a forward reference, we can't deal with
|
||||||
it until the end of the pattern, so return -3. */
|
it until the end of the pattern, so return FFL_LATER. */
|
||||||
|
|
||||||
case OP_RECURSE:
|
case OP_RECURSE:
|
||||||
if (!atend) return -3;
|
if (!atend) return FFL_LATER;
|
||||||
cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
|
cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
|
||||||
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
|
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
|
||||||
if (cc > cs && cc < ce) return -1; /* Recursion */
|
if (cc > cs && cc < ce) return FFL_NOTFIXED; /* Recursion */
|
||||||
else /* Check for mutual recursion */
|
else /* Check for mutual recursion */
|
||||||
{
|
{
|
||||||
recurse_check *r = recurses;
|
recurse_check *r = recurses;
|
||||||
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
||||||
if (r != NULL) return -1; /* Mutual recursion */
|
if (r != NULL) return FFL_NOTFIXED; /* Mutual recursion */
|
||||||
}
|
}
|
||||||
this_recurse.prev = recurses;
|
this_recurse.prev = recurses;
|
||||||
this_recurse.group = cs;
|
this_recurse.group = cs;
|
||||||
|
@ -999,7 +1024,7 @@ for (;;)
|
||||||
otherwise \C is coded as OP_ALLANY. */
|
otherwise \C is coded as OP_ALLANY. */
|
||||||
|
|
||||||
case OP_ANYBYTE:
|
case OP_ANYBYTE:
|
||||||
return -2;
|
return FFL_BACKSLASHC;
|
||||||
|
|
||||||
/* Check a class for variable quantification */
|
/* Check a class for variable quantification */
|
||||||
|
|
||||||
|
@ -1028,12 +1053,12 @@ for (;;)
|
||||||
case OP_CRPOSSTAR:
|
case OP_CRPOSSTAR:
|
||||||
case OP_CRPOSPLUS:
|
case OP_CRPOSPLUS:
|
||||||
case OP_CRPOSQUERY:
|
case OP_CRPOSQUERY:
|
||||||
return -1;
|
return FFL_NOTFIXED;
|
||||||
|
|
||||||
case OP_CRRANGE:
|
case OP_CRRANGE:
|
||||||
case OP_CRMINRANGE:
|
case OP_CRMINRANGE:
|
||||||
case OP_CRPOSRANGE:
|
case OP_CRPOSRANGE:
|
||||||
if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
|
if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return FFL_NOTFIXED;
|
||||||
branchlength += (int)GET2(cc,1);
|
branchlength += (int)GET2(cc,1);
|
||||||
cc += 1 + 2 * IMM2_SIZE;
|
cc += 1 + 2 * IMM2_SIZE;
|
||||||
break;
|
break;
|
||||||
|
@ -1125,13 +1150,13 @@ for (;;)
|
||||||
case OP_TYPEUPTO:
|
case OP_TYPEUPTO:
|
||||||
case OP_UPTO:
|
case OP_UPTO:
|
||||||
case OP_UPTOI:
|
case OP_UPTOI:
|
||||||
return -1;
|
return FFL_NOTFIXED;
|
||||||
|
|
||||||
/* Catch unrecognized opcodes so that when new ones are added they
|
/* Catch unrecognized opcodes so that when new ones are added they
|
||||||
are not forgotten, as has happened in the past. */
|
are not forgotten, as has happened in the past. */
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return -4;
|
return FFL_UNKNOWNOP;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Control never gets here */
|
/* Control never gets here */
|
||||||
|
@ -7459,11 +7484,11 @@ for (;;)
|
||||||
|
|
||||||
/* If lookbehind, check that this branch matches a fixed-length string, and
|
/* If lookbehind, check that this branch matches a fixed-length string, and
|
||||||
put the length into the OP_REVERSE item. Temporarily mark the end of the
|
put the length into the OP_REVERSE item. Temporarily mark the end of the
|
||||||
branch with OP_END. If the branch contains OP_RECURSE, the result is -3
|
branch with OP_END. If the branch contains OP_RECURSE, the result is
|
||||||
because there may be forward references that we can't check here. Set a
|
FFL_LATER (a negative value) because there may be forward references that
|
||||||
flag to cause another lookbehind check at the end. Why not do it all at the
|
we can't check here. Set a flag to cause another lookbehind check at the
|
||||||
end? Because common, erroneous checks are picked up here and the offset of
|
end. Why not do it all at the end? Because common errors can be picked up
|
||||||
the problem can be shown. */
|
here and the offset of the problem can be shown. */
|
||||||
|
|
||||||
if (lookbehind)
|
if (lookbehind)
|
||||||
{
|
{
|
||||||
|
@ -7471,14 +7496,13 @@ for (;;)
|
||||||
*code = OP_END;
|
*code = OP_END;
|
||||||
fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0,
|
fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0,
|
||||||
FALSE, cb, NULL);
|
FALSE, cb, NULL);
|
||||||
if (fixed_length == -3)
|
if (fixed_length == FFL_LATER)
|
||||||
{
|
{
|
||||||
cb->check_lookbehind = TRUE;
|
cb->check_lookbehind = TRUE;
|
||||||
}
|
}
|
||||||
else if (fixed_length < 0)
|
else if (fixed_length < 0)
|
||||||
{
|
{
|
||||||
*errorcodeptr = (fixed_length == -2)? ERR36 :
|
*errorcodeptr = fixed_length_errors[-fixed_length];
|
||||||
(fixed_length == -4)? ERR70: ERR25;
|
|
||||||
*ptrptr = ptr;
|
*ptrptr = ptr;
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
@ -8578,8 +8602,7 @@ if (errorcode == 0 && cb.check_lookbehind)
|
||||||
*be = end_op;
|
*be = end_op;
|
||||||
if (fixed_length < 0)
|
if (fixed_length < 0)
|
||||||
{
|
{
|
||||||
errorcode = (fixed_length == -2)? ERR36 :
|
errorcode = fixed_length_errors[-fixed_length];
|
||||||
(fixed_length == -4)? ERR70 : ERR25;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length;
|
if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length;
|
||||||
|
|
|
@ -171,6 +171,7 @@ static const char compile_error_texts[] =
|
||||||
/* 85 */
|
/* 85 */
|
||||||
"using \\C is disabled in this PCRE2 library\0"
|
"using \\C is disabled in this PCRE2 library\0"
|
||||||
"regular expression is too complicated\0"
|
"regular expression is too complicated\0"
|
||||||
|
"lookbehind assertion is too long\0"
|
||||||
;
|
;
|
||||||
|
|
||||||
/* Match-time and UTF error texts are in the same format. */
|
/* Match-time and UTF error texts are in the same format. */
|
||||||
|
|
|
@ -589,11 +589,17 @@ typedef struct pcre2_real_match_context {
|
||||||
defined specially because it is required in pcre2_serialize_decode() when
|
defined specially because it is required in pcre2_serialize_decode() when
|
||||||
copying the size from possibly unaligned memory into a variable of the same
|
copying the size from possibly unaligned memory into a variable of the same
|
||||||
type. Use a macro rather than a typedef to avoid compiler warnings when this
|
type. Use a macro rather than a typedef to avoid compiler warnings when this
|
||||||
file is included multiple times by pcre2test. */
|
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
|
||||||
|
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
|
||||||
|
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
|
||||||
|
here.) */
|
||||||
|
|
||||||
#undef CODE_BLOCKSIZE_TYPE
|
#undef CODE_BLOCKSIZE_TYPE
|
||||||
#define CODE_BLOCKSIZE_TYPE size_t
|
#define CODE_BLOCKSIZE_TYPE size_t
|
||||||
|
|
||||||
|
#undef LOOKBEHIND_MAX
|
||||||
|
#define LOOKBEHIND_MAX UINT16_MAX
|
||||||
|
|
||||||
typedef struct pcre2_real_code {
|
typedef struct pcre2_real_code {
|
||||||
pcre2_memctl memctl; /* Memory control fields */
|
pcre2_memctl memctl; /* Memory control fields */
|
||||||
const uint8_t *tables; /* The character tables */
|
const uint8_t *tables; /* The character tables */
|
||||||
|
|
|
@ -4592,4 +4592,8 @@ B)x/alt_verbnames,mark
|
||||||
/abc/replace=A$3123456789Z
|
/abc/replace=A$3123456789Z
|
||||||
abc
|
abc
|
||||||
|
|
||||||
|
/(?<!a{65535}a{5})x/I
|
||||||
|
|
||||||
|
/(?<!a{65535})x/I
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
|
@ -14681,4 +14681,13 @@ Failed: error 142 at offset 7: syntax error in subpattern name (missing terminat
|
||||||
abc
|
abc
|
||||||
Failed: error -49 at offset 3 in replacement: unknown substring
|
Failed: error -49 at offset 3 in replacement: unknown substring
|
||||||
|
|
||||||
|
/(?<!a{65535}a{5})x/I
|
||||||
|
Failed: error 187 at offset 16: lookbehind assertion is too long
|
||||||
|
|
||||||
|
/(?<!a{65535})x/I
|
||||||
|
Capturing subpattern count = 0
|
||||||
|
Max lookbehind = 65535
|
||||||
|
First code unit = 'x'
|
||||||
|
Subject length lower bound = 1
|
||||||
|
|
||||||
# End of testinput2
|
# End of testinput2
|
||||||
|
|
Loading…
Reference in New Issue