Give error for overlong lookbehind assertion.

This commit is contained in:
Philip.Hazel 2015-11-03 10:39:27 +00:00
parent 7d7a92edef
commit 75181cca2e
7 changed files with 82 additions and 35 deletions

View File

@ -255,6 +255,8 @@ trouble in some environments.
73. The maximum lookbehind length was incorrectly calculated for patterns such
as /(?<=(a)(?-1))x/ which have a recursion within a backreference.
74. Give an error if a lookbehind assertion is longer than 65535 code units.
Version 10.20 30-June-2015
--------------------------

View File

@ -1,4 +1,4 @@
.TH PCRE2LIMITS 3 "25 November 2014" "PCRE2 10.00"
.TH PCRE2LIMITS 3 "03 November 2015" "PCRE2 10.21"
.SH NAME
PCRE2 - Perl-compatible regular expressions (revised API)
.SH "SIZE AND OTHER LIMITATIONS"
@ -37,6 +37,8 @@ documentation.
.P
All values in repeating quantifiers must be less than 65536.
.P
The maximum length of a lookbehind assertion is 65535 characters.
.P
There is no limit to the number of parenthesized subpatterns, but there can be
no more than 65535 capturing subpatterns. There is, however, a limit to the
depth of nesting of parenthesized subpatterns of all kinds. This is imposed in
@ -69,6 +71,6 @@ Cambridge, England.
.rs
.sp
.nf
Last updated: 25 November 2014
Copyright (c) 1997-2014 University of Cambridge.
Last updated: 03 November 2015
Copyright (c) 1997-2015 University of Cambridge.
.fi

View File

@ -571,8 +571,8 @@ static PCRE2_SPTR posix_substitutes[] = {
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
eint2 in pcre2posix.c must be updated, and a new error text must be added to
compile_error_texts in pcre2_error.c. */
eint2 in pcre2posix.c may need to be updated, and a new error text must be
added to compile_error_texts in pcre2_error.c. */
enum { ERR0 = COMPILE_ERROR_BASE,
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
@ -583,7 +583,20 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86 };
ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87 };
/* Error codes that correspond to negative error codes returned by
find_fixedlength(). */
static int fixed_length_errors[] =
{
ERR0, /* Not an error */
ERR0, /* Not an error; -1 is used for "process later" */
ERR25, /* Lookbehind is not fixed length */
ERR36, /* \C in lookbehind is not allowed */
ERR87, /* Lookbehind is too long */
ERR70 /* Internal error: unknown opcode encountered */
};
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -780,16 +793,19 @@ PUT(previous_callout, 1 + LINK_SIZE, length);
*************************************************/
/* Scan a branch and compute the fixed length of subject that will match it, if
the length is fixed. This is needed for dealing with backward assertions. In
the length is fixed. This is needed for dealing with lookbehind assertions. In
UTF mode, the result is in code units rather than bytes. The branch is
temporarily terminated with OP_END when this function is called.
This function is called when a backward assertion is encountered, so that if it
fails, the error message can point to the correct place in the pattern.
This function is called when a lookbehind assertion is encountered, so that if
it fails, the error message can point to the correct place in the pattern.
However, we cannot do this when the assertion contains subroutine calls,
because they can be forward references. We solve this by remembering this case
and doing the check at the end; a flag specifies which mode we are running in.
Lookbehind lengths are held in 16-bit fields and the maximum value is defined
as LOOKBEHIND_MAX.
Arguments:
code points to the start of the pattern (the bracket)
utf TRUE in UTF mode
@ -797,13 +813,20 @@ Arguments:
cb the "compile data" structure
recurses chain of recurse_check to catch mutual recursion
Returns: the fixed length,
or -1 if there is no fixed length,
or -2 if \C was encountered (in UTF-8 mode only)
or -3 if an OP_RECURSE item was encountered and atend is FALSE
or -4 if an unknown opcode was encountered (internal error)
Returns: if non-negative, the fixed length,
or -1 if an OP_RECURSE item was encountered and atend is FALSE
or -2 if there is no fixed length,
or -3 if \C was encountered (in UTF-8 mode only)
or -4 length is too long
or -5 if an unknown opcode was encountered (internal error)
*/
#define FFL_LATER (-1)
#define FFL_NOTFIXED (-2)
#define FFL_BACKSLASHC (-3)
#define FFL_TOOLONG (-4)
#define FFL_UNKNOWNOP (-5)
static int
find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
recurse_check *recurses)
@ -821,6 +844,8 @@ for (;;)
int d;
PCRE2_UCHAR *ce, *cs;
register PCRE2_UCHAR op = *cc;
if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG;
switch (op)
{
@ -854,7 +879,7 @@ for (;;)
case OP_ACCEPT:
case OP_ASSERT_ACCEPT:
if (length < 0) length = branchlength;
else if (length != branchlength) return -1;
else if (length != branchlength) return FFL_NOTFIXED;
if (*cc != OP_ALT) return length;
cc += 1 + LINK_SIZE;
branchlength = 0;
@ -862,18 +887,18 @@ for (;;)
/* A true recursion implies not fixed length, but a subroutine call may
be OK. If the subroutine is a forward reference, we can't deal with
it until the end of the pattern, so return -3. */
it until the end of the pattern, so return FFL_LATER. */
case OP_RECURSE:
if (!atend) return -3;
if (!atend) return FFL_LATER;
cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
if (cc > cs && cc < ce) return -1; /* Recursion */
if (cc > cs && cc < ce) return FFL_NOTFIXED; /* Recursion */
else /* Check for mutual recursion */
{
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) return -1; /* Mutual recursion */
if (r != NULL) return FFL_NOTFIXED; /* Mutual recursion */
}
this_recurse.prev = recurses;
this_recurse.group = cs;
@ -999,7 +1024,7 @@ for (;;)
otherwise \C is coded as OP_ALLANY. */
case OP_ANYBYTE:
return -2;
return FFL_BACKSLASHC;
/* Check a class for variable quantification */
@ -1028,12 +1053,12 @@ for (;;)
case OP_CRPOSSTAR:
case OP_CRPOSPLUS:
case OP_CRPOSQUERY:
return -1;
return FFL_NOTFIXED;
case OP_CRRANGE:
case OP_CRMINRANGE:
case OP_CRPOSRANGE:
if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return FFL_NOTFIXED;
branchlength += (int)GET2(cc,1);
cc += 1 + 2 * IMM2_SIZE;
break;
@ -1125,13 +1150,13 @@ for (;;)
case OP_TYPEUPTO:
case OP_UPTO:
case OP_UPTOI:
return -1;
return FFL_NOTFIXED;
/* Catch unrecognized opcodes so that when new ones are added they
are not forgotten, as has happened in the past. */
default:
return -4;
return FFL_UNKNOWNOP;
}
}
/* Control never gets here */
@ -7459,11 +7484,11 @@ for (;;)
/* If lookbehind, check that this branch matches a fixed-length string, and
put the length into the OP_REVERSE item. Temporarily mark the end of the
branch with OP_END. If the branch contains OP_RECURSE, the result is -3
because there may be forward references that we can't check here. Set a
flag to cause another lookbehind check at the end. Why not do it all at the
end? Because common, erroneous checks are picked up here and the offset of
the problem can be shown. */
branch with OP_END. If the branch contains OP_RECURSE, the result is
FFL_LATER (a negative value) because there may be forward references that
we can't check here. Set a flag to cause another lookbehind check at the
end. Why not do it all at the end? Because common errors can be picked up
here and the offset of the problem can be shown. */
if (lookbehind)
{
@ -7471,14 +7496,13 @@ for (;;)
*code = OP_END;
fixed_length = find_fixedlength(last_branch, (options & PCRE2_UTF) != 0,
FALSE, cb, NULL);
if (fixed_length == -3)
if (fixed_length == FFL_LATER)
{
cb->check_lookbehind = TRUE;
}
else if (fixed_length < 0)
{
*errorcodeptr = (fixed_length == -2)? ERR36 :
(fixed_length == -4)? ERR70: ERR25;
*errorcodeptr = fixed_length_errors[-fixed_length];
*ptrptr = ptr;
return FALSE;
}
@ -8578,8 +8602,7 @@ if (errorcode == 0 && cb.check_lookbehind)
*be = end_op;
if (fixed_length < 0)
{
errorcode = (fixed_length == -2)? ERR36 :
(fixed_length == -4)? ERR70 : ERR25;
errorcode = fixed_length_errors[-fixed_length];
break;
}
if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length;

View File

@ -171,6 +171,7 @@ static const char compile_error_texts[] =
/* 85 */
"using \\C is disabled in this PCRE2 library\0"
"regular expression is too complicated\0"
"lookbehind assertion is too long\0"
;
/* Match-time and UTF error texts are in the same format. */

View File

@ -589,11 +589,17 @@ typedef struct pcre2_real_match_context {
defined specially because it is required in pcre2_serialize_decode() when
copying the size from possibly unaligned memory into a variable of the same
type. Use a macro rather than a typedef to avoid compiler warnings when this
file is included multiple times by pcre2test. */
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
here.) */
#undef CODE_BLOCKSIZE_TYPE
#define CODE_BLOCKSIZE_TYPE size_t
#undef LOOKBEHIND_MAX
#define LOOKBEHIND_MAX UINT16_MAX
typedef struct pcre2_real_code {
pcre2_memctl memctl; /* Memory control fields */
const uint8_t *tables; /* The character tables */

4
testdata/testinput2 vendored
View File

@ -4592,4 +4592,8 @@ B)x/alt_verbnames,mark
/abc/replace=A$3123456789Z
abc
/(?<!a{65535}a{5})x/I
/(?<!a{65535})x/I
# End of testinput2

View File

@ -14681,4 +14681,13 @@ Failed: error 142 at offset 7: syntax error in subpattern name (missing terminat
abc
Failed: error -49 at offset 3 in replacement: unknown substring
/(?<!a{65535}a{5})x/I
Failed: error 187 at offset 16: lookbehind assertion is too long
/(?<!a{65535})x/I
Capturing subpattern count = 0
Max lookbehind = 65535
First code unit = 'x'
Subject length lower bound = 1
# End of testinput2