Give an internal error for a bad opcode during auto-possessification. This can

stop a loop when compiling an invalid UTF string with PCRE2_NO_UTF_CHECK.
This commit is contained in:
Philip.Hazel 2015-02-06 16:47:15 +00:00
parent a398ae3bef
commit ad452f4036
5 changed files with 40 additions and 24 deletions

View File

@ -55,6 +55,13 @@ patterns.
10. The error message for an invalid quantifier has been changed from "nothing
to repeat" to "quantifier does not follow a repeatable item".
11. If a bad UTF string is compiled with NO_UTF_CHECK, it may succeed, but
scanning the compiled pattern in subsequent auto-possessification can get out
of step and lead to an unknown opcode. Previously this could have caused an
infinite loop. Now it generates an "internal error" error. This is a tidyup,
not a bug fix; passing bad UTF with NO_UTF_CHECK is documented as having an
undefined outcome.
Version 10.00 05-January-2015
-----------------------------

View File

@ -1090,17 +1090,20 @@ but some compilers complain about an unreachable statement. */
*************************************************/
/* Replaces single character iterations with their possessive alternatives
if appropriate. This function modifies the compiled opcode!
if appropriate. This function modifies the compiled opcode! Hitting a
non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a
bad UTF string was compiled with PCRE2_NO_UTF_CHECK.
Arguments:
code points to start of the byte code
utf TRUE in UTF mode
cb compile data block
Returns: nothing
Returns: 0 for success
-1 if a non-existant opcode is encountered
*/
void
int
PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
{
register PCRE2_UCHAR c;
@ -1111,7 +1114,9 @@ uint32_t list[8];
for (;;)
{
c = *code;
if (c > OP_TABLE_LENGTH) return -1; /* Something gone wrong */
if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
{
c -= get_repeat_base(c) - OP_STAR;
@ -1207,7 +1212,7 @@ for (;;)
switch(c)
{
case OP_END:
return;
return 0;
case OP_TYPESTAR:
case OP_TYPEMINSTAR:

View File

@ -573,7 +573,7 @@ enum { ERR0 = COMPILE_ERROR_BASE,
ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79 };
ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80 };
/* This is a table of start-of-pattern options such as (*UTF) and settings such
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
@ -7802,7 +7802,9 @@ if (usedlength > length) errorcode = ERR23; else
}
/* Fill in any forward references that are required. There may be repeated
references; optimize for them, as searching a large regex takes time. */
references; optimize for them, as searching a large regex takes time. The
test of errorcode inside the loop means that nothing is done if it is already
non-zero. */
if (cb.hwm > cb.start_workspace)
{
@ -7832,23 +7834,23 @@ if (cb.workspace_size > COMPILE_WORK_SIZE)
ccontext->memctl.memory_data);
cb.start_workspace = NULL;
/* Give an error if there's back reference to a non-existent capturing
subpattern. */
/* After a successful compile, give an error if there's back reference to a
non-existent capturing subpattern. Then, unless disabled, check whether any
single character iterators can be auto-possessified. The function overwrites
the appropriate opcode values, so the type of the pointer must be cast. NOTE:
the intermediate variable "temp" is used in this code because at least one
compiler gives a warning about loss of "const" attribute if the cast
(PCRE2_UCHAR *)codestart is used directly in the function call. */
if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
/* Unless disabled, check whether any single character iterators can be
auto-possessified. The function overwrites the appropriate opcode values, so
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
used in this code because at least one compiler gives a warning about loss of
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
function call. */
if ((options & PCRE2_NO_AUTO_POSSESS) == 0)
if (errorcode == 0)
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
PRIV(auto_possessify)(temp, utf, &cb);
}
if (re->top_backref > re->top_bracket) errorcode = ERR15;
else if ((options & PCRE2_NO_AUTO_POSSESS) == 0)
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
}
}
/* If there were any lookbehind assertions that contained OP_RECURSE
(recursions or subroutine calls), a flag is set for them to be checked here,
@ -7858,7 +7860,7 @@ OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
exceptional ones forgo this. We scan the pattern to check that they are fixed
length, and set their lengths. */
if (cb.check_lookbehind)
if (errorcode == 0 && cb.check_lookbehind)
{
PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart;

View File

@ -159,6 +159,8 @@ static const char compile_error_texts[] =
"character code point value in \\u.... sequence is too large\0"
"digits missing in \\x{} or \\o{}\0"
"syntax error in (?(VERSION condition\0"
/* 80 */
"internal error: unknown opcode in auto_possessify()\0"
;
/* Match-time and UTF error texts are in the same format. */

View File

@ -1882,7 +1882,7 @@ is available. */
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
const compile_block *);
extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int);
extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,