From ad452f403685657450f3cef2da0bcd10180e0e08 Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Fri, 6 Feb 2015 16:47:15 +0000 Subject: [PATCH] Give an internal error for a bad opcode during auto-possessification. This can stop a loop when compiling an invalid UTF string with PCRE2_NO_UTF_CHECK. --- ChangeLog | 7 +++++++ src/pcre2_auto_possess.c | 15 ++++++++++----- src/pcre2_compile.c | 38 ++++++++++++++++++++------------------ src/pcre2_error.c | 2 ++ src/pcre2_internal.h | 2 +- 5 files changed, 40 insertions(+), 24 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9ab6986..d676c6b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -55,6 +55,13 @@ patterns. 10. The error message for an invalid quantifier has been changed from "nothing to repeat" to "quantifier does not follow a repeatable item". +11. If a bad UTF string is compiled with NO_UTF_CHECK, it may succeed, but +scanning the compiled pattern in subsequent auto-possessification can get out +of step and lead to an unknown opcode. Previously this could have caused an +infinite loop. Now it generates an "internal error" error. This is a tidyup, +not a bug fix; passing bad UTF with NO_UTF_CHECK is documented as having an +undefined outcome. + Version 10.00 05-January-2015 ----------------------------- diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c index f531398..0e050e6 100644 --- a/src/pcre2_auto_possess.c +++ b/src/pcre2_auto_possess.c @@ -1090,17 +1090,20 @@ but some compilers complain about an unreachable statement. */ *************************************************/ /* Replaces single character iterations with their possessive alternatives -if appropriate. This function modifies the compiled opcode! +if appropriate. This function modifies the compiled opcode! Hitting a +non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a +bad UTF string was compiled with PCRE2_NO_UTF_CHECK. Arguments: code points to start of the byte code utf TRUE in UTF mode cb compile data block -Returns: nothing +Returns: 0 for success + -1 if a non-existant opcode is encountered */ -void +int PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb) { register PCRE2_UCHAR c; @@ -1111,7 +1114,9 @@ uint32_t list[8]; for (;;) { c = *code; - + + if (c > OP_TABLE_LENGTH) return -1; /* Something gone wrong */ + if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) { c -= get_repeat_base(c) - OP_STAR; @@ -1207,7 +1212,7 @@ for (;;) switch(c) { case OP_END: - return; + return 0; case OP_TYPESTAR: case OP_TYPEMINSTAR: diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 27089b1..d829e28 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -573,7 +573,7 @@ enum { ERR0 = COMPILE_ERROR_BASE, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70, - ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79 }; + ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80 }; /* This is a table of start-of-pattern options such as (*UTF) and settings such as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward @@ -7802,7 +7802,9 @@ if (usedlength > length) errorcode = ERR23; else } /* Fill in any forward references that are required. There may be repeated -references; optimize for them, as searching a large regex takes time. */ +references; optimize for them, as searching a large regex takes time. The +test of errorcode inside the loop means that nothing is done if it is already +non-zero. */ if (cb.hwm > cb.start_workspace) { @@ -7832,23 +7834,23 @@ if (cb.workspace_size > COMPILE_WORK_SIZE) ccontext->memctl.memory_data); cb.start_workspace = NULL; -/* Give an error if there's back reference to a non-existent capturing -subpattern. */ +/* After a successful compile, give an error if there's back reference to a +non-existent capturing subpattern. Then, unless disabled, check whether any +single character iterators can be auto-possessified. The function overwrites +the appropriate opcode values, so the type of the pointer must be cast. NOTE: +the intermediate variable "temp" is used in this code because at least one +compiler gives a warning about loss of "const" attribute if the cast +(PCRE2_UCHAR *)codestart is used directly in the function call. */ -if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; - -/* Unless disabled, check whether any single character iterators can be -auto-possessified. The function overwrites the appropriate opcode values, so -the type of the pointer must be cast. NOTE: the intermediate variable "temp" is -used in this code because at least one compiler gives a warning about loss of -"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the -function call. */ - -if ((options & PCRE2_NO_AUTO_POSSESS) == 0) +if (errorcode == 0) { - PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; - PRIV(auto_possessify)(temp, utf, &cb); - } + if (re->top_backref > re->top_bracket) errorcode = ERR15; + else if ((options & PCRE2_NO_AUTO_POSSESS) == 0) + { + PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; + if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; + } + } /* If there were any lookbehind assertions that contained OP_RECURSE (recursions or subroutine calls), a flag is set for them to be checked here, @@ -7858,7 +7860,7 @@ OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The exceptional ones forgo this. We scan the pattern to check that they are fixed length, and set their lengths. */ -if (cb.check_lookbehind) +if (errorcode == 0 && cb.check_lookbehind) { PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart; diff --git a/src/pcre2_error.c b/src/pcre2_error.c index b257c60..3cd8792 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -159,6 +159,8 @@ static const char compile_error_texts[] = "character code point value in \\u.... sequence is too large\0" "digits missing in \\x{} or \\o{}\0" "syntax error in (?(VERSION condition\0" + /* 80 */ + "internal error: unknown opcode in auto_possessify()\0" ; /* Match-time and UTF error texts are in the same format. */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index ebc0197..ba8906a 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1882,7 +1882,7 @@ is available. */ #define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_) #define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_) -extern void _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, +extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL, const compile_block *); extern PCRE2_SPTR _pcre2_find_bracket(PCRE2_SPTR, BOOL, int); extern BOOL _pcre2_is_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,