diff --git a/ChangeLog b/ChangeLog index f3e1cc5..c6d3480 100644 --- a/ChangeLog +++ b/ChangeLog @@ -77,6 +77,10 @@ whose condition was an assertion preceded by an explicit callout with a string argument might be incorrectly processed, especially if the string contained \Q. This bug was discovered by Karl Skomski with the LLVM fuzzer. +21. Compiling PCRE2 with the sanitize options of clang showed up a number of +very pedantic coding infelicities and a buffer overflow while checking a UTF-8 +string if the final multi-byte UTF-8 character was truncated. + Version 10.20 30-June-2015 -------------------------- diff --git a/RunTest b/RunTest index c4d659c..fb758fe 100755 --- a/RunTest +++ b/RunTest @@ -33,6 +33,10 @@ # For backwards compatibility, -nojit, -valgrind, -valgrind-log, and -sim may # be given without the leading "-" character. # +# When PCRE2 is compiled by clang with -fsanitize arguments, some tests need +# very much more stack than normal. In environments where the stack can be +# set at runtime, -bigstack sets a gigantic stack. +# # There are two special cases where only one argument is allowed: # # If the first and only argument is "ebcdic", the script runs the special @@ -184,6 +188,7 @@ arg8= arg16= arg32= nojit= +bigstack= sim= skip= valgrind= @@ -240,6 +245,7 @@ while [ $# -gt 0 ] ; do -8) arg8=yes;; -16) arg16=yes;; -32) arg32=yes;; + bigstack|-bigstack) bigstack=yes;; nojit|-nojit) nojit=yes;; sim|-sim) shift; sim=$1;; valgrind|-valgrind) valgrind="valgrind --tool=memcheck -q --smc-check=all";; @@ -287,13 +293,22 @@ fi # If it is possible to set the system stack size, arrange to set a value for # test 2, which needs more than the even the Linux default when PCRE2 has been -# compiled with -fsanitize=address. +# compiled by gcc with -fsanitize=address. When the compiler is clang, sanitize +# options require an even bigger stack for test 2, and an increased stack for +# some of the other tests. $sim ./pcre2test -S 1 /dev/null /dev/null if [ $? -eq 0 ] ; then - test2stack="-S 16" + if [ "$bigstack" = "" ] ; then + test2stack="-S 16" + defaultstack="" + else + test2stack="-S 1024" + defaultstack="-S 64" + fi else test2stack="" + defaultstack="" fi # All of 8-bit, 16-bit, and 32-bit character strings may be supported, but only @@ -438,7 +453,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $do1 = yes ] ; then echo $title1 for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput1 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput1 testtry checkresult $? 1 "$opt" done fi @@ -508,7 +523,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ "$locale" != "" ] ; then echo $title3 "(using '$locale' locale)" for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $infile testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $infile testtry if [ $? = 0 ] ; then case "$opt" in -jit) with=" with JIT";; @@ -545,7 +560,7 @@ for bmode in "$test8" "$test16" "$test32"; do echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput4 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput4 testtry checkresult $? 4 "$opt" done fi @@ -557,7 +572,7 @@ for bmode in "$test8" "$test16" "$test32"; do echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput5 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput5 testtry checkresult $? 5 "$opt" done fi @@ -567,7 +582,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $do6 = yes ] ; then echo $title6 - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput6 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput6 testtry checkresult $? 6 "" fi @@ -576,7 +591,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput7 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput7 testtry checkresult $? 7 "" fi fi @@ -596,7 +611,7 @@ for bmode in "$test8" "$test16" "$test32"; do elif [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput8 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput8 testtry checkresult $? 8-$bits "" fi fi @@ -609,7 +624,7 @@ for bmode in "$test8" "$test16" "$test32"; do echo " Skipped when running 16/32-bit tests" else for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput9 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput9 testtry checkresult $? 9 "$opt" done fi @@ -625,7 +640,7 @@ for bmode in "$test8" "$test16" "$test32"; do echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput10 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput10 testtry checkresult $? 10 "$opt" done fi @@ -639,7 +654,7 @@ for bmode in "$test8" "$test16" "$test32"; do echo " Skipped when running 8-bit tests" else for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput11 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput11 testtry checkresult $? 11-$bits "$opt" done fi @@ -656,7 +671,7 @@ for bmode in "$test8" "$test16" "$test32"; do echo " Skipped because UTF-$bits support is not available" else for opt in "" $jitopt; do - $sim $valgrind ./pcre2test -q $bmode $opt $testdata/testinput12 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $opt $testdata/testinput12 testtry checkresult $? 12-$bits "$opt" done fi @@ -669,7 +684,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ "$bits" = "8" ] ; then echo " Skipped when running 8-bit tests" else - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput13 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput13 testtry checkresult $? 13 "" fi fi @@ -678,7 +693,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $do14 = yes ] ; then echo $title14 - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput14 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput14 testtry checkresult $? 14 "" fi @@ -689,7 +704,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $jit -ne 0 ] ; then echo " Skipped because JIT is available" else - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput15 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput15 testtry checkresult $? 15 "" fi fi @@ -701,7 +716,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $jit -eq 0 -o "$nojit" = "yes" ] ; then echo " Skipped because JIT is not available or nojit was specified" else - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput16 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput16 testtry checkresult $? 16 "" fi fi @@ -713,7 +728,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ "$bits" = "16" -o "$bits" = "32" ] ; then echo " Skipped when running 16/32-bit tests" else - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput17 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput17 testtry checkresult $? 17 "" fi fi @@ -727,7 +742,7 @@ for bmode in "$test8" "$test16" "$test32"; do elif [ $utf -eq 0 ] ; then echo " Skipped because UTF-$bits support is not available" else - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput18 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput18 testtry checkresult $? 18 "" fi fi @@ -736,7 +751,7 @@ for bmode in "$test8" "$test16" "$test32"; do if [ $do19 = yes ] ; then echo $title19 - $sim $valgrind ./pcre2test -q $bmode $testdata/testinput19 testtry + $sim $valgrind ./pcre2test -q $defaultstack $bmode $testdata/testinput19 testtry checkresult $? 19 "" fi diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 5fe9440..08ea585 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -270,7 +270,7 @@ in UTF-8 mode. It runs from '0' to 'z'. */ #ifndef EBCDIC #define ESCAPES_FIRST CHAR_0 #define ESCAPES_LAST CHAR_z -#define ESCAPES_UPPER_CASE (-32) /* Add this to upper case a letter */ +#define UPPER_CASE(c) (c-32) static const short int escapes[] = { 0, 0, @@ -323,11 +323,11 @@ because it is defined as 'a', which of course picks up the ASCII value. */ #if 'a' == 0x81 /* Check for a real EBCDIC environment */ #define ESCAPES_FIRST CHAR_a #define ESCAPES_LAST CHAR_9 -#define ESCAPES_UPPER_CASE (+64) /* Add this to upper case a letter */ +#define UPPER_CASE(c) (c+64) #else /* Testing in an ASCII environment */ #define ESCAPES_FIRST ((unsigned char)'\x81') /* EBCDIC 'a' */ #define ESCAPES_LAST ((unsigned char)'\xf9') /* EBCDIC '9' */ -#define ESCAPES_UPPER_CASE (-32) /* Add this to upper case a letter */ +#define UPPER_CASE(c) (c-32) #endif static const short int escapes[] = { @@ -1884,7 +1884,7 @@ else s = cb->bracount - (s - 1); } - escape = -s; + escape = -(int)s; break; /* The handling of escape sequences consisting of a string of digits @@ -1909,7 +1909,7 @@ else { oldptr = ptr; /* The integer range is limited by the machine's int representation. */ - s = (int)(c - CHAR_0); + s = c - CHAR_0; overflow = FALSE; while (IS_DIGIT(ptr[1])) { @@ -1933,7 +1933,7 @@ else if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount) { - escape = -s; /* Indicates a back reference */ + escape = -(int)s; /* Indicates a back reference */ break; } ptr = oldptr; /* Put the pointer back and fall through */ @@ -1981,7 +1981,7 @@ else #if PCRE2_CODE_UNIT_WIDTH == 32 if (c >= 0x20000000l) { overflow = TRUE; break; } #endif - c = (c << 3) + cc - CHAR_0 ; + c = (c << 3) + (cc - CHAR_0); #if PCRE2_CODE_UNIT_WIDTH == 8 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } #elif PCRE2_CODE_UNIT_WIDTH == 16 @@ -2105,7 +2105,7 @@ else #endif c = *(++ptr); - if (c >= CHAR_a && c <= CHAR_z) c += ESCAPES_UPPER_CASE; + if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c); if (c == CHAR_NULL && ptr >= cb->end_pattern) { *errorcodeptr = ERR2; @@ -3532,7 +3532,7 @@ for (; ptr < cb->end_pattern; ptr++) if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL; else top_nest--; } - nest_depth--; + if (nest_depth > 0) nest_depth--; /* Can be 0 for unmatched ) */ break; } } @@ -3938,14 +3938,16 @@ for (;; ptr++) if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0) { nestptr = ptr + 7; - ptr = sub_start_of_word - 1; + ptr = sub_start_of_word; /* Do not combine these statements; clang's */ + ptr--; /* sanitizer moans about a negative index. */ continue; } if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) { nestptr = ptr + 7; - ptr = sub_end_of_word - 1; + ptr = sub_end_of_word; /* Do not combine these statements; clang's */ + ptr--; /* sanitizer moans about a negative index. */ continue; } @@ -5960,7 +5962,7 @@ for (;; ptr++) goto FAILED; } if (refsign != 0) recno = (refsign == CHAR_MINUS)? - cb->bracount - recno + 1 : recno + cb->bracount; + (cb->bracount + 1) - recno : recno + cb->bracount; if (recno <= 0 || (uint32_t)recno > cb->final_bracount) { *errorcodeptr = ERR15; @@ -6490,7 +6492,7 @@ for (;; ptr++) *errorcodeptr = ERR58; goto FAILED; } - recno = cb->bracount - recno + 1; + recno = (int)(cb->bracount + 1) - recno; if (recno <= 0) { *errorcodeptr = ERR15; @@ -8183,7 +8185,7 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && while (IS_DIGIT(ptr[pp])) { if (c > UINT32_MAX / 10 - 1) break; /* Integer overflow */ - c = c*10 + ptr[pp++] - CHAR_0; + c = c*10 + (ptr[pp++] - CHAR_0); } if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS) { diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index b14477d..ff4d332 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -3172,7 +3172,7 @@ occur. */ #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) -options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO)); +options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); #undef FF #undef OO diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 5d3e694..41113a5 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -194,7 +194,7 @@ if (caseless) GETCHARINC(c, eptr); GETCHARINC(d, p); ur = GET_UCD(d); - if (c != d && c != d + ur->other_case) + if (c != d && c != (uint32_t)((int)d + ur->other_case)) { const uint32_t *pp = PRIV(ucd_caseless_sets) + ur->caseset; for (;;) @@ -211,7 +211,7 @@ if (caseless) /* Not in UTF mode */ { - while (length-- > 0) + for (; length > 0; length--) { uint32_t cc, cp; if (eptr >= mb->end_subject) return 1; /* Partial match */ @@ -226,11 +226,11 @@ if (caseless) } /* In the caseful case, we can just compare the code units, whether or not we -are in UT mode. */ +are in UTF mode. */ else { - while (length-- > 0) + for (; length > 0; length--) { if (eptr >= mb->end_subject) return 1; /* Partial match */ if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1; /*No match */ @@ -3342,7 +3342,10 @@ for (;;) CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */ RRETURN(MATCH_NOMATCH); } - while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); + for (; length > 0; length--) + { + if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); + } } else #endif @@ -6513,7 +6516,7 @@ occur. */ #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET) #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART) -options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO)); +options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); #undef FF #undef OO @@ -6783,7 +6786,7 @@ for(;;) end_subject = t; } - /* Advance to a unique first code unit if there is one. In 8-bit mode, the + /* Advance to a unique first code unit if there is one. In 8-bit mode, the use of memchr() gives a big speed up. */ if (has_first_cu) @@ -6801,8 +6804,8 @@ for(;;) #else start_match = memchr(start_match, first_cu, end_subject - start_match); if (start_match == NULL) start_match = end_subject; -#endif - } +#endif + } } /* Or to just after a linebreak for a multiline match */ diff --git a/src/pcre2_string_utils.c b/src/pcre2_string_utils.c index 888620e..1962cf3 100644 --- a/src/pcre2_string_utils.c +++ b/src/pcre2_string_utils.c @@ -121,7 +121,7 @@ int PRIV(strncmp)(PCRE2_SPTR str1, PCRE2_SPTR str2, size_t len) { PCRE2_UCHAR c1, c2; -while (len-- > 0) +for (; len > 0; len--) { c1 = *str1++; c2 = *str2++; @@ -150,7 +150,7 @@ int PRIV(strncmp_c8)(PCRE2_SPTR str1, const char *str2, size_t len) { PCRE2_UCHAR c1, c2; -while (len-- > 0) +for (; len > 0; len--) { c1 = *str1++; c2 = *str2++; diff --git a/src/pcre2_valid_utf.c b/src/pcre2_valid_utf.c index a97847a..94e97d7 100644 --- a/src/pcre2_valid_utf.c +++ b/src/pcre2_valid_utf.c @@ -131,11 +131,13 @@ PCRE2_ERROR_UTF8_ERR20 Isolated 0x80 byte (not within UTF-8 character) PCRE2_ERROR_UTF8_ERR21 Byte with the illegal value 0xfe or 0xff */ -for (p = string; length-- > 0; p++) +for (p = string; length > 0; p++) { register uint32_t ab, d; c = *p; + length--; + if (c < 128) continue; /* ASCII character */ if (c < 0xc0) /* Isolated 10xx xxxx byte */ @@ -324,9 +326,10 @@ PCRE2_ERROR_UTF16_ERR2 Invalid low surrogate PCRE2_ERROR_UTF16_ERR3 Isolated low surrogate */ -for (p = string; length-- > 0; p++) +for (p = string; length > 0; p++) { c = *p; + length--; if ((c & 0xf800) != 0xd800) { @@ -368,7 +371,7 @@ PCRE2_ERROR_UTF32_ERR1 Surrogate character PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff */ -for (p = string; length-- > 0; p++) +for (p = string; length > 0; length--, p++) { c = *p; if ((c & 0xfffff800u) != 0xd800u) diff --git a/src/pcre2test.c b/src/pcre2test.c index 43a056a..540b53e 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -2606,7 +2606,7 @@ if (pbuffer16_size < 2*len + 2) pp = pbuffer16; if (!utf) { - while (len-- > 0) *pp++ = *p++; + for (; len > 0; len--) *pp++ = *p++; } else while (len > 0) { @@ -2683,7 +2683,7 @@ if (pbuffer32_size < 4*len + 4) pp = pbuffer32; if (!utf) { - while (len-- > 0) *pp++ = *p++; + for (; len > 0; len--) *pp++ = *p++; } else while (len > 0) { @@ -2723,9 +2723,8 @@ Returns: a possibly changed offset static PCRE2_SIZE backchars(uint8_t *subject, PCRE2_SIZE offset, uint32_t count, BOOL utf) { -long int yield; - -if (!utf || test_mode == PCRE32_MODE) yield = offset - count; +if (!utf || test_mode == PCRE32_MODE) + return (count >= offset)? 0 : (offset - count); else if (test_mode == PCRE8_MODE) { @@ -2735,7 +2734,7 @@ else if (test_mode == PCRE8_MODE) pp--; while ((*pp & 0xc0) == 0x80) pp--; } - yield = pp - (PCRE2_SPTR8)subject; + return pp - (PCRE2_SPTR8)subject; } else /* 16-bit mode */ @@ -2746,10 +2745,8 @@ else /* 16-bit mode */ pp--; if ((*pp & 0xfc00) == 0xdc00) pp--; } - yield = pp - (PCRE2_SPTR16)subject; + return pp - (PCRE2_SPTR16)subject; } - -return (yield >= 0)? yield : 0; } @@ -2936,7 +2933,7 @@ while (top > bot) if (c == 0) { if (len == mlen) return mid; - c = len - mlen; + c = (int)len - (int)mlen; } if (c > 0) bot = mid + 1; else top = mid; } @@ -3712,7 +3709,7 @@ if ((pat_patctl.control & CTL_INFO) != 0) if (namecount > 0) { fprintf(outfile, "Named capturing subpatterns:\n"); - while (namecount-- > 0) + for (; namecount > 0; namecount--) { int imm2_size = test_mode == PCRE8_MODE ? 2 : 1; uint32_t length = (uint32_t)STRLEN(nametable + imm2_size); @@ -5378,7 +5375,7 @@ if (p[-1] != 0 && !decode_modifiers(p, CTX_DAT, NULL, &dat_datctl)) /* Check for mutually exclusive modifiers. */ c = dat_datctl.control & EXCLUSIVE_DAT_CONTROLS; -if (c - (c & -c) != 0) +if (c != 0 && c != (c & (~c+1))) { show_controls(c, "** Not allowed together:"); fprintf(outfile, "\n");