From f4e69b99d22e8003d11ad37daf856c0f01587347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Marjam=C3=A4ki?= Date: Fri, 30 Apr 2021 17:41:59 +0200 Subject: [PATCH] bump simplecpp --- externals/simplecpp/simplecpp.cpp | 207 +++++++++++++++++++++++++++++- externals/simplecpp/simplecpp.h | 5 +- 2 files changed, 209 insertions(+), 3 deletions(-) diff --git a/externals/simplecpp/simplecpp.cpp b/externals/simplecpp/simplecpp.cpp index d174cbacb..111ba4ba2 100644 --- a/externals/simplecpp/simplecpp.cpp +++ b/externals/simplecpp/simplecpp.cpp @@ -23,6 +23,7 @@ #include "simplecpp.h" #include +#include #include #include #include @@ -2304,6 +2305,208 @@ static void simplifyName(simplecpp::TokenList &expr) } } +/* + * Reads at least minlen and at most maxlen digits (inc. prefix) in base base + * from s starting at position pos and converts them to a + * unsigned long long value, updating pos to point to the first + * unused element of s. + * Returns ULLONG_MAX if the result is not representable and + * throws if the above requirements were not possible to satisfy. + */ +static unsigned long long stringToULLbounded( + const std::string& s, + std::size_t& pos, + int base = 0, + std::ptrdiff_t minlen = 1, + std::size_t maxlen = std::string::npos +) +{ + std::string sub = s.substr(pos, maxlen); + const char* start = sub.c_str(); + char* end; + unsigned long long value = std::strtoull(start, &end, base); + pos += end - start; + if (end - start < minlen) + throw std::runtime_error("expected digit"); + return value; +} + +/* Converts character literal (including prefix, but not ud-suffix) + * to long long value. + * + * Assumes ASCII-compatible single-byte encoded str. + * + * For target assumes + * - UTF-8 execution character set encoding or encoding matching str + * - UTF-32 execution wide-character set encoding + * - requirements for __STDC_UTF_16__, __STDC_UTF_32__ and __STDC_ISO_10646__ satisfied + * - char16_t is 16bit wide + * - char32_t is 32bit wide + * - wchar_t is 32bit wide and unsigned + * - matching char signedness to host + * - matching sizeof(int) to host + * + * For host assumes + * - ASCII-compatible execution character set + * + * For host and target assumes + * - CHAR_BIT == 8 + * - two's complement + * + * Implements multi-character narrow literals according to GCC's behavior, + * except multi code unit universal character names are not supported. + * Multi-character wide literals are not supported. + * Limited support of universal character names for non-UTF-8 execution character set encodings. + */ +long long simplecpp::characterLiteralToLL(const std::string& str) +{ + // default is wide/utf32 + bool narrow = false; + bool utf8 = false; + bool utf16 = false; + + std::size_t pos; + + if (str.size() >= 1 && str[0] == '\'') { + narrow = true; + pos = 1; + } else if (str.size() >= 2 && str[0] == 'u' && str[1] == '\'') { + utf16 = true; + pos = 2; + } else if (str.size() >= 3 && str[0] == 'u' && str[1] == '8' && str[2] == '\'') { + utf8 = true; + pos = 3; + } else if (str.size() >= 2 && (str[0] == 'L' || str[0] == 'U') && str[1] == '\'') { + pos = 2; + } else + throw std::runtime_error("expected a character literal"); + + unsigned long long multivalue = 0; + + std::size_t nbytes = 0; + + while (pos + 1 < str.size()) { + if (str[pos] == '\'' || str[pos] == '\n') + throw std::runtime_error("raw single quotes and newlines not allowed in character literals"); + + if (nbytes >= 1 && !narrow) + throw std::runtime_error("multiple characters only supported in narrow character literals"); + + unsigned long long value; + + if (str[pos] == '\\') { + pos++; + char escape = str[pos++]; + + if (pos >= str.size()) + throw std::runtime_error("unexpected end of character literal"); + + switch (escape) { + case '\'': + case '"': + case '?': + case '\\': + value = static_cast(escape); + break; + + case 'a': + value = static_cast('\a'); + break; + case 'b': + value = static_cast('\b'); + break; + case 'f': + value = static_cast('\f'); + break; + case 'n': + value = static_cast('\n'); + break; + case 'r': + value = static_cast('\r'); + break; + case 't': + value = static_cast('\t'); + break; + case 'v': + value = static_cast('\v'); + break; + + // ESC extension + case 'e': + value = static_cast('\x1b'); + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + // octal escape sequences consist of 1 to 3 digits + value = stringToULLbounded(str, --pos, 8, 1, 3); + break; + + case 'x': + // hexadecimal escape sequences consist of at least 1 digit + value = stringToULLbounded(str, pos, 16); + break; + + case 'u': + case 'U': { + // universal character names have exactly 4 or 8 digits + std::size_t ndigits = (escape == 'u' ? 4 : 8); + value = stringToULLbounded(str, pos, 16, ndigits, ndigits); + + // UTF-8 encodes code points above 0x7f in multiple code units + // code points above 0x10ffff are not allowed + if (((narrow || utf8) && value > 0x7f) || (utf16 && value > 0xffff) || value > 0x10ffff) + throw std::runtime_error("code point too large"); + + if (value >= 0xd800 && value <= 0xdfff) + throw std::runtime_error("surrogate code points not allowed in universal character names"); + + break; + } + + default: + throw std::runtime_error("invalid escape sequence"); + } + } else { + value = static_cast(str[pos++]); + + if (!narrow && value > 0x7f) + throw std::runtime_error("non-ASCII source characters supported only in narrow character literals"); + } + + if (((narrow || utf8) && value > std::numeric_limits::max()) || (utf16 && value >> 16) || value >> 32) + throw std::runtime_error("numeric escape sequence too large"); + + multivalue <<= CHAR_BIT; + multivalue |= value; + nbytes++; + } + + if (pos + 1 != str.size() || str[pos] != '\'') + throw std::runtime_error("missing closing quote in character literal"); + + if (!nbytes) + throw std::runtime_error("empty character literal"); + + // ordinary narrow character literal's value is determined by (possibly signed) char + if (narrow && nbytes == 1) + return static_cast(multivalue); + + // while multi-character literal's value is determined by (signed) int + if (narrow) + return static_cast(multivalue); + + // All other cases are unsigned. Since long long is at least 64bit wide, + // while the literals at most 32bit wide, the conversion preserves all values. + return multivalue; +} + static void simplifyNumbers(simplecpp::TokenList &expr) { for (simplecpp::Token *tok = expr.front(); tok; tok = tok->next) { @@ -2311,8 +2514,8 @@ static void simplifyNumbers(simplecpp::TokenList &expr) continue; if (tok->str().compare(0,2,"0x") == 0) tok->setstr(toString(stringToULL(tok->str()))); - else if (tok->str()[0] == '\'') - tok->setstr(toString(tok->str()[1] & 0xffU)); + else if (!tok->number && tok->str().find('\'') != tok->str().npos) + tok->setstr(toString(simplecpp::characterLiteralToLL(tok->str()))); } } diff --git a/externals/simplecpp/simplecpp.h b/externals/simplecpp/simplecpp.h index 5ca74a179..80d025d6f 100644 --- a/externals/simplecpp/simplecpp.h +++ b/externals/simplecpp/simplecpp.h @@ -107,7 +107,8 @@ namespace simplecpp { } void flags() { - name = (std::isalpha((unsigned char)string[0]) || string[0] == '_' || string[0] == '$'); + name = (std::isalpha((unsigned char)string[0]) || string[0] == '_' || string[0] == '$') + && (string.find('\'') == string.npos); comment = string.size() > 1U && string[0] == '/' && (string[1] == '/' || string[1] == '*'); number = std::isdigit((unsigned char)string[0]) || (string.size() > 1U && string[0] == '-' && std::isdigit((unsigned char)string[1])); op = (string.size() == 1U) ? string[0] : '\0'; @@ -299,6 +300,8 @@ namespace simplecpp { std::string std; }; + SIMPLECPP_LIB long long characterLiteralToLL(const std::string& str); + SIMPLECPP_LIB std::map load(const TokenList &rawtokens, std::vector &filenames, const DUI &dui, OutputList *outputList = NULL); /**