Bump simplecpp (#3255)

2021-05-10 20:46:31 +00:00 · 2021-05-10 20:46:31 +00:00 · c77caf637c
parent 75311fba0f
commit c77caf637c
1 changed files with 50 additions and 5 deletions
--- a/externals/simplecpp/simplecpp.cpp
+++ b/externals/simplecpp/simplecpp.cpp
@ -2334,10 +2334,11 @@ static unsigned long long stringToULLbounded(
 /* Converts character literal (including prefix, but not ud-suffix)
 * to long long value.
 *
- * Assumes ASCII-compatible single-byte encoded str.
+ * Assumes ASCII-compatible single-byte encoded str for narrow literals
+ * and UTF-8 otherwise.
 *
 * For target assumes
- * - UTF-8 execution character set encoding or encoding matching str
+ * - execution character set encoding matching str
 * - UTF-32 execution wide-character set encoding
 * - requirements for __STDC_UTF_16__, __STDC_UTF_32__ and __STDC_ISO_10646__ satisfied
 * - char16_t is 16bit wide
@ -2402,6 +2403,12 @@ long long simplecpp::characterLiteralToLL(const std::string& str)
                throw std::runtime_error("unexpected end of character literal");

            switch (escape) {
+            // obscure GCC extensions
+            case '%':
+            case '(':
+            case '[':
+            case '{':
+            // standard escape sequences
            case '\'':
            case '"':
            case '?':
@ -2431,8 +2438,9 @@ long long simplecpp::characterLiteralToLL(const std::string& str)
                value = static_cast<unsigned char>('\v');
                break;

-            // ESC extension
+            // GCC extension for ESC character
            case 'e':
+            case 'E':
                value = static_cast<unsigned char>('\x1b');
                break;

@ -2476,8 +2484,45 @@ long long simplecpp::characterLiteralToLL(const std::string& str)
        } else {
            value = static_cast<unsigned char>(str[pos++]);

-            if (!narrow && value > 0x7f)
-                throw std::runtime_error("non-ASCII source characters supported only in narrow character literals");
+            if (!narrow && value >= 0x80) {
+                // Assuming this is a UTF-8 encoded code point.
+                // This decoder may not completely validate the input.
+                // Noncharacters are neither rejected nor replaced.
+                
+                int additional_bytes;
+                if (value >= 0xf5)  // higher values would result in code points above 0x10ffff
+                    throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+                else if (value >= 0xf0)
+                    additional_bytes = 3;
+                else if (value >= 0xe0)
+                    additional_bytes = 2;
+                else if (value >= 0xc2) // 0xc0 and 0xc1 are always overlong 2-bytes encodings
+                    additional_bytes = 1;
+                else
+                    throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+                
+                value &= (1 << (6 - additional_bytes)) - 1;
+
+                while (additional_bytes--) {
+                    if(pos + 1 >= str.size())
+                        throw std::runtime_error("assumed UTF-8 encoded source, but character literal ends unexpectedly");
+                    
+                    unsigned char c = str[pos++];
+                    
+                    if (((c >> 6) != 2)    // ensure c has form 0xb10xxxxxx
+                        || (!value && additional_bytes == 1 && c < 0xa0)    // overlong 3-bytes encoding
+                        || (!value && additional_bytes == 2 && c < 0x90))   // overlong 4-bytes encoding
+                        throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+                    
+                    value = (value << 6) | (c & ((1 << 7) - 1));
+                }
+
+                if (value >= 0xd800 && value <= 0xdfff)
+                    throw std::runtime_error("assumed UTF-8 encoded source, but sequence is invalid");
+                
+                if ((utf8 && value > 0x7f) || (utf16 && value > 0xffff) || value > 0x10ffff)
+                    throw std::runtime_error("code point too large");
+            }
        }

        if (((narrow || utf8) && value > std::numeric_limits<unsigned char>::max()) || (utf16 && value >> 16) || value >> 32)