From 6a3dd9a18542467c317bc4b670259e12fc343497 Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Sun, 10 Mar 2019 10:38:50 +0100
Subject: [PATCH] Handle concatenated string and char literals

This handles concatenated strings and characters from simplecpp.
Previously, L'c' would be preprocessed to the tokens  "L" and "'c'".
cppcheck would then remove the "L" token and set "'c'" to be a wide
character literal. Now, it needs to remove the prefix instead.

When doing this, add handling of utf32 encoded literals (U) and UTF-8
encoded literals (u8).
---
 lib/token.cpp               | 21 +++++++++++++++++----
 lib/tokenize.cpp            | 14 ++++++++------
 test/testsimplifytokens.cpp | 33 +++++++++++++++++++++++++++++++--
 test/testtokenize.cpp       |  5 ++++-
 4 files changed, 60 insertions(+), 13 deletions(-)
diff --git a/lib/token.cpp b/lib/token.cpp
index 12366019f..58d002894 100644
--- a/lib/token.cpp
+++ b/lib/token.cpp
@@ -34,6 +34,19 @@
 #include <stack>
 #include <utility>
 
+static bool isStringCharLiteral(const std::string &str, char q)
+{
+
+    if (!endsWith(str, q))
+        return false;
+
+    const std::string prefix[5] = { "", "u8", "L", "U", "u" };
+    for (const std::string & p: prefix) {
+        if ((str.length() + 1) > p.length() && (str.find(p + q) == 0))
+            return true;
+    }
+    return false;
+}
 const std::list<ValueFlow::Value> TokenImpl::mEmptyValueList;
 
 Token::Token(TokensFrontBack *tokensFrontBack) :
@@ -73,6 +86,10 @@ void Token::update_property_info()
     if (!mStr.empty()) {
         if (mStr == "true" || mStr == "false")
             tokType(eBoolean);
+        else if (isStringCharLiteral(mStr, '\"'))
+            tokType(eString);
+        else if (isStringCharLiteral(mStr, '\''))
+            tokType(eChar);
         else if (std::isalpha((unsigned char)mStr[0]) || mStr[0] == '_' || mStr[0] == '$') { // Name
             if (mImpl->mVarId)
                 tokType(eVariable);
@@ -80,10 +97,6 @@ void Token::update_property_info()
                 tokType(eName);
         } else if (std::isdigit((unsigned char)mStr[0]) || (mStr.length() > 1 && mStr[0] == '-' && std::isdigit((unsigned char)mStr[1])))
             tokType(eNumber);
-        else if (mStr.length() > 1 && mStr[0] == '"' && endsWith(mStr,'"'))
-            tokType(eString);
-        else if (mStr.length() > 1 && mStr[0] == '\'' && endsWith(mStr,'\''))
-            tokType(eChar);
         else if (mStr == "=" || mStr == "<<=" || mStr == ">>=" ||
                  (mStr.size() == 2U && mStr[1] == '=' && std::strchr("+-*/%&^|", mStr[0])))
             tokType(eAssignmentOp);
diff --git a/lib/tokenize.cpp b/lib/tokenize.cpp
index b622eab12..0084c28f4 100644
--- a/lib/tokenize.cpp
+++ b/lib/tokenize.cpp
@@ -1926,13 +1926,15 @@ void Tokenizer::combineOperators()
 
 void Tokenizer::combineStringAndCharLiterals()
 {
-    // Combine wide strings and wide characters
     for (Token *tok = list.front(); tok; tok = tok->next()) {
-        if (Token::Match(tok, "[Lu] %char%|%str%")) {
-            // Combine 'L "string"' and 'L 'c''
-            tok->str(tok->next()->str());
-            tok->deleteNext();
-            tok->isLong(true);
+        const std::string prefix[4] = {"u8", "L", "U", "u"};
+        for (const std::string & p : prefix) {
+            if (((tok->tokType() == Token::eString) && (tok->str().find(p + "\"") == 0)) ||
+                ((tok->tokType() == Token::eChar) && (tok->str().find(p + "\'") == 0))) {
+                tok->str(tok->str().substr(p.size()));
+                tok->isLong(p != "u8");
+                break;
+            }
         }
     }
 
diff --git a/test/testsimplifytokens.cpp b/test/testsimplifytokens.cpp
index df36cbf44..24e1cdaf4 100644
--- a/test/testsimplifytokens.cpp
+++ b/test/testsimplifytokens.cpp
@@ -115,6 +115,8 @@ private:
 
         TEST_CASE(combine_wstrings);
         TEST_CASE(combine_ustrings);
+        TEST_CASE(combine_Ustrings);
+        TEST_CASE(combine_u8strings);
 
         // Simplify "not" to "!" (#345)
         TEST_CASE(not1);
@@ -507,9 +509,9 @@ private:
     }
 
     void combine_ustrings() {
-        const char code[] =  "abc = u\"abc\";";
+        const char code[] =  "abcd = u\"ab\" u\"cd\";";
 
-        const char expected[] =  "abc = \"abc\" ;";
+        const char expected[] =  "abcd = \"abcd\" ;";
 
         Tokenizer tokenizer(&settings0, this);
         std::istringstream istr(code);
@@ -519,6 +521,33 @@ private:
         ASSERT_EQUALS(true, tokenizer.tokens()->tokAt(2)->isLong());
     }
 
+    void combine_Ustrings() {
+        const char code[] =  "abcd = U\"ab\" U\"cd\";";
+
+        const char expected[] =  "abcd = \"abcd\" ;";
+
+        Tokenizer tokenizer(&settings0, this);
+        std::istringstream istr(code);
+        tokenizer.tokenize(istr, "test.cpp");
+
+        ASSERT_EQUALS(expected, tokenizer.tokens()->stringifyList(0, false));
+        ASSERT_EQUALS(true, tokenizer.tokens()->tokAt(2)->isLong());
+    }
+
+    void combine_u8strings() {
+        const char code[] =  "abcd = u8\"ab\" u8\"cd\";";
+
+        const char expected[] =  "abcd = \"abcd\" ;";
+
+
+        Tokenizer tokenizer(&settings0, this);
+        std::istringstream istr(code);
+        tokenizer.tokenize(istr, "test.cpp");
+
+        ASSERT_EQUALS(expected, tokenizer.tokens()->stringifyList(0, false));
+        ASSERT_EQUALS(false, tokenizer.tokens()->tokAt(2)->isLong());
+    }
+
     void double_plus() {
         {
             const char code1[] =  "void foo( int a )\n"
diff --git a/test/testtokenize.cpp b/test/testtokenize.cpp
index eb3179417..594948c5b 100644
--- a/test/testtokenize.cpp
+++ b/test/testtokenize.cpp
@@ -8210,6 +8210,7 @@ private:
         if (!tokenList.list.createTokens(istr,"test.cpp"))
             return "ERROR";
 
+        tokenList.combineStringAndCharLiterals();
         tokenList.combineOperators();
         tokenList.createLinks();
         tokenList.createLinks2();
@@ -8295,9 +8296,11 @@ private:
 
         ASSERT_EQUALS("a\"\"=", testAst("a=\"\""));
         ASSERT_EQUALS("a\'\'=", testAst("a=\'\'"));
-
         ASSERT_EQUALS("'X''a'>", testAst("('X' > 'a')"));
         ASSERT_EQUALS("'X''a'>", testAst("(L'X' > L'a')"));
+        ASSERT_EQUALS("'X''a'>", testAst("(u'X' > u'a')"));
+        ASSERT_EQUALS("'X''a'>", testAst("(U'X' > U'a')"));
+        ASSERT_EQUALS("'X''a'>", testAst("(u8'X' > u8'a')"));
 
         ASSERT_EQUALS("a0>bc/d:?", testAst("(a>0) ? (b/(c)) : d;"));
         ASSERT_EQUALS("abc/+d+", testAst("a + (b/(c)) + d;"));