From 297360920abb3df447d73c056d3fee578b1a353e Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Wed, 16 Oct 2019 11:41:33 +0200
Subject: [PATCH] Keep prefix in string and char literals (#2272)

Keeping the prefix in the token allows cppcheck to print the correct
string and char literals in debug and error messages.

To achieve this, move some of the helper functions from token.cpp to
utils.h so that checks that look at string and char literals can reuse
them. This is a large part of this commit.

Note that the only user visible change is that when string and char
literals are printed in error messages, the prefix is now included.

For example:

	int f() {
		return test.substr( 0 , 4 ) == U"Hello" ? 0 : 1 ;
	};

now prints U"Hello" instead of "Hello" in the error message.
---
 lib/checkstring.cpp         |  4 +--
 lib/mathlib.cpp             |  8 ++---
 lib/token.cpp               | 59 +++++++++++--------------------------
 lib/tokenize.cpp            |  7 +++--
 lib/utils.h                 | 50 +++++++++++++++++++++++++++++++
 test/testmathlib.cpp        |  2 ++
 test/testsimplifytokens.cpp | 12 ++++----
 test/teststring.cpp         |  6 ++--
 test/testtoken.cpp          |  5 +++-
 test/testtokenize.cpp       |  8 ++---
 10 files changed, 98 insertions(+), 63 deletions(-)
diff --git a/lib/checkstring.cpp b/lib/checkstring.cpp
index fff8215ed..82066c70f 100644
--- a/lib/checkstring.cpp
+++ b/lib/checkstring.cpp
@@ -327,9 +327,9 @@ void CheckString::incorrectStringCompareError(const Token *tok, const std::strin
 
 void CheckString::incorrectStringBooleanError(const Token *tok, const std::string& string)
 {
-    const bool charLiteral = string[0] == '\'';
+    const bool charLiteral = isCharLiteral(string);
     const std::string literalType = charLiteral ? "char" : "string";
-    const std::string result = (string == "\'\\0\'") ? "false" : "true";
+    const std::string result = getCharLiteral(string) == "\\0" ? "false" : "true";
     reportError(tok,
                 Severity::warning,
                 charLiteral ? "incorrectCharBooleanError" : "incorrectStringBooleanError",
diff --git a/lib/mathlib.cpp b/lib/mathlib.cpp
index 9dc0e9060..be73fa1f7 100644
--- a/lib/mathlib.cpp
+++ b/lib/mathlib.cpp
@@ -531,8 +531,8 @@ MathLib::bigint MathLib::toLongNumber(const std::string & str)
             return static_cast<bigint>(doubleval);
     }
 
-    if (str[0] == '\'' && str.size() >= 3U && endsWith(str,'\'')) {
-        return characterLiteralToLongNumber(str.substr(1,str.size()-2));
+    if (isCharLiteral(str)) {
+        return characterLiteralToLongNumber(getCharLiteral(str));
     }
 
     if (str[0] == '-') {
@@ -600,8 +600,8 @@ static double FloatHexToDoubleNumber(const std::string& str)
 
 double MathLib::toDoubleNumber(const std::string &str)
 {
-    if (str[0] == '\'' && str.size() >= 3U && endsWith(str,'\''))
-        return characterLiteralToLongNumber(str.substr(1,str.size()-2));
+    if (isCharLiteral(str))
+        return characterLiteralToLongNumber(getCharLiteral(str));
     if (isIntHex(str))
         return static_cast<double>(toLongNumber(str));
     // nullcheck
diff --git a/lib/token.cpp b/lib/token.cpp
index 1fddb3ec8..50bf1df4c 100644
--- a/lib/token.cpp
+++ b/lib/token.cpp
@@ -34,22 +34,6 @@
 #include <stack>
 #include <utility>
 
-static const std::string literal_prefix[4] = {"u8", "u", "U", "L"};
-
-static bool isStringCharLiteral(const std::string &str, char q)
-{
-
-    if (!endsWith(str, q))
-        return false;
-    if (str[0] == q && str.length() > 1)
-        return true;
-
-    for (const std::string & p: literal_prefix) {
-        if ((str.length() + 1) > p.length() && (str.compare(0, p.size() + 1, (p + q)) == 0))
-            return true;
-    }
-    return false;
-}
 const std::list<ValueFlow::Value> TokenImpl::mEmptyValueList;
 
 Token::Token(TokensFrontBack *tokensFrontBack) :
@@ -89,9 +73,9 @@ void Token::update_property_info()
     if (!mStr.empty()) {
         if (mStr == "true" || mStr == "false")
             tokType(eBoolean);
-        else if (isStringCharLiteral(mStr, '\"'))
+        else if (isStringLiteral(mStr))
             tokType(eString);
-        else if (isStringCharLiteral(mStr, '\''))
+        else if (isCharLiteral(mStr))
             tokType(eChar);
         else if (std::isalpha((unsigned char)mStr[0]) || mStr[0] == '_' || mStr[0] == '$') { // Name
             if (mImpl->mVarId)
@@ -168,17 +152,11 @@ void Token::update_property_isStandardType()
 
 void Token::update_property_char_string_literal()
 {
-    if (!(mTokType == Token::eString || mTokType == Token::eChar)) // Token has already been updated
+    if (mTokType != Token::eString && mTokType != Token::eChar)
         return;
 
-    for (const std::string & p : literal_prefix) {
-        if (((mTokType == Token::eString) && mStr.compare(0, p.size() + 1, p + "\"") == 0) ||
-            ((mTokType == Token::eChar) && (mStr.compare(0, p.size() +  1, p + "\'") == 0))) {
-            mStr = mStr.substr(p.size());
-            isLong(p != "u8");
-            break;
-        }
-    }
+    isLong(((mTokType == Token::eString) && isPrefixStringCharLiteral(mStr, '"', "L")) ||
+           ((mTokType == Token::eChar) && isPrefixStringCharLiteral(mStr, '\'', "L")));
 }
 
 bool Token::isUpperCaseName() const
@@ -195,7 +173,7 @@ bool Token::isUpperCaseName() const
 void Token::concatStr(std::string const& b)
 {
     mStr.erase(mStr.length() - 1);
-    mStr.append(b.begin() + 1, b.end());
+    mStr.append(getStringLiteral(b) + "\"");
 
     update_property_info();
 }
@@ -203,7 +181,7 @@ void Token::concatStr(std::string const& b)
 std::string Token::strValue() const
 {
     assert(mTokType == eString);
-    std::string ret(mStr.substr(1, mStr.length() - 2));
+    std::string ret(getStringLiteral(mStr));
     std::string::size_type pos = 0U;
     while ((pos = ret.find('\\', pos)) != std::string::npos) {
         ret.erase(pos,1U);
@@ -721,8 +699,9 @@ nonneg int Token::getStrLength(const Token *tok)
     assert(tok->mTokType == eString);
 
     int len = 0;
-    std::string::const_iterator it = tok->str().begin() + 1U;
-    const std::string::const_iterator end = tok->str().end() - 1U;
+    const std::string str(getStringLiteral(tok->str()));
+    std::string::const_iterator it = str.begin();
+    const std::string::const_iterator end = str.end();
 
     while (it != end) {
         if (*it == '\\') {
@@ -747,9 +726,9 @@ nonneg int Token::getStrSize(const Token *tok)
 {
     assert(tok != nullptr);
     assert(tok->tokType() == eString);
-    const std::string &str = tok->str();
+    const std::string str(getStringLiteral(tok->str()));
     int sizeofstring = 1;
-    for (int i = 1; i < (int)str.size() - 1; i++) {
+    for (int i = 0; i < (int)str.size(); i++) {
         if (str[i] == '\\')
             ++i;
         ++sizeofstring;
@@ -760,9 +739,9 @@ nonneg int Token::getStrSize(const Token *tok)
 std::string Token::getCharAt(const Token *tok, MathLib::bigint index)
 {
     assert(tok != nullptr);
-
-    std::string::const_iterator it = tok->str().begin() + 1U;
-    const std::string::const_iterator end = tok->str().end() - 1U;
+    std::string str(getStringLiteral(tok->str()));
+    std::string::const_iterator it = str.begin();
+    const std::string::const_iterator end = str.end();
 
     while (it != end) {
         if (index == 0) {
@@ -1161,9 +1140,7 @@ void Token::stringify(std::ostream& os, bool varid, bool attributes, bool macro)
         if (isComplex())
             os << "_Complex ";
         if (isLong()) {
-            if (mTokType == eString || mTokType == eChar)
-                os << "L";
-            else
+            if (!(mTokType == eString || mTokType == eChar))
                 os << "long ";
         }
     }
@@ -1428,8 +1405,8 @@ static std::string stringFromTokenRange(const Token* start, const Token* end)
     for (const Token *tok = start; tok && tok != end; tok = tok->next()) {
         if (tok->isUnsigned())
             ret << "unsigned ";
-        if (tok->isLong())
-            ret << (tok->isLiteral() ? "L" : "long ");
+        if (tok->isLong() && !tok->isLiteral())
+            ret << "long ";
         if (tok->originalName().empty() || tok->isUnsigned() || tok->isLong()) {
             ret << tok->str();
         } else
diff --git a/lib/tokenize.cpp b/lib/tokenize.cpp
index 4a44c9aa8..b73b40880 100644
--- a/lib/tokenize.cpp
+++ b/lib/tokenize.cpp
@@ -2484,7 +2484,7 @@ void Tokenizer::combineStringAndCharLiterals()
     for (Token *tok = list.front();
          tok;
          tok = tok->next()) {
-        if (tok->str()[0] != '"')
+        if (!isStringLiteral(tok->str()))
             continue;
 
         tok->str(simplifyString(tok->str()));
@@ -10637,8 +10637,11 @@ void Tokenizer::simplifyMicrosoftStringFunctions()
             tok->deleteNext();
             tok->deleteThis();
             tok->deleteNext();
-            if (!ansi)
+            if (!ansi) {
                 tok->isLong(true);
+                if (tok->str()[0] != 'L')
+                    tok->str("L" + tok->str());
+            }
             while (Token::Match(tok->next(), "_T|_TEXT|TEXT ( %char%|%str% )")) {
                 tok->next()->deleteNext();
                 tok->next()->deleteThis();
diff --git a/lib/utils.h b/lib/utils.h
index 62953e42b..15cf940b1 100644
--- a/lib/utils.h
+++ b/lib/utils.h
@@ -36,6 +36,56 @@ inline bool endsWith(const std::string &str, const char end[], std::size_t endle
     return (str.size() >= endlen) && (str.compare(str.size()-endlen, endlen, end)==0);
 }
 
+inline static bool isPrefixStringCharLiteral(const std::string &str, char q, const std::string& p)
+{
+    if (!endsWith(str, q))
+        return false;
+    if ((str.length() + 1) > p.length() && (str.compare(0, p.size() + 1, p + q) == 0))
+        return true;
+    return false;
+}
+
+inline static bool isStringCharLiteral(const std::string &str, char q)
+{
+    for (const std::string & p: {
+    "", "u8", "u", "U", "L"
+}) {
+        if (isPrefixStringCharLiteral(str, q, p))
+            return true;
+    }
+    return false;
+}
+
+inline static bool isStringLiteral(const std::string &str)
+{
+    return isStringCharLiteral(str, '"');
+}
+
+inline static bool isCharLiteral(const std::string &str)
+{
+    return isStringCharLiteral(str, '\'');
+}
+
+inline static std::string getStringCharLiteral(const std::string &str, char q)
+{
+    const std::size_t quotePos = str.find(q);
+    return str.substr(quotePos + 1U, str.size() - quotePos - 2U);
+}
+
+inline static std::string getStringLiteral(const std::string &str)
+{
+    if (isStringLiteral(str))
+        return getStringCharLiteral(str, '"');
+    return "";
+}
+
+inline static std::string getCharLiteral(const std::string &str)
+{
+    if (isCharLiteral(str))
+        return getStringCharLiteral(str, '\'');
+    return "";
+}
+
 inline static const char *getOrdinalText(int i)
 {
     if (i == 1)
diff --git a/test/testmathlib.cpp b/test/testmathlib.cpp
index 0acc2449f..3489f6fdd 100644
--- a/test/testmathlib.cpp
+++ b/test/testmathlib.cpp
@@ -286,6 +286,7 @@ private:
         ASSERT_EQUALS((int)('\x10'), MathLib::toLongNumber("'\\x10'"));
         ASSERT_EQUALS((int)('\100'), MathLib::toLongNumber("'\\100'"));
         ASSERT_EQUALS((int)('\200'), MathLib::toLongNumber("'\\200'"));
+        ASSERT_EQUALS((int)(L'A'),   MathLib::toLongNumber("L'A'"));
 #ifdef __GNUC__
         // BEGIN Implementation-specific results
         ASSERT_EQUALS((int)('AB'),    MathLib::toLongNumber("'AB'"));
@@ -375,6 +376,7 @@ private:
         ASSERT_EQUALS_DOUBLE(0.0,    MathLib::toDoubleNumber("-0.0"),     0.000001);
         ASSERT_EQUALS_DOUBLE(0.0,    MathLib::toDoubleNumber("+0.0"),     0.000001);
         ASSERT_EQUALS_DOUBLE('0',    MathLib::toDoubleNumber("'0'"),      0.000001);
+        ASSERT_EQUALS_DOUBLE(L'0',   MathLib::toDoubleNumber("L'0'"),     0.000001);
 
         ASSERT_EQUALS_DOUBLE(192, MathLib::toDoubleNumber("0x0.3p10"), 0.000001);
         ASSERT_EQUALS_DOUBLE(5.42101e-20, MathLib::toDoubleNumber("0x1p-64"), 1e-20);
diff --git a/test/testsimplifytokens.cpp b/test/testsimplifytokens.cpp
index 4d23b0d2b..2cf3747e7 100644
--- a/test/testsimplifytokens.cpp
+++ b/test/testsimplifytokens.cpp
@@ -1804,7 +1804,7 @@ private:
     void combine_wstrings() {
         const char code[] =  "a = L\"hello \"  L\"world\" ;\n";
 
-        const char expected[] =  "a = \"hello world\" ;";
+        const char expected[] =  "a = L\"hello world\" ;";
 
         Tokenizer tokenizer(&settings0, this);
         std::istringstream istr(code);
@@ -1817,33 +1817,33 @@ private:
     void combine_ustrings() {
         const char code[] =  "abcd = u\"ab\" u\"cd\";";
 
-        const char expected[] =  "abcd = \"abcd\" ;";
+        const char expected[] =  "abcd = u\"abcd\" ;";
 
         Tokenizer tokenizer(&settings0, this);
         std::istringstream istr(code);
         tokenizer.tokenize(istr, "test.cpp");
 
         ASSERT_EQUALS(expected, tokenizer.tokens()->stringifyList(nullptr, false));
-        ASSERT_EQUALS(true, tokenizer.tokens()->tokAt(2)->isLong());
+        ASSERT_EQUALS(false, tokenizer.tokens()->tokAt(2)->isLong());
     }
 
     void combine_Ustrings() {
         const char code[] =  "abcd = U\"ab\" U\"cd\";";
 
-        const char expected[] =  "abcd = \"abcd\" ;";
+        const char expected[] =  "abcd = U\"abcd\" ;";
 
         Tokenizer tokenizer(&settings0, this);
         std::istringstream istr(code);
         tokenizer.tokenize(istr, "test.cpp");
 
         ASSERT_EQUALS(expected, tokenizer.tokens()->stringifyList(nullptr, false));
-        ASSERT_EQUALS(true, tokenizer.tokens()->tokAt(2)->isLong());
+        ASSERT_EQUALS(false, tokenizer.tokens()->tokAt(2)->isLong());
     }
 
     void combine_u8strings() {
         const char code[] =  "abcd = u8\"ab\" u8\"cd\";";
 
-        const char expected[] =  "abcd = \"abcd\" ;";
+        const char expected[] =  "abcd = u8\"abcd\" ;";
 
 
         Tokenizer tokenizer(&settings0, this);
diff --git a/test/teststring.cpp b/test/teststring.cpp
index 25664f8f4..4fd831298 100644
--- a/test/teststring.cpp
+++ b/test/teststring.cpp
@@ -603,7 +603,7 @@ private:
         check("int f() {\n"
               "    return test.substr( 0 , 4 ) == L\"Hello\" ? 0 : 1 ;\n"
               "}");
-        ASSERT_EQUALS("[test.cpp:2]: (warning) String literal \"Hello\" doesn't match length argument for substr().\n", errout.str());
+        ASSERT_EQUALS("[test.cpp:2]: (warning) String literal L\"Hello\" doesn't match length argument for substr().\n", errout.str());
 
         check("int f() {\n"
               "    return test.substr( 0 , 5 ) == \"Hello\" ? 0 : 1 ;\n"
@@ -688,7 +688,7 @@ private:
               "  int x = 'd' ? 1 : 2;\n"
               "}");
         ASSERT_EQUALS("[test.cpp:2]: (warning) Conversion of char literal 'a' to bool always evaluates to true.\n"
-                      "[test.cpp:3]: (warning) Conversion of char literal 'b' to bool always evaluates to true.\n"
+                      "[test.cpp:3]: (warning) Conversion of char literal L'b' to bool always evaluates to true.\n"
                       "[test.cpp:4]: (warning) Conversion of char literal 'c' to bool always evaluates to true.\n"
                       "[test.cpp:5]: (warning) Conversion of char literal 'd' to bool always evaluates to true.\n"
                       , errout.str());
@@ -704,7 +704,7 @@ private:
               "  if(L'\\0' || cond){}\n"
               "}");
         ASSERT_EQUALS("[test.cpp:2]: (warning) Conversion of char literal '\\0' to bool always evaluates to false.\n"
-                      "[test.cpp:3]: (warning) Conversion of char literal '\\0' to bool always evaluates to false.\n", errout.str());
+                      "[test.cpp:3]: (warning) Conversion of char literal L'\\0' to bool always evaluates to false.\n", errout.str());
     }
 
     void deadStrcmp() {
diff --git a/test/testtoken.cpp b/test/testtoken.cpp
index 3f11654f6..9c679dfb3 100644
--- a/test/testtoken.cpp
+++ b/test/testtoken.cpp
@@ -887,7 +887,7 @@ private:
         tok.concatStr("123");
 
         ASSERT_EQUALS(false, tok.isBoolean());
-        ASSERT_EQUALS("tru23", tok.str());
+        ASSERT_EQUALS("tru\"", tok.str());
     }
 
     void isNameGuarantees1() const {
@@ -990,6 +990,9 @@ private:
 
         givenACodeSampleToTokenize data4("return L\"a\";");
         ASSERT_EQUALS("returnL\"a\"", data4.tokens()->expressionString());
+
+        givenACodeSampleToTokenize data5("return U\"a\";");
+        ASSERT_EQUALS("returnU\"a\"", data5.tokens()->expressionString());
     }
 
     void hasKnownIntValue() {
diff --git a/test/testtokenize.cpp b/test/testtokenize.cpp
index 7178fcc34..384395ed4 100644
--- a/test/testtokenize.cpp
+++ b/test/testtokenize.cpp
@@ -7222,10 +7222,10 @@ private:
         ASSERT_EQUALS("a\"\"=", testAst("a=\"\""));
         ASSERT_EQUALS("a\'\'=", testAst("a=\'\'"));
         ASSERT_EQUALS("'X''a'>", testAst("('X' > 'a')"));
-        ASSERT_EQUALS("'X''a'>", testAst("(L'X' > L'a')"));
-        ASSERT_EQUALS("'X''a'>", testAst("(u'X' > u'a')"));
-        ASSERT_EQUALS("'X''a'>", testAst("(U'X' > U'a')"));
-        ASSERT_EQUALS("'X''a'>", testAst("(u8'X' > u8'a')"));
+        ASSERT_EQUALS("L'X'L'a'>", testAst("(L'X' > L'a')"));
+        ASSERT_EQUALS("u'X'u'a'>", testAst("(u'X' > u'a')"));
+        ASSERT_EQUALS("U'X'U'a'>", testAst("(U'X' > U'a')"));
+        ASSERT_EQUALS("u8'X'u8'a'>", testAst("(u8'X' > u8'a')"));
 
         ASSERT_EQUALS("a0>bc/d:?", testAst("(a>0) ? (b/(c)) : d;"));
         ASSERT_EQUALS("abc/+d+", testAst("a + (b/(c)) + d;"));