Improved tokenizer to handle '#' better. Previously everything after # was combined into a single token,

now # is considered more like an alphabet, with few exceptions, e.g. "##" tokens.
This commit is contained in:
Reijo Tomperi 2009-03-15 23:09:27 +02:00
parent a6ad972aad
commit 21b687b301
4 changed files with 73 additions and 101 deletions

View File

@ -177,20 +177,10 @@ void Tokenizer::tokenize(std::istream &code, const char FileName[])
if (ch < 0) if (ch < 0)
continue; continue;
if (ch == '\n')
{
// Add current token..
addtoken(CurrentToken.c_str(), lineno++, FileIndex);
CurrentToken.clear();
continue;
}
// char/string.. // char/string..
if (ch == '\'' || ch == '\"') if (ch == '\'' || ch == '\"')
{ {
// Add previous token std::string line;
addtoken(CurrentToken.c_str(), lineno, FileIndex);
CurrentToken.clear();
// read char // read char
bool special = false; bool special = false;
@ -198,7 +188,7 @@ void Tokenizer::tokenize(std::istream &code, const char FileName[])
do do
{ {
// Append token.. // Append token..
CurrentToken += c; line += c;
if (c == '\n') if (c == '\n')
++lineno; ++lineno;
@ -213,60 +203,13 @@ void Tokenizer::tokenize(std::istream &code, const char FileName[])
c = (char)code.get(); c = (char)code.get();
} }
while (code.good() && (special || c != ch)); while (code.good() && (special || c != ch));
CurrentToken += ch; line += ch;
// Add token and start on next.. // Handle #file "file.h"
addtoken(CurrentToken.c_str(), lineno, FileIndex); if (CurrentToken == "#file")
CurrentToken.clear();
continue;
}
if (ch == '#' && CurrentToken.empty())
{
// If previous token was "#" then append this to create a "##" token
if (Token::simpleMatch(_tokensBack, "#"))
{
_tokensBack->str("##");
continue;
}
std::string line("#");
{
char chPrev = '#';
bool skip = false;
while (code.good())
{
ch = (char)code.get();
if (chPrev != '\\' && ch == '\n')
break;
if (chPrev == '\\')
line += chPrev;
if (chPrev == '#' && ch == '#')
{
addtoken("##", lineno, FileIndex);
skip = true;
break;
}
if (ch != ' ')
chPrev = ch;
if (ch != '\\' && ch != '\n')
{
line += ch;
}
if (ch == '\n')
++lineno;
}
if (skip)
continue;
}
if (strncmp(line.c_str(), "#file", 5) == 0 &&
line.find("\"") != std::string::npos)
{ {
// Extract the filename // Extract the filename
line.erase(0, line.find("\"") + 1); line = line.substr(1, line.length() - 2);
if (line.find("\"") != std::string::npos)
line.erase(line.find("\""));
// Has this file been tokenized already? // Has this file been tokenized already?
++lineno; ++lineno;
@ -290,33 +233,23 @@ void Tokenizer::tokenize(std::istream &code, const char FileName[])
} }
lineNumbers.push_back(lineno); lineNumbers.push_back(lineno);
lineno = 1; lineno = 0;
continue;
} }
else if (strncmp(line.c_str(), "#endfile", 8) == 0)
{
if (lineNumbers.empty() || fileIndexes.empty())
{
std::cerr << "####### Preprocessor bug! #######\n";
std::exit(0);
}
lineno = lineNumbers.back();
lineNumbers.pop_back();
FileIndex = fileIndexes.back();
fileIndexes.pop_back();
continue;
}
else else
{ {
// Add previous token
addtoken(CurrentToken.c_str(), lineno, FileIndex);
// Add content of the string
addtoken(line.c_str(), lineno, FileIndex); addtoken(line.c_str(), lineno, FileIndex);
} }
CurrentToken.clear();
continue;
} }
if (strchr("#+-*/%&|^?!=<>[](){};:,.~", ch)) if (strchr("+-*/%&|^?!=<>[](){};:,.~\n ", ch))
{ {
if (ch == '.' && if (ch == '.' &&
CurrentToken.length() > 0 && CurrentToken.length() > 0 &&
@ -333,8 +266,55 @@ void Tokenizer::tokenize(std::istream &code, const char FileName[])
} }
else else
{ {
addtoken(CurrentToken.c_str(), lineno, FileIndex); if (CurrentToken == "#file")
{
// Handle this where strings are handled
continue;
}
else if (CurrentToken == "#endfile")
{
if (lineNumbers.empty() || fileIndexes.empty())
{
std::cerr << "####### Preprocessor bug! #######\n";
std::exit(0);
}
lineno = lineNumbers.back();
lineNumbers.pop_back();
FileIndex = fileIndexes.back();
fileIndexes.pop_back();
CurrentToken.clear();
continue;
}
// If token contains # characters, split it up
std::string temp;
for (std::string::size_type i = 0; i < CurrentToken.length(); ++i)
{
if (CurrentToken[i] == '#' && CurrentToken.length() + 1 > i && CurrentToken[i+1] == '#')
{
addtoken(temp.c_str(), lineno, FileIndex);
temp.clear();
addtoken("##", lineno, FileIndex);
++i;
}
else
temp += CurrentToken[i];
}
addtoken(temp.c_str(), lineno, FileIndex);
CurrentToken.clear(); CurrentToken.clear();
if (ch == '\n')
{
++lineno;
continue;
}
else if (ch == ' ')
{
continue;
}
CurrentToken += ch; CurrentToken += ch;
// Add "++", "--" or ">>" token // Add "++", "--" or ">>" token
if ((ch == '+' || ch == '-' || ch == '>') && (code.peek() == ch)) if ((ch == '+' || ch == '-' || ch == '>') && (code.peek() == ch))
@ -345,14 +325,6 @@ void Tokenizer::tokenize(std::istream &code, const char FileName[])
} }
} }
if (std::isspace(ch) || std::iscntrl(ch))
{
addtoken(CurrentToken.c_str(), lineno, FileIndex);
CurrentToken.clear();
continue;
}
CurrentToken += ch; CurrentToken += ch;
} }
addtoken(CurrentToken.c_str(), lineno, FileIndex); addtoken(CurrentToken.c_str(), lineno, FileIndex);

View File

@ -1522,8 +1522,8 @@ private:
check("class A\n" check("class A\n"
"{\n" "{\n"
"public:\n" "public:\n"
" int * p;\n" " int * p;\n"
" A() { p = new int; }\n" " A() { p = new int; }\n"
"};\n", true); "};\n", true);
ASSERT_EQUALS("[test.cpp:4]: (all) Memory leak: A::p\n", errout.str()); ASSERT_EQUALS("[test.cpp:4]: (all) Memory leak: A::p\n", errout.str());
} }
@ -1533,8 +1533,8 @@ private:
check("class A\n" check("class A\n"
"{\n" "{\n"
"public:\n" "public:\n"
" int * p;\n" " int * p;\n"
" A();\n" " A();\n"
"};\n" "};\n"
"A::A() : p(new int[10])\n" "A::A() : p(new int[10])\n"
"{ }", true); "{ }", true);

View File

@ -107,7 +107,7 @@ private:
TEST_CASE(multi_character_character); TEST_CASE(multi_character_character);
TEST_CASE(stringify); TEST_CASE(stringify);
// TODO TEST_CASE(stringify2); TEST_CASE(stringify2);
TEST_CASE(ifdefwithfile); TEST_CASE(ifdefwithfile);
TEST_CASE(pragma); TEST_CASE(pragma);
} }

View File

@ -912,10 +912,10 @@ private:
{ {
const std::string code("void func()\n" const std::string code("void func()\n"
"{\n" "{\n"
" char a[256] = \"test\";\n" "char a[256] = \"test\";\n"
" {\n" "{\n"
" char b[256] = \"test\";\n" "char b[256] = \"test\";\n"
" }\n" "}\n"
"}\n"); "}\n");
// tokenize.. // tokenize..