Handle UTF-16 files. Partial fix for ticket #2083

This commit is contained in:
Daniel Marjamäki 2012-06-10 11:00:27 +02:00
parent 905615e991
commit 171f570639
2 changed files with 86 additions and 5 deletions

View File

@ -59,15 +59,32 @@ void Preprocessor::writeError(const std::string &fileName, const unsigned int li
false)); false));
} }
static unsigned char readChar(std::istream &istr) static unsigned char readChar(std::istream &istr, unsigned int bom)
{ {
unsigned char ch = (unsigned char)istr.get(); unsigned char ch = (unsigned char)istr.get();
// For UTF-16 encoded files the BOM is 0xfeff/0xfffe. If the
// character is non-ASCII character then replace it with 0xff
if (bom == 0xfeff || bom == 0xfffe) {
unsigned char ch2 = (unsigned char)istr.get();
int ch16 = (bom == 0xfeff) ? (ch<<8 | ch2) : (ch2<<8 | ch);
ch = (unsigned char)((ch16 >= 0x80) ? 0xff : ch16);
}
// Handling of newlines.. // Handling of newlines..
if (ch == '\r') { if (ch == '\r') {
ch = '\n'; ch = '\n';
if ((char)istr.peek() == '\n') if (bom == 0 && (char)istr.peek() == '\n')
(void)istr.get(); (void)istr.get();
else if (bom == 0xfeff || bom == 0xfffe) {
int c1 = istr.get();
int c2 = istr.get();
int ch16 = (bom == 0xfeff) ? (c1<<8 | c2) : (c2<<8 | c1);
if (ch16 != '\n') {
istr.unget();
istr.unget();
}
}
} }
return ch; return ch;
@ -108,6 +125,14 @@ static std::string unify(const std::string &s, char separator)
/** Just read the code into a string. Perform simple cleanup of the code */ /** Just read the code into a string. Perform simple cleanup of the code */
std::string Preprocessor::read(std::istream &istr, const std::string &filename) std::string Preprocessor::read(std::istream &istr, const std::string &filename)
{ {
// The UTF-16 BOM is 0xfffe or 0xfeff.
unsigned int bom = 0;
if (istr.peek() >= 0xfe) {
bom = (istr.get() << 8);
if (istr.peek() >= 0xfe)
bom |= istr.get();
}
// ------------------------------------------------------------------------------------------ // ------------------------------------------------------------------------------------------
// //
// handling <backslash><newline> // handling <backslash><newline>
@ -115,7 +140,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename)
// on the next <newline>, extra newlines will be added // on the next <newline>, extra newlines will be added
std::ostringstream code; std::ostringstream code;
unsigned int newlines = 0; unsigned int newlines = 0;
for (unsigned char ch = readChar(istr); istr.good(); ch = readChar(istr)) { for (unsigned char ch = readChar(istr,bom); istr.good(); ch = readChar(istr,bom)) {
// Replace assorted special chars with spaces.. // Replace assorted special chars with spaces..
if (((ch & 0x80) == 0) && (ch != '\n') && (std::isspace(ch) || std::iscntrl(ch))) if (((ch & 0x80) == 0) && (ch != '\n') && (std::isspace(ch) || std::iscntrl(ch)))
ch = ' '; ch = ' ';
@ -135,7 +160,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename)
if (chNext != '\n' && chNext != '\r' && if (chNext != '\n' && chNext != '\r' &&
(std::isspace(chNext) || std::iscntrl(chNext))) { (std::isspace(chNext) || std::iscntrl(chNext))) {
// Skip whitespace between <backslash> and <newline> // Skip whitespace between <backslash> and <newline>
(void)readChar(istr); (void)readChar(istr,bom);
continue; continue;
} }
@ -147,7 +172,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename)
#endif #endif
if (chNext == '\n' || chNext == '\r') { if (chNext == '\n' || chNext == '\r') {
++newlines; ++newlines;
(void)readChar(istr); // Skip the "<backslash><newline>" (void)readChar(istr,bom); // Skip the "<backslash><newline>"
} else } else
code << "\\"; code << "\\";
} else { } else {

View File

@ -62,6 +62,9 @@ private:
TEST_CASE(readCode1); TEST_CASE(readCode1);
TEST_CASE(readCode2); TEST_CASE(readCode2);
// reading utf-16 file
TEST_CASE(utf16);
// The bug that started the whole work with the new preprocessor // The bug that started the whole work with the new preprocessor
TEST_CASE(Bug2190219); TEST_CASE(Bug2190219);
@ -284,6 +287,59 @@ private:
} }
void utf16() {
Settings settings;
Preprocessor preprocessor(&settings, this);
// a => a
{
const char code[] = { (char)0xff, (char)0xfe, 'a', '\0' };
std::string s(code, sizeof(code));
std::istringstream istr(s);
ASSERT_EQUALS("a", preprocessor.read(istr, "test.c"));
}
{
const char code[] = { (char)0xfe, (char)0xff, '\0', 'a' };
std::string s(code, sizeof(code));
std::istringstream istr(s);
ASSERT_EQUALS("a", preprocessor.read(istr, "test.c"));
}
// extended char => 0xff
{
const char code[] = { (char)0xff, (char)0xfe, 'a', 'a' };
std::string s(code, sizeof(code));
std::istringstream istr(s);
const char expected[] = { (char)0xff, 0 };
ASSERT_EQUALS(expected, preprocessor.read(istr, "test.c"));
}
{
const char code[] = { (char)0xfe, (char)0xff, 'a', 'a' };
std::string s(code, sizeof(code));
std::istringstream istr(s);
const char expected[] = { (char)0xff, 0 };
ASSERT_EQUALS(expected, preprocessor.read(istr, "test.c"));
}
// \r\n => \n
{
const char code[] = { (char)0xff, (char)0xfe, '\r', '\0', '\n', '\0' };
std::string s(code, sizeof(code));
std::istringstream istr(s);
ASSERT_EQUALS("\n", preprocessor.read(istr, "test.c"));
}
{
const char code[] = { (char)0xfe, (char)0xff, '\0', '\r', '\0', '\n' };
std::string s(code, sizeof(code));
std::istringstream istr(s);
ASSERT_EQUALS("\n", preprocessor.read(istr, "test.c"));
}
}
void Bug2190219() { void Bug2190219() {
const char filedata[] = "int main()\n" const char filedata[] = "int main()\n"
"{\n" "{\n"