diff --git a/lib/preprocessor.cpp b/lib/preprocessor.cpp index d5eb79785..8af9d50e0 100644 --- a/lib/preprocessor.cpp +++ b/lib/preprocessor.cpp @@ -59,15 +59,32 @@ void Preprocessor::writeError(const std::string &fileName, const unsigned int li false)); } -static unsigned char readChar(std::istream &istr) +static unsigned char readChar(std::istream &istr, unsigned int bom) { unsigned char ch = (unsigned char)istr.get(); + // For UTF-16 encoded files the BOM is 0xfeff/0xfffe. If the + // character is non-ASCII character then replace it with 0xff + if (bom == 0xfeff || bom == 0xfffe) { + unsigned char ch2 = (unsigned char)istr.get(); + int ch16 = (bom == 0xfeff) ? (ch<<8 | ch2) : (ch2<<8 | ch); + ch = (unsigned char)((ch16 >= 0x80) ? 0xff : ch16); + } + // Handling of newlines.. if (ch == '\r') { ch = '\n'; - if ((char)istr.peek() == '\n') + if (bom == 0 && (char)istr.peek() == '\n') (void)istr.get(); + else if (bom == 0xfeff || bom == 0xfffe) { + int c1 = istr.get(); + int c2 = istr.get(); + int ch16 = (bom == 0xfeff) ? (c1<<8 | c2) : (c2<<8 | c1); + if (ch16 != '\n') { + istr.unget(); + istr.unget(); + } + } } return ch; @@ -108,6 +125,14 @@ static std::string unify(const std::string &s, char separator) /** Just read the code into a string. Perform simple cleanup of the code */ std::string Preprocessor::read(std::istream &istr, const std::string &filename) { + // The UTF-16 BOM is 0xfffe or 0xfeff. + unsigned int bom = 0; + if (istr.peek() >= 0xfe) { + bom = (istr.get() << 8); + if (istr.peek() >= 0xfe) + bom |= istr.get(); + } + // ------------------------------------------------------------------------------------------ // // handling @@ -115,7 +140,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename) // on the next , extra newlines will be added std::ostringstream code; unsigned int newlines = 0; - for (unsigned char ch = readChar(istr); istr.good(); ch = readChar(istr)) { + for (unsigned char ch = readChar(istr,bom); istr.good(); ch = readChar(istr,bom)) { // Replace assorted special chars with spaces.. if (((ch & 0x80) == 0) && (ch != '\n') && (std::isspace(ch) || std::iscntrl(ch))) ch = ' '; @@ -135,7 +160,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename) if (chNext != '\n' && chNext != '\r' && (std::isspace(chNext) || std::iscntrl(chNext))) { // Skip whitespace between and - (void)readChar(istr); + (void)readChar(istr,bom); continue; } @@ -147,7 +172,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename) #endif if (chNext == '\n' || chNext == '\r') { ++newlines; - (void)readChar(istr); // Skip the "" + (void)readChar(istr,bom); // Skip the "" } else code << "\\"; } else { diff --git a/test/testpreprocessor.cpp b/test/testpreprocessor.cpp index 40d31a171..d8f1d8e6e 100644 --- a/test/testpreprocessor.cpp +++ b/test/testpreprocessor.cpp @@ -62,6 +62,9 @@ private: TEST_CASE(readCode1); TEST_CASE(readCode2); + // reading utf-16 file + TEST_CASE(utf16); + // The bug that started the whole work with the new preprocessor TEST_CASE(Bug2190219); @@ -284,6 +287,59 @@ private: } + void utf16() { + Settings settings; + Preprocessor preprocessor(&settings, this); + + // a => a + { + const char code[] = { (char)0xff, (char)0xfe, 'a', '\0' }; + std::string s(code, sizeof(code)); + std::istringstream istr(s); + ASSERT_EQUALS("a", preprocessor.read(istr, "test.c")); + } + + { + const char code[] = { (char)0xfe, (char)0xff, '\0', 'a' }; + std::string s(code, sizeof(code)); + std::istringstream istr(s); + ASSERT_EQUALS("a", preprocessor.read(istr, "test.c")); + } + + // extended char => 0xff + { + const char code[] = { (char)0xff, (char)0xfe, 'a', 'a' }; + std::string s(code, sizeof(code)); + std::istringstream istr(s); + const char expected[] = { (char)0xff, 0 }; + ASSERT_EQUALS(expected, preprocessor.read(istr, "test.c")); + } + + { + const char code[] = { (char)0xfe, (char)0xff, 'a', 'a' }; + std::string s(code, sizeof(code)); + std::istringstream istr(s); + const char expected[] = { (char)0xff, 0 }; + ASSERT_EQUALS(expected, preprocessor.read(istr, "test.c")); + } + + // \r\n => \n + { + const char code[] = { (char)0xff, (char)0xfe, '\r', '\0', '\n', '\0' }; + std::string s(code, sizeof(code)); + std::istringstream istr(s); + ASSERT_EQUALS("\n", preprocessor.read(istr, "test.c")); + } + + { + const char code[] = { (char)0xfe, (char)0xff, '\0', '\r', '\0', '\n' }; + std::string s(code, sizeof(code)); + std::istringstream istr(s); + ASSERT_EQUALS("\n", preprocessor.read(istr, "test.c")); + } + } + + void Bug2190219() { const char filedata[] = "int main()\n" "{\n"