Handle UTF-16 files. Partial fix for ticket #2083
This commit is contained in:
parent
905615e991
commit
171f570639
|
@ -59,15 +59,32 @@ void Preprocessor::writeError(const std::string &fileName, const unsigned int li
|
|||
false));
|
||||
}
|
||||
|
||||
static unsigned char readChar(std::istream &istr)
|
||||
static unsigned char readChar(std::istream &istr, unsigned int bom)
|
||||
{
|
||||
unsigned char ch = (unsigned char)istr.get();
|
||||
|
||||
// For UTF-16 encoded files the BOM is 0xfeff/0xfffe. If the
|
||||
// character is non-ASCII character then replace it with 0xff
|
||||
if (bom == 0xfeff || bom == 0xfffe) {
|
||||
unsigned char ch2 = (unsigned char)istr.get();
|
||||
int ch16 = (bom == 0xfeff) ? (ch<<8 | ch2) : (ch2<<8 | ch);
|
||||
ch = (unsigned char)((ch16 >= 0x80) ? 0xff : ch16);
|
||||
}
|
||||
|
||||
// Handling of newlines..
|
||||
if (ch == '\r') {
|
||||
ch = '\n';
|
||||
if ((char)istr.peek() == '\n')
|
||||
if (bom == 0 && (char)istr.peek() == '\n')
|
||||
(void)istr.get();
|
||||
else if (bom == 0xfeff || bom == 0xfffe) {
|
||||
int c1 = istr.get();
|
||||
int c2 = istr.get();
|
||||
int ch16 = (bom == 0xfeff) ? (c1<<8 | c2) : (c2<<8 | c1);
|
||||
if (ch16 != '\n') {
|
||||
istr.unget();
|
||||
istr.unget();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ch;
|
||||
|
@ -108,6 +125,14 @@ static std::string unify(const std::string &s, char separator)
|
|||
/** Just read the code into a string. Perform simple cleanup of the code */
|
||||
std::string Preprocessor::read(std::istream &istr, const std::string &filename)
|
||||
{
|
||||
// The UTF-16 BOM is 0xfffe or 0xfeff.
|
||||
unsigned int bom = 0;
|
||||
if (istr.peek() >= 0xfe) {
|
||||
bom = (istr.get() << 8);
|
||||
if (istr.peek() >= 0xfe)
|
||||
bom |= istr.get();
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------------
|
||||
//
|
||||
// handling <backslash><newline>
|
||||
|
@ -115,7 +140,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename)
|
|||
// on the next <newline>, extra newlines will be added
|
||||
std::ostringstream code;
|
||||
unsigned int newlines = 0;
|
||||
for (unsigned char ch = readChar(istr); istr.good(); ch = readChar(istr)) {
|
||||
for (unsigned char ch = readChar(istr,bom); istr.good(); ch = readChar(istr,bom)) {
|
||||
// Replace assorted special chars with spaces..
|
||||
if (((ch & 0x80) == 0) && (ch != '\n') && (std::isspace(ch) || std::iscntrl(ch)))
|
||||
ch = ' ';
|
||||
|
@ -135,7 +160,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename)
|
|||
if (chNext != '\n' && chNext != '\r' &&
|
||||
(std::isspace(chNext) || std::iscntrl(chNext))) {
|
||||
// Skip whitespace between <backslash> and <newline>
|
||||
(void)readChar(istr);
|
||||
(void)readChar(istr,bom);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -147,7 +172,7 @@ std::string Preprocessor::read(std::istream &istr, const std::string &filename)
|
|||
#endif
|
||||
if (chNext == '\n' || chNext == '\r') {
|
||||
++newlines;
|
||||
(void)readChar(istr); // Skip the "<backslash><newline>"
|
||||
(void)readChar(istr,bom); // Skip the "<backslash><newline>"
|
||||
} else
|
||||
code << "\\";
|
||||
} else {
|
||||
|
|
|
@ -62,6 +62,9 @@ private:
|
|||
TEST_CASE(readCode1);
|
||||
TEST_CASE(readCode2);
|
||||
|
||||
// reading utf-16 file
|
||||
TEST_CASE(utf16);
|
||||
|
||||
// The bug that started the whole work with the new preprocessor
|
||||
TEST_CASE(Bug2190219);
|
||||
|
||||
|
@ -284,6 +287,59 @@ private:
|
|||
}
|
||||
|
||||
|
||||
void utf16() {
|
||||
Settings settings;
|
||||
Preprocessor preprocessor(&settings, this);
|
||||
|
||||
// a => a
|
||||
{
|
||||
const char code[] = { (char)0xff, (char)0xfe, 'a', '\0' };
|
||||
std::string s(code, sizeof(code));
|
||||
std::istringstream istr(s);
|
||||
ASSERT_EQUALS("a", preprocessor.read(istr, "test.c"));
|
||||
}
|
||||
|
||||
{
|
||||
const char code[] = { (char)0xfe, (char)0xff, '\0', 'a' };
|
||||
std::string s(code, sizeof(code));
|
||||
std::istringstream istr(s);
|
||||
ASSERT_EQUALS("a", preprocessor.read(istr, "test.c"));
|
||||
}
|
||||
|
||||
// extended char => 0xff
|
||||
{
|
||||
const char code[] = { (char)0xff, (char)0xfe, 'a', 'a' };
|
||||
std::string s(code, sizeof(code));
|
||||
std::istringstream istr(s);
|
||||
const char expected[] = { (char)0xff, 0 };
|
||||
ASSERT_EQUALS(expected, preprocessor.read(istr, "test.c"));
|
||||
}
|
||||
|
||||
{
|
||||
const char code[] = { (char)0xfe, (char)0xff, 'a', 'a' };
|
||||
std::string s(code, sizeof(code));
|
||||
std::istringstream istr(s);
|
||||
const char expected[] = { (char)0xff, 0 };
|
||||
ASSERT_EQUALS(expected, preprocessor.read(istr, "test.c"));
|
||||
}
|
||||
|
||||
// \r\n => \n
|
||||
{
|
||||
const char code[] = { (char)0xff, (char)0xfe, '\r', '\0', '\n', '\0' };
|
||||
std::string s(code, sizeof(code));
|
||||
std::istringstream istr(s);
|
||||
ASSERT_EQUALS("\n", preprocessor.read(istr, "test.c"));
|
||||
}
|
||||
|
||||
{
|
||||
const char code[] = { (char)0xfe, (char)0xff, '\0', '\r', '\0', '\n' };
|
||||
std::string s(code, sizeof(code));
|
||||
std::istringstream istr(s);
|
||||
ASSERT_EQUALS("\n", preprocessor.read(istr, "test.c"));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Bug2190219() {
|
||||
const char filedata[] = "int main()\n"
|
||||
"{\n"
|
||||
|
|
Loading…
Reference in New Issue