Only accept well-formed UTF-8 sequences

Enable tests that were disabled before, and adjust one test,
and add more tests.
This commit is contained in:
Behdad Esfahbod 2014-07-11 16:10:58 -04:00
parent 7323d385cc
commit af2490c095
2 changed files with 66 additions and 32 deletions

View File

@ -1,5 +1,5 @@
/* /*
* Copyright © 2011,2012 Google, Inc. * Copyright © 2011,2012,2014 Google, Inc.
* *
* This is part of HarfBuzz, a text shaping library. * This is part of HarfBuzz, a text shaping library.
* *
@ -32,44 +32,75 @@
/* UTF-8 */ /* UTF-8 */
#define HB_UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) { Len = 1; Mask = 0x7f; } \
else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
else Len = 0;
static inline const uint8_t * static inline const uint8_t *
hb_utf_next (const uint8_t *text, hb_utf_next (const uint8_t *text,
const uint8_t *end, const uint8_t *end,
hb_codepoint_t *unicode) hb_codepoint_t *unicode)
{ {
hb_codepoint_t c = *text, mask; /* Written to only accept well-formed sequences.
unsigned int len; * Based on ideas from ICU's U8_NEXT.
* Generates a -1 for each ill-formed byte. */
/* TODO check for overlong sequences? */ hb_codepoint_t c = *text++;
HB_UTF8_COMPUTE (c, mask, len); if (c > 0x7Fu)
if (unlikely (!len || (unsigned int) (end - text) < len)) {
*unicode = -1;
return text + 1;
} else {
hb_codepoint_t result;
unsigned int i;
result = c & mask;
for (i = 1; i < len; i++)
{ {
if (unlikely ((text[i] & 0xc0) != 0x80)) if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
{ {
unsigned int t1;
if (likely (text < end &&
(t1 = text[0] - 0x80u) <= 0x3Fu))
{
c = ((c&0x1Fu)<<6) | t1;
text++;
}
else
goto error;
}
else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
{
unsigned int t1, t2;
if (likely (1 < end - text &&
(t1 = text[0] - 0x80u) <= 0x3Fu &&
(t2 = text[1] - 0x80u) <= 0x3Fu &&
(hb_in_range (c, 0xE1u, 0xECu) ||
hb_in_range (c, 0xEEu, 0xEFu) ||
(c == 0xE0u && t1 >= 0xA0u-0x80u) ||
(c == 0xEDu && t1 <= 0x9Fu-0x80u))))
{
c = ((c&0xFu)<<12) | (t1<<6) | t2;
text += 2;
}
else
goto error;
}
else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
{
unsigned int t1, t2, t3;
if (likely (2 < end - text &&
(t1 = text[0] - 0x80u) <= 0x3Fu &&
(t2 = text[1] - 0x80u) <= 0x3Fu &&
(t3 = text[2] - 0x80u) <= 0x3Fu &&
(hb_in_range (c, 0xF1u, 0xF3u) ||
(c == 0xF0u && t1 >= 0x90u-0x80u) ||
(c == 0xF4u && t1 <= 0x8Fu-0x80u))))
{
c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
text += 3;
}
else
goto error;
}
else
goto error;
}
*unicode = c;
return text;
error:
*unicode = -1; *unicode = -1;
return text + 1; return text;
}
result <<= 6;
result |= (text[i] & 0x3f);
}
*unicode = result;
return text + len;
}
} }
static inline const uint8_t * static inline const uint8_t *

View File

@ -449,11 +449,15 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
{ "\x7f", -1, 1, TRUE }, { "\x7f", -1, 1, TRUE },
{ "\xdf\xbf", -1, 2, TRUE }, { "\xdf\xbf", -1, 2, TRUE },
{ "\xef\xbf\xbf", -1, 0, TRUE }, { "\xef\xbf\xbf", -1, 0, TRUE },
{ "\xf7\xbf\xbf\xbf", -1, 0, TRUE }, { "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
{ "\xf4\x90\xbf\xbf", -1, 0, FALSE },
{ "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
{ "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE }, { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
{ "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE }, { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
/* other boundary conditions */ /* other boundary conditions */
{ "\xed\x9f\xbf", -1, 3, TRUE }, { "\xed\x9f\xbf", -1, 3, TRUE },
{ "\xed\xa0\x80", -1, 0, FALSE },
{ "\xed\xbf\xbf", -1, 0, FALSE },
{ "\xee\x80\x80", -1, 3, TRUE }, { "\xee\x80\x80", -1, 3, TRUE },
{ "\xef\xbf\xbd", -1, 3, TRUE }, { "\xef\xbf\xbd", -1, 3, TRUE },
{ "\xf4\x8f\xbf\xbf", -1, 0, TRUE }, { "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
@ -610,8 +614,6 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
/* impossible bytes */ /* impossible bytes */
{ "\x20\xfe\x20", -1, 1, FALSE }, { "\x20\xfe\x20", -1, 1, FALSE },
{ "\x20\xff\x20", -1, 1, FALSE }, { "\x20\xff\x20", -1, 1, FALSE },
#if 0
/* XXX fix these, or document that we don't detect them? */
/* overlong sequences */ /* overlong sequences */
{ "\x20\xc0\xaf\x20", -1, 1, FALSE }, { "\x20\xc0\xaf\x20", -1, 1, FALSE },
{ "\x20\xe0\x80\xaf\x20", -1, 1, FALSE }, { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
@ -644,6 +646,7 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
{ "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE }, { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
{ "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE }, { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
{ "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE }, { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
#if 0 /* We don't consider U+FFFE / U+FFFF and similar invalid. */
{ "\x20\xef\xbf\xbe\x20", -1, 1, FALSE }, { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
{ "\x20\xef\xbf\xbf\x20", -1, 1, FALSE }, { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
#endif #endif