Only accept well-formed UTF-8 sequences
Enable tests that were disabled before, and adjust one test, and add more tests.
This commit is contained in:
parent
7323d385cc
commit
af2490c095
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright © 2011,2012 Google, Inc.
|
||||
* Copyright © 2011,2012,2014 Google, Inc.
|
||||
*
|
||||
* This is part of HarfBuzz, a text shaping library.
|
||||
*
|
||||
|
@ -32,44 +32,75 @@
|
|||
|
||||
/* UTF-8 */
|
||||
|
||||
#define HB_UTF8_COMPUTE(Char, Mask, Len) \
|
||||
if (Char < 128) { Len = 1; Mask = 0x7f; } \
|
||||
else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
|
||||
else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
|
||||
else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
|
||||
else Len = 0;
|
||||
|
||||
static inline const uint8_t *
|
||||
hb_utf_next (const uint8_t *text,
|
||||
const uint8_t *end,
|
||||
hb_codepoint_t *unicode)
|
||||
{
|
||||
hb_codepoint_t c = *text, mask;
|
||||
unsigned int len;
|
||||
/* Written to only accept well-formed sequences.
|
||||
* Based on ideas from ICU's U8_NEXT.
|
||||
* Generates a -1 for each ill-formed byte. */
|
||||
|
||||
/* TODO check for overlong sequences? */
|
||||
hb_codepoint_t c = *text++;
|
||||
|
||||
HB_UTF8_COMPUTE (c, mask, len);
|
||||
if (unlikely (!len || (unsigned int) (end - text) < len)) {
|
||||
*unicode = -1;
|
||||
return text + 1;
|
||||
} else {
|
||||
hb_codepoint_t result;
|
||||
unsigned int i;
|
||||
result = c & mask;
|
||||
for (i = 1; i < len; i++)
|
||||
if (c > 0x7Fu)
|
||||
{
|
||||
if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
|
||||
{
|
||||
unsigned int t1;
|
||||
if (likely (text < end &&
|
||||
(t1 = text[0] - 0x80u) <= 0x3Fu))
|
||||
{
|
||||
if (unlikely ((text[i] & 0xc0) != 0x80))
|
||||
{
|
||||
*unicode = -1;
|
||||
return text + 1;
|
||||
}
|
||||
result <<= 6;
|
||||
result |= (text[i] & 0x3f);
|
||||
c = ((c&0x1Fu)<<6) | t1;
|
||||
text++;
|
||||
}
|
||||
*unicode = result;
|
||||
return text + len;
|
||||
else
|
||||
goto error;
|
||||
}
|
||||
else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
|
||||
{
|
||||
unsigned int t1, t2;
|
||||
if (likely (1 < end - text &&
|
||||
(t1 = text[0] - 0x80u) <= 0x3Fu &&
|
||||
(t2 = text[1] - 0x80u) <= 0x3Fu &&
|
||||
(hb_in_range (c, 0xE1u, 0xECu) ||
|
||||
hb_in_range (c, 0xEEu, 0xEFu) ||
|
||||
(c == 0xE0u && t1 >= 0xA0u-0x80u) ||
|
||||
(c == 0xEDu && t1 <= 0x9Fu-0x80u))))
|
||||
{
|
||||
c = ((c&0xFu)<<12) | (t1<<6) | t2;
|
||||
text += 2;
|
||||
}
|
||||
else
|
||||
goto error;
|
||||
}
|
||||
else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
|
||||
{
|
||||
unsigned int t1, t2, t3;
|
||||
if (likely (2 < end - text &&
|
||||
(t1 = text[0] - 0x80u) <= 0x3Fu &&
|
||||
(t2 = text[1] - 0x80u) <= 0x3Fu &&
|
||||
(t3 = text[2] - 0x80u) <= 0x3Fu &&
|
||||
(hb_in_range (c, 0xF1u, 0xF3u) ||
|
||||
(c == 0xF0u && t1 >= 0x90u-0x80u) ||
|
||||
(c == 0xF4u && t1 <= 0x8Fu-0x80u))))
|
||||
{
|
||||
c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
|
||||
text += 3;
|
||||
}
|
||||
else
|
||||
goto error;
|
||||
}
|
||||
else
|
||||
goto error;
|
||||
}
|
||||
|
||||
*unicode = c;
|
||||
return text;
|
||||
|
||||
error:
|
||||
*unicode = -1;
|
||||
return text;
|
||||
}
|
||||
|
||||
static inline const uint8_t *
|
||||
|
|
|
@ -449,11 +449,15 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
|
|||
{ "\x7f", -1, 1, TRUE },
|
||||
{ "\xdf\xbf", -1, 2, TRUE },
|
||||
{ "\xef\xbf\xbf", -1, 0, TRUE },
|
||||
{ "\xf7\xbf\xbf\xbf", -1, 0, TRUE },
|
||||
{ "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
|
||||
{ "\xf4\x90\xbf\xbf", -1, 0, FALSE },
|
||||
{ "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
|
||||
{ "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
|
||||
{ "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
|
||||
/* other boundary conditions */
|
||||
{ "\xed\x9f\xbf", -1, 3, TRUE },
|
||||
{ "\xed\xa0\x80", -1, 0, FALSE },
|
||||
{ "\xed\xbf\xbf", -1, 0, FALSE },
|
||||
{ "\xee\x80\x80", -1, 3, TRUE },
|
||||
{ "\xef\xbf\xbd", -1, 3, TRUE },
|
||||
{ "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
|
||||
|
@ -610,8 +614,6 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
|
|||
/* impossible bytes */
|
||||
{ "\x20\xfe\x20", -1, 1, FALSE },
|
||||
{ "\x20\xff\x20", -1, 1, FALSE },
|
||||
#if 0
|
||||
/* XXX fix these, or document that we don't detect them? */
|
||||
/* overlong sequences */
|
||||
{ "\x20\xc0\xaf\x20", -1, 1, FALSE },
|
||||
{ "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
|
||||
|
@ -644,6 +646,7 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
|
|||
{ "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
|
||||
{ "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
|
||||
{ "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
|
||||
#if 0 /* We don't consider U+FFFE / U+FFFF and similar invalid. */
|
||||
{ "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
|
||||
{ "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue