diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh index 375d3b4fa..4c4fce1d5 100644 --- a/src/hb-utf-private.hh +++ b/src/hb-utf-private.hh @@ -1,5 +1,5 @@ /* - * Copyright © 2011,2012 Google, Inc. + * Copyright © 2011,2012,2014 Google, Inc. * * This is part of HarfBuzz, a text shaping library. * @@ -32,44 +32,75 @@ /* UTF-8 */ -#define HB_UTF8_COMPUTE(Char, Mask, Len) \ - if (Char < 128) { Len = 1; Mask = 0x7f; } \ - else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \ - else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \ - else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \ - else Len = 0; - static inline const uint8_t * hb_utf_next (const uint8_t *text, const uint8_t *end, hb_codepoint_t *unicode) { - hb_codepoint_t c = *text, mask; - unsigned int len; + /* Written to only accept well-formed sequences. + * Based on ideas from ICU's U8_NEXT. + * Generates a -1 for each ill-formed byte. */ - /* TODO check for overlong sequences? */ + hb_codepoint_t c = *text++; - HB_UTF8_COMPUTE (c, mask, len); - if (unlikely (!len || (unsigned int) (end - text) < len)) { - *unicode = -1; - return text + 1; - } else { - hb_codepoint_t result; - unsigned int i; - result = c & mask; - for (i = 1; i < len; i++) + if (c > 0x7Fu) + { + if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */ + { + unsigned int t1; + if (likely (text < end && + (t1 = text[0] - 0x80u) <= 0x3Fu)) { - if (unlikely ((text[i] & 0xc0) != 0x80)) - { - *unicode = -1; - return text + 1; - } - result <<= 6; - result |= (text[i] & 0x3f); + c = ((c&0x1Fu)<<6) | t1; + text++; } - *unicode = result; - return text + len; + else + goto error; + } + else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */ + { + unsigned int t1, t2; + if (likely (1 < end - text && + (t1 = text[0] - 0x80u) <= 0x3Fu && + (t2 = text[1] - 0x80u) <= 0x3Fu && + (hb_in_range (c, 0xE1u, 0xECu) || + hb_in_range (c, 0xEEu, 0xEFu) || + (c == 0xE0u && t1 >= 0xA0u-0x80u) || + (c == 0xEDu && t1 <= 0x9Fu-0x80u)))) + { + c = ((c&0xFu)<<12) | (t1<<6) | t2; + text += 2; + } + else + goto error; + } + else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */ + { + unsigned int t1, t2, t3; + if (likely (2 < end - text && + (t1 = text[0] - 0x80u) <= 0x3Fu && + (t2 = text[1] - 0x80u) <= 0x3Fu && + (t3 = text[2] - 0x80u) <= 0x3Fu && + (hb_in_range (c, 0xF1u, 0xF3u) || + (c == 0xF0u && t1 >= 0x90u-0x80u) || + (c == 0xF4u && t1 <= 0x8Fu-0x80u)))) + { + c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; + text += 3; + } + else + goto error; + } + else + goto error; } + + *unicode = c; + return text; + +error: + *unicode = -1; + return text; } static inline const uint8_t * diff --git a/test/api/test-buffer.c b/test/api/test-buffer.c index 0b70cf968..1956c92c2 100644 --- a/test/api/test-buffer.c +++ b/test/api/test-buffer.c @@ -449,11 +449,15 @@ static const utf8_validity_test_t utf8_validity_tests[] = { { "\x7f", -1, 1, TRUE }, { "\xdf\xbf", -1, 2, TRUE }, { "\xef\xbf\xbf", -1, 0, TRUE }, - { "\xf7\xbf\xbf\xbf", -1, 0, TRUE }, + { "\xf4\x8f\xbf\xbf", -1, 0, TRUE }, + { "\xf4\x90\xbf\xbf", -1, 0, FALSE }, + { "\xf7\xbf\xbf\xbf", -1, 0, FALSE }, { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE }, { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE }, /* other boundary conditions */ { "\xed\x9f\xbf", -1, 3, TRUE }, + { "\xed\xa0\x80", -1, 0, FALSE }, + { "\xed\xbf\xbf", -1, 0, FALSE }, { "\xee\x80\x80", -1, 3, TRUE }, { "\xef\xbf\xbd", -1, 3, TRUE }, { "\xf4\x8f\xbf\xbf", -1, 0, TRUE }, @@ -610,8 +614,6 @@ static const utf8_validity_test_t utf8_validity_tests[] = { /* impossible bytes */ { "\x20\xfe\x20", -1, 1, FALSE }, { "\x20\xff\x20", -1, 1, FALSE }, -#if 0 - /* XXX fix these, or document that we don't detect them? */ /* overlong sequences */ { "\x20\xc0\xaf\x20", -1, 1, FALSE }, { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE }, @@ -644,6 +646,7 @@ static const utf8_validity_test_t utf8_validity_tests[] = { { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE }, { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE }, { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE }, +#if 0 /* We don't consider U+FFFE / U+FFFF and similar invalid. */ { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE }, { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE }, #endif