Only accept well-formed UTF-8 sequences

Enable tests that were disabled before, and adjust one test, and add more tests.
2014-07-11 16:10:58 -04:00 · 2014-07-11 16:10:58 -04:00 · af2490c095
parent 7323d385cc
commit af2490c095
2 changed files with 66 additions and 32 deletions
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@ -1,5 +1,5 @@
 /*
- * Copyright © 2011,2012  Google, Inc.
+ * Copyright © 2011,2012,2014  Google, Inc.
 *
 *  This is part of HarfBuzz, a text shaping library.
 *
@ -32,44 +32,75 @@
 /* UTF-8 */
 #define HB_UTF8_COMPUTE(Char, Mask, Len) \
  if (Char < 128) { Len = 1; Mask = 0x7f; } \
  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
  else Len = 0;
 static inline const uint8_t *
 hb_utf_next (const uint8_t *text,
 	     const uint8_t *end,
 	     hb_codepoint_t *unicode)
 {
-  hb_codepoint_t c = *text, mask;
+  /* Written to only accept well-formed sequences.
-  unsigned int len;
+   * Based on ideas from ICU's U8_NEXT.
   * Generates a -1 for each ill-formed byte. */
-  /* TODO check for overlong sequences? */
+  hb_codepoint_t c = *text++;
-  HB_UTF8_COMPUTE (c, mask, len);
+  if (c > 0x7Fu)
-  if (unlikely (!len || (unsigned int) (end - text) < len)) {
+  {
-    *unicode = -1;
+    if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
-    return text + 1;
+    {
-  } else {
+      unsigned int t1;
-    hb_codepoint_t result;
+      if (likely (text < end &&
-    unsigned int i;
+		  (t1 = text[0] - 0x80u) <= 0x3Fu))
    result = c & mask;
    for (i = 1; i < len; i++)
      {
-	if (unlikely ((text[i] & 0xc0) != 0x80))
+	c = ((c&0x1Fu)<<6) | t1;
-	  {
+	text++;
 	    *unicode = -1;
 	    return text + 1;
 	  }
 	result <<= 6;
 	result |= (text[i] & 0x3f);
      }
-    *unicode = result;
+      else
-    return text + len;
+	goto error;
    }
    else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
    {
      unsigned int t1, t2;
      if (likely (1 < end - text &&
 		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
 		  (t2 = text[1] - 0x80u) <= 0x3Fu &&
 		  (hb_in_range (c, 0xE1u, 0xECu) ||
 		   hb_in_range (c, 0xEEu, 0xEFu) ||
 		   (c == 0xE0u && t1 >= 0xA0u-0x80u) ||
 		   (c == 0xEDu && t1 <= 0x9Fu-0x80u))))
      {
 	c = ((c&0xFu)<<12) | (t1<<6) | t2;
 	text += 2;
      }
      else
 	goto error;
    }
    else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
    {
      unsigned int t1, t2, t3;
      if (likely (2 < end - text &&
 		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
 		  (t2 = text[1] - 0x80u) <= 0x3Fu &&
 		  (t3 = text[2] - 0x80u) <= 0x3Fu &&
 		  (hb_in_range (c, 0xF1u, 0xF3u) ||
 		   (c == 0xF0u && t1 >= 0x90u-0x80u) ||
 		   (c == 0xF4u && t1 <= 0x8Fu-0x80u))))
      {
 	c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
 	text += 3;
      }
      else
 	goto error;
    }
    else
      goto error;
  }
  *unicode = c;
  return text;
 error:
  *unicode = -1;
  return text;
 }
 static inline const uint8_t *
--- a/test/api/test-buffer.c
+++ b/test/api/test-buffer.c
@ -449,11 +449,15 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
  { "\x7f", -1, 1, TRUE },
  { "\xdf\xbf", -1, 2, TRUE },
  { "\xef\xbf\xbf", -1, 0, TRUE },
-  { "\xf7\xbf\xbf\xbf", -1, 0, TRUE },
+  { "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
  { "\xf4\x90\xbf\xbf", -1, 0, FALSE },
  { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
  { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
  { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
  /* other boundary conditions */
  { "\xed\x9f\xbf", -1, 3, TRUE },
  { "\xed\xa0\x80", -1, 0, FALSE },
  { "\xed\xbf\xbf", -1, 0, FALSE },
  { "\xee\x80\x80", -1, 3, TRUE },
  { "\xef\xbf\xbd", -1, 3, TRUE },
  { "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
@ -610,8 +614,6 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
  /* impossible bytes */
  { "\x20\xfe\x20", -1, 1, FALSE },
  { "\x20\xff\x20", -1, 1, FALSE },
 #if 0
  /* XXX fix these, or document that we don't detect them? */
  /* overlong sequences */
  { "\x20\xc0\xaf\x20", -1, 1, FALSE },
  { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
@ -644,6 +646,7 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
  { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
  { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
  { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
 #if 0 /* We don't consider U+FFFE / U+FFFF and similar invalid. */
  { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
  { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
 #endif