Only accept well-formed UTF-8 sequences

Enable tests that were disabled before, and adjust one test, and add more tests.
2014-07-11 16:10:58 -04:00 · 2014-07-11 16:10:58 -04:00 · af2490c095
parent 7323d385cc
commit af2490c095
2 changed files with 66 additions and 32 deletions
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@ -1,5 +1,5 @@
 /*
- * Copyright © 2011,2012  Google, Inc.
+ * Copyright © 2011,2012,2014  Google, Inc.
 *
 *  This is part of HarfBuzz, a text shaping library.
 *
@ -32,44 +32,75 @@

 /* UTF-8 */

-#define HB_UTF8_COMPUTE(Char, Mask, Len) \
-  if (Char < 128) { Len = 1; Mask = 0x7f; } \
-  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
-  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
-  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
-  else Len = 0;
-
 static inline const uint8_t *
 hb_utf_next (const uint8_t *text,
 	     const uint8_t *end,
 	     hb_codepoint_t *unicode)
 {
-  hb_codepoint_t c = *text, mask;
-  unsigned int len;
+  /* Written to only accept well-formed sequences.
+   * Based on ideas from ICU's U8_NEXT.
+   * Generates a -1 for each ill-formed byte. */

-  /* TODO check for overlong sequences? */
+  hb_codepoint_t c = *text++;

-  HB_UTF8_COMPUTE (c, mask, len);
-  if (unlikely (!len || (unsigned int) (end - text) < len)) {
-    *unicode = -1;
-    return text + 1;
-  } else {
-    hb_codepoint_t result;
-    unsigned int i;
-    result = c & mask;
-    for (i = 1; i < len; i++)
+  if (c > 0x7Fu)
  {
-	if (unlikely ((text[i] & 0xc0) != 0x80))
+    if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
    {
+      unsigned int t1;
+      if (likely (text < end &&
+		  (t1 = text[0] - 0x80u) <= 0x3Fu))
+      {
+	c = ((c&0x1Fu)<<6) | t1;
+	text++;
+      }
+      else
+	goto error;
+    }
+    else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
+    {
+      unsigned int t1, t2;
+      if (likely (1 < end - text &&
+		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
+		  (t2 = text[1] - 0x80u) <= 0x3Fu &&
+		  (hb_in_range (c, 0xE1u, 0xECu) ||
+		   hb_in_range (c, 0xEEu, 0xEFu) ||
+		   (c == 0xE0u && t1 >= 0xA0u-0x80u) ||
+		   (c == 0xEDu && t1 <= 0x9Fu-0x80u))))
+      {
+	c = ((c&0xFu)<<12) | (t1<<6) | t2;
+	text += 2;
+      }
+      else
+	goto error;
+    }
+    else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
+    {
+      unsigned int t1, t2, t3;
+      if (likely (2 < end - text &&
+		  (t1 = text[0] - 0x80u) <= 0x3Fu &&
+		  (t2 = text[1] - 0x80u) <= 0x3Fu &&
+		  (t3 = text[2] - 0x80u) <= 0x3Fu &&
+		  (hb_in_range (c, 0xF1u, 0xF3u) ||
+		   (c == 0xF0u && t1 >= 0x90u-0x80u) ||
+		   (c == 0xF4u && t1 <= 0x8Fu-0x80u))))
+      {
+	c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
+	text += 3;
+      }
+      else
+	goto error;
+    }
+    else
+      goto error;
+  }
+
+  *unicode = c;
+  return text;
+
+error:
  *unicode = -1;
-	    return text + 1;
-	  }
-	result <<= 6;
-	result |= (text[i] & 0x3f);
-      }
-    *unicode = result;
-    return text + len;
-  }
+  return text;
 }

 static inline const uint8_t *
--- a/test/api/test-buffer.c
+++ b/test/api/test-buffer.c
@ -449,11 +449,15 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
  { "\x7f", -1, 1, TRUE },
  { "\xdf\xbf", -1, 2, TRUE },
  { "\xef\xbf\xbf", -1, 0, TRUE },
-  { "\xf7\xbf\xbf\xbf", -1, 0, TRUE },
+  { "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
+  { "\xf4\x90\xbf\xbf", -1, 0, FALSE },
+  { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
  { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
  { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
  /* other boundary conditions */
  { "\xed\x9f\xbf", -1, 3, TRUE },
+  { "\xed\xa0\x80", -1, 0, FALSE },
+  { "\xed\xbf\xbf", -1, 0, FALSE },
  { "\xee\x80\x80", -1, 3, TRUE },
  { "\xef\xbf\xbd", -1, 3, TRUE },
  { "\xf4\x8f\xbf\xbf", -1, 0, TRUE },
@ -610,8 +614,6 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
  /* impossible bytes */
  { "\x20\xfe\x20", -1, 1, FALSE },
  { "\x20\xff\x20", -1, 1, FALSE },
-#if 0
-  /* XXX fix these, or document that we don't detect them? */
  /* overlong sequences */
  { "\x20\xc0\xaf\x20", -1, 1, FALSE },
  { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
@ -644,6 +646,7 @@ static const utf8_validity_test_t utf8_validity_tests[] = {
  { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
  { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
  { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
+#if 0 /* We don't consider U+FFFE / U+FFFF and similar invalid. */
  { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
  { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
 #endif