From 976c8f455221eb599d1c446eafd88d51d7d2aa65 Mon Sep 17 00:00:00 2001
From: Behdad Esfahbod <behdad@behdad.org>
Date: Wed, 16 Jul 2014 15:34:20 -0400
Subject: [PATCH] New API: hb_buffer_[sg]et_replacement_codepoint()

With this change, we now by default replace broken UTF-8/16/32 bits
with U+FFFD.  This can be changed by calling new API on the buffer.
Previously the replacement value used to be (hb_codepoint_t)-1.

Note that hb_buffer_clear_contents() does NOT reset the replacement
character.

See discussion here:

https://github.com/behdad/harfbuzz/commit/6f13b6d62daae4989e3cc2fe4b168e5c59650964

New API:

  hb_buffer_set_replacement_codepoint()
  hb_buffer_get_replacement_codepoint()
---
 src/hb-buffer-private.hh |  1 +
 src/hb-buffer.cc         | 45 +++++++++++++++++++++++++++++++++++++---
 src/hb-buffer.h          | 15 +++++++++++++-
 src/hb-utf-private.hh    | 34 +++++++++++++++++-------------
 test/api/test-buffer.c   | 26 +++++++++++++----------
 5 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/src/hb-buffer-private.hh b/src/hb-buffer-private.hh
index 3a2b9ab48..5eccd3c31 100644
--- a/src/hb-buffer-private.hh
+++ b/src/hb-buffer-private.hh
@@ -52,6 +52,7 @@ struct hb_buffer_t {
   hb_unicode_funcs_t *unicode; /* Unicode functions */
   hb_segment_properties_t props; /* Script, language, direction */
   hb_buffer_flags_t flags; /* BOT / EOT / etc. */
+  hb_codepoint_t replacement; /* U+FFFD or something else. */
 
   /* Buffer contents */
 
diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc
index 242cded36..2377ba40d 100644
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@@ -178,6 +178,7 @@ hb_buffer_t::reset (void)
 
   hb_unicode_funcs_destroy (unicode);
   unicode = hb_unicode_funcs_get_default ();
+  replacement = HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT;
 
   clear ();
 }
@@ -703,6 +704,7 @@ hb_buffer_get_empty (void)
     const_cast<hb_unicode_funcs_t *> (&_hb_unicode_funcs_nil),
     HB_SEGMENT_PROPERTIES_DEFAULT,
     HB_BUFFER_FLAG_DEFAULT,
+    HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT,
 
     HB_BUFFER_CONTENT_TYPE_INVALID,
     true, /* in_error */
@@ -1047,6 +1049,42 @@ hb_buffer_get_flags (hb_buffer_t *buffer)
 }
 
 
+/**
+ * hb_buffer_set_replacement_codepoint:
+ * @buffer: a buffer.
+ * @replacement: 
+ *
+ * 
+ *
+ * Since: 1.0
+ **/
+void
+hb_buffer_set_replacement_codepoint (hb_buffer_t    *buffer,
+				     hb_codepoint_t  replacement)
+{
+  if (unlikely (hb_object_is_inert (buffer)))
+    return;
+
+  buffer->replacement = replacement;
+}
+
+/**
+ * hb_buffer_get_replacement_codepoint:
+ * @buffer: a buffer.
+ *
+ * 
+ *
+ * Return value: 
+ *
+ * Since: 1.0
+ **/
+hb_codepoint_t
+hb_buffer_get_replacement_codepoint (hb_buffer_t    *buffer)
+{
+  return buffer->replacement;
+}
+
+
 /**
  * hb_buffer_reset:
  * @buffer: a buffer.
@@ -1299,6 +1337,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
 		   int           item_length)
 {
   typedef hb_utf_t<T, true> utf_t;
+  const hb_codepoint_t replacement = buffer->replacement;
 
   assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
 	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
@@ -1330,7 +1369,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
     while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH)
     {
       hb_codepoint_t u;
-      prev = utf_t::prev (prev, start, &u);
+      prev = utf_t::prev (prev, start, &u, replacement);
       buffer->context[0][buffer->context_len[0]++] = u;
     }
   }
@@ -1341,7 +1380,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   {
     hb_codepoint_t u;
     const T *old_next = next;
-    next = utf_t::next (next, end, &u);
+    next = utf_t::next (next, end, &u, replacement);
     buffer->add (u, old_next - (const T *) text);
   }
 
@@ -1351,7 +1390,7 @@ hb_buffer_add_utf (hb_buffer_t  *buffer,
   while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH)
   {
     hb_codepoint_t u;
-    next = utf_t::next (next, end, &u);
+    next = utf_t::next (next, end, &u, replacement);
     buffer->context[1][buffer->context_len[1]++] = u;
   }
 
diff --git a/src/hb-buffer.h b/src/hb-buffer.h
index 777c3d980..7b0c92046 100644
--- a/src/hb-buffer.h
+++ b/src/hb-buffer.h
@@ -186,12 +186,25 @@ hb_buffer_flags_t
 hb_buffer_get_flags (hb_buffer_t *buffer);
 
 
+
+#define HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT 0xFFFDu
+
+/* Sets codepoint used to replace invalid UTF-8/16/32 entries.
+ * Default is 0xFFFDu. */
+void
+hb_buffer_set_replacement_codepoint (hb_buffer_t    *buffer,
+				     hb_codepoint_t  replacement);
+
+hb_codepoint_t
+hb_buffer_get_replacement_codepoint (hb_buffer_t    *buffer);
+
+
 /* Resets the buffer.  Afterwards it's as if it was just created,
  * except that it has a larger buffer allocated perhaps... */
 void
 hb_buffer_reset (hb_buffer_t *buffer);
 
-/* Like reset, but does NOT clear unicode_funcs. */
+/* Like reset, but does NOT clear unicode_funcs and replacement_codepoint. */
 void
 hb_buffer_clear_contents (hb_buffer_t *buffer);
 
diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh
index cbacd67ce..68216c45e 100644
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@@ -40,7 +40,8 @@ struct hb_utf_t<uint8_t, true>
   static inline const uint8_t *
   next (const uint8_t *text,
 	const uint8_t *end,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     /* Written to only accept well-formed sequences.
      * Based on ideas from ICU's U8_NEXT.
@@ -101,23 +102,24 @@ struct hb_utf_t<uint8_t, true>
     return text;
 
   error:
-    *unicode = -1;
+    *unicode = replacement;
     return text;
   }
 
   static inline const uint8_t *
   prev (const uint8_t *text,
 	const uint8_t *start,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     const uint8_t *end = text--;
     while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
       text--;
 
-    if (likely (next (text, end, unicode) == end))
+    if (likely (next (text, end, unicode, replacement) == end))
       return text;
 
-    *unicode = -1;
+    *unicode = replacement;
     return end - 1;
   }
 
@@ -137,7 +139,8 @@ struct hb_utf_t<uint16_t, true>
   static inline const uint16_t *
   next (const uint16_t *text,
 	const uint16_t *end,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     hb_codepoint_t c = *text++;
 
@@ -161,14 +164,15 @@ struct hb_utf_t<uint16_t, true>
     }
 
     /* Lonely / out-of-order surrogate. */
-    *unicode = -1;
+    *unicode = replacement;
     return text;
   }
 
   static inline const uint16_t *
   prev (const uint16_t *text,
 	const uint16_t *start,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     const uint16_t *end = text--;
     hb_codepoint_t c = *text;
@@ -182,10 +186,10 @@ struct hb_utf_t<uint16_t, true>
     if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
       text--;
 
-    if (likely (next (text, end, unicode) == end))
+    if (likely (next (text, end, unicode, replacement) == end))
       return text;
 
-    *unicode = -1;
+    *unicode = replacement;
     return end - 1;
   }
 
@@ -208,7 +212,8 @@ struct hb_utf_t<uint32_t, validate>
   static inline const uint32_t *
   next (const uint32_t *text,
 	const uint32_t *end HB_UNUSED,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
     hb_codepoint_t c = *text++;
     if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
@@ -217,16 +222,17 @@ struct hb_utf_t<uint32_t, validate>
     return text;
 
   error:
-    *unicode = -1;
+    *unicode = replacement;
     return text;
   }
 
   static inline const uint32_t *
   prev (const uint32_t *text,
 	const uint32_t *start HB_UNUSED,
-	hb_codepoint_t *unicode)
+	hb_codepoint_t *unicode,
+	hb_codepoint_t replacement)
   {
-    next (text - 1, text, unicode);
+    next (text - 1, text, unicode, replacement);
     return text - 1;
   }
 
diff --git a/test/api/test-buffer.c b/test/api/test-buffer.c
index 1be693170..af73c3f18 100644
--- a/test/api/test-buffer.c
+++ b/test/api/test-buffer.c
@@ -374,6 +374,7 @@ test_buffer_utf8_conversion (void)
   unsigned int bytes, chars, i, j, len;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
 
   for (i = 0; i < G_N_ELEMENTS (utf8_conversion_tests); i++)
   {
@@ -388,7 +389,7 @@ test_buffer_utf8_conversion (void)
     for (chars = 0; test->codepoints[chars]; chars++)
       ;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf8 (b, test->utf8, bytes,  1, bytes - 2);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);
@@ -660,6 +661,7 @@ test_buffer_utf8_validity (void)
   unsigned int i;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
 
   for (i = 0; i < G_N_ELEMENTS (utf8_validity_tests); i++)
   {
@@ -678,7 +680,7 @@ test_buffer_utf8_validity (void)
     else
       segment_bytes = test->max_len;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf8 (b, test->utf8, text_bytes,  0, segment_bytes);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);
@@ -718,6 +720,7 @@ test_buffer_utf16_conversion (void)
   unsigned int i;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1);
 
   for (i = 0; i < G_N_ELEMENTS (utf16_conversion_tests); i++)
   {
@@ -732,7 +735,7 @@ test_buffer_utf16_conversion (void)
     for (chars = 0; test->codepoints[chars]; chars++)
       ;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf16 (b, test->utf16, u_len,  1, u_len - 2);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);
@@ -752,15 +755,15 @@ typedef struct {
 
 /* note: we skip the first and last item from utf32 when adding to buffer */
 static const utf32_conversion_test_t utf32_conversion_tests[] = {
-  {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -1, -1}},
+  {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -3, -3}},
   {{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}},
-  {{0x41, 0xD800, 0xDF02, 0x61}, {-1, -1}},
-  {{0x41, 0xD800, 0xDF02}, {-1}},
-  {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -1}},
-  {{0x41, 0xD800, 0x61, 0xDF02}, {-1, 0x61}},
-  {{0x41, 0xDF00, 0x61}, {-1}},
+  {{0x41, 0xD800, 0xDF02, 0x61}, {-3, -3}},
+  {{0x41, 0xD800, 0xDF02}, {-3}},
+  {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -3}},
+  {{0x41, 0xD800, 0x61, 0xDF02}, {-3, 0x61}},
+  {{0x41, 0xDF00, 0x61}, {-3}},
   {{0x41, 0x10FFFF, 0x61}, {0x10FFFF}},
-  {{0x41, 0x110000, 0x61}, {-1}},
+  {{0x41, 0x110000, 0x61}, {-3}},
   {{0x41, 0x61}, {0}}
 };
 
@@ -771,6 +774,7 @@ test_buffer_utf32_conversion (void)
   unsigned int i;
 
   b = hb_buffer_create ();
+  hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -3);
 
   for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++)
   {
@@ -785,7 +789,7 @@ test_buffer_utf32_conversion (void)
     for (chars = 0; test->codepoints[chars]; chars++)
       ;
 
-    hb_buffer_reset (b);
+    hb_buffer_clear_contents (b);
     hb_buffer_add_utf32 (b, test->utf32, u_len,  1, u_len - 2);
 
     glyphs = hb_buffer_get_glyph_infos (b, &len);