[buffer] Templatize UTF handling

Also move UTF routines into a separate file, to be reused from shapers that need it.
2012-09-25 11:22:28 -04:00 · 2012-09-25 11:22:28 -04:00 · 7f19ae7b9f
parent 0e0a4da9b7
commit 7f19ae7b9f
3 changed files with 152 additions and 113 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -51,6 +51,7 @@ HBSOURCES =  \
 	hb-tt-font.cc \
 	hb-unicode-private.hh \
 	hb-unicode.cc \
+	hb-utf-private.hh \
 	hb-warning.cc \
 	$(NULL)
 HBHEADERS = \
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@ -28,6 +28,7 @@
 */

 #include "hb-buffer-private.hh"
+#include "hb-utf-private.hh"

 #include <string.h>

@ -797,68 +798,44 @@ hb_buffer_guess_properties (hb_buffer_t *buffer)
  buffer->guess_properties ();
 }

-#define ADD_UTF(T) \
-	HB_STMT_START { \
-	  if (text_length == -1) { \
-	    text_length = 0; \
-	    const T *p = (const T *) text; \
-	    while (*p) { \
-	      text_length++; \
-	      p++; \
-	    } \
-	  } \
-	  if (item_length == -1) \
-	    item_length = text_length - item_offset; \
-	  buffer->ensure (buffer->len + item_length * sizeof (T) / 4); \
-	  const T *next = (const T *) text + item_offset; \
-	  const T *end = next + item_length; \
-	  while (next < end) { \
-	    hb_codepoint_t u; \
-	    const T *old_next = next; \
-	    next = UTF_NEXT (next, end, u); \
-	    hb_buffer_add (buffer, u, 1,  old_next - (const T *) text); \
-	  } \
-	} HB_STMT_END
-
-
-#define UTF8_COMPUTE(Char, Mask, Len) \
-  if (Char < 128) { Len = 1; Mask = 0x7f; } \
-  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
-  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
-  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
-  else Len = 0;
-
-static inline const uint8_t *
-hb_utf8_next (const uint8_t *text,
-	      const uint8_t *end,
-	      hb_codepoint_t *unicode)
+template <typename T>
+static inline void
+hb_buffer_add_utf (hb_buffer_t  *buffer,
+		   const T      *text,
+		   int           text_length,
+		   unsigned int  item_offset,
+		   int           item_length)
 {
-  uint8_t c = *text;
-  unsigned int mask, len;
+  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
+	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));

-  /* TODO check for overlong sequences? */
+  if (unlikely (hb_object_is_inert (buffer)))
+    return;

-  UTF8_COMPUTE (c, mask, len);
-  if (unlikely (!len || (unsigned int) (end - text) < len)) {
-    *unicode = -1;
-    return text + 1;
-  } else {
-    hb_codepoint_t result;
-    unsigned int i;
-    result = c & mask;
-    for (i = 1; i < len; i++)
-      {
-	if (unlikely ((text[i] & 0xc0) != 0x80))
-	  {
-	    *unicode = -1;
-	    return text + 1;
-	  }
-	result <<= 6;
-	result |= (text[i] & 0x3f);
-      }
-    *unicode = result;
-    return text + len;
+  if (text_length == -1) {
+    text_length = 0;
+    const T *p = (const T *) text;
+    while (*p) {
+      text_length++;
+      p++;
+    }
  }
+
+  if (item_length == -1)
+    item_length = text_length - item_offset;
+
+  buffer->ensure (buffer->len + item_length * sizeof (T) / 4);
+
+  const T *next = (const T *) text + item_offset;
+  const T *end = next + item_length;
+  while (next < end) {
+    hb_codepoint_t u;
+    const T *old_next = next;
+    next = hb_utf_next (next, end, &u);
+    hb_buffer_add (buffer, u, 1,  old_next - (const T *) text);
+  }
+
+  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
 }

 void
@ -868,36 +845,7 @@ hb_buffer_add_utf8 (hb_buffer_t  *buffer,
 		    unsigned int  item_offset,
 		    int           item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
-	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
-  if (unlikely (hb_object_is_inert (buffer)))
-    return;
-  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
-#define UTF_NEXT(S, E, U)	hb_utf8_next (S, E, &(U))
-  ADD_UTF (uint8_t);
-#undef UTF_NEXT
-}
-
-static inline const uint16_t *
-hb_utf16_next (const uint16_t *text,
-	       const uint16_t *end,
-	       hb_codepoint_t *unicode)
-{
-  uint16_t c = *text++;
-
-  if (unlikely (c >= 0xd800 && c < 0xdc00)) {
-    /* high surrogate */
-    uint16_t l;
-    if (text < end && ((l = *text), likely (l >= 0xdc00 && l < 0xe000))) {
-      /* low surrogate */
-      *unicode = ((hb_codepoint_t) ((c) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000);
-       text++;
-    } else
-      *unicode = -1;
-  } else
-    *unicode = c;
-
-  return text;
+  hb_buffer_add_utf (buffer, (const uint8_t *) text, text_length, item_offset, item_length);
 }

 void
@ -907,23 +855,7 @@ hb_buffer_add_utf16 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int            item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
-	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
-  if (unlikely (hb_object_is_inert (buffer)))
-    return;
-  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
-#define UTF_NEXT(S, E, U)	hb_utf16_next (S, E, &(U))
-  ADD_UTF (uint16_t);
-#undef UTF_NEXT
-}
-
-static inline const uint32_t *
-hb_utf32_next (const uint32_t *text,
-	       const uint32_t *end,
-	       hb_codepoint_t *unicode)
-{
-  *unicode = *text;
-  return text + 1;
+  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
 }

 void
@ -933,14 +865,7 @@ hb_buffer_add_utf32 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int             item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
-	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
-  if (unlikely (hb_object_is_inert (buffer)))
-    return;
-  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
-#define UTF_NEXT(S, E, U)	hb_utf32_next (S, E, &(U))
-  ADD_UTF (uint32_t);
-#undef UTF_NEXT
+  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
 }


--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@ -0,0 +1,113 @@
+/*
+ * Copyright © 2011,2012  Google, Inc.
+ *
+ *  This is part of HarfBuzz, a text shaping library.
+ *
+ * Permission is hereby granted, without written agreement and without
+ * license or royalty fees, to use, copy, modify, and distribute this
+ * software and its documentation for any purpose, provided that the
+ * above copyright notice and the following two paragraphs appear in
+ * all copies of this software.
+ *
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
+ * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
+ * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+ * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
+ * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+ *
+ * Google Author(s): Behdad Esfahbod
+ */
+
+#ifndef HB_UTF_PRIVATE_HH
+#define HB_UTF_PRIVATE_HH
+
+#include "hb-private.hh"
+
+
+/* UTF-8 */
+
+#define HB_UTF8_COMPUTE(Char, Mask, Len) \
+  if (Char < 128) { Len = 1; Mask = 0x7f; } \
+  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
+  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
+  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
+  else Len = 0;
+
+static inline const uint8_t *
+hb_utf_next (const uint8_t *text,
+	     const uint8_t *end,
+	     hb_codepoint_t *unicode)
+{
+  uint8_t c = *text;
+  unsigned int mask, len;
+
+  /* TODO check for overlong sequences? */
+
+  HB_UTF8_COMPUTE (c, mask, len);
+  if (unlikely (!len || (unsigned int) (end - text) < len)) {
+    *unicode = -1;
+    return text + 1;
+  } else {
+    hb_codepoint_t result;
+    unsigned int i;
+    result = c & mask;
+    for (i = 1; i < len; i++)
+      {
+	if (unlikely ((text[i] & 0xc0) != 0x80))
+	  {
+	    *unicode = -1;
+	    return text + 1;
+	  }
+	result <<= 6;
+	result |= (text[i] & 0x3f);
+      }
+    *unicode = result;
+    return text + len;
+  }
+}
+
+
+/* UTF-16 */
+
+static inline const uint16_t *
+hb_utf_next (const uint16_t *text,
+	     const uint16_t *end,
+	     hb_codepoint_t *unicode)
+{
+  uint16_t c = *text++;
+
+  if (unlikely (c >= 0xd800 && c < 0xdc00)) {
+    /* high surrogate */
+    uint16_t l;
+    if (text < end && ((l = *text), likely (l >= 0xdc00 && l < 0xe000))) {
+      /* low surrogate */
+      *unicode = ((hb_codepoint_t) ((c) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000);
+       text++;
+    } else
+      *unicode = -1;
+  } else
+    *unicode = c;
+
+  return text;
+}
+
+
+/* UTF-32 */
+
+static inline const uint32_t *
+hb_utf_next (const uint32_t *text,
+	     const uint32_t *end,
+	     hb_codepoint_t *unicode)
+{
+  *unicode = *text;
+  return text + 1;
+}
+
+
+#endif /* HB_UTF_PRIVATE_HH */