[buffer] Templatize UTF handling

Also move UTF routines into a separate file, to be reused from shapers that need it.
2012-09-25 11:22:28 -04:00 · 2012-09-25 11:22:28 -04:00 · 7f19ae7b9f
parent 0e0a4da9b7
commit 7f19ae7b9f
3 changed files with 152 additions and 113 deletions
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -51,6 +51,7 @@ HBSOURCES =  \
 	hb-tt-font.cc \
 	hb-unicode-private.hh \
 	hb-unicode.cc \
 	hb-utf-private.hh \
 	hb-warning.cc \
 	$(NULL)
 HBHEADERS = \
--- a/src/hb-buffer.cc
+++ b/src/hb-buffer.cc
@ -28,6 +28,7 @@
 */
 #include "hb-buffer-private.hh"
 #include "hb-utf-private.hh"
 #include <string.h>
@ -797,68 +798,44 @@ hb_buffer_guess_properties (hb_buffer_t *buffer)
  buffer->guess_properties ();
 }
-#define ADD_UTF(T) \
+template <typename T>
-	HB_STMT_START { \
+static inline void
-	  if (text_length == -1) { \
+hb_buffer_add_utf (hb_buffer_t  *buffer,
-	    text_length = 0; \
+		   const T      *text,
-	    const T *p = (const T *) text; \
+		   int           text_length,
-	    while (*p) { \
+		   unsigned int  item_offset,
-	      text_length++; \
+		   int           item_length)
 	      p++; \
 	    } \
 	  } \
 	  if (item_length == -1) \
 	    item_length = text_length - item_offset; \
 	  buffer->ensure (buffer->len + item_length * sizeof (T) / 4); \
 	  const T *next = (const T *) text + item_offset; \
 	  const T *end = next + item_length; \
 	  while (next < end) { \
 	    hb_codepoint_t u; \
 	    const T *old_next = next; \
 	    next = UTF_NEXT (next, end, u); \
 	    hb_buffer_add (buffer, u, 1,  old_next - (const T *) text); \
 	  } \
 	} HB_STMT_END
 #define UTF8_COMPUTE(Char, Mask, Len) \
  if (Char < 128) { Len = 1; Mask = 0x7f; } \
  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
  else Len = 0;
 static inline const uint8_t *
 hb_utf8_next (const uint8_t *text,
 	      const uint8_t *end,
 	      hb_codepoint_t *unicode)
 {
-  uint8_t c = *text;
+  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
-  unsigned int mask, len;
+	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
-  /* TODO check for overlong sequences? */
+  if (unlikely (hb_object_is_inert (buffer)))
    return;
-  UTF8_COMPUTE (c, mask, len);
+  if (text_length == -1) {
-  if (unlikely (!len || (unsigned int) (end - text) < len)) {
+    text_length = 0;
-    *unicode = -1;
+    const T *p = (const T *) text;
-    return text + 1;
+    while (*p) {
-  } else {
+      text_length++;
-    hb_codepoint_t result;
+      p++;
-    unsigned int i;
+    }
    result = c & mask;
    for (i = 1; i < len; i++)
      {
 	if (unlikely ((text[i] & 0xc0) != 0x80))
 	  {
 	    *unicode = -1;
 	    return text + 1;
 	  }
 	result <<= 6;
 	result |= (text[i] & 0x3f);
      }
    *unicode = result;
    return text + len;
  }
  if (item_length == -1)
    item_length = text_length - item_offset;
  buffer->ensure (buffer->len + item_length * sizeof (T) / 4);
  const T *next = (const T *) text + item_offset;
  const T *end = next + item_length;
  while (next < end) {
    hb_codepoint_t u;
    const T *old_next = next;
    next = hb_utf_next (next, end, &u);
    hb_buffer_add (buffer, u, 1,  old_next - (const T *) text);
  }
  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
 }
 void
@ -868,36 +845,7 @@ hb_buffer_add_utf8 (hb_buffer_t  *buffer,
 		    unsigned int  item_offset,
 		    int           item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
+  hb_buffer_add_utf (buffer, (const uint8_t *) text, text_length, item_offset, item_length);
 	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
  if (unlikely (hb_object_is_inert (buffer)))
    return;
  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
 #define UTF_NEXT(S, E, U)	hb_utf8_next (S, E, &(U))
  ADD_UTF (uint8_t);
 #undef UTF_NEXT
 }
 static inline const uint16_t *
 hb_utf16_next (const uint16_t *text,
 	       const uint16_t *end,
 	       hb_codepoint_t *unicode)
 {
  uint16_t c = *text++;
  if (unlikely (c >= 0xd800 && c < 0xdc00)) {
    /* high surrogate */
    uint16_t l;
    if (text < end && ((l = *text), likely (l >= 0xdc00 && l < 0xe000))) {
      /* low surrogate */
      *unicode = ((hb_codepoint_t) ((c) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000);
       text++;
    } else
      *unicode = -1;
  } else
    *unicode = c;
  return text;
 }
 void
@ -907,23 +855,7 @@ hb_buffer_add_utf16 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int            item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
+  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
 	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
  if (unlikely (hb_object_is_inert (buffer)))
    return;
  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
 #define UTF_NEXT(S, E, U)	hb_utf16_next (S, E, &(U))
  ADD_UTF (uint16_t);
 #undef UTF_NEXT
 }
 static inline const uint32_t *
 hb_utf32_next (const uint32_t *text,
 	       const uint32_t *end,
 	       hb_codepoint_t *unicode)
 {
  *unicode = *text;
  return text + 1;
 }
 void
@ -933,14 +865,7 @@ hb_buffer_add_utf32 (hb_buffer_t    *buffer,
 		     unsigned int    item_offset,
 		     int             item_length)
 {
-  assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE ||
+  hb_buffer_add_utf (buffer, text, text_length, item_offset, item_length);
 	  (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID));
  if (unlikely (hb_object_is_inert (buffer)))
    return;
  buffer->content_type = HB_BUFFER_CONTENT_TYPE_UNICODE;
 #define UTF_NEXT(S, E, U)	hb_utf32_next (S, E, &(U))
  ADD_UTF (uint32_t);
 #undef UTF_NEXT
 }
--- a/src/hb-utf-private.hh
+++ b/src/hb-utf-private.hh
@ -0,0 +1,113 @@
 /*
 * Copyright © 2011,2012  Google, Inc.
 *
 *  This is part of HarfBuzz, a text shaping library.
 *
 * Permission is hereby granted, without written agreement and without
 * license or royalty fees, to use, copy, modify, and distribute this
 * software and its documentation for any purpose, provided that the
 * above copyright notice and the following two paragraphs appear in
 * all copies of this software.
 *
 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 *
 * Google Author(s): Behdad Esfahbod
 */
 #ifndef HB_UTF_PRIVATE_HH
 #define HB_UTF_PRIVATE_HH
 #include "hb-private.hh"
 /* UTF-8 */
 #define HB_UTF8_COMPUTE(Char, Mask, Len) \
  if (Char < 128) { Len = 1; Mask = 0x7f; } \
  else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \
  else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \
  else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \
  else Len = 0;
 static inline const uint8_t *
 hb_utf_next (const uint8_t *text,
 	     const uint8_t *end,
 	     hb_codepoint_t *unicode)
 {
  uint8_t c = *text;
  unsigned int mask, len;
  /* TODO check for overlong sequences? */
  HB_UTF8_COMPUTE (c, mask, len);
  if (unlikely (!len || (unsigned int) (end - text) < len)) {
    *unicode = -1;
    return text + 1;
  } else {
    hb_codepoint_t result;
    unsigned int i;
    result = c & mask;
    for (i = 1; i < len; i++)
      {
 	if (unlikely ((text[i] & 0xc0) != 0x80))
 	  {
 	    *unicode = -1;
 	    return text + 1;
 	  }
 	result <<= 6;
 	result |= (text[i] & 0x3f);
      }
    *unicode = result;
    return text + len;
  }
 }
 /* UTF-16 */
 static inline const uint16_t *
 hb_utf_next (const uint16_t *text,
 	     const uint16_t *end,
 	     hb_codepoint_t *unicode)
 {
  uint16_t c = *text++;
  if (unlikely (c >= 0xd800 && c < 0xdc00)) {
    /* high surrogate */
    uint16_t l;
    if (text < end && ((l = *text), likely (l >= 0xdc00 && l < 0xe000))) {
      /* low surrogate */
      *unicode = ((hb_codepoint_t) ((c) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000);
       text++;
    } else
      *unicode = -1;
  } else
    *unicode = c;
  return text;
 }
 /* UTF-32 */
 static inline const uint32_t *
 hb_utf_next (const uint32_t *text,
 	     const uint32_t *end,
 	     hb_codepoint_t *unicode)
 {
  *unicode = *text;
  return text + 1;
 }
 #endif /* HB_UTF_PRIVATE_HH */