From 976c8f455221eb599d1c446eafd88d51d7d2aa65 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Wed, 16 Jul 2014 15:34:20 -0400 Subject: [PATCH] New API: hb_buffer_[sg]et_replacement_codepoint() With this change, we now by default replace broken UTF-8/16/32 bits with U+FFFD. This can be changed by calling new API on the buffer. Previously the replacement value used to be (hb_codepoint_t)-1. Note that hb_buffer_clear_contents() does NOT reset the replacement character. See discussion here: https://github.com/behdad/harfbuzz/commit/6f13b6d62daae4989e3cc2fe4b168e5c59650964 New API: hb_buffer_set_replacement_codepoint() hb_buffer_get_replacement_codepoint() --- src/hb-buffer-private.hh | 1 + src/hb-buffer.cc | 45 +++++++++++++++++++++++++++++++++++++--- src/hb-buffer.h | 15 +++++++++++++- src/hb-utf-private.hh | 34 +++++++++++++++++------------- test/api/test-buffer.c | 26 +++++++++++++---------- 5 files changed, 92 insertions(+), 29 deletions(-) diff --git a/src/hb-buffer-private.hh b/src/hb-buffer-private.hh index 3a2b9ab48..5eccd3c31 100644 --- a/src/hb-buffer-private.hh +++ b/src/hb-buffer-private.hh @@ -52,6 +52,7 @@ struct hb_buffer_t { hb_unicode_funcs_t *unicode; /* Unicode functions */ hb_segment_properties_t props; /* Script, language, direction */ hb_buffer_flags_t flags; /* BOT / EOT / etc. */ + hb_codepoint_t replacement; /* U+FFFD or something else. */ /* Buffer contents */ diff --git a/src/hb-buffer.cc b/src/hb-buffer.cc index 242cded36..2377ba40d 100644 --- a/src/hb-buffer.cc +++ b/src/hb-buffer.cc @@ -178,6 +178,7 @@ hb_buffer_t::reset (void) hb_unicode_funcs_destroy (unicode); unicode = hb_unicode_funcs_get_default (); + replacement = HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT; clear (); } @@ -703,6 +704,7 @@ hb_buffer_get_empty (void) const_cast (&_hb_unicode_funcs_nil), HB_SEGMENT_PROPERTIES_DEFAULT, HB_BUFFER_FLAG_DEFAULT, + HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT, HB_BUFFER_CONTENT_TYPE_INVALID, true, /* in_error */ @@ -1047,6 +1049,42 @@ hb_buffer_get_flags (hb_buffer_t *buffer) } +/** + * hb_buffer_set_replacement_codepoint: + * @buffer: a buffer. + * @replacement: + * + * + * + * Since: 1.0 + **/ +void +hb_buffer_set_replacement_codepoint (hb_buffer_t *buffer, + hb_codepoint_t replacement) +{ + if (unlikely (hb_object_is_inert (buffer))) + return; + + buffer->replacement = replacement; +} + +/** + * hb_buffer_get_replacement_codepoint: + * @buffer: a buffer. + * + * + * + * Return value: + * + * Since: 1.0 + **/ +hb_codepoint_t +hb_buffer_get_replacement_codepoint (hb_buffer_t *buffer) +{ + return buffer->replacement; +} + + /** * hb_buffer_reset: * @buffer: a buffer. @@ -1299,6 +1337,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, int item_length) { typedef hb_utf_t utf_t; + const hb_codepoint_t replacement = buffer->replacement; assert (buffer->content_type == HB_BUFFER_CONTENT_TYPE_UNICODE || (!buffer->len && buffer->content_type == HB_BUFFER_CONTENT_TYPE_INVALID)); @@ -1330,7 +1369,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, while (start < prev && buffer->context_len[0] < buffer->CONTEXT_LENGTH) { hb_codepoint_t u; - prev = utf_t::prev (prev, start, &u); + prev = utf_t::prev (prev, start, &u, replacement); buffer->context[0][buffer->context_len[0]++] = u; } } @@ -1341,7 +1380,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, { hb_codepoint_t u; const T *old_next = next; - next = utf_t::next (next, end, &u); + next = utf_t::next (next, end, &u, replacement); buffer->add (u, old_next - (const T *) text); } @@ -1351,7 +1390,7 @@ hb_buffer_add_utf (hb_buffer_t *buffer, while (next < end && buffer->context_len[1] < buffer->CONTEXT_LENGTH) { hb_codepoint_t u; - next = utf_t::next (next, end, &u); + next = utf_t::next (next, end, &u, replacement); buffer->context[1][buffer->context_len[1]++] = u; } diff --git a/src/hb-buffer.h b/src/hb-buffer.h index 777c3d980..7b0c92046 100644 --- a/src/hb-buffer.h +++ b/src/hb-buffer.h @@ -186,12 +186,25 @@ hb_buffer_flags_t hb_buffer_get_flags (hb_buffer_t *buffer); + +#define HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT 0xFFFDu + +/* Sets codepoint used to replace invalid UTF-8/16/32 entries. + * Default is 0xFFFDu. */ +void +hb_buffer_set_replacement_codepoint (hb_buffer_t *buffer, + hb_codepoint_t replacement); + +hb_codepoint_t +hb_buffer_get_replacement_codepoint (hb_buffer_t *buffer); + + /* Resets the buffer. Afterwards it's as if it was just created, * except that it has a larger buffer allocated perhaps... */ void hb_buffer_reset (hb_buffer_t *buffer); -/* Like reset, but does NOT clear unicode_funcs. */ +/* Like reset, but does NOT clear unicode_funcs and replacement_codepoint. */ void hb_buffer_clear_contents (hb_buffer_t *buffer); diff --git a/src/hb-utf-private.hh b/src/hb-utf-private.hh index cbacd67ce..68216c45e 100644 --- a/src/hb-utf-private.hh +++ b/src/hb-utf-private.hh @@ -40,7 +40,8 @@ struct hb_utf_t static inline const uint8_t * next (const uint8_t *text, const uint8_t *end, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { /* Written to only accept well-formed sequences. * Based on ideas from ICU's U8_NEXT. @@ -101,23 +102,24 @@ struct hb_utf_t return text; error: - *unicode = -1; + *unicode = replacement; return text; } static inline const uint8_t * prev (const uint8_t *text, const uint8_t *start, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { const uint8_t *end = text--; while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) text--; - if (likely (next (text, end, unicode) == end)) + if (likely (next (text, end, unicode, replacement) == end)) return text; - *unicode = -1; + *unicode = replacement; return end - 1; } @@ -137,7 +139,8 @@ struct hb_utf_t static inline const uint16_t * next (const uint16_t *text, const uint16_t *end, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { hb_codepoint_t c = *text++; @@ -161,14 +164,15 @@ struct hb_utf_t } /* Lonely / out-of-order surrogate. */ - *unicode = -1; + *unicode = replacement; return text; } static inline const uint16_t * prev (const uint16_t *text, const uint16_t *start, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { const uint16_t *end = text--; hb_codepoint_t c = *text; @@ -182,10 +186,10 @@ struct hb_utf_t if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu))) text--; - if (likely (next (text, end, unicode) == end)) + if (likely (next (text, end, unicode, replacement) == end)) return text; - *unicode = -1; + *unicode = replacement; return end - 1; } @@ -208,7 +212,8 @@ struct hb_utf_t static inline const uint32_t * next (const uint32_t *text, const uint32_t *end HB_UNUSED, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { hb_codepoint_t c = *text++; if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu))) @@ -217,16 +222,17 @@ struct hb_utf_t return text; error: - *unicode = -1; + *unicode = replacement; return text; } static inline const uint32_t * prev (const uint32_t *text, const uint32_t *start HB_UNUSED, - hb_codepoint_t *unicode) + hb_codepoint_t *unicode, + hb_codepoint_t replacement) { - next (text - 1, text, unicode); + next (text - 1, text, unicode, replacement); return text - 1; } diff --git a/test/api/test-buffer.c b/test/api/test-buffer.c index 1be693170..af73c3f18 100644 --- a/test/api/test-buffer.c +++ b/test/api/test-buffer.c @@ -374,6 +374,7 @@ test_buffer_utf8_conversion (void) unsigned int bytes, chars, i, j, len; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1); for (i = 0; i < G_N_ELEMENTS (utf8_conversion_tests); i++) { @@ -388,7 +389,7 @@ test_buffer_utf8_conversion (void) for (chars = 0; test->codepoints[chars]; chars++) ; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf8 (b, test->utf8, bytes, 1, bytes - 2); glyphs = hb_buffer_get_glyph_infos (b, &len); @@ -660,6 +661,7 @@ test_buffer_utf8_validity (void) unsigned int i; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1); for (i = 0; i < G_N_ELEMENTS (utf8_validity_tests); i++) { @@ -678,7 +680,7 @@ test_buffer_utf8_validity (void) else segment_bytes = test->max_len; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf8 (b, test->utf8, text_bytes, 0, segment_bytes); glyphs = hb_buffer_get_glyph_infos (b, &len); @@ -718,6 +720,7 @@ test_buffer_utf16_conversion (void) unsigned int i; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -1); for (i = 0; i < G_N_ELEMENTS (utf16_conversion_tests); i++) { @@ -732,7 +735,7 @@ test_buffer_utf16_conversion (void) for (chars = 0; test->codepoints[chars]; chars++) ; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf16 (b, test->utf16, u_len, 1, u_len - 2); glyphs = hb_buffer_get_glyph_infos (b, &len); @@ -752,15 +755,15 @@ typedef struct { /* note: we skip the first and last item from utf32 when adding to buffer */ static const utf32_conversion_test_t utf32_conversion_tests[] = { - {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -1, -1}}, + {{0x41, 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x61} , {0x004D, 0x0430, 0x4E8C, -3, -3}}, {{0x41, 0x004D, 0x0430, 0x4E8C, 0x10302, 0x61} , {0x004D, 0x0430, 0x4E8C, 0x10302}}, - {{0x41, 0xD800, 0xDF02, 0x61}, {-1, -1}}, - {{0x41, 0xD800, 0xDF02}, {-1}}, - {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -1}}, - {{0x41, 0xD800, 0x61, 0xDF02}, {-1, 0x61}}, - {{0x41, 0xDF00, 0x61}, {-1}}, + {{0x41, 0xD800, 0xDF02, 0x61}, {-3, -3}}, + {{0x41, 0xD800, 0xDF02}, {-3}}, + {{0x41, 0x61, 0xD800, 0xDF02}, {0x61, -3}}, + {{0x41, 0xD800, 0x61, 0xDF02}, {-3, 0x61}}, + {{0x41, 0xDF00, 0x61}, {-3}}, {{0x41, 0x10FFFF, 0x61}, {0x10FFFF}}, - {{0x41, 0x110000, 0x61}, {-1}}, + {{0x41, 0x110000, 0x61}, {-3}}, {{0x41, 0x61}, {0}} }; @@ -771,6 +774,7 @@ test_buffer_utf32_conversion (void) unsigned int i; b = hb_buffer_create (); + hb_buffer_set_replacement_codepoint (b, (hb_codepoint_t) -3); for (i = 0; i < G_N_ELEMENTS (utf32_conversion_tests); i++) { @@ -785,7 +789,7 @@ test_buffer_utf32_conversion (void) for (chars = 0; test->codepoints[chars]; chars++) ; - hb_buffer_reset (b); + hb_buffer_clear_contents (b); hb_buffer_add_utf32 (b, test->utf32, u_len, 1, u_len - 2); glyphs = hb_buffer_get_glyph_infos (b, &len);