From 3b7831851052ecf2611a115cc2b80ef970d83df8 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Wed, 3 Oct 2018 19:44:15 +0200 Subject: [PATCH] [emoji] Mark emoji contination sequences as continuation This adds a new grapheme bit. Not used yet. Part of https://github.com/harfbuzz/harfbuzz/issues/1159 --- src/hb-ot-layout.hh | 83 +++++++++++++++++++++------------------------ src/hb-ot-shape.cc | 24 +++++++++++++ src/hb-unicode.hh | 5 --- 3 files changed, 63 insertions(+), 49 deletions(-) diff --git a/src/hb-ot-layout.hh b/src/hb-ot-layout.hh index 7a787b77e..24ff8ec56 100644 --- a/src/hb-ot-layout.hh +++ b/src/hb-ot-layout.hh @@ -160,12 +160,12 @@ hb_ot_layout_position_finish_offsets (hb_font_t *font, #define foreach_syllable(buffer, start, end) \ for (unsigned int \ _count = buffer->len, \ - start = 0, end = _count ? _next_syllable (buffer, 0) : 0; \ + start = 0, end = _count ? _hb_next_syllable (buffer, 0) : 0; \ start < _count; \ - start = end, end = _next_syllable (buffer, start)) + start = end, end = _hb_next_syllable (buffer, start)) static inline unsigned int -_next_syllable (hb_buffer_t *buffer, unsigned int start) +_hb_next_syllable (hb_buffer_t *buffer, unsigned int start) { hb_glyph_info_t *info = buffer->info; unsigned int count = buffer->len; @@ -188,7 +188,7 @@ _next_syllable (hb_buffer_t *buffer, unsigned int start) * * Whether it's one of the three Mongolian Free Variation Selectors, * CGJ, or other characters that are hidden but should not be ignored * like most other Default_Ignorable()s do during matching. - * * One free bit right now. + * * Whether it's a grapheme continuation. * * The high-byte has different meanings, switched by the Gen-Cat: * - For Mn,Mc,Me: the modified Combining_Class. @@ -202,6 +202,7 @@ enum hb_unicode_props_flags_t { UPROPS_MASK_IGNORABLE = 0x0020u, UPROPS_MASK_HIDDEN = 0x0040u, /* MONGOLIAN FREE VARIATION SELECTOR 1..3, * or TAG characters */ + UPROPS_MASK_CONTINUATION=0x0080u, /* If GEN_CAT=FORMAT, top byte masks: */ UPROPS_MASK_Cf_ZWJ = 0x0100u, @@ -220,6 +221,7 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer) if (u >= 0x80) { buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_NON_ASCII; + if (unlikely (unicode->is_default_ignorable (u))) { buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_DEFAULT_IGNORABLES; @@ -245,24 +247,10 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer) props |= UPROPS_MASK_HIDDEN; } } - else if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK (gen_cat))) - { - /* The above check is just an optimization to let in only things we need further - * processing on. */ - /* Only Mn and Mc can have non-zero ccc: - * https://unicode.org/policies/stability_policy.html#Property_Value - * """ - * Canonical_Combining_Class, General_Category - * All characters other than those with General_Category property values - * Spacing_Mark (Mc) and Nonspacing_Mark (Mn) have the Canonical_Combining_Class - * property value 0. - * 1.1.5+ - * """ - * - * Also, all Mn's that are Default_Ignorable, have ccc=0, hence - * the "else if". - */ + if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_MARK (gen_cat))) + { + props |= UPROPS_MASK_CONTINUATION; props |= unicode->modified_combining_class (u)<<8; } } @@ -302,29 +290,6 @@ _hb_glyph_info_get_modified_combining_class (const hb_glyph_info_t *info) { return _hb_glyph_info_is_unicode_mark (info) ? info->unicode_props()>>8 : 0; } - - -/* Loop over grapheme. Based on foreach_cluster(). */ -#define foreach_grapheme(buffer, start, end) \ - for (unsigned int \ - _count = buffer->len, \ - start = 0, end = _count ? _next_grapheme (buffer, 0) : 0; \ - start < _count; \ - start = end, end = _next_grapheme (buffer, start)) - -static inline unsigned int -_next_grapheme (hb_buffer_t *buffer, unsigned int start) -{ - hb_glyph_info_t *info = buffer->info; - unsigned int count = buffer->len; - - while (++start < count && _hb_glyph_info_is_unicode_mark (&info[start])) - ; - - return start; -} - - #define info_cc(info) (_hb_glyph_info_get_modified_combining_class (&(info))) static inline bool @@ -369,6 +334,36 @@ _hb_glyph_info_unhide (hb_glyph_info_t *info) info->unicode_props() &= ~ UPROPS_MASK_HIDDEN; } +static inline void +_hb_glyph_info_set_continuation (hb_glyph_info_t *info) +{ + info->unicode_props() |= UPROPS_MASK_CONTINUATION; +} +static inline bool +_hb_glyph_info_is_continuation (const hb_glyph_info_t *info) +{ + return info->unicode_props() & UPROPS_MASK_CONTINUATION; +} +/* Loop over grapheme. Based on foreach_cluster(). */ +#define foreach_grapheme(buffer, start, end) \ + for (unsigned int \ + _count = buffer->len, \ + start = 0, end = _count ? _hb_next_grapheme (buffer, 0) : 0; \ + start < _count; \ + start = end, end = _hb_next_grapheme (buffer, start)) + +static inline unsigned int +_hb_next_grapheme (hb_buffer_t *buffer, unsigned int start) +{ + hb_glyph_info_t *info = buffer->info; + unsigned int count = buffer->len; + + while (++start < count && _hb_glyph_info_is_continuation (&info[start])) + ; + + return start; +} + static inline bool _hb_glyph_info_is_unicode_format (const hb_glyph_info_t *info) { diff --git a/src/hb-ot-shape.cc b/src/hb-ot-shape.cc index 0cea9c662..9e73e3580 100644 --- a/src/hb-ot-shape.cc +++ b/src/hb-ot-shape.cc @@ -275,10 +275,34 @@ struct hb_ot_shape_context_t static void hb_set_unicode_props (hb_buffer_t *buffer) { + /* Implement enough of Unicode Graphemes here that shaping + * in reverse-direction wouldn't break graphemes. Namely, + * we mark all marks and ZWJ and ZWJ,Extended_Pictographic + * sequences as continuations. The foreach_grapheme() + * macro uses this bit. + * + * https://www.unicode.org/reports/tr29/#Regex_Definitions + */ unsigned int count = buffer->len; hb_glyph_info_t *info = buffer->info; for (unsigned int i = 0; i < count; i++) + { _hb_glyph_info_set_unicode_props (&info[i], buffer); + + /* Marks are already set as continuation by the above line. + * Handle ZWJ-continuation. */ + if (unlikely (_hb_glyph_info_is_zwj (&info[i]))) + { + _hb_glyph_info_set_continuation (&info[i]); + if (i + 1 < count && + _hb_unicode_is_emoji_Extended_Pictographic (info[i + 1].codepoint)) + { + i++; + _hb_glyph_info_set_unicode_props (&info[i], buffer); + _hb_glyph_info_set_continuation (&info[i]); + } + } + } } static void diff --git a/src/hb-unicode.hh b/src/hb-unicode.hh index eac52eaa5..106fbf0aa 100644 --- a/src/hb-unicode.hh +++ b/src/hb-unicode.hh @@ -381,11 +381,6 @@ DECLARE_NULL_INSTANCE (hb_unicode_funcs_t); FLAG (HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) | \ FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK))) -#define HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK(gen_cat) \ - (FLAG_UNSAFE (gen_cat) & \ - (FLAG (HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK) | \ - FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK))) - /* * Ranges, used for bsearch tables.