[emoji] Mark emoji contination sequences as continuation
This adds a new grapheme bit. Not used yet. Part of https://github.com/harfbuzz/harfbuzz/issues/1159
This commit is contained in:
parent
123326e20a
commit
3b78318510
|
@ -160,12 +160,12 @@ hb_ot_layout_position_finish_offsets (hb_font_t *font,
|
|||
#define foreach_syllable(buffer, start, end) \
|
||||
for (unsigned int \
|
||||
_count = buffer->len, \
|
||||
start = 0, end = _count ? _next_syllable (buffer, 0) : 0; \
|
||||
start = 0, end = _count ? _hb_next_syllable (buffer, 0) : 0; \
|
||||
start < _count; \
|
||||
start = end, end = _next_syllable (buffer, start))
|
||||
start = end, end = _hb_next_syllable (buffer, start))
|
||||
|
||||
static inline unsigned int
|
||||
_next_syllable (hb_buffer_t *buffer, unsigned int start)
|
||||
_hb_next_syllable (hb_buffer_t *buffer, unsigned int start)
|
||||
{
|
||||
hb_glyph_info_t *info = buffer->info;
|
||||
unsigned int count = buffer->len;
|
||||
|
@ -188,7 +188,7 @@ _next_syllable (hb_buffer_t *buffer, unsigned int start)
|
|||
* * Whether it's one of the three Mongolian Free Variation Selectors,
|
||||
* CGJ, or other characters that are hidden but should not be ignored
|
||||
* like most other Default_Ignorable()s do during matching.
|
||||
* * One free bit right now.
|
||||
* * Whether it's a grapheme continuation.
|
||||
*
|
||||
* The high-byte has different meanings, switched by the Gen-Cat:
|
||||
* - For Mn,Mc,Me: the modified Combining_Class.
|
||||
|
@ -202,6 +202,7 @@ enum hb_unicode_props_flags_t {
|
|||
UPROPS_MASK_IGNORABLE = 0x0020u,
|
||||
UPROPS_MASK_HIDDEN = 0x0040u, /* MONGOLIAN FREE VARIATION SELECTOR 1..3,
|
||||
* or TAG characters */
|
||||
UPROPS_MASK_CONTINUATION=0x0080u,
|
||||
|
||||
/* If GEN_CAT=FORMAT, top byte masks: */
|
||||
UPROPS_MASK_Cf_ZWJ = 0x0100u,
|
||||
|
@ -220,6 +221,7 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer)
|
|||
if (u >= 0x80)
|
||||
{
|
||||
buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_NON_ASCII;
|
||||
|
||||
if (unlikely (unicode->is_default_ignorable (u)))
|
||||
{
|
||||
buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_DEFAULT_IGNORABLES;
|
||||
|
@ -245,24 +247,10 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer)
|
|||
props |= UPROPS_MASK_HIDDEN;
|
||||
}
|
||||
}
|
||||
else if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK (gen_cat)))
|
||||
{
|
||||
/* The above check is just an optimization to let in only things we need further
|
||||
* processing on. */
|
||||
|
||||
/* Only Mn and Mc can have non-zero ccc:
|
||||
* https://unicode.org/policies/stability_policy.html#Property_Value
|
||||
* """
|
||||
* Canonical_Combining_Class, General_Category
|
||||
* All characters other than those with General_Category property values
|
||||
* Spacing_Mark (Mc) and Nonspacing_Mark (Mn) have the Canonical_Combining_Class
|
||||
* property value 0.
|
||||
* 1.1.5+
|
||||
* """
|
||||
*
|
||||
* Also, all Mn's that are Default_Ignorable, have ccc=0, hence
|
||||
* the "else if".
|
||||
*/
|
||||
if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_MARK (gen_cat)))
|
||||
{
|
||||
props |= UPROPS_MASK_CONTINUATION;
|
||||
props |= unicode->modified_combining_class (u)<<8;
|
||||
}
|
||||
}
|
||||
|
@ -302,29 +290,6 @@ _hb_glyph_info_get_modified_combining_class (const hb_glyph_info_t *info)
|
|||
{
|
||||
return _hb_glyph_info_is_unicode_mark (info) ? info->unicode_props()>>8 : 0;
|
||||
}
|
||||
|
||||
|
||||
/* Loop over grapheme. Based on foreach_cluster(). */
|
||||
#define foreach_grapheme(buffer, start, end) \
|
||||
for (unsigned int \
|
||||
_count = buffer->len, \
|
||||
start = 0, end = _count ? _next_grapheme (buffer, 0) : 0; \
|
||||
start < _count; \
|
||||
start = end, end = _next_grapheme (buffer, start))
|
||||
|
||||
static inline unsigned int
|
||||
_next_grapheme (hb_buffer_t *buffer, unsigned int start)
|
||||
{
|
||||
hb_glyph_info_t *info = buffer->info;
|
||||
unsigned int count = buffer->len;
|
||||
|
||||
while (++start < count && _hb_glyph_info_is_unicode_mark (&info[start]))
|
||||
;
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
#define info_cc(info) (_hb_glyph_info_get_modified_combining_class (&(info)))
|
||||
|
||||
static inline bool
|
||||
|
@ -369,6 +334,36 @@ _hb_glyph_info_unhide (hb_glyph_info_t *info)
|
|||
info->unicode_props() &= ~ UPROPS_MASK_HIDDEN;
|
||||
}
|
||||
|
||||
static inline void
|
||||
_hb_glyph_info_set_continuation (hb_glyph_info_t *info)
|
||||
{
|
||||
info->unicode_props() |= UPROPS_MASK_CONTINUATION;
|
||||
}
|
||||
static inline bool
|
||||
_hb_glyph_info_is_continuation (const hb_glyph_info_t *info)
|
||||
{
|
||||
return info->unicode_props() & UPROPS_MASK_CONTINUATION;
|
||||
}
|
||||
/* Loop over grapheme. Based on foreach_cluster(). */
|
||||
#define foreach_grapheme(buffer, start, end) \
|
||||
for (unsigned int \
|
||||
_count = buffer->len, \
|
||||
start = 0, end = _count ? _hb_next_grapheme (buffer, 0) : 0; \
|
||||
start < _count; \
|
||||
start = end, end = _hb_next_grapheme (buffer, start))
|
||||
|
||||
static inline unsigned int
|
||||
_hb_next_grapheme (hb_buffer_t *buffer, unsigned int start)
|
||||
{
|
||||
hb_glyph_info_t *info = buffer->info;
|
||||
unsigned int count = buffer->len;
|
||||
|
||||
while (++start < count && _hb_glyph_info_is_continuation (&info[start]))
|
||||
;
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
_hb_glyph_info_is_unicode_format (const hb_glyph_info_t *info)
|
||||
{
|
||||
|
|
|
@ -275,10 +275,34 @@ struct hb_ot_shape_context_t
|
|||
static void
|
||||
hb_set_unicode_props (hb_buffer_t *buffer)
|
||||
{
|
||||
/* Implement enough of Unicode Graphemes here that shaping
|
||||
* in reverse-direction wouldn't break graphemes. Namely,
|
||||
* we mark all marks and ZWJ and ZWJ,Extended_Pictographic
|
||||
* sequences as continuations. The foreach_grapheme()
|
||||
* macro uses this bit.
|
||||
*
|
||||
* https://www.unicode.org/reports/tr29/#Regex_Definitions
|
||||
*/
|
||||
unsigned int count = buffer->len;
|
||||
hb_glyph_info_t *info = buffer->info;
|
||||
for (unsigned int i = 0; i < count; i++)
|
||||
{
|
||||
_hb_glyph_info_set_unicode_props (&info[i], buffer);
|
||||
|
||||
/* Marks are already set as continuation by the above line.
|
||||
* Handle ZWJ-continuation. */
|
||||
if (unlikely (_hb_glyph_info_is_zwj (&info[i])))
|
||||
{
|
||||
_hb_glyph_info_set_continuation (&info[i]);
|
||||
if (i + 1 < count &&
|
||||
_hb_unicode_is_emoji_Extended_Pictographic (info[i + 1].codepoint))
|
||||
{
|
||||
i++;
|
||||
_hb_glyph_info_set_unicode_props (&info[i], buffer);
|
||||
_hb_glyph_info_set_continuation (&info[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -381,11 +381,6 @@ DECLARE_NULL_INSTANCE (hb_unicode_funcs_t);
|
|||
FLAG (HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) | \
|
||||
FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)))
|
||||
|
||||
#define HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK(gen_cat) \
|
||||
(FLAG_UNSAFE (gen_cat) & \
|
||||
(FLAG (HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK) | \
|
||||
FLAG (HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)))
|
||||
|
||||
|
||||
/*
|
||||
* Ranges, used for bsearch tables.
|
||||
|
|
Loading…
Reference in New Issue