Improve CGJ skipping logic

Previously we made CGJ unskippable.  Now, if CGJ did NOT prevent
any reordering, allow skipping over it.  To make this work we
had to make changes to the Arabic mark reordering algorithm
implementation to renumber moved MCM marks.  See comments.

Fixes https://github.com/harfbuzz/harfbuzz/issues/554
This commit is contained in:
Behdad Esfahbod 2018-01-05 12:46:12 +00:00
parent 293e443529
commit 8c0d1916a4
5 changed files with 58 additions and 25 deletions

View File

@ -69,6 +69,7 @@ enum hb_buffer_scratch_flags_t {
HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK = 0x00000004u, HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK = 0x00000004u,
HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT = 0x00000008u, HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT = 0x00000008u,
HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK = 0x00000010u, HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK = 0x00000010u,
HB_BUFFER_SCRATCH_FLAG_HAS_CGJ = 0x00000020u,
/* Reserved for complex shapers' internal use. */ /* Reserved for complex shapers' internal use. */
HB_BUFFER_SCRATCH_FLAG_COMPLEX0 = 0x01000000u, HB_BUFFER_SCRATCH_FLAG_COMPLEX0 = 0x01000000u,

View File

@ -280,7 +280,11 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer)
else if (unlikely (hb_in_range (u, 0xE0020u, 0xE007Fu))) props |= UPROPS_MASK_HIDDEN; else if (unlikely (hb_in_range (u, 0xE0020u, 0xE007Fu))) props |= UPROPS_MASK_HIDDEN;
/* COMBINING GRAPHEME JOINER should not be skipped; at least some times. /* COMBINING GRAPHEME JOINER should not be skipped; at least some times.
* https://github.com/harfbuzz/harfbuzz/issues/554 */ * https://github.com/harfbuzz/harfbuzz/issues/554 */
else if (unlikely (u == 0x034Fu)) props |= UPROPS_MASK_HIDDEN; else if (unlikely (u == 0x034Fu))
{
buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_CGJ;
props |= UPROPS_MASK_HIDDEN;
}
} }
else if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK_OR_MODIFIER_SYMBOL (gen_cat))) else if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK_OR_MODIFIER_SYMBOL (gen_cat)))
{ {
@ -388,6 +392,11 @@ _hb_glyph_info_is_default_ignorable_and_not_hidden (const hb_glyph_info_t *info)
== UPROPS_MASK_IGNORABLE) && == UPROPS_MASK_IGNORABLE) &&
!_hb_glyph_info_ligated (info); !_hb_glyph_info_ligated (info);
} }
static inline void
_hb_glyph_info_unhide (hb_glyph_info_t *info)
{
info->unicode_props() &= ~ UPROPS_MASK_HIDDEN;
}
static inline bool static inline bool
_hb_glyph_info_is_unicode_format (const hb_glyph_info_t *info) _hb_glyph_info_is_unicode_format (const hb_glyph_info_t *info)

View File

@ -644,13 +644,15 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
{ {
hb_glyph_info_t *info = buffer->info; hb_glyph_info_t *info = buffer->info;
DEBUG_MSG (ARABIC, buffer, "Reordering marks from %d to %d", start, end);
unsigned int i = start; unsigned int i = start;
for (unsigned int cc = 220; cc <= 230; cc += 10) for (unsigned int cc = 220; cc <= 230; cc += 10)
{ {
DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d\n", cc, i); DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d", cc, i);
while (i < end && info_cc(info[i]) < cc) while (i < end && info_cc(info[i]) < cc)
i++; i++;
DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d\n", cc, i); DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d", cc, i);
if (i == end) if (i == end)
break; break;
@ -658,20 +660,17 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
if (info_cc(info[i]) > cc) if (info_cc(info[i]) > cc)
continue; continue;
/* Technically we should also check "info_cc(info[j]) == cc"
* in the following loop. But not doing it is safe; we might
* end up moving all the 220 MCMs and 230 MCMs together in one
* move and be done. */
unsigned int j = i; unsigned int j = i;
while (j < end && info_is_mcm (info[j])) while (j < end && info_cc(info[j]) == cc && info_is_mcm (info[j]))
j++; j++;
DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d\n", cc, i, j);
if (i == j) if (i == j)
continue; continue;
DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d", cc, i, j);
/* Shift it! */ /* Shift it! */
DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d\n", cc, i, j); DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d", cc, i, j);
hb_glyph_info_t temp[HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS]; hb_glyph_info_t temp[HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS];
assert (j - i <= ARRAY_LENGTH (temp)); assert (j - i <= ARRAY_LENGTH (temp));
buffer->merge_clusters (start, j); buffer->merge_clusters (start, j);
@ -679,7 +678,25 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
memmove (&info[start + j - i], &info[start], (i - start) * sizeof (hb_glyph_info_t)); memmove (&info[start + j - i], &info[start], (i - start) * sizeof (hb_glyph_info_t));
memmove (&info[start], temp, (j - i) * sizeof (hb_glyph_info_t)); memmove (&info[start], temp, (j - i) * sizeof (hb_glyph_info_t));
start += j - i; /* Renumber CC such that the reordered sequence is still sorted.
* 22 and 26 are chosen because they are smaller than all Arabic categories,
* and are folded back to 220/230 respectively during fallback mark positioning.
*
* We do this because the CGJ-handling logic in the normalizer relies on
* mark sequences having an increasing order even after this reordering.
* https://github.com/harfbuzz/harfbuzz/issues/554
* This, however, does break some obscure sequences, where the normalizer
* might compose a sequence that it should not. For example, in the seequence
* ALEF, HAMZAH, MADDAH, we should NOT try to compose ALEF+MADDAH, but with this
* renumbering, we will.
*/
unsigned int new_start = start + j - i;
unsigned int new_cc = cc == 220 ? HB_MODIFIED_COMBINING_CLASS_CCC22 : HB_MODIFIED_COMBINING_CLASS_CCC26;
while (start < new_start)
{
_hb_glyph_info_set_modified_combining_class (&info[start], new_cc);
start++;
}
i = j; i = j;
} }

View File

@ -345,9 +345,8 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0) if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0)
break; break;
/* We are going to do a O(n^2). Only do this if the sequence is short, /* We are going to do a O(n^2). Only do this if the sequence is short. */
* but not too short ;). */ if (end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) {
if (end - i < 2 || end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) {
i = end; i = end;
continue; continue;
} }
@ -373,13 +372,11 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
buffer->clear_output (); buffer->clear_output ();
count = buffer->len; count = buffer->len;
unsigned int starter = 0; unsigned int starter = 0;
bool combine = true;
buffer->next_glyph (); buffer->next_glyph ();
while (buffer->idx < count && !buffer->in_error) while (buffer->idx < count && !buffer->in_error)
{ {
hb_codepoint_t composed, glyph; hb_codepoint_t composed, glyph;
if (combine && if (/* We don't try to compose a non-mark character with it's preceding starter.
/* We don't try to compose a non-mark character with it's preceding starter.
* This is both an optimization to avoid trying to compose every two neighboring * This is both an optimization to avoid trying to compose every two neighboring
* glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul * glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul
* fonts are not designed to mix-and-match pre-composed syllables and Jamo. */ * fonts are not designed to mix-and-match pre-composed syllables and Jamo. */
@ -410,22 +407,27 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
continue; continue;
} }
else if (/* We sometimes custom-tailor the sorted order of marks. In that case, stop
* trying to combine as soon as combining-class drops. */
starter < buffer->out_len - 1 &&
info_cc (buffer->prev()) > info_cc (buffer->cur()))
combine = false;
} }
/* Blocked, or doesn't compose. */ /* Blocked, or doesn't compose. */
buffer->next_glyph (); buffer->next_glyph ();
if (info_cc (buffer->prev()) == 0) if (info_cc (buffer->prev()) == 0)
{
starter = buffer->out_len - 1; starter = buffer->out_len - 1;
combine = true;
}
} }
buffer->swap_buffers (); buffer->swap_buffers ();
if (buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ)
{
/* For all CGJ, check if it prevented any reordering at all.
* If it did NOT, then make it skippable.
* https://github.com/harfbuzz/harfbuzz/issues/554
*/
for (unsigned int i = 1; i + 1 < buffer->len; i++)
if (buffer->info[i].codepoint == 0x034Fu/*CGJ*/ &&
info_cc(buffer->info[i-1]) <= info_cc(buffer->info[i+1]))
{
_hb_glyph_info_unhide (&buffer->info[i]);
}
}
} }

View File

@ -1,2 +1,6 @@
fonts/sha1sum/94a5d6fb15a27521fba9ea4aee9cb39b2d03322a.ttf::U+064A,U+064E,U+0670,U+0653,U+0640,U+0654,U+064E,U+0627:[afii57415.zz04=7+481|afii57454=4@25,975+0|uni0654=4@-50,50+0|afii57440=4+650|uni0670_uni0653=0@75,400+0|afii57454=0@750,1125+0|afii57450.calt=0+1331] fonts/sha1sum/94a5d6fb15a27521fba9ea4aee9cb39b2d03322a.ttf::U+064A,U+064E,U+0670,U+0653,U+0640,U+0654,U+064E,U+0627:[afii57415.zz04=7+481|afii57454=4@25,975+0|uni0654=4@-50,50+0|afii57440=4+650|uni0670_uni0653=0@75,400+0|afii57454=0@750,1125+0|afii57450.calt=0+1331]
fonts/sha1sum/24b8d24d00ae86f49791b746da4c9d3f717a51a8.ttf::U+0628,U+0618,U+0619,U+064E,U+064F,U+0654,U+0658,U+0653,U+0654,U+0651,U+0656,U+0651,U+065C,U+0655,U+0650:[uni0653.small=0@266,2508+0|uni0654=0@308,2151+0|uni0655=0@518,-1544+0|uni065C=0@501,-1453+0|uni0656=0@573,-659+0|uni0650=0@500,133+0|uni0619=0@300,1807+0|uni0618=0@357,1674+0|uni0651064E=0@387,1178+0|uni0651=0@402,764+0|uni0658=0@424,404+0|uni0654064F=0@540,-435+0|uni0628=0+1352] fonts/sha1sum/24b8d24d00ae86f49791b746da4c9d3f717a51a8.ttf::U+0628,U+0618,U+0619,U+064E,U+064F,U+0654,U+0658,U+0653,U+0654,U+0651,U+0656,U+0651,U+065C,U+0655,U+0650:[uni0653.small=0@266,2508+0|uni0654=0@308,2151+0|uni0655=0@518,-1544+0|uni065C=0@501,-1453+0|uni0656=0@573,-659+0|uni0650=0@500,133+0|uni0619=0@300,1807+0|uni0618=0@357,1674+0|uni0651064E=0@387,1178+0|uni0651=0@402,764+0|uni0658=0@424,404+0|uni0654064F=0@540,-435+0|uni0628=0+1352]
fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+034F,U+0650:[uni0650.small2=0@727,-774+0|space=0+0|uni0655=0@727,-209+0|uni0649=0+1566]
fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+0650:[uni0650.small2=0@727,-774+0|uni0655=0@727,-209+0|uni0649=0+1566]
fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+0655:[uni0650.small2=0@727,-774+0|uni0655=0@727,-209+0|uni0649=0+1566]
fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+034F,U+0655:[uni0655=0+0|space=0+0|uni0650=0@166,0+0|uni0649=0+1566]