Improve CGJ skipping logic

Previously we made CGJ unskippable. Now, if CGJ did NOT prevent any reordering, allow skipping over it. To make this work we had to make changes to the Arabic mark reordering algorithm implementation to renumber moved MCM marks. See comments. Fixes https://github.com/harfbuzz/harfbuzz/issues/554
2018-01-05 12:46:12 +00:00 · 2018-01-05 12:46:12 +00:00 · 8c0d1916a4
parent 293e443529
commit 8c0d1916a4
5 changed files with 58 additions and 25 deletions
--- a/src/hb-buffer-private.hh
+++ b/src/hb-buffer-private.hh
@ -69,6 +69,7 @@ enum hb_buffer_scratch_flags_t {
  HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK		= 0x00000004u,
  HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT		= 0x00000008u,
  HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK		= 0x00000010u,
+  HB_BUFFER_SCRATCH_FLAG_HAS_CGJ			= 0x00000020u,

  /* Reserved for complex shapers' internal use. */
  HB_BUFFER_SCRATCH_FLAG_COMPLEX0			= 0x01000000u,
--- a/src/hb-ot-layout-private.hh
+++ b/src/hb-ot-layout-private.hh
@ -280,7 +280,11 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer)
      else if (unlikely (hb_in_range (u, 0xE0020u, 0xE007Fu))) props |= UPROPS_MASK_HIDDEN;
      /* COMBINING GRAPHEME JOINER should not be skipped; at least some times.
       * https://github.com/harfbuzz/harfbuzz/issues/554 */
-      else if (unlikely (u == 0x034Fu)) props |= UPROPS_MASK_HIDDEN;
+      else if (unlikely (u == 0x034Fu))
+      {
+	buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_CGJ;
+	props |= UPROPS_MASK_HIDDEN;
+      }
    }
    else if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK_OR_MODIFIER_SYMBOL (gen_cat)))
    {
@ -388,6 +392,11 @@ _hb_glyph_info_is_default_ignorable_and_not_hidden (const hb_glyph_info_t *info)
 	  == UPROPS_MASK_IGNORABLE) &&
 	 !_hb_glyph_info_ligated (info);
 }
+static inline void
+_hb_glyph_info_unhide (hb_glyph_info_t *info)
+{
+  info->unicode_props() &= ~ UPROPS_MASK_HIDDEN;
+}

 static inline bool
 _hb_glyph_info_is_unicode_format (const hb_glyph_info_t *info)
--- a/src/hb-ot-shape-complex-arabic.cc
+++ b/src/hb-ot-shape-complex-arabic.cc
@ -644,13 +644,15 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
 {
  hb_glyph_info_t *info = buffer->info;

+  DEBUG_MSG (ARABIC, buffer, "Reordering marks from %d to %d", start, end);
+
  unsigned int i = start;
  for (unsigned int cc = 220; cc <= 230; cc += 10)
  {
-    DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d\n", cc, i);
+    DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d", cc, i);
    while (i < end && info_cc(info[i]) < cc)
      i++;
-    DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d\n", cc, i);
+    DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d", cc, i);

    if (i == end)
      break;
@ -658,20 +660,17 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
    if (info_cc(info[i]) > cc)
      continue;

-    /* Technically we should also check "info_cc(info[j]) == cc"
-     * in the following loop.  But not doing it is safe; we might
-     * end up moving all the 220 MCMs and 230 MCMs together in one
-     * move and be done. */
    unsigned int j = i;
-    while (j < end && info_is_mcm (info[j]))
+    while (j < end && info_cc(info[j]) == cc && info_is_mcm (info[j]))
      j++;
-    DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d\n", cc, i, j);

    if (i == j)
      continue;

+    DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d", cc, i, j);
+
    /* Shift it! */
-    DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d\n", cc, i, j);
+    DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d", cc, i, j);
    hb_glyph_info_t temp[HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS];
    assert (j - i <= ARRAY_LENGTH (temp));
    buffer->merge_clusters (start, j);
@ -679,7 +678,25 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan,
    memmove (&info[start + j - i], &info[start], (i - start) * sizeof (hb_glyph_info_t));
    memmove (&info[start], temp, (j - i) * sizeof (hb_glyph_info_t));

-    start += j - i;
+    /* Renumber CC such that the reordered sequence is still sorted.
+     * 22 and 26 are chosen because they are smaller than all Arabic categories,
+     * and are folded back to 220/230 respectively during fallback mark positioning.
+     *
+     * We do this because the CGJ-handling logic in the normalizer relies on
+     * mark sequences having an increasing order even after this reordering.
+     * https://github.com/harfbuzz/harfbuzz/issues/554
+     * This, however, does break some obscure sequences, where the normalizer
+     * might compose a sequence that it should not.  For example, in the seequence
+     * ALEF, HAMZAH, MADDAH, we should NOT try to compose ALEF+MADDAH, but with this
+     * renumbering, we will.
+     */
+    unsigned int new_start = start + j - i;
+    unsigned int new_cc = cc == 220 ? HB_MODIFIED_COMBINING_CLASS_CCC22 : HB_MODIFIED_COMBINING_CLASS_CCC26;
+    while (start < new_start)
+    {
+      _hb_glyph_info_set_modified_combining_class (&info[start], new_cc);
+      start++;
+    }

    i = j;
  }
--- a/src/hb-ot-shape-normalize.cc
+++ b/src/hb-ot-shape-normalize.cc
@ -345,9 +345,8 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
      if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0)
        break;

-    /* We are going to do a O(n^2).  Only do this if the sequence is short,
-     * but not too short ;). */
-    if (end - i < 2 || end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) {
+    /* We are going to do a O(n^2).  Only do this if the sequence is short. */
+    if (end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) {
      i = end;
      continue;
    }
@ -373,13 +372,11 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,
  buffer->clear_output ();
  count = buffer->len;
  unsigned int starter = 0;
-  bool combine = true;
  buffer->next_glyph ();
  while (buffer->idx < count && !buffer->in_error)
  {
    hb_codepoint_t composed, glyph;
-    if (combine &&
-	/* We don't try to compose a non-mark character with it's preceding starter.
+    if (/* We don't try to compose a non-mark character with it's preceding starter.
 	 * This is both an optimization to avoid trying to compose every two neighboring
 	 * glyphs in most scripts AND a desired feature for Hangul.  Apparently Hangul
 	 * fonts are not designed to mix-and-match pre-composed syllables and Jamo. */
@ -410,22 +407,27 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan,

 	continue;
      }
-      else if (/* We sometimes custom-tailor the sorted order of marks. In that case, stop
-		* trying to combine as soon as combining-class drops. */
-	       starter < buffer->out_len - 1 &&
-	       info_cc (buffer->prev()) > info_cc (buffer->cur()))
-        combine = false;
    }

    /* Blocked, or doesn't compose. */
    buffer->next_glyph ();

    if (info_cc (buffer->prev()) == 0)
-    {
      starter = buffer->out_len - 1;
-      combine = true;
-    }
  }
  buffer->swap_buffers ();

+  if (buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ)
+  {
+    /* For all CGJ, check if it prevented any reordering at all.
+     * If it did NOT, then make it skippable.
+     * https://github.com/harfbuzz/harfbuzz/issues/554
+     */
+    for (unsigned int i = 1; i + 1 < buffer->len; i++)
+      if (buffer->info[i].codepoint == 0x034Fu/*CGJ*/ &&
+	  info_cc(buffer->info[i-1]) <= info_cc(buffer->info[i+1]))
+      {
+	_hb_glyph_info_unhide (&buffer->info[i]);
+      }
+  }
 }
--- a/test/shaping/tests/arabic-mark-order.tests
+++ b/test/shaping/tests/arabic-mark-order.tests
@ -1,2 +1,6 @@
 fonts/sha1sum/94a5d6fb15a27521fba9ea4aee9cb39b2d03322a.ttf::U+064A,U+064E,U+0670,U+0653,U+0640,U+0654,U+064E,U+0627:[afii57415.zz04=7+481|afii57454=4@25,975+0|uni0654=4@-50,50+0|afii57440=4+650|uni0670_uni0653=0@75,400+0|afii57454=0@750,1125+0|afii57450.calt=0+1331]
 fonts/sha1sum/24b8d24d00ae86f49791b746da4c9d3f717a51a8.ttf::U+0628,U+0618,U+0619,U+064E,U+064F,U+0654,U+0658,U+0653,U+0654,U+0651,U+0656,U+0651,U+065C,U+0655,U+0650:[uni0653.small=0@266,2508+0|uni0654=0@308,2151+0|uni0655=0@518,-1544+0|uni065C=0@501,-1453+0|uni0656=0@573,-659+0|uni0650=0@500,133+0|uni0619=0@300,1807+0|uni0618=0@357,1674+0|uni0651064E=0@387,1178+0|uni0651=0@402,764+0|uni0658=0@424,404+0|uni0654064F=0@540,-435+0|uni0628=0+1352]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+034F,U+0650:[uni0650.small2=0@727,-774+0|space=0+0|uni0655=0@727,-209+0|uni0649=0+1566]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+0650:[uni0650.small2=0@727,-774+0|uni0655=0@727,-209+0|uni0649=0+1566]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+0655:[uni0650.small2=0@727,-774+0|uni0655=0@727,-209+0|uni0649=0+1566]
+fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+034F,U+0655:[uni0655=0+0|space=0+0|uni0650=0@166,0+0|uni0649=0+1566]