From 6e74c64211b6aaac48bae8c87f9420d8dc03fd93 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Mon, 11 Feb 2013 06:50:17 -0500 Subject: [PATCH] Improve normalization heuristic Before, for most scripts, we were not trying to recompose two characters if the second one had ccc=0. That fails for Myanmar where U+1026 decomposes to U+1025,U+102E, both of which have ccc=0. However, we do want to try to recompose those. We now check whether the second is a mark, using general category instead. At the same time, remove optimization that was conflicting with this. [Let the Ngapi hackfest begin!] --- src/hb-ot-shape-normalize.cc | 39 ++++++++++++------------------------ 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc index c5325e42b..344c0ff84 100644 --- a/src/hb-ot-shape-normalize.cc +++ b/src/hb-ot-shape-normalize.cc @@ -192,30 +192,23 @@ decompose_compatibility (const hb_ot_shape_normalize_context_t *c, hb_codepoint_ } /* Returns true if recomposition may be benefitial. */ -static inline bool +static inline void decompose_current_character (const hb_ot_shape_normalize_context_t *c, bool shortest) { hb_buffer_t * const buffer = c->buffer; hb_codepoint_t glyph; - unsigned int len = 1; /* Kind of a cute waterfall here... */ if (shortest && c->font->get_glyph (buffer->cur().codepoint, 0, &glyph)) next_char (buffer, glyph); - else if ((len = decompose (c, shortest, buffer->cur().codepoint))) + else if (decompose (c, shortest, buffer->cur().codepoint)) skip_char (buffer); else if (!shortest && c->font->get_glyph (buffer->cur().codepoint, 0, &glyph)) next_char (buffer, glyph); - else if ((len = decompose_compatibility (c, buffer->cur().codepoint))) + else if (decompose_compatibility (c, buffer->cur().codepoint)) skip_char (buffer); else next_char (buffer, glyph); /* glyph is initialized in earlier branches. */ - - /* - * A recomposition would only be useful if we decomposed into at least three - * characters... - */ - return len > 2; } static inline void @@ -239,7 +232,7 @@ handle_variation_selector_cluster (const hb_ot_shape_normalize_context_t *c, uns } /* Returns true if recomposition may be benefitial. */ -static inline bool +static inline void decompose_multi_char_cluster (const hb_ot_shape_normalize_context_t *c, unsigned int end) { hb_buffer_t * const buffer = c->buffer; @@ -247,23 +240,20 @@ decompose_multi_char_cluster (const hb_ot_shape_normalize_context_t *c, unsigned for (unsigned int i = buffer->idx; i < end; i++) if (unlikely (buffer->unicode->is_variation_selector (buffer->info[i].codepoint))) { handle_variation_selector_cluster (c, end); - return false; + return; } while (buffer->idx < end) decompose_current_character (c, false); - /* We can be smarter here and only return true if there are at least two ccc!=0 marks. - * But does not matter. */ - return true; } -static inline bool +static inline void decompose_cluster (const hb_ot_shape_normalize_context_t *c, bool short_circuit, unsigned int end) { if (likely (c->buffer->idx + 1 == end)) - return decompose_current_character (c, short_circuit); + decompose_current_character (c, short_circuit); else - return decompose_multi_char_cluster (c, end); + decompose_multi_char_cluster (c, end); } @@ -296,7 +286,6 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan, bool short_circuit = mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED && mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT; - bool can_use_recompose = false; unsigned int count; /* We do a fairly straightforward yet custom normalization process in three @@ -317,15 +306,11 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan, if (buffer->cur().cluster != buffer->info[end].cluster) break; - can_use_recompose = decompose_cluster (&c, short_circuit, end) || can_use_recompose; + decompose_cluster (&c, short_circuit, end); } buffer->swap_buffers (); - if (mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_FULL && !can_use_recompose) - return; /* Done! */ - - /* Second round, reorder (inplace) */ count = buffer->len; @@ -369,9 +354,11 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan, { hb_codepoint_t composed, glyph; if (/* If mode is NOT COMPOSED_FULL (ie. it's COMPOSED_DIACRITICS), we don't try to - * compose a CCC=0 character with it's preceding starter. */ + * compose a non-mark character with it's preceding starter. This is just an + * optimization to avoid trying to compose every two neighboring glyphs in most + * scripts. */ (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_FULL || - _hb_glyph_info_get_modified_combining_class (&buffer->cur()) != 0) && + HB_UNICODE_GENERAL_CATEGORY_IS_MARK (_hb_glyph_info_get_general_category (&buffer->cur()))) && /* If there's anything between the starter and this char, they should have CCC * smaller than this character's. */ (starter == buffer->out_len - 1 ||