From c98b7183f7dc453d5bac1f2503017cded317a495 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Tue, 31 Dec 2013 15:55:40 +0800 Subject: [PATCH] [ot] Add Hangul shaper Not exhaustively tested, but I think I got the intended logic right. The logic can perhaps be simplified. Maybe we should disabled normalization with this shaper. Then again, for now focusing on correctness. --- src/Makefile.am | 1 + src/hb-ot-shape-complex-default.cc | 13 -- src/hb-ot-shape-complex-hangul.cc | 232 +++++++++++++++++++++++++++++ src/hb-ot-shape-complex-private.hh | 10 +- src/hb-private.hh | 6 + 5 files changed, 240 insertions(+), 22 deletions(-) create mode 100644 src/hb-ot-shape-complex-hangul.cc diff --git a/src/Makefile.am b/src/Makefile.am index 67a328c5e..62544dbcb 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -92,6 +92,7 @@ HBSOURCES += \ hb-ot-shape-complex-arabic-fallback.hh \ hb-ot-shape-complex-arabic-table.hh \ hb-ot-shape-complex-default.cc \ + hb-ot-shape-complex-hangul.cc \ hb-ot-shape-complex-indic.cc \ hb-ot-shape-complex-indic-machine.hh \ hb-ot-shape-complex-indic-private.hh \ diff --git a/src/hb-ot-shape-complex-default.cc b/src/hb-ot-shape-complex-default.cc index d6afa0e1c..519790c80 100644 --- a/src/hb-ot-shape-complex-default.cc +++ b/src/hb-ot-shape-complex-default.cc @@ -32,14 +32,6 @@ /* The default shaper *only* adds additional per-script features.*/ -static const hb_tag_t hangul_features[] = -{ - HB_TAG('l','j','m','o'), - HB_TAG('v','j','m','o'), - HB_TAG('t','j','m','o'), - HB_TAG_NONE -}; - static const hb_tag_t tibetan_features[] = { HB_TAG('a','b','v','s'), @@ -56,11 +48,6 @@ collect_features_default (hb_ot_shape_planner_t *plan) switch ((hb_tag_t) plan->props.script) { - /* Unicode-1.1 additions */ - case HB_SCRIPT_HANGUL: - script_features = hangul_features; - break; - /* Unicode-2.0 additions */ case HB_SCRIPT_TIBETAN: script_features = tibetan_features; diff --git a/src/hb-ot-shape-complex-hangul.cc b/src/hb-ot-shape-complex-hangul.cc new file mode 100644 index 000000000..1b89f2096 --- /dev/null +++ b/src/hb-ot-shape-complex-hangul.cc @@ -0,0 +1,232 @@ +/* + * Copyright © 2013 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + * Google Author(s): Behdad Esfahbod + */ + +#include "hb-ot-shape-complex-private.hh" + + +/* Hangul shaper */ + + +static const hb_tag_t hangul_features[] = +{ + HB_TAG('l','j','m','o'), + HB_TAG('v','j','m','o'), + HB_TAG('t','j','m','o'), + HB_TAG_NONE +}; + +static void +collect_features_hangul (hb_ot_shape_planner_t *plan) +{ + for (const hb_tag_t *script_features = hangul_features; script_features && *script_features; script_features++) + plan->map.add_global_bool_feature (*script_features); +} + +#define LBase 0x1100 +#define VBase 0x1161 +#define TBase 0x11A7 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define SBase 0xAC00 +#define NCount (VCount * TCount) +#define SCount (LCount * NCount) + +#define isCombiningL(u) (hb_in_range ((u), LBase, LBase+LCount-1)) +#define isCombiningV(u) (hb_in_range ((u), VBase, VBase+VCount-1)) +#define isCombiningT(u) (hb_in_range ((u), TBase+1, TBase+TCount-1)) +#define isCombinedS(u) (hb_in_range ((u), SBase, SBase+SCount-1)) + +#define isT(u) (hb_in_ranges ((u), 0x11A8, 0x11FF, 0xD7C8, 0xD7FF)) + +static void +preprocess_text_hangul (const hb_ot_shape_plan_t *plan, + hb_buffer_t *buffer, + hb_font_t *font) +{ + /* Hangul syllables come in two shapes: LV, and LVT. Of those: + * + * - LV can be precomposed, or decomposed. Lets call those + * and , + * - LVT can be fully precomposed, partically precomposed, or + * fully decomposed. Ie. , , or . + * + * The composition / decomposition is mechanical. However, not + * all sequences compose, and not all sequences + * compose. + * + * Here are the specifics: + * + * - : U+1100..115F, U+A960..A97F + * - : U+1160..11A7, U+D7B0..D7C7 + * - : U+11A8..11FF, U+D7C8..D7FF + * + * - Only the sequences for the 11xx ranges combine. + * - Only sequences for T in U+11A8..11C3 combine. + * + * Here is what we want to accomplish in this shaper: + * + * - If the whole syllable can be precomposed, do that, + * - Otherwise, fully decompose. + * + * That is, of the different possible syllables: + * + * + * + * + * + * + * + * + * - needs no work. + * + * - and can stay the way they are if the font supports them, otherwise we + * should fully decompose them if font supports. + * + * - and we should compose if the whole thing can be composed. + * + * - we should compose if the whole thing can be composed, otherwise we should + * decompose. + */ + + buffer->clear_output (); + unsigned int count = buffer->len; + for (buffer->idx = 0; buffer->idx < count;) + { + hb_codepoint_t u = buffer->cur().codepoint; + + if (isCombiningL(u) && buffer->idx + 1 < count) + { + hb_codepoint_t l = u; + hb_codepoint_t v = buffer->cur(+1).codepoint; + if (isCombiningV(v)) + { + /* Have or . */ + unsigned int len = 2; + unsigned int tindex = 0; + if (buffer->idx + 2 < count) + { + hb_codepoint_t t = buffer->cur(+2).codepoint; + if (isCombiningT(t)) + { + len = 3; + tindex = t - TBase; + } + else if (isT (t)) + { + /* Old T jamo. Doesn't combine. Don't combine *anything*. */ + len = 0; + } + } + + if (len) + { + hb_codepoint_t s = SBase + (l - LBase) * NCount + (v - VBase) * TCount + tindex; + hb_codepoint_t glyph; + if (font->get_glyph (s, 0, &glyph)) + { + buffer->replace_glyphs (len, 1, &s); + if (unlikely (buffer->in_error)) + return; + continue; + } + } + } + } + + else if (isCombinedS(u)) + { + /* Have , , or */ + hb_codepoint_t s = u; + hb_codepoint_t glyph; + bool has_glyph = font->get_glyph (s, 0, &glyph); + unsigned int lindex = (s - SBase) / NCount; + unsigned int nindex = (s - SBase) % NCount; + unsigned int vindex = nindex / VCount; + unsigned int tindex = nindex % VCount; + + if (tindex && has_glyph) + goto next; /* supported. Nothing to do. */ + + if (!tindex && + buffer->idx + 1 < count && + isCombiningT (buffer->cur(+1).codepoint)) + { + /* , try to combine. */ + tindex = buffer->cur(+1).codepoint - TBase; + hb_codepoint_t new_s = s + tindex; + if (font->get_glyph (new_s, 0, &glyph)) + { + buffer->replace_glyphs (2, 1, &new_s); + if (unlikely (buffer->in_error)) + return; + continue; + } + } + + /* Otherwise, decompose if font doesn't support , + * or if having non-combining . Note that we + * already handled combining above. */ + if (!has_glyph || + (buffer->idx + 1 < count && + isT (buffer->cur(+1).codepoint))) + { + hb_codepoint_t decomposed[3] = {LBase + lindex, + VBase + vindex, + TBase + tindex}; + if (font->get_glyph (decomposed[0], 0, &glyph) && + font->get_glyph (decomposed[1], 0, &glyph) && + (tindex && font->get_glyph (decomposed[2], 0, &glyph))) + { + buffer->replace_glyphs (1, tindex ? 3 : 2, decomposed); + if (unlikely (buffer->in_error)) + return; + continue; + } + } + } + + next: + buffer->next_glyph (); + } + buffer->swap_buffers (); +} + +const hb_ot_complex_shaper_t _hb_ot_complex_shaper_hangul = +{ + "hangul", + collect_features_hangul, + NULL, /* override_features */ + NULL, /* data_create */ + NULL, /* data_destroy */ + preprocess_text_hangul, + NULL, /* normalization_preference */ + NULL, /* decompose */ + NULL, /* compose */ + NULL, /* setup_masks */ + HB_OT_SHAPE_ZERO_WIDTH_MARKS_BY_UNICODE_LATE, + false, /* fallback_position */ +}; diff --git a/src/hb-ot-shape-complex-private.hh b/src/hb-ot-shape-complex-private.hh index ac0072ba5..17b95e2bc 100644 --- a/src/hb-ot-shape-complex-private.hh +++ b/src/hb-ot-shape-complex-private.hh @@ -52,6 +52,7 @@ enum hb_ot_shape_zero_width_marks_type_t { #define HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS \ HB_COMPLEX_SHAPER_IMPLEMENT (default) /* should be first */ \ HB_COMPLEX_SHAPER_IMPLEMENT (arabic) \ + HB_COMPLEX_SHAPER_IMPLEMENT (hangul) \ HB_COMPLEX_SHAPER_IMPLEMENT (indic) \ HB_COMPLEX_SHAPER_IMPLEMENT (myanmar) \ HB_COMPLEX_SHAPER_IMPLEMENT (sea) \ @@ -189,19 +190,10 @@ hb_ot_shape_complex_categorize (const hb_ot_shape_planner_t *planner) return &_hb_ot_complex_shaper_thai; -#if 0 - /* Note: - * Currently we don't have a separate Hangul shaper. The default shaper handles - * Hangul by enabling jamo features. We may want to implement a separate shaper - * in the future. See this thread for details of what such a shaper would do: - * - * http://lists.freedesktop.org/archives/harfbuzz/2013-April/003070.html - */ /* Unicode-1.1 additions */ case HB_SCRIPT_HANGUL: return &_hb_ot_complex_shaper_hangul; -#endif /* ^--- Add new shapers here */ diff --git a/src/hb-private.hh b/src/hb-private.hh index 4b72260ed..680b21e2c 100644 --- a/src/hb-private.hh +++ b/src/hb-private.hh @@ -807,6 +807,12 @@ hb_in_range (T u, T lo, T hi) return lo <= u && u <= hi; } +template static inline bool +hb_in_ranges (T u, T lo1, T hi1, T lo2, T hi2) +{ + return hb_in_range (u, lo1, hi1) || hb_in_range (u, lo2, hi2); +} + template static inline bool hb_in_ranges (T u, T lo1, T hi1, T lo2, T hi2, T lo3, T hi3) {