From 4a7f4f3e56f8f7640ae7337aa1b3324f31e0d4ab Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Mon, 23 Jul 2012 13:15:33 -0400 Subject: [PATCH] [Thai] Adjust SARA AM reordering to match Uniscribe Adjust the list of marks before SARA AM that get the reordering treatment. Also adjust cluster formation to match Uniscribe. With Wikipedia test data, now I see: - For Thai, with the Angsana New font from Win7, I see 54 failures out of over 4M tests (0.00129107%). Of the 54, two are legitimate reordering issues (fix coming soon), and the other 52 are simply Uniscribe using a zero-width space char instead of an unknown character for missing glyphs. No idea why. The missing-glyph sequences include one that is a Thai character followed by an Arabic Sokun. Someone confused it with Nikhahit I assume! - For Lao, with the Dokchampa font from Win7, 33 tests fail out of 54k (0.0615167%). All seem to be insignificant mark positioning with two marks on a base. Have to investigate. --- src/hb-ot-shape-complex-misc.cc | 42 ++++++++++++------- src/hb-private.hh | 6 +++ .../texts/in-tree/shaper-thai/MANIFEST | 1 + .../in-tree/shaper-thai/script-lao/MANIFEST | 1 + .../shaper-thai/script-lao/misc/MANIFEST | 1 + .../shaper-thai/script-lao/misc/sara-am.txt | 20 +++++++++ .../shaper-thai/script-thai/misc/sara-am.txt | 18 +++++++- 7 files changed, 72 insertions(+), 17 deletions(-) create mode 100644 test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST create mode 100644 test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST create mode 100644 test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt diff --git a/src/hb-ot-shape-complex-misc.cc b/src/hb-ot-shape-complex-misc.cc index 7a11876e6..17e26254e 100644 --- a/src/hb-ot-shape-complex-misc.cc +++ b/src/hb-ot-shape-complex-misc.cc @@ -121,19 +121,20 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED, /* The following is NOT specified in the MS OT Thai spec, however, it seems * to be what Uniscribe and other engines implement. According to Eric Muller: * - * When you have a sara am, decompose it in nikhahit + sara a, *and* mode the - * nihka hit backwards over any *tone* mark (0E48-0E4B). + * When you have a SARA AM, decompose it in NIKHAHIT + SARA AA, *and* move the + * NIKHAHIT backwards over any tone mark (0E48-0E4B). * * <0E14, 0E4B, 0E33> -> <0E14, 0E4D, 0E4B, 0E32> * - * This reordering is legit only when the nikhahit comes from a sara am, not + * This reordering is legit only when the NIKHAHIT comes from a SARA AM, not * when it's there to start with. The string <0E14, 0E4B, 0E4D> is probably - * not what a u↪ser wanted, but the rendering is nevertheless nikhahit above + * not what a user wanted, but the rendering is nevertheless nikhahit above * chattawa. * * Same for Lao. */ + /* * Here are the characters of significance: * @@ -142,9 +143,9 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED, * SARA AA: U+0E32 U+0EB2 * Nikhahit: U+0E4D U+0ECD * - * Tone marks: - * Thai: <0E48..0E4B> CCC=107 - * Lao: <0EC8..0ECB> CCC=122 + * Testing shows that Uniscribe reorder the following marks: + * Thai: <0E31..0E37,0E47..0E4E> + * Lao: <0EB1..0EB7,0EC7..0ECE> * * Note how the Lao versions are the same as Thai + 0x80. */ @@ -154,7 +155,7 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED, #define IS_SARA_AM(x) (((x) & ~0x0080) == 0x0E33) #define NIKHAHIT_FROM_SARA_AM(x) ((x) - 0xE33 + 0xE4D) #define SARA_AA_FROM_SARA_AM(x) ((x) - 1) -#define IS_TONE_MARK(x) (((x) & ~0x0083) == 0x0E48) +#define IS_TONE_MARK(x) (hb_in_ranges ((x) & ~0x0080, 0x0E31, 0x0E37, 0x0E47, 0x0E4E)) buffer->clear_output (); unsigned int count = buffer->len; @@ -179,14 +180,23 @@ _hb_ot_shape_complex_setup_masks_thai (hb_ot_map_t *map HB_UNUSED, while (start > 0 && IS_TONE_MARK (buffer->out_info[start - 1].codepoint)) start--; - /* Move Nikhahit (end-2) to the beginning */ - hb_glyph_info_t t = buffer->out_info[end - 2]; - memmove (buffer->out_info + start + 1, - buffer->out_info + start, - sizeof (buffer->out_info[0]) * (end - start - 2)); - buffer->out_info[start] = t; - - buffer->merge_out_clusters (start, end); + if (start + 2 < end) + { + /* Move Nikhahit (end-2) to the beginning */ + buffer->merge_out_clusters (start, end); + hb_glyph_info_t t = buffer->out_info[end - 2]; + memmove (buffer->out_info + start + 1, + buffer->out_info + start, + sizeof (buffer->out_info[0]) * (end - start - 2)); + buffer->out_info[start] = t; + } + else + { + /* Since we decomposed, and NIKHAHIT is combining, merge clusters with the + * previous cluster. */ + if (start) + buffer->merge_out_clusters (start - 1, end); + } } buffer->swap_buffers (); } diff --git a/src/hb-private.hh b/src/hb-private.hh index 3f710edaa..0b9c4efce 100644 --- a/src/hb-private.hh +++ b/src/hb-private.hh @@ -729,6 +729,12 @@ hb_in_range (T u, T lo, T hi) return lo <= u && u <= hi; } +template static inline bool +hb_in_ranges (T u, T lo1, T hi1, T lo2, T hi2) +{ + return hb_in_range (u, lo1, hi1) || hb_in_range (u, lo2, hi2); +} + /* Useful for set-operations on small enums. * For example, for testing "x ∈ {x1, x2, x3}" use: diff --git a/test/shaping/texts/in-tree/shaper-thai/MANIFEST b/test/shaping/texts/in-tree/shaper-thai/MANIFEST index 22bc0edfb..32b54765b 100644 --- a/test/shaping/texts/in-tree/shaper-thai/MANIFEST +++ b/test/shaping/texts/in-tree/shaper-thai/MANIFEST @@ -1 +1,2 @@ +script-lao script-thai diff --git a/test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST b/test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST new file mode 100644 index 000000000..b8752e7b9 --- /dev/null +++ b/test/shaping/texts/in-tree/shaper-thai/script-lao/MANIFEST @@ -0,0 +1 @@ +misc diff --git a/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST new file mode 100644 index 000000000..ffd16f106 --- /dev/null +++ b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/MANIFEST @@ -0,0 +1 @@ +sara-am.txt diff --git a/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt new file mode 100644 index 000000000..234d8c06e --- /dev/null +++ b/test/shaping/texts/in-tree/shaper-thai/script-lao/misc/sara-am.txt @@ -0,0 +1,20 @@ +ດຳ +ດ໋ຳ +ດໍ໋າ +ດ໋ໍາ +ມັຳ +ມິຳ +ມີຳ +ມຶຳ +ມືຳ +ມຸຳ +ມູຳ +ມ຺ຳ +ມ໇ຳ +ມ່ຳ +ມ້ຳ +ມ໊ຳ +ມ໋ຳ +ມ໌ຳ +ມໍຳ +ມ໎ຳ diff --git a/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt index 6d385ef1d..9f044ce84 100644 --- a/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt +++ b/test/shaping/texts/in-tree/shaper-thai/script-thai/misc/sara-am.txt @@ -1,4 +1,20 @@ -ำ ดำ ด๋ำ ดํ๋า +ด๋ํา +มัำ +มิำ +มีำ +มึำ +มืำ +มุำ +มูำ +มฺำ +ม็ำ +ม่ำ +ม้ำ +ม๊ำ +ม๋ำ +ม์ำ +มํำ +ม๎ำ