diff --git a/src/hb-ot-tag.cc b/src/hb-ot-tag.cc index 62c1b1a32..5d471159b 100644 --- a/src/hb-ot-tag.cc +++ b/src/hb-ot-tag.cc @@ -40,7 +40,12 @@ static hb_tag_t hb_ot_old_tag_from_script (hb_script_t script) { switch ((hb_tag_t) script) { + case HB_SCRIPT_INVALID: return HB_OT_TAG_DEFAULT_SCRIPT; + + /* KATAKANA and HIRAGANA both map to 'kana' */ case HB_SCRIPT_HIRAGANA: return HB_TAG('k','a','n','a'); + + /* Spaces at the end are preserved, unlike ISO 15924 */ case HB_SCRIPT_LAO: return HB_TAG('l','a','o',' '); case HB_SCRIPT_YI: return HB_TAG('y','i',' ',' '); /* Unicode-5.0 additions */ @@ -48,7 +53,6 @@ hb_ot_old_tag_from_script (hb_script_t script) /* Unicode-5.1 additions */ case HB_SCRIPT_VAI: return HB_TAG('v','a','i',' '); /* Unicode-5.2 additions */ - case HB_SCRIPT_MEETEI_MAYEK: return HB_TAG('m','y','e','i'); /* Unicode-6.0 additions */ } @@ -59,20 +63,19 @@ hb_ot_old_tag_from_script (hb_script_t script) static hb_script_t hb_ot_old_tag_to_script (hb_tag_t tag) { - switch (tag) { - case HB_TAG('k','a','n','a'): return HB_SCRIPT_HIRAGANA; - case HB_TAG('l','a','o',' '): return HB_SCRIPT_LAO; - case HB_TAG('y','i',' ',' '): return HB_SCRIPT_YI; - /* Unicode-5.0 additions */ - case HB_TAG('n','k','o',' '): return HB_SCRIPT_NKO; - /* Unicode-5.1 additions */ - case HB_TAG('v','a','i',' '): return HB_SCRIPT_VAI; - /* Unicode-5.2 additions */ - case HB_TAG('m','y','e','i'): return HB_SCRIPT_MEETEI_MAYEK; - /* Unicode-6.0 additions */ - } + if (unlikely (tag == HB_OT_TAG_DEFAULT_SCRIPT)) + return HB_SCRIPT_INVALID; - /* Else, just change first char to uppercase and return */ + /* This side of the conversion is fully algorithmic. */ + + /* Any spaces at the end of the tag are replaced by repeating the last + * letter. Eg 'nko ' -> 'Nkoo' */ + if (unlikely ((tag & 0x0000FF00) == 0x00002000)) + tag |= (tag >> 8) & 0x0000FF00; /* Copy second letter to third */ + if (unlikely ((tag & 0x000000FF) == 0x00000020)) + tag |= (tag >> 8) & 0x000000FF; /* Copy third letter to fourth */ + + /* Change first char to uppercase and return */ return (hb_script_t) (tag & ~0x20000000); } @@ -91,7 +94,7 @@ hb_ot_new_tag_from_script (hb_script_t script) case HB_SCRIPT_TELUGU: return HB_TAG('t','e','l','2'); } - return HB_TAG_NONE; + return HB_OT_TAG_DEFAULT_SCRIPT; } static hb_script_t @@ -114,7 +117,8 @@ hb_ot_new_tag_to_script (hb_tag_t tag) /* * Complete list at: - * http://www.microsoft.com/typography/otspec/scripttags.htm + * https://www.microsoft.com/typography/otspec/scripttags.htm + * https://www.microsoft.com/typography/otspec160/scripttagsProposed.htm * * Most of the script tags are the same as the ISO 15924 tag but lowercased. * So we just do that, and handle the exceptional cases in a switch. @@ -127,11 +131,11 @@ hb_ot_tags_from_script (hb_script_t script, { hb_tag_t new_tag; - *script_tag_2 = HB_TAG_NONE; + *script_tag_2 = HB_OT_TAG_DEFAULT_SCRIPT; *script_tag_1 = hb_ot_old_tag_from_script (script); new_tag = hb_ot_new_tag_from_script (script); - if (unlikely (new_tag != HB_TAG_NONE)) { + if (unlikely (new_tag != HB_OT_TAG_DEFAULT_SCRIPT)) { *script_tag_2 = *script_tag_1; *script_tag_1 = new_tag; } @@ -165,6 +169,7 @@ typedef struct { * Many items still missing. Those are commented out at the end. * Keep sorted for bsearch. */ + static const LangTag ot_languages[] = { {"aa", HB_TAG('A','F','R',' ')}, /* Afar */ {"ab", HB_TAG('A','B','K',' ')}, /* Abkhazian */ @@ -451,11 +456,6 @@ static const LangTag ot_languages[] = { {"yi", HB_TAG('J','I','I',' ')}, /* Yiddish */ {"yo", HB_TAG('Y','B','A',' ')}, /* Yoruba */ {"yso", HB_TAG('N','I','S',' ')}, /* Nisi (China) */ - {"zh-cn", HB_TAG('Z','H','S',' ')}, /* Chinese (China) */ - {"zh-hk", HB_TAG('Z','H','H',' ')}, /* Chinese (Hong Kong) */ - {"zh-mo", HB_TAG('Z','H','T',' ')}, /* Chinese (Macao) */ - {"zh-sg", HB_TAG('Z','H','S',' ')}, /* Chinese (Singapore) */ - {"zh-tw", HB_TAG('Z','H','T',' ')}, /* Chinese (Taiwan) */ {"zne", HB_TAG('Z','N','D',' ')}, /* Zande */ {"zu", HB_TAG('Z','U','L',' ')} /* Zulu */ @@ -571,6 +571,14 @@ static const LangTag ot_languages[] = { /*{"??", HB_TAG('Z','H','P',' ')},*/ /* Chinese Phonetic */ }; +static const LangTag ot_languages_zh[] = { + {"zh-cn", HB_TAG('Z','H','S',' ')}, /* Chinese (China) */ + {"zh-hk", HB_TAG('Z','H','H',' ')}, /* Chinese (Hong Kong) */ + {"zh-mo", HB_TAG('Z','H','T',' ')}, /* Chinese (Macao) */ + {"zh-sg", HB_TAG('Z','H','S',' ')}, /* Chinese (Singapore) */ + {"zh-tw", HB_TAG('Z','H','T',' ')} /* Chinese (Taiwan) */ +}; + static int lang_compare_first_component (const char *a, const char *b) @@ -592,66 +600,58 @@ lang_matches (const char *lang_str, const char *spec) { unsigned int len = strlen (spec); - return lang_str && strncmp (lang_str, spec, len) == 0 && + return strncmp (lang_str, spec, len) == 0 && (lang_str[len] == '\0' || lang_str[len] == '-'); } hb_tag_t hb_ot_tag_from_language (hb_language_t language) { - const char *lang_str; - LangTag *lang_tag; + const char *lang_str, *s; + const LangTag *lang_tag; if (language == NULL) return HB_OT_TAG_DEFAULT_LANGUAGE; lang_str = hb_language_to_string (language); - if (0 == strncmp (lang_str, "x-hbot", 6)) { + s = strstr (lang_str, "x-hbot"); + if (s) { char tag[4]; int i; - lang_str += 6; - for (i = 0; i < 4 && ISALPHA (lang_str[i]); i++) - tag[i] = TOUPPER (lang_str[i]); - for (; i < 4; i++) - tag[i] = ' '; - return HB_TAG_CHAR4 (tag); + s += 6; + for (i = 0; i < 4 && ISALPHA (s[i]); i++) + tag[i] = TOUPPER (s[i]); + if (i) { + for (; i < 4; i++) + tag[i] = ' '; + return HB_TAG_CHAR4 (tag); + } } - /* find a language matching in the first component */ + /* Find a language matching in the first component */ lang_tag = (LangTag *) bsearch (lang_str, ot_languages, ARRAY_LENGTH (ot_languages), sizeof (LangTag), (hb_compare_func_t) lang_compare_first_component); - - /* we now need to find the best language matching */ - if (lang_tag) - { - hb_bool_t found = FALSE; - - /* go to the final one matching in the first component */ - while (lang_tag + 1 < ot_languages + ARRAY_LENGTH (ot_languages) && - lang_compare_first_component (lang_str, (lang_tag + 1)->language) == 0) - lang_tag++; - - /* go back, find which one matches completely */ - while (lang_tag >= ot_languages && - lang_compare_first_component (lang_str, lang_tag->language) == 0) - { - if (lang_matches (lang_str, lang_tag->language)) { - found = TRUE; - break; - } - - lang_tag--; - } - - if (!found) - lang_tag = NULL; - } - if (lang_tag) return lang_tag->tag; + /* Otherwise, check the Chinese ones */ + if (0 == lang_compare_first_component (lang_str, "zh")) + { + unsigned int i; + + for (i = 0; i < ARRAY_LENGTH (ot_languages_zh); i++) + { + lang_tag = &ot_languages_zh[i]; + if (lang_matches (lang_tag->language, lang_str)) + return lang_tag->tag; + } + + /* Otherwise just return 'ZHS ' */ + return HB_TAG('Z','H','S',' '); + } + return HB_OT_TAG_DEFAULT_LANGUAGE; } @@ -659,18 +659,45 @@ hb_language_t hb_ot_tag_to_language (hb_tag_t tag) { unsigned int i; - unsigned char buf[11] = "x-hbot"; + + if (tag == HB_OT_TAG_DEFAULT_LANGUAGE) + return NULL; for (i = 0; i < ARRAY_LENGTH (ot_languages); i++) if (ot_languages[i].tag == tag) return hb_language_from_string (ot_languages[i].language); - buf[6] = tag >> 24; - buf[7] = (tag >> 16) & 0xFF; - buf[8] = (tag >> 8) & 0xFF; - buf[9] = tag & 0xFF; - buf[10] = '\0'; - return hb_language_from_string ((char *) buf); + /* If tag starts with ZH, it's Chinese */ + if ((tag & 0xFFFF0000) == 0x5A480000) { + switch (tag) { + case HB_TAG('Z','H','H',' '): return hb_language_from_string ("zh-hk"); /* Hong Kong */ + default: { + /* Encode the tag... */ + unsigned char buf[14] = "zh-x-hbot"; + buf[9] = tag >> 24; + buf[10] = (tag >> 16) & 0xFF; + buf[11] = (tag >> 8) & 0xFF; + buf[12] = tag & 0xFF; + if (buf[12] == 0x20) + buf[12] = '\0'; + buf[13] = '\0'; + return hb_language_from_string ((char *) buf); + } + } + } + + /* Else return a custom language in the form of "x-hbotXXXX" */ + { + unsigned char buf[11] = "x-hbot"; + buf[6] = tag >> 24; + buf[7] = (tag >> 16) & 0xFF; + buf[8] = (tag >> 8) & 0xFF; + buf[9] = tag & 0xFF; + if (buf[9] == 0x20) + buf[9] = '\0'; + buf[10] = '\0'; + return hb_language_from_string ((char *) buf); + } } diff --git a/test/Makefile.am b/test/Makefile.am index f2eaff02a..973d6b4ae 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -21,6 +21,9 @@ TEST_PROGS += \ test-unicode \ $(NULL) +TEST_PROGS += \ + test-ot-tag \ + $(NULL) # Tests for header compilation TEST_PROGS += \ diff --git a/test/test-ot-tag.c b/test/test-ot-tag.c new file mode 100644 index 000000000..bae0af523 --- /dev/null +++ b/test/test-ot-tag.c @@ -0,0 +1,227 @@ +/* + * Copyright © 2011 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + * Google Author(s): Behdad Esfahbod + */ + +#include "hb-test.h" + +#include + +/* Unit tests for hb-ot-tag.h */ + + +/* https://www.microsoft.com/typography/otspec/scripttags.htm */ + +static void +test_simple_tags (const char *s, hb_script_t script) +{ + hb_script_t tag; + hb_script_t t1, t2; + + g_test_message ("Testing script %c%c%c%c: tag %s", HB_UNTAG (hb_script_to_iso15924_tag (script)), s); + tag = hb_tag_from_string (s); + + hb_ot_tags_from_script (script, &t1, &t2); + + g_assert_cmphex (t1, ==, tag); + g_assert_cmphex (t2, ==, HB_OT_TAG_DEFAULT_SCRIPT); + + g_assert_cmphex (hb_ot_tag_to_script (tag), ==, script); +} + +static void +test_indic_tags (const char *s1, const char *s2, hb_script_t script) +{ + hb_script_t tag1, tag2; + hb_script_t t1, t2; + + g_test_message ("Testing script %c%c%c%c: new tag %s, old tag %s", HB_UNTAG (hb_script_to_iso15924_tag (script)), s1, s2); + tag1 = hb_tag_from_string (s1); + tag2 = hb_tag_from_string (s2); + + hb_ot_tags_from_script (script, &t1, &t2); + + g_assert_cmphex (t1, ==, tag1); + g_assert_cmphex (t2, ==, tag2); + + g_assert_cmphex (hb_ot_tag_to_script (tag1), ==, script); + g_assert_cmphex (hb_ot_tag_to_script (tag2), ==, script); +} + +static void +test_ot_tag_script_degenerate (void) +{ + hb_script_t t1, t2; + + g_assert_cmphex (HB_TAG_CHAR4 ("DFLT"), ==, HB_OT_TAG_DEFAULT_SCRIPT); + + /* HIRAGANA and KATAKANA both map to 'kana' */ + test_simple_tags ("kana", HB_SCRIPT_KATAKANA); + hb_ot_tags_from_script (HB_SCRIPT_HIRAGANA, &t1, &t2); + g_assert_cmphex (t1, ==, HB_TAG_CHAR4 ("kana")); + g_assert_cmphex (t2, ==, HB_OT_TAG_DEFAULT_SCRIPT); + + test_simple_tags ("DFLT", HB_SCRIPT_INVALID); + + /* Spaces are replaced */ + g_assert_cmphex (hb_ot_tag_to_script (HB_TAG_CHAR4 ("be ")), ==, hb_script_from_string ("Beee")); +} + +static void +test_ot_tag_script_simple (void) +{ + /* Arbitrary non-existent script */ + test_simple_tags ("wwyz", hb_script_from_string ("wWyZ")); + + /* These we don't really care about */ + test_simple_tags ("zyyy", HB_SCRIPT_COMMON); + test_simple_tags ("zinh", HB_SCRIPT_INHERITED); + test_simple_tags ("zzzz", HB_SCRIPT_UNKNOWN); + + test_simple_tags ("arab", HB_SCRIPT_ARABIC); + test_simple_tags ("copt", HB_SCRIPT_COPTIC); + test_simple_tags ("kana", HB_SCRIPT_KATAKANA); + test_simple_tags ("latn", HB_SCRIPT_LATIN); + + /* These are trickier since their OT script tags have space. */ + test_simple_tags ("lao ", HB_SCRIPT_LAO); + test_simple_tags ("yi ", HB_SCRIPT_YI); + /* Unicode-5.0 additions */ + test_simple_tags ("nko ", HB_SCRIPT_NKO); + /* Unicode-5.1 additions */ + test_simple_tags ("vai ", HB_SCRIPT_VAI); + + /* https://www.microsoft.com/typography/otspec160/scripttagsProposed.htm */ + + /* Unicode-5.2 additions */ + test_simple_tags ("mtei", HB_SCRIPT_MEETEI_MAYEK); + /* Unicode-6.0 additions */ + test_simple_tags ("mand", HB_SCRIPT_MANDAIC); +} + +static void +test_ot_tag_script_indic (void) +{ + test_indic_tags ("bng2", "beng", HB_SCRIPT_BENGALI); + test_indic_tags ("dev2", "deva", HB_SCRIPT_DEVANAGARI); + test_indic_tags ("gjr2", "gujr", HB_SCRIPT_GUJARATI); + test_indic_tags ("gur2", "guru", HB_SCRIPT_GURMUKHI); + test_indic_tags ("knd2", "knda", HB_SCRIPT_KANNADA); + test_indic_tags ("mlm2", "mlym", HB_SCRIPT_MALAYALAM); + test_indic_tags ("ory2", "orya", HB_SCRIPT_ORIYA); + test_indic_tags ("tml2", "taml", HB_SCRIPT_TAMIL); + test_indic_tags ("tel2", "telu", HB_SCRIPT_TELUGU); +} + + + +/* https://www.microsoft.com/typography/otspec/languagetags.htm */ + +static void +test_language_two_way (const char *tag_s, const char *lang_s) +{ + hb_language_t lang = hb_language_from_string (lang_s); + hb_tag_t tag = hb_tag_from_string (tag_s); + + g_test_message ("Testing language %s <-> tag %s", lang_s, tag_s); + + g_assert_cmphex (tag, ==, hb_ot_tag_from_language (lang)); + g_assert (lang == hb_ot_tag_to_language (tag)); +} + +static void +test_tag_from_language (const char *tag_s, const char *lang_s) +{ + hb_language_t lang = hb_language_from_string (lang_s); + hb_tag_t tag = hb_tag_from_string (tag_s); + + g_test_message ("Testing language %s -> tag %s", lang_s, tag_s); + + g_assert_cmphex (tag, ==, hb_ot_tag_from_language (lang)); +} + +static void +test_tag_to_language (const char *tag_s, const char *lang_s) +{ + hb_language_t lang = hb_language_from_string (lang_s); + hb_tag_t tag = hb_tag_from_string (tag_s); + + g_test_message ("Testing tag %s -> language %s", tag_s, lang_s); + + g_assert (lang == hb_ot_tag_to_language (tag)); +} + +static void +test_ot_tag_language (void) +{ + g_assert_cmphex (HB_TAG_CHAR4 ("dflt"), ==, HB_OT_TAG_DEFAULT_LANGUAGE); + test_language_two_way ("dflt", NULL); + + test_language_two_way ("ARA", "ar"); + + test_language_two_way ("AZE", "az"); + test_tag_from_language ("AZE", "az-ir"); + test_tag_from_language ("AZE", "az-az"); + + test_language_two_way ("ENG", "en"); + test_tag_from_language ("ENG", "en_US"); + + test_language_two_way ("EVN", "eve"); + + test_language_two_way ("FAR", "fa"); + test_tag_from_language ("FAR", "fa_IR"); + + test_language_two_way ("ZHH", "zh-hk"); /* Chinese (Hong Kong) */ + + test_tag_from_language ("ZHS", "zh-cn"); /* Chinese (China) */ + test_tag_from_language ("ZHS", "zh-sg"); /* Chinese (Singapore) */ + test_tag_from_language ("ZHT", "zh-mo"); /* Chinese (Macao) */ + test_tag_from_language ("ZHT", "zh-tw"); /* Chinese (Taiwan) */ + + test_tag_from_language ("ZHS", "zh"); /* Chinese */ + test_tag_from_language ("ZHS", "zh-xx"); + + test_tag_to_language ("ZHS", "zh-x-hbotzhs"); + test_tag_to_language ("ZHT", "zh-x-hbotzht"); + test_tag_to_language ("ZHP", "zh-x-hbotzhp"); + + test_language_two_way ("ABC", "x-hbotabc"); + test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbotabc-zxc"); + + test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-zxc"); +} + +int +main (int argc, char **argv) +{ + hb_test_init (&argc, &argv); + + hb_test_add (test_ot_tag_script_degenerate); + hb_test_add (test_ot_tag_script_simple); + hb_test_add (test_ot_tag_script_indic); + + hb_test_add (test_ot_tag_language); + + return hb_test_run(); +}