From b207eab842bf7c18e3de49338726b26138c77c69 Mon Sep 17 00:00:00 2001 From: David Corbett Date: Thu, 30 Apr 2020 11:53:21 -0400 Subject: [PATCH] Round-trip OpenType tags through BCP 47 --- src/hb-algs.hh | 6 ++++ src/hb-ot-tag.cc | 72 ++++++++++++++++++++++++------------------ test/api/test-ot-tag.c | 58 ++++++++++++++++++++++++++-------- 3 files changed, 92 insertions(+), 44 deletions(-) diff --git a/src/hb-algs.hh b/src/hb-algs.hh index 5d2a48c46..30b5812e1 100644 --- a/src/hb-algs.hh +++ b/src/hb-algs.hh @@ -577,6 +577,12 @@ static inline unsigned char TOUPPER (unsigned char c) { return (c >= 'a' && c <= 'z') ? c - 'a' + 'A' : c; } static inline unsigned char TOLOWER (unsigned char c) { return (c >= 'A' && c <= 'Z') ? c - 'A' + 'a' : c; } +static inline bool ISHEX (unsigned char c) +{ return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } +static inline unsigned char TOHEX (uint8_t c) +{ return (c & 0xF) <= 9 ? (c & 0xF) + '0' : (c & 0xF) + 'a' - 10; } +static inline uint8_t FROMHEX (unsigned char c) +{ return (c >= '0' && c <= '9') ? c - '0' : TOLOWER (c) - 'a' + 10; } static inline unsigned int DIV_CEIL (const unsigned int a, unsigned int b) { return (a + (b - 1)) / b; } diff --git a/src/hb-ot-tag.cc b/src/hb-ot-tag.cc index 8ad917ae7..7ec91c581 100644 --- a/src/hb-ot-tag.cc +++ b/src/hb-ot-tag.cc @@ -319,12 +319,26 @@ parse_private_use_subtag (const char *private_use_subtag, char tag[4]; int i; s += strlen (prefix); - for (i = 0; i < 4 && ISALNUM (s[i]); i++) - tag[i] = normalize (s[i]); - if (!i) return false; + if (s[0] == '-') { + s += 1; + char c; + for (i = 0; i < 8 && ISHEX (s[i]); i++) + { + c = FROMHEX (s[i]); + if (i % 2 == 0) + tag[i / 2] = c << 4; + else + tag[i / 2] += c; + } + if (i != 8) return false; + } else { + for (i = 0; i < 4 && ISALNUM (s[i]); i++) + tag[i] = normalize (s[i]); + if (!i) return false; - for (; i < 4; i++) - tag[i] = ' '; + for (; i < 4; i++) + tag[i] = ' '; + } tags[0] = HB_TAG (tag[0], tag[1], tag[2], tag[3]); if ((tags[0] & 0xDFDFDFDF) == HB_OT_TAG_DEFAULT_SCRIPT) tags[0] ^= ~0xDFDFDFDF; @@ -434,30 +448,28 @@ hb_ot_tag_to_language (hb_tag_t tag) if (ot_languages[i].tag == tag) return hb_language_from_string (ot_languages[i].language, -1); - /* If it's three letters long, assume it's ISO 639-3 and lower-case and use it - * (if it's not a registered tag, calling hb_ot_tag_from_language on the - * result might not return the same tag as the original tag). - * Else return a custom language in the form of "x-hbotABCD". */ + /* Return a custom language in the form of "x-hbot-AABBCCDD". + * If it's three letters long, also guess it's ISO 639-3 and lower-case and + * prepend it (if it's not a registered tag, the private use subtags will + * ensure that calling hb_ot_tag_from_language on the result will still return + * the same tag as the original tag). + */ { - char buf[11] = "x-hbot"; + char buf[20]; char *str = buf; - buf[6] = tag >> 24; - buf[7] = (tag >> 16) & 0xFF; - buf[8] = (tag >> 8) & 0xFF; - buf[9] = tag & 0xFF; - if (buf[9] == 0x20) + if (ISALPHA (tag >> 24) + && ISALPHA ((tag >> 16) & 0xFF) + && ISALPHA ((tag >> 8) & 0xFF) + && (tag & 0xFF) == ' ') { - buf[9] = '\0'; - if (ISALPHA (buf[6]) && ISALPHA (buf[7]) && ISALPHA (buf[8])) - { - buf[6] = TOLOWER (buf[6]); - buf[7] = TOLOWER (buf[7]); - buf[8] = TOLOWER (buf[8]); - str += 6; - } + buf[0] = TOLOWER (tag >> 24); + buf[1] = TOLOWER ((tag >> 16) & 0xFF); + buf[2] = TOLOWER ((tag >> 8) & 0xFF); + buf[3] = '-'; + str += 4; } - buf[10] = '\0'; - return hb_language_from_string (str, -1); + snprintf (str, 16, "x-hbot-%08x", tag); + return hb_language_from_string (&*buf, -1); } } @@ -498,13 +510,14 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag, unsigned char *buf; const char *lang_str = hb_language_to_string (*language); size_t len = strlen (lang_str); - buf = (unsigned char *) malloc (len + 11); + buf = (unsigned char *) malloc (len + 16); if (unlikely (!buf)) { *language = nullptr; } else { + int shift; memcpy (buf, lang_str, len); if (lang_str[0] != 'x' || lang_str[1] != '-') { buf[len++] = '-'; @@ -515,10 +528,9 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag, buf[len++] = 'b'; buf[len++] = 's'; buf[len++] = 'c'; - buf[len++] = script_tag >> 24; - buf[len++] = (script_tag >> 16) & 0xFF; - buf[len++] = (script_tag >> 8) & 0xFF; - buf[len++] = script_tag & 0xFF; + buf[len++] = '-'; + for (shift = 28; shift >= 0; shift -= 4) + buf[len++] = TOHEX (script_tag >> shift); *language = hb_language_from_string ((char *) buf, len); free (buf); } diff --git a/test/api/test-ot-tag.c b/test/api/test-ot-tag.c index 958fd6b2b..78ebc03db 100644 --- a/test/api/test-ot-tag.c +++ b/test/api/test-ot-tag.c @@ -164,10 +164,16 @@ test_ot_tag_script_from_language (void) test_script_tags_from_language ("copt", "en", HB_SCRIPT_COPTIC); test_script_tags_from_language (NULL, "x-hbsc", HB_SCRIPT_INVALID); test_script_tags_from_language ("copt", "x-hbsc", HB_SCRIPT_COPTIC); + test_script_tags_from_language (NULL, "x-hbsc-", HB_SCRIPT_INVALID); + test_script_tags_from_language (NULL, "x-hbsc-1", HB_SCRIPT_INVALID); + test_script_tags_from_language (NULL, "x-hbsc-1a", HB_SCRIPT_INVALID); + test_script_tags_from_language (NULL, "x-hbsc-1a2b3c4x", HB_SCRIPT_INVALID); + test_script_tags_from_language ("2lon", "x-hbsc-326c6f6e67", HB_SCRIPT_INVALID); test_script_tags_from_language ("abc ", "x-hbscabc", HB_SCRIPT_INVALID); test_script_tags_from_language ("deva", "x-hbscdeva", HB_SCRIPT_INVALID); test_script_tags_from_language ("dev2", "x-hbscdev2", HB_SCRIPT_INVALID); test_script_tags_from_language ("dev3", "x-hbscdev3", HB_SCRIPT_INVALID); + test_script_tags_from_language ("dev3", "x-hbsc-64657633", HB_SCRIPT_INVALID); test_script_tags_from_language ("copt", "x-hbotpap0-hbsccopt", HB_SCRIPT_INVALID); test_script_tags_from_language (NULL, "en-x-hbsc", HB_SCRIPT_INVALID); test_script_tags_from_language ("copt", "en-x-hbsc", HB_SCRIPT_COPTIC); @@ -175,6 +181,7 @@ test_ot_tag_script_from_language (void) test_script_tags_from_language ("deva", "en-x-hbscdeva", HB_SCRIPT_INVALID); test_script_tags_from_language ("dev2", "en-x-hbscdev2", HB_SCRIPT_INVALID); test_script_tags_from_language ("dev3", "en-x-hbscdev3", HB_SCRIPT_INVALID); + test_script_tags_from_language ("dev3", "en-x-hbsc-64657633", HB_SCRIPT_INVALID); test_script_tags_from_language ("copt", "en-x-hbotpap0-hbsccopt", HB_SCRIPT_INVALID); } @@ -266,12 +273,12 @@ test_tags_to_script_and_language (const char *script_tag_s, static void test_ot_tags_to_script_and_language (void) { - test_tags_to_script_and_language ("DFLT", "ENG", "", "en-x-hbscdflt"); + test_tags_to_script_and_language ("DFLT", "ENG", "", "en-x-hbsc-44464c54"); test_tags_to_script_and_language ("latn", "ENG", "Latn", "en"); - test_tags_to_script_and_language ("deva", "MAR", "Deva", "mr-x-hbscdeva"); - test_tags_to_script_and_language ("dev2", "MAR", "Deva", "mr-x-hbscdev2"); + test_tags_to_script_and_language ("deva", "MAR", "Deva", "mr-x-hbsc-64657661"); + test_tags_to_script_and_language ("dev2", "MAR", "Deva", "mr-x-hbsc-64657632"); test_tags_to_script_and_language ("dev3", "MAR", "Deva", "mr"); - test_tags_to_script_and_language ("qaa", "QTZ0", "Qaaa", "x-hbotqtz0-hbscqaa"); + test_tags_to_script_and_language ("qaa", "QTZ0", "Qaaa", "x-hbot-51545a30-hbsc-71616120"); } static void @@ -291,8 +298,9 @@ test_ot_tag_language (void) test_language_two_way ("ENG", "en"); test_tag_from_language ("ENG", "en_US"); - test_language_two_way ("CJA", "cja"); /* Western Cham */ - test_language_two_way ("CJM", "cjm"); /* Eastern Cham */ + test_language_two_way ("CJA", "cja-x-hbot-434a4120"); /* Western Cham */ + test_language_two_way ("CJM", "cjm-x-hbot-434a4d20"); /* Eastern Cham */ + test_tag_from_language ("CJM", "cjm"); test_language_two_way ("EVN", "eve"); test_language_two_way ("HAL", "cfm"); /* BCP47 and current ISO639-3 code for Halam/Falam Chin */ @@ -351,13 +359,20 @@ test_ot_tag_language (void) test_tag_from_language ("ZHH", "yue-Hant"); test_tag_from_language ("ZHS", "yue-Hans"); - test_language_two_way ("ABC", "abc"); - test_language_two_way ("ABCD", "x-hbotabcd"); + test_language_two_way ("ABC", "abc-x-hbot-41424320"); + test_language_two_way ("ABCD", "x-hbot-41424344"); test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbotabc-zxc"); test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbotabc"); test_tag_from_language ("ABCD", "asdf-asdf-wer-x-hbotabcd"); + test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbot-41424320-zxc"); + test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbot-41424320"); + test_tag_from_language ("ABCD", "asdf-asdf-wer-x-hbot-41424344"); + test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot"); test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-zxc"); + test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-zxc-414243"); + test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-414243"); + test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-4142432"); test_tag_from_language ("dflt", "xy"); test_tag_from_language ("XYZ", "xyz"); /* Unknown ISO 639-3 */ @@ -423,12 +438,27 @@ test_ot_tag_language (void) test_language_two_way ("SYRN", "und-Syrn"); /* Test that x-hbot overrides the base language */ - test_tag_from_language ("ABC", "fa-x-hbotabc-zxc"); - test_tag_from_language ("ABC", "fa-ir-x-hbotabc-zxc"); - test_tag_from_language ("ABC", "zh-x-hbotabc-zxc"); - test_tag_from_language ("ABC", "zh-cn-x-hbotabc-zxc"); - test_tag_from_language ("ABC", "zh-xy-x-hbotabc-zxc"); - test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-zxc"); + test_tag_from_language ("ABC", "fa-x-hbotabc-hbot-41686121-zxc"); + test_tag_from_language ("ABC", "fa-ir-x-hbotabc-hbot-41686121-zxc"); + test_tag_from_language ("ABC", "zh-x-hbotabc-hbot-41686121-zxc"); + test_tag_from_language ("ABC", "zh-cn-x-hbotabc-hbot-41686121-zxc"); + test_tag_from_language ("ABC", "zh-xy-x-hbotabc-hbot-41686121-zxc"); + test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-hbot-41686121-zxc"); + + test_tag_from_language ("Aha!", "fa-x-hbot-41686121-hbotabc-zxc"); + test_tag_from_language ("Aha!", "fa-ir-x-hbot-41686121-hbotabc-zxc"); + test_tag_from_language ("Aha!", "zh-x-hbot-41686121-hbotabc-zxc"); + test_tag_from_language ("Aha!", "zh-cn-x-hbot-41686121-hbotabc-zxc"); + test_tag_from_language ("Aha!", "zh-xy-x-hbot-41686121-hbotabc-zxc"); + test_tag_from_language ("Aha!", "xyz-xy-x-hbot-41686121-hbotabc-zxc"); + + /* Invalid x-hbot */ + test_tag_from_language ("dflt", "x-hbot"); + test_tag_from_language ("dflt", "x-hbot-"); + test_tag_from_language ("dflt", "x-hbot-1"); + test_tag_from_language ("dflt", "x-hbot-1a"); + test_tag_from_language ("dflt", "x-hbot-1a2b3c4x"); + test_tag_from_language ("2lon", "x-hbot-326c6f6e67"); /* Unnormalized BCP 47 tags */ test_tag_from_language ("ARA", "ar-aao");