Round-trip OpenType tags through BCP 47

This commit is contained in:
David Corbett 2020-04-30 11:53:21 -04:00 committed by Behdad Esfahbod
parent 10d6605bbe
commit b207eab842
3 changed files with 92 additions and 44 deletions

View File

@ -577,6 +577,12 @@ static inline unsigned char TOUPPER (unsigned char c)
{ return (c >= 'a' && c <= 'z') ? c - 'a' + 'A' : c; }
static inline unsigned char TOLOWER (unsigned char c)
{ return (c >= 'A' && c <= 'Z') ? c - 'A' + 'a' : c; }
static inline bool ISHEX (unsigned char c)
{ return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); }
static inline unsigned char TOHEX (uint8_t c)
{ return (c & 0xF) <= 9 ? (c & 0xF) + '0' : (c & 0xF) + 'a' - 10; }
static inline uint8_t FROMHEX (unsigned char c)
{ return (c >= '0' && c <= '9') ? c - '0' : TOLOWER (c) - 'a' + 10; }
static inline unsigned int DIV_CEIL (const unsigned int a, unsigned int b)
{ return (a + (b - 1)) / b; }

View File

@ -319,12 +319,26 @@ parse_private_use_subtag (const char *private_use_subtag,
char tag[4];
int i;
s += strlen (prefix);
for (i = 0; i < 4 && ISALNUM (s[i]); i++)
tag[i] = normalize (s[i]);
if (!i) return false;
if (s[0] == '-') {
s += 1;
char c;
for (i = 0; i < 8 && ISHEX (s[i]); i++)
{
c = FROMHEX (s[i]);
if (i % 2 == 0)
tag[i / 2] = c << 4;
else
tag[i / 2] += c;
}
if (i != 8) return false;
} else {
for (i = 0; i < 4 && ISALNUM (s[i]); i++)
tag[i] = normalize (s[i]);
if (!i) return false;
for (; i < 4; i++)
tag[i] = ' ';
for (; i < 4; i++)
tag[i] = ' ';
}
tags[0] = HB_TAG (tag[0], tag[1], tag[2], tag[3]);
if ((tags[0] & 0xDFDFDFDF) == HB_OT_TAG_DEFAULT_SCRIPT)
tags[0] ^= ~0xDFDFDFDF;
@ -434,30 +448,28 @@ hb_ot_tag_to_language (hb_tag_t tag)
if (ot_languages[i].tag == tag)
return hb_language_from_string (ot_languages[i].language, -1);
/* If it's three letters long, assume it's ISO 639-3 and lower-case and use it
* (if it's not a registered tag, calling hb_ot_tag_from_language on the
* result might not return the same tag as the original tag).
* Else return a custom language in the form of "x-hbotABCD". */
/* Return a custom language in the form of "x-hbot-AABBCCDD".
* If it's three letters long, also guess it's ISO 639-3 and lower-case and
* prepend it (if it's not a registered tag, the private use subtags will
* ensure that calling hb_ot_tag_from_language on the result will still return
* the same tag as the original tag).
*/
{
char buf[11] = "x-hbot";
char buf[20];
char *str = buf;
buf[6] = tag >> 24;
buf[7] = (tag >> 16) & 0xFF;
buf[8] = (tag >> 8) & 0xFF;
buf[9] = tag & 0xFF;
if (buf[9] == 0x20)
if (ISALPHA (tag >> 24)
&& ISALPHA ((tag >> 16) & 0xFF)
&& ISALPHA ((tag >> 8) & 0xFF)
&& (tag & 0xFF) == ' ')
{
buf[9] = '\0';
if (ISALPHA (buf[6]) && ISALPHA (buf[7]) && ISALPHA (buf[8]))
{
buf[6] = TOLOWER (buf[6]);
buf[7] = TOLOWER (buf[7]);
buf[8] = TOLOWER (buf[8]);
str += 6;
}
buf[0] = TOLOWER (tag >> 24);
buf[1] = TOLOWER ((tag >> 16) & 0xFF);
buf[2] = TOLOWER ((tag >> 8) & 0xFF);
buf[3] = '-';
str += 4;
}
buf[10] = '\0';
return hb_language_from_string (str, -1);
snprintf (str, 16, "x-hbot-%08x", tag);
return hb_language_from_string (&*buf, -1);
}
}
@ -498,13 +510,14 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag,
unsigned char *buf;
const char *lang_str = hb_language_to_string (*language);
size_t len = strlen (lang_str);
buf = (unsigned char *) malloc (len + 11);
buf = (unsigned char *) malloc (len + 16);
if (unlikely (!buf))
{
*language = nullptr;
}
else
{
int shift;
memcpy (buf, lang_str, len);
if (lang_str[0] != 'x' || lang_str[1] != '-') {
buf[len++] = '-';
@ -515,10 +528,9 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag,
buf[len++] = 'b';
buf[len++] = 's';
buf[len++] = 'c';
buf[len++] = script_tag >> 24;
buf[len++] = (script_tag >> 16) & 0xFF;
buf[len++] = (script_tag >> 8) & 0xFF;
buf[len++] = script_tag & 0xFF;
buf[len++] = '-';
for (shift = 28; shift >= 0; shift -= 4)
buf[len++] = TOHEX (script_tag >> shift);
*language = hb_language_from_string ((char *) buf, len);
free (buf);
}

View File

@ -164,10 +164,16 @@ test_ot_tag_script_from_language (void)
test_script_tags_from_language ("copt", "en", HB_SCRIPT_COPTIC);
test_script_tags_from_language (NULL, "x-hbsc", HB_SCRIPT_INVALID);
test_script_tags_from_language ("copt", "x-hbsc", HB_SCRIPT_COPTIC);
test_script_tags_from_language (NULL, "x-hbsc-", HB_SCRIPT_INVALID);
test_script_tags_from_language (NULL, "x-hbsc-1", HB_SCRIPT_INVALID);
test_script_tags_from_language (NULL, "x-hbsc-1a", HB_SCRIPT_INVALID);
test_script_tags_from_language (NULL, "x-hbsc-1a2b3c4x", HB_SCRIPT_INVALID);
test_script_tags_from_language ("2lon", "x-hbsc-326c6f6e67", HB_SCRIPT_INVALID);
test_script_tags_from_language ("abc ", "x-hbscabc", HB_SCRIPT_INVALID);
test_script_tags_from_language ("deva", "x-hbscdeva", HB_SCRIPT_INVALID);
test_script_tags_from_language ("dev2", "x-hbscdev2", HB_SCRIPT_INVALID);
test_script_tags_from_language ("dev3", "x-hbscdev3", HB_SCRIPT_INVALID);
test_script_tags_from_language ("dev3", "x-hbsc-64657633", HB_SCRIPT_INVALID);
test_script_tags_from_language ("copt", "x-hbotpap0-hbsccopt", HB_SCRIPT_INVALID);
test_script_tags_from_language (NULL, "en-x-hbsc", HB_SCRIPT_INVALID);
test_script_tags_from_language ("copt", "en-x-hbsc", HB_SCRIPT_COPTIC);
@ -175,6 +181,7 @@ test_ot_tag_script_from_language (void)
test_script_tags_from_language ("deva", "en-x-hbscdeva", HB_SCRIPT_INVALID);
test_script_tags_from_language ("dev2", "en-x-hbscdev2", HB_SCRIPT_INVALID);
test_script_tags_from_language ("dev3", "en-x-hbscdev3", HB_SCRIPT_INVALID);
test_script_tags_from_language ("dev3", "en-x-hbsc-64657633", HB_SCRIPT_INVALID);
test_script_tags_from_language ("copt", "en-x-hbotpap0-hbsccopt", HB_SCRIPT_INVALID);
}
@ -266,12 +273,12 @@ test_tags_to_script_and_language (const char *script_tag_s,
static void
test_ot_tags_to_script_and_language (void)
{
test_tags_to_script_and_language ("DFLT", "ENG", "", "en-x-hbscdflt");
test_tags_to_script_and_language ("DFLT", "ENG", "", "en-x-hbsc-44464c54");
test_tags_to_script_and_language ("latn", "ENG", "Latn", "en");
test_tags_to_script_and_language ("deva", "MAR", "Deva", "mr-x-hbscdeva");
test_tags_to_script_and_language ("dev2", "MAR", "Deva", "mr-x-hbscdev2");
test_tags_to_script_and_language ("deva", "MAR", "Deva", "mr-x-hbsc-64657661");
test_tags_to_script_and_language ("dev2", "MAR", "Deva", "mr-x-hbsc-64657632");
test_tags_to_script_and_language ("dev3", "MAR", "Deva", "mr");
test_tags_to_script_and_language ("qaa", "QTZ0", "Qaaa", "x-hbotqtz0-hbscqaa");
test_tags_to_script_and_language ("qaa", "QTZ0", "Qaaa", "x-hbot-51545a30-hbsc-71616120");
}
static void
@ -291,8 +298,9 @@ test_ot_tag_language (void)
test_language_two_way ("ENG", "en");
test_tag_from_language ("ENG", "en_US");
test_language_two_way ("CJA", "cja"); /* Western Cham */
test_language_two_way ("CJM", "cjm"); /* Eastern Cham */
test_language_two_way ("CJA", "cja-x-hbot-434a4120"); /* Western Cham */
test_language_two_way ("CJM", "cjm-x-hbot-434a4d20"); /* Eastern Cham */
test_tag_from_language ("CJM", "cjm");
test_language_two_way ("EVN", "eve");
test_language_two_way ("HAL", "cfm"); /* BCP47 and current ISO639-3 code for Halam/Falam Chin */
@ -351,13 +359,20 @@ test_ot_tag_language (void)
test_tag_from_language ("ZHH", "yue-Hant");
test_tag_from_language ("ZHS", "yue-Hans");
test_language_two_way ("ABC", "abc");
test_language_two_way ("ABCD", "x-hbotabcd");
test_language_two_way ("ABC", "abc-x-hbot-41424320");
test_language_two_way ("ABCD", "x-hbot-41424344");
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbotabc-zxc");
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbotabc");
test_tag_from_language ("ABCD", "asdf-asdf-wer-x-hbotabcd");
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbot-41424320-zxc");
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbot-41424320");
test_tag_from_language ("ABCD", "asdf-asdf-wer-x-hbot-41424344");
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot");
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-zxc");
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-zxc-414243");
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-414243");
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-4142432");
test_tag_from_language ("dflt", "xy");
test_tag_from_language ("XYZ", "xyz"); /* Unknown ISO 639-3 */
@ -423,12 +438,27 @@ test_ot_tag_language (void)
test_language_two_way ("SYRN", "und-Syrn");
/* Test that x-hbot overrides the base language */
test_tag_from_language ("ABC", "fa-x-hbotabc-zxc");
test_tag_from_language ("ABC", "fa-ir-x-hbotabc-zxc");
test_tag_from_language ("ABC", "zh-x-hbotabc-zxc");
test_tag_from_language ("ABC", "zh-cn-x-hbotabc-zxc");
test_tag_from_language ("ABC", "zh-xy-x-hbotabc-zxc");
test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-zxc");
test_tag_from_language ("ABC", "fa-x-hbotabc-hbot-41686121-zxc");
test_tag_from_language ("ABC", "fa-ir-x-hbotabc-hbot-41686121-zxc");
test_tag_from_language ("ABC", "zh-x-hbotabc-hbot-41686121-zxc");
test_tag_from_language ("ABC", "zh-cn-x-hbotabc-hbot-41686121-zxc");
test_tag_from_language ("ABC", "zh-xy-x-hbotabc-hbot-41686121-zxc");
test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-hbot-41686121-zxc");
test_tag_from_language ("Aha!", "fa-x-hbot-41686121-hbotabc-zxc");
test_tag_from_language ("Aha!", "fa-ir-x-hbot-41686121-hbotabc-zxc");
test_tag_from_language ("Aha!", "zh-x-hbot-41686121-hbotabc-zxc");
test_tag_from_language ("Aha!", "zh-cn-x-hbot-41686121-hbotabc-zxc");
test_tag_from_language ("Aha!", "zh-xy-x-hbot-41686121-hbotabc-zxc");
test_tag_from_language ("Aha!", "xyz-xy-x-hbot-41686121-hbotabc-zxc");
/* Invalid x-hbot */
test_tag_from_language ("dflt", "x-hbot");
test_tag_from_language ("dflt", "x-hbot-");
test_tag_from_language ("dflt", "x-hbot-1");
test_tag_from_language ("dflt", "x-hbot-1a");
test_tag_from_language ("dflt", "x-hbot-1a2b3c4x");
test_tag_from_language ("2lon", "x-hbot-326c6f6e67");
/* Unnormalized BCP 47 tags */
test_tag_from_language ("ARA", "ar-aao");