Round-trip OpenType tags through BCP 47
This commit is contained in:
parent
10d6605bbe
commit
b207eab842
|
@ -577,6 +577,12 @@ static inline unsigned char TOUPPER (unsigned char c)
|
|||
{ return (c >= 'a' && c <= 'z') ? c - 'a' + 'A' : c; }
|
||||
static inline unsigned char TOLOWER (unsigned char c)
|
||||
{ return (c >= 'A' && c <= 'Z') ? c - 'A' + 'a' : c; }
|
||||
static inline bool ISHEX (unsigned char c)
|
||||
{ return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); }
|
||||
static inline unsigned char TOHEX (uint8_t c)
|
||||
{ return (c & 0xF) <= 9 ? (c & 0xF) + '0' : (c & 0xF) + 'a' - 10; }
|
||||
static inline uint8_t FROMHEX (unsigned char c)
|
||||
{ return (c >= '0' && c <= '9') ? c - '0' : TOLOWER (c) - 'a' + 10; }
|
||||
|
||||
static inline unsigned int DIV_CEIL (const unsigned int a, unsigned int b)
|
||||
{ return (a + (b - 1)) / b; }
|
||||
|
|
|
@ -319,12 +319,26 @@ parse_private_use_subtag (const char *private_use_subtag,
|
|||
char tag[4];
|
||||
int i;
|
||||
s += strlen (prefix);
|
||||
if (s[0] == '-') {
|
||||
s += 1;
|
||||
char c;
|
||||
for (i = 0; i < 8 && ISHEX (s[i]); i++)
|
||||
{
|
||||
c = FROMHEX (s[i]);
|
||||
if (i % 2 == 0)
|
||||
tag[i / 2] = c << 4;
|
||||
else
|
||||
tag[i / 2] += c;
|
||||
}
|
||||
if (i != 8) return false;
|
||||
} else {
|
||||
for (i = 0; i < 4 && ISALNUM (s[i]); i++)
|
||||
tag[i] = normalize (s[i]);
|
||||
if (!i) return false;
|
||||
|
||||
for (; i < 4; i++)
|
||||
tag[i] = ' ';
|
||||
}
|
||||
tags[0] = HB_TAG (tag[0], tag[1], tag[2], tag[3]);
|
||||
if ((tags[0] & 0xDFDFDFDF) == HB_OT_TAG_DEFAULT_SCRIPT)
|
||||
tags[0] ^= ~0xDFDFDFDF;
|
||||
|
@ -434,30 +448,28 @@ hb_ot_tag_to_language (hb_tag_t tag)
|
|||
if (ot_languages[i].tag == tag)
|
||||
return hb_language_from_string (ot_languages[i].language, -1);
|
||||
|
||||
/* If it's three letters long, assume it's ISO 639-3 and lower-case and use it
|
||||
* (if it's not a registered tag, calling hb_ot_tag_from_language on the
|
||||
* result might not return the same tag as the original tag).
|
||||
* Else return a custom language in the form of "x-hbotABCD". */
|
||||
/* Return a custom language in the form of "x-hbot-AABBCCDD".
|
||||
* If it's three letters long, also guess it's ISO 639-3 and lower-case and
|
||||
* prepend it (if it's not a registered tag, the private use subtags will
|
||||
* ensure that calling hb_ot_tag_from_language on the result will still return
|
||||
* the same tag as the original tag).
|
||||
*/
|
||||
{
|
||||
char buf[11] = "x-hbot";
|
||||
char buf[20];
|
||||
char *str = buf;
|
||||
buf[6] = tag >> 24;
|
||||
buf[7] = (tag >> 16) & 0xFF;
|
||||
buf[8] = (tag >> 8) & 0xFF;
|
||||
buf[9] = tag & 0xFF;
|
||||
if (buf[9] == 0x20)
|
||||
if (ISALPHA (tag >> 24)
|
||||
&& ISALPHA ((tag >> 16) & 0xFF)
|
||||
&& ISALPHA ((tag >> 8) & 0xFF)
|
||||
&& (tag & 0xFF) == ' ')
|
||||
{
|
||||
buf[9] = '\0';
|
||||
if (ISALPHA (buf[6]) && ISALPHA (buf[7]) && ISALPHA (buf[8]))
|
||||
{
|
||||
buf[6] = TOLOWER (buf[6]);
|
||||
buf[7] = TOLOWER (buf[7]);
|
||||
buf[8] = TOLOWER (buf[8]);
|
||||
str += 6;
|
||||
buf[0] = TOLOWER (tag >> 24);
|
||||
buf[1] = TOLOWER ((tag >> 16) & 0xFF);
|
||||
buf[2] = TOLOWER ((tag >> 8) & 0xFF);
|
||||
buf[3] = '-';
|
||||
str += 4;
|
||||
}
|
||||
}
|
||||
buf[10] = '\0';
|
||||
return hb_language_from_string (str, -1);
|
||||
snprintf (str, 16, "x-hbot-%08x", tag);
|
||||
return hb_language_from_string (&*buf, -1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -498,13 +510,14 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag,
|
|||
unsigned char *buf;
|
||||
const char *lang_str = hb_language_to_string (*language);
|
||||
size_t len = strlen (lang_str);
|
||||
buf = (unsigned char *) malloc (len + 11);
|
||||
buf = (unsigned char *) malloc (len + 16);
|
||||
if (unlikely (!buf))
|
||||
{
|
||||
*language = nullptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
int shift;
|
||||
memcpy (buf, lang_str, len);
|
||||
if (lang_str[0] != 'x' || lang_str[1] != '-') {
|
||||
buf[len++] = '-';
|
||||
|
@ -515,10 +528,9 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag,
|
|||
buf[len++] = 'b';
|
||||
buf[len++] = 's';
|
||||
buf[len++] = 'c';
|
||||
buf[len++] = script_tag >> 24;
|
||||
buf[len++] = (script_tag >> 16) & 0xFF;
|
||||
buf[len++] = (script_tag >> 8) & 0xFF;
|
||||
buf[len++] = script_tag & 0xFF;
|
||||
buf[len++] = '-';
|
||||
for (shift = 28; shift >= 0; shift -= 4)
|
||||
buf[len++] = TOHEX (script_tag >> shift);
|
||||
*language = hb_language_from_string ((char *) buf, len);
|
||||
free (buf);
|
||||
}
|
||||
|
|
|
@ -164,10 +164,16 @@ test_ot_tag_script_from_language (void)
|
|||
test_script_tags_from_language ("copt", "en", HB_SCRIPT_COPTIC);
|
||||
test_script_tags_from_language (NULL, "x-hbsc", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("copt", "x-hbsc", HB_SCRIPT_COPTIC);
|
||||
test_script_tags_from_language (NULL, "x-hbsc-", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language (NULL, "x-hbsc-1", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language (NULL, "x-hbsc-1a", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language (NULL, "x-hbsc-1a2b3c4x", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("2lon", "x-hbsc-326c6f6e67", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("abc ", "x-hbscabc", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("deva", "x-hbscdeva", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("dev2", "x-hbscdev2", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("dev3", "x-hbscdev3", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("dev3", "x-hbsc-64657633", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("copt", "x-hbotpap0-hbsccopt", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language (NULL, "en-x-hbsc", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("copt", "en-x-hbsc", HB_SCRIPT_COPTIC);
|
||||
|
@ -175,6 +181,7 @@ test_ot_tag_script_from_language (void)
|
|||
test_script_tags_from_language ("deva", "en-x-hbscdeva", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("dev2", "en-x-hbscdev2", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("dev3", "en-x-hbscdev3", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("dev3", "en-x-hbsc-64657633", HB_SCRIPT_INVALID);
|
||||
test_script_tags_from_language ("copt", "en-x-hbotpap0-hbsccopt", HB_SCRIPT_INVALID);
|
||||
}
|
||||
|
||||
|
@ -266,12 +273,12 @@ test_tags_to_script_and_language (const char *script_tag_s,
|
|||
static void
|
||||
test_ot_tags_to_script_and_language (void)
|
||||
{
|
||||
test_tags_to_script_and_language ("DFLT", "ENG", "", "en-x-hbscdflt");
|
||||
test_tags_to_script_and_language ("DFLT", "ENG", "", "en-x-hbsc-44464c54");
|
||||
test_tags_to_script_and_language ("latn", "ENG", "Latn", "en");
|
||||
test_tags_to_script_and_language ("deva", "MAR", "Deva", "mr-x-hbscdeva");
|
||||
test_tags_to_script_and_language ("dev2", "MAR", "Deva", "mr-x-hbscdev2");
|
||||
test_tags_to_script_and_language ("deva", "MAR", "Deva", "mr-x-hbsc-64657661");
|
||||
test_tags_to_script_and_language ("dev2", "MAR", "Deva", "mr-x-hbsc-64657632");
|
||||
test_tags_to_script_and_language ("dev3", "MAR", "Deva", "mr");
|
||||
test_tags_to_script_and_language ("qaa", "QTZ0", "Qaaa", "x-hbotqtz0-hbscqaa");
|
||||
test_tags_to_script_and_language ("qaa", "QTZ0", "Qaaa", "x-hbot-51545a30-hbsc-71616120");
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -291,8 +298,9 @@ test_ot_tag_language (void)
|
|||
test_language_two_way ("ENG", "en");
|
||||
test_tag_from_language ("ENG", "en_US");
|
||||
|
||||
test_language_two_way ("CJA", "cja"); /* Western Cham */
|
||||
test_language_two_way ("CJM", "cjm"); /* Eastern Cham */
|
||||
test_language_two_way ("CJA", "cja-x-hbot-434a4120"); /* Western Cham */
|
||||
test_language_two_way ("CJM", "cjm-x-hbot-434a4d20"); /* Eastern Cham */
|
||||
test_tag_from_language ("CJM", "cjm");
|
||||
test_language_two_way ("EVN", "eve");
|
||||
|
||||
test_language_two_way ("HAL", "cfm"); /* BCP47 and current ISO639-3 code for Halam/Falam Chin */
|
||||
|
@ -351,13 +359,20 @@ test_ot_tag_language (void)
|
|||
test_tag_from_language ("ZHH", "yue-Hant");
|
||||
test_tag_from_language ("ZHS", "yue-Hans");
|
||||
|
||||
test_language_two_way ("ABC", "abc");
|
||||
test_language_two_way ("ABCD", "x-hbotabcd");
|
||||
test_language_two_way ("ABC", "abc-x-hbot-41424320");
|
||||
test_language_two_way ("ABCD", "x-hbot-41424344");
|
||||
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbotabc-zxc");
|
||||
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbotabc");
|
||||
test_tag_from_language ("ABCD", "asdf-asdf-wer-x-hbotabcd");
|
||||
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbot-41424320-zxc");
|
||||
test_tag_from_language ("ABC", "asdf-asdf-wer-x-hbot-41424320");
|
||||
test_tag_from_language ("ABCD", "asdf-asdf-wer-x-hbot-41424344");
|
||||
|
||||
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot");
|
||||
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-zxc");
|
||||
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-zxc-414243");
|
||||
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-414243");
|
||||
test_tag_from_language ("dflt", "asdf-asdf-wer-x-hbot-4142432");
|
||||
|
||||
test_tag_from_language ("dflt", "xy");
|
||||
test_tag_from_language ("XYZ", "xyz"); /* Unknown ISO 639-3 */
|
||||
|
@ -423,12 +438,27 @@ test_ot_tag_language (void)
|
|||
test_language_two_way ("SYRN", "und-Syrn");
|
||||
|
||||
/* Test that x-hbot overrides the base language */
|
||||
test_tag_from_language ("ABC", "fa-x-hbotabc-zxc");
|
||||
test_tag_from_language ("ABC", "fa-ir-x-hbotabc-zxc");
|
||||
test_tag_from_language ("ABC", "zh-x-hbotabc-zxc");
|
||||
test_tag_from_language ("ABC", "zh-cn-x-hbotabc-zxc");
|
||||
test_tag_from_language ("ABC", "zh-xy-x-hbotabc-zxc");
|
||||
test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-zxc");
|
||||
test_tag_from_language ("ABC", "fa-x-hbotabc-hbot-41686121-zxc");
|
||||
test_tag_from_language ("ABC", "fa-ir-x-hbotabc-hbot-41686121-zxc");
|
||||
test_tag_from_language ("ABC", "zh-x-hbotabc-hbot-41686121-zxc");
|
||||
test_tag_from_language ("ABC", "zh-cn-x-hbotabc-hbot-41686121-zxc");
|
||||
test_tag_from_language ("ABC", "zh-xy-x-hbotabc-hbot-41686121-zxc");
|
||||
test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-hbot-41686121-zxc");
|
||||
|
||||
test_tag_from_language ("Aha!", "fa-x-hbot-41686121-hbotabc-zxc");
|
||||
test_tag_from_language ("Aha!", "fa-ir-x-hbot-41686121-hbotabc-zxc");
|
||||
test_tag_from_language ("Aha!", "zh-x-hbot-41686121-hbotabc-zxc");
|
||||
test_tag_from_language ("Aha!", "zh-cn-x-hbot-41686121-hbotabc-zxc");
|
||||
test_tag_from_language ("Aha!", "zh-xy-x-hbot-41686121-hbotabc-zxc");
|
||||
test_tag_from_language ("Aha!", "xyz-xy-x-hbot-41686121-hbotabc-zxc");
|
||||
|
||||
/* Invalid x-hbot */
|
||||
test_tag_from_language ("dflt", "x-hbot");
|
||||
test_tag_from_language ("dflt", "x-hbot-");
|
||||
test_tag_from_language ("dflt", "x-hbot-1");
|
||||
test_tag_from_language ("dflt", "x-hbot-1a");
|
||||
test_tag_from_language ("dflt", "x-hbot-1a2b3c4x");
|
||||
test_tag_from_language ("2lon", "x-hbot-326c6f6e67");
|
||||
|
||||
/* Unnormalized BCP 47 tags */
|
||||
test_tag_from_language ("ARA", "ar-aao");
|
||||
|
|
Loading…
Reference in New Issue