Match extlang subtags

If the second subtag of a BCP 47 tag is three letters long, it denotes
an extended language. The tag converter ignores the language subtag and
uses the extended language instead.

There are some grandfathered exceptions, which are handled earlier.
This commit is contained in:
David Corbett 2018-01-20 15:53:09 -05:00 committed by Behdad Esfahbod
parent 2f1f961cc0
commit 7c7cb2a989
4 changed files with 38 additions and 2 deletions

View File

@ -884,7 +884,7 @@ def print_subtag_matches (subtag):
for language, tags in sorted (ot.from_bcp_47.items (), key=lambda i: (-len (i[0]), i[0])):
lt = LanguageTag (language)
if len (lt.subtags) == 1 or lt.grandfathered and ot.from_bcp_47[lt.subtags[0]] == tags:
if len (lt.subtags) == 1 or lt.grandfathered and len (lt.subtags[1]) != 3 and ot.from_bcp_47[lt.subtags[0]] == tags:
continue
print (' if (', end='')
if (lt.language == 'und' or

View File

@ -1279,6 +1279,13 @@ hb_ot_tags_from_complex_language (const char *lang_str,
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "zh-min-nan"))
{
/* Minnan, Hokkien, Amoy, Taiwanese, Southern Min, Southern Fujian, Hoklo, Southern Fukien, Ho-lo */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
}
if (lang_matches (lang_str, "cdo-hans"))
{
/* Min Dong Chinese; Han (Simplified variant) */
@ -1791,6 +1798,13 @@ hb_ot_tags_from_complex_language (const char *lang_str,
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "no-bok"))
{
/* Norwegian Bokmal */
tags[0] = HB_TAG('N','O','R',' '); /* Norwegian */
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "no-nyn"))
{
/* Norwegian Nynorsk */
@ -1822,6 +1836,13 @@ hb_ot_tags_from_complex_language (const char *lang_str,
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "zh-min"))
{
/* Min, Fuzhou, Hokkien, Amoy, or Taiwanese */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "i-hak"))
{
/* Hakka */

View File

@ -249,8 +249,17 @@ hb_ot_tags_from_language (const char *lang_str,
return;
/* Find a language matching in the first component. */
s = strchr (lang_str, '-');
{
const LangTag *lang_tag;
if (s && limit - lang_str >= 6)
{
const char *extlang_end = strchr (s + 1, '-');
/* If there is an extended language tag, use it. */
if (3 == (extlang_end ? extlang_end - s - 1 : strlen (s + 1)) &&
ISALPHA (s[1]))
lang_str = s + 1;
}
lang_tag = (LangTag *) bsearch (lang_str, ot_languages,
ARRAY_LENGTH (ot_languages), sizeof (LangTag),
lang_compare_first_component);
@ -264,7 +273,6 @@ hb_ot_tags_from_language (const char *lang_str,
}
}
s = strchr (lang_str, '-');
if (!s)
s = lang_str + strlen (lang_str);
if (s - lang_str == 3) {

View File

@ -369,9 +369,13 @@ test_ot_tag_language (void)
test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-zxc");
/* Unnormalized BCP 47 tags */
test_tag_from_language ("ARA", "ar-aao");
test_tag_from_language ("JBO", "art-lojban");
test_tag_from_language ("KOK", "kok-gom");
test_tag_from_language ("LTZ", "i-lux");
test_tag_from_language ("MNG", "drh");
test_tag_from_language ("MOR", "ar-ary");
test_tag_from_language ("MOR", "ar-ary-DZ");
test_tag_from_language ("NOR", "no-bok");
test_tag_from_language ("NYN", "no-nyn");
test_tag_from_language ("ZHS", "i-hak");
@ -379,6 +383,9 @@ test_ot_tag_language (void)
test_tag_from_language ("ZHS", "zh-min");
test_tag_from_language ("ZHS", "zh-min-nan");
test_tag_from_language ("ZHS", "zh-xiang");
/* A UN M.49 region code, not an extended language subtag */
test_tag_from_language ("ARA", "ar-001");
}
static void