Don’t always inherit from macrolanguages

If an OpenType tag maps to a BCP 47 macrolanguage, that is presumably to
support the use of the macrolanguage as a vague stand-in for one of its
individual languages. For example, "ar" and "zh" are often used for
"arb" and "cmn". When the OpenType tag maps to a macrolanguage and some
but not all of its individual languages, that indicates that the
OpenType tag only corresponds to the listed individual languages (which
may be referred to using the macrolanguage subtag) but not the missing
individual languages. In particular, INUK (Nunavik Inuktitut) is mapped
to "ike" (Eastern Canadian Inuktitut) and "iu" (Inuktitut) but not to
"ikt" (Inuinnaqtun), so "ikt" should not inherit the INUK mapping from
its macrolanguage "iu".
This commit is contained in:
David Corbett 2022-01-30 13:28:23 -05:00
parent 0b1bf89cc2
commit a184c5f851
2 changed files with 33 additions and 14 deletions

View File

@ -467,6 +467,14 @@ class OpenTypeRegistryParser (HTMLParser):
explicit mapping, so it inherits from sq (Albanian) the mapping
to SQI.
However, if an OpenType tag maps to a BCP 47 macrolanguage and
some but not all of its individual languages, the mapping is not
inherited from the macrolanguage to the missing individual
languages. For example, INUK (Nunavik Inuktitut) is mapped to
ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to
ikt (Inuinnaqtun, which is an individual language of iu), so
this method does not add a mapping from ikt to INUK.
If a BCP 47 tag for a macrolanguage has no OpenType mapping but
some of its individual languages do, their mappings are copied
to the macrolanguage.
@ -476,12 +484,30 @@ class OpenTypeRegistryParser (HTMLParser):
if first_time:
self.from_bcp_47_uninherited = dict (self.from_bcp_47)
for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ()))
ot_macrolanguages = {
ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ())
}
blocked_ot_macrolanguages = set ()
if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''):
for ot_macrolanguage in ot_macrolanguages:
round_trip_macrolanguages = {
l for l in self.to_bcp_47[ot_macrolanguage]
if 'retired code' not in bcp_47.scopes.get (l, '')
}
round_trip_languages = {
l for l in languages
if 'retired code' not in bcp_47.scopes.get (l, '')
}
intersection = round_trip_macrolanguages & round_trip_languages
if intersection and intersection != round_trip_languages:
blocked_ot_macrolanguages.add (ot_macrolanguage)
if ot_macrolanguages:
for ot_macrolanguage in ot_macrolanguages:
for language in languages:
self.add_language (language, ot_macrolanguage)
self.ranks[ot_macrolanguage] += 1
if ot_macrolanguage not in blocked_ot_macrolanguages:
for language in languages:
self.add_language (language, ot_macrolanguage)
if not blocked_ot_macrolanguages:
self.ranks[ot_macrolanguage] += 1
elif first_time:
for language in languages:
if language in self.from_bcp_47_uninherited:
@ -715,6 +741,7 @@ ot.add_language ('no', 'NOR')
ot.add_language ('oc-provenc', 'PRO')
ot.remove_language_ot ('QUZ')
ot.add_language ('qu', 'QUZ')
ot.add_language ('qub', 'QWH')
ot.add_language ('qud', 'QVI')

View File

@ -628,7 +628,6 @@ static const LangTag ot_languages[] = {
{"ike", HB_TAG('I','N','U',' ')}, /* Eastern Canadian Inuktitut -> Inuktitut */
{"ike", HB_TAG('I','N','U','K')}, /* Eastern Canadian Inuktitut -> Nunavik Inuktitut */
{"ikt", HB_TAG('I','N','U',' ')}, /* Inuinnaqtun -> Inuktitut */
{"ikt", HB_TAG('I','N','U','K')}, /* Inuinnaqtun -> Nunavik Inuktitut */
/*{"ilo", HB_TAG('I','L','O',' ')},*/ /* Iloko -> Ilokano */
{"in", HB_TAG('I','N','D',' ')}, /* Indonesian (retired code) */
{"in", HB_TAG('M','L','Y',' ')}, /* Indonesian (retired code) -> Malay */
@ -1044,7 +1043,6 @@ static const LangTag ot_languages[] = {
{"nln", HB_TAG('N','A','H',' ')}, /* Durango Nahuatl (retired code) -> Nahuatl */
{"nlv", HB_TAG('N','A','H',' ')}, /* Orizaba Nahuatl -> Nahuatl */
{"nn", HB_TAG('N','Y','N',' ')}, /* Norwegian Nynorsk (Nynorsk, Norwegian) */
{"nn", HB_TAG('N','O','R',' ')}, /* Norwegian Nynorsk -> Norwegian */
{"nnh", HB_TAG('B','M','L',' ')}, /* Ngiemboon -> Bamileke */
{"nnz", HB_TAG('B','M','L',' ')}, /* Nda'nda' -> Bamileke */
{"no", HB_TAG('N','O','R',' ')}, /* Norwegian [macrolanguage] */
@ -2615,14 +2613,8 @@ hb_ot_tags_from_complex_language (const char *lang_str,
if (0 == strcmp (&lang_str[1], "o-nyn"))
{
/* Norwegian Nynorsk (retired code) */
unsigned int i;
hb_tag_t possible_tags[] = {
HB_TAG('N','Y','N',' '), /* Norwegian Nynorsk (Nynorsk, Norwegian) */
HB_TAG('N','O','R',' '), /* Norwegian */
};
for (i = 0; i < 2 && i < *count; i++)
tags[i] = possible_tags[i];
*count = i;
tags[0] = HB_TAG('N','Y','N',' '); /* Norwegian Nynorsk (Nynorsk, Norwegian) */
*count = 1;
return true;
}
break;