From a184c5f8518ab92b95947f23848ddde677e8cac1 Mon Sep 17 00:00:00 2001 From: David Corbett Date: Sun, 30 Jan 2022 13:28:23 -0500 Subject: [PATCH] =?UTF-8?q?Don=E2=80=99t=20always=20inherit=20from=20macro?= =?UTF-8?q?languages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If an OpenType tag maps to a BCP 47 macrolanguage, that is presumably to support the use of the macrolanguage as a vague stand-in for one of its individual languages. For example, "ar" and "zh" are often used for "arb" and "cmn". When the OpenType tag maps to a macrolanguage and some but not all of its individual languages, that indicates that the OpenType tag only corresponds to the listed individual languages (which may be referred to using the macrolanguage subtag) but not the missing individual languages. In particular, INUK (Nunavik Inuktitut) is mapped to "ike" (Eastern Canadian Inuktitut) and "iu" (Inuktitut) but not to "ikt" (Inuinnaqtun), so "ikt" should not inherit the INUK mapping from its macrolanguage "iu". --- src/gen-tag-table.py | 35 +++++++++++++++++++++++++++++++---- src/hb-ot-tag-table.hh | 12 ++---------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index 3064240ee..d1f8fe286 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -467,6 +467,14 @@ class OpenTypeRegistryParser (HTMLParser): explicit mapping, so it inherits from sq (Albanian) the mapping to SQI. + However, if an OpenType tag maps to a BCP 47 macrolanguage and + some but not all of its individual languages, the mapping is not + inherited from the macrolanguage to the missing individual + languages. For example, INUK (Nunavik Inuktitut) is mapped to + ike (Eastern Canadian Inuktitut) and iu (Inuktitut) but not to + ikt (Inuinnaqtun, which is an individual language of iu), so + this method does not add a mapping from ikt to INUK. + If a BCP 47 tag for a macrolanguage has no OpenType mapping but some of its individual languages do, their mappings are copied to the macrolanguage. @@ -476,12 +484,30 @@ class OpenTypeRegistryParser (HTMLParser): if first_time: self.from_bcp_47_uninherited = dict (self.from_bcp_47) for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): - ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ())) + ot_macrolanguages = { + ot_macrolanguage for ot_macrolanguage in self.from_bcp_47_uninherited.get (macrolanguage, set ()) + } + blocked_ot_macrolanguages = set () + if 'retired code' not in bcp_47.scopes.get (macrolanguage, ''): + for ot_macrolanguage in ot_macrolanguages: + round_trip_macrolanguages = { + l for l in self.to_bcp_47[ot_macrolanguage] + if 'retired code' not in bcp_47.scopes.get (l, '') + } + round_trip_languages = { + l for l in languages + if 'retired code' not in bcp_47.scopes.get (l, '') + } + intersection = round_trip_macrolanguages & round_trip_languages + if intersection and intersection != round_trip_languages: + blocked_ot_macrolanguages.add (ot_macrolanguage) if ot_macrolanguages: for ot_macrolanguage in ot_macrolanguages: - for language in languages: - self.add_language (language, ot_macrolanguage) - self.ranks[ot_macrolanguage] += 1 + if ot_macrolanguage not in blocked_ot_macrolanguages: + for language in languages: + self.add_language (language, ot_macrolanguage) + if not blocked_ot_macrolanguages: + self.ranks[ot_macrolanguage] += 1 elif first_time: for language in languages: if language in self.from_bcp_47_uninherited: @@ -715,6 +741,7 @@ ot.add_language ('no', 'NOR') ot.add_language ('oc-provenc', 'PRO') +ot.remove_language_ot ('QUZ') ot.add_language ('qu', 'QUZ') ot.add_language ('qub', 'QWH') ot.add_language ('qud', 'QVI') diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index a71d539d1..463e7a02a 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -628,7 +628,6 @@ static const LangTag ot_languages[] = { {"ike", HB_TAG('I','N','U',' ')}, /* Eastern Canadian Inuktitut -> Inuktitut */ {"ike", HB_TAG('I','N','U','K')}, /* Eastern Canadian Inuktitut -> Nunavik Inuktitut */ {"ikt", HB_TAG('I','N','U',' ')}, /* Inuinnaqtun -> Inuktitut */ - {"ikt", HB_TAG('I','N','U','K')}, /* Inuinnaqtun -> Nunavik Inuktitut */ /*{"ilo", HB_TAG('I','L','O',' ')},*/ /* Iloko -> Ilokano */ {"in", HB_TAG('I','N','D',' ')}, /* Indonesian (retired code) */ {"in", HB_TAG('M','L','Y',' ')}, /* Indonesian (retired code) -> Malay */ @@ -1044,7 +1043,6 @@ static const LangTag ot_languages[] = { {"nln", HB_TAG('N','A','H',' ')}, /* Durango Nahuatl (retired code) -> Nahuatl */ {"nlv", HB_TAG('N','A','H',' ')}, /* Orizaba Nahuatl -> Nahuatl */ {"nn", HB_TAG('N','Y','N',' ')}, /* Norwegian Nynorsk (Nynorsk, Norwegian) */ - {"nn", HB_TAG('N','O','R',' ')}, /* Norwegian Nynorsk -> Norwegian */ {"nnh", HB_TAG('B','M','L',' ')}, /* Ngiemboon -> Bamileke */ {"nnz", HB_TAG('B','M','L',' ')}, /* Nda'nda' -> Bamileke */ {"no", HB_TAG('N','O','R',' ')}, /* Norwegian [macrolanguage] */ @@ -2615,14 +2613,8 @@ hb_ot_tags_from_complex_language (const char *lang_str, if (0 == strcmp (&lang_str[1], "o-nyn")) { /* Norwegian Nynorsk (retired code) */ - unsigned int i; - hb_tag_t possible_tags[] = { - HB_TAG('N','Y','N',' '), /* Norwegian Nynorsk (Nynorsk, Norwegian) */ - HB_TAG('N','O','R',' '), /* Norwegian */ - }; - for (i = 0; i < 2 && i < *count; i++) - tags[i] = possible_tags[i]; - *count = i; + tags[0] = HB_TAG('N','Y','N',' '); /* Norwegian Nynorsk (Nynorsk, Norwegian) */ + *count = 1; return true; } break;