From 0e31595e0d2e214262c4cf0d4136215bc4c89a0a Mon Sep 17 00:00:00 2001 From: David Corbett Date: Fri, 28 Jan 2022 22:26:38 -0500 Subject: [PATCH] Infer tag mappings for unregistered macrolanguages Every macrolanguage not mentioned in the OT language system tag registry is mapped to every tag of its individual languages, if those have registered tags. --- src/gen-tag-table.py | 34 +++++++++++++++++++--------------- src/hb-ot-tag-table.hh | 9 +++++++++ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index cb612b982..fa98d29de 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -329,6 +329,10 @@ class OpenTypeRegistryParser (HTMLParser): from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` inverted. Its values start as unsorted sets; ``sort_languages`` converts them to sorted lists. + from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]): + A copy of ``from_bcp_47``. It starts as ``None`` and is + populated at the beginning of the first call to + ``inherit_from_macrolanguages``. """ def __init__ (self): @@ -338,6 +342,7 @@ class OpenTypeRegistryParser (HTMLParser): self.ranks = collections.defaultdict (int) self.to_bcp_47 = collections.defaultdict (set) self.from_bcp_47 = collections.defaultdict (set) + self.from_bcp_47_uninherited = None # Whether the parser is in a element self._td = False # Whether the parser is after a
element within the current element @@ -463,29 +468,24 @@ class OpenTypeRegistryParser (HTMLParser): to SQI. If a BCP 47 tag for a macrolanguage has no OpenType mapping but - all of its individual languages do and they all map to the same - tags, the mapping is copied to the macrolanguage. + some of its individual languages do, their mappings are copied + to the macrolanguage. """ global bcp_47 - original_ot_from_bcp_47 = dict (self.from_bcp_47) + first_time = self.from_bcp_47_uninherited is None + if first_time: + self.from_bcp_47_uninherited = dict (self.from_bcp_47) for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): - ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ())) + ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ())) if ot_macrolanguages: for ot_macrolanguage in ot_macrolanguages: for language in languages: self.add_language (language, ot_macrolanguage) self.ranks[ot_macrolanguage] += 1 - else: + elif first_time: for language in languages: - if language in original_ot_from_bcp_47: - if ot_macrolanguages: - ml = original_ot_from_bcp_47[language] - if ml: - ot_macrolanguages &= ml - else: - pass - else: - ot_macrolanguages |= original_ot_from_bcp_47[language] + if language in self.from_bcp_47_uninherited: + ot_macrolanguages |= self.from_bcp_47_uninherited[language] else: ot_macrolanguages.clear () if not ot_macrolanguages: @@ -1121,7 +1121,11 @@ def verify_disambiguation_dict (): elif len (primary_tags) == 0: expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) else: - macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]') + original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')] + if len (original_languages) == 1: + macrolanguages = original_languages + else: + macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]'] if len (macrolanguages) != 1: macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') if len (macrolanguages) != 1: diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index c5ec1518b..f6e9238c7 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -256,6 +256,8 @@ static const LangTag ot_languages[] = { {"chh", HB_TAG_NONE }, /* Chinook != Chattisgarhi */ {"chj", HB_TAG('C','C','H','N')}, /* Ojitlán Chinantec -> Chinantec */ {"chk", HB_TAG('C','H','K','0')}, /* Chuukese */ + {"chm", HB_TAG('H','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> High Mari */ + {"chm", HB_TAG('L','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> Low Mari */ {"chn", HB_TAG('C','P','P',' ')}, /* Chinook jargon -> Creoles */ /*{"cho", HB_TAG('C','H','O',' ')},*/ /* Choctaw */ {"chp", HB_TAG('C','H','P',' ')}, /* Chipewyan */ @@ -1311,6 +1313,9 @@ static const LangTag ot_languages[] = { {"sgo", HB_TAG_NONE }, /* Songa (retired code) != Sango */ /*{"sgs", HB_TAG('S','G','S',' ')},*/ /* Samogitian */ {"sgw", HB_TAG('C','H','G',' ')}, /* Sebat Bet Gurage -> Chaha Gurage */ + {"sh", HB_TAG('B','O','S',' ')}, /* Serbo-Croatian [macrolanguage] -> Bosnian */ + {"sh", HB_TAG('H','R','V',' ')}, /* Serbo-Croatian [macrolanguage] -> Croatian */ + {"sh", HB_TAG('S','R','B',' ')}, /* Serbo-Croatian [macrolanguage] -> Serbian */ {"shi", HB_TAG('S','H','I',' ')}, /* Tachelhit */ {"shi", HB_TAG('B','B','R',' ')}, /* Tachelhit -> Berber */ {"shl", HB_TAG('Q','I','N',' ')}, /* Shendu -> Chin */ @@ -2841,6 +2846,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("fa", -1); /* Persian [macrolanguage] */ case HB_TAG('G','O','N',' '): /* Gondi */ return hb_language_from_string ("gon", -1); /* Gondi [macrolanguage] */ + case HB_TAG('H','M','A',' '): /* High Mari */ + return hb_language_from_string ("mrj", -1); /* Western Mari */ case HB_TAG('H','M','N',' '): /* Hmong */ return hb_language_from_string ("hmn", -1); /* Hmong [macrolanguage] */ case HB_TAG('H','N','D',' '): /* Hindko */ @@ -2881,6 +2888,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag) return hb_language_from_string ("uki", -1); /* Kui (India) */ case HB_TAG('K','U','R',' '): /* Kurdish */ return hb_language_from_string ("ku", -1); /* Kurdish [macrolanguage] */ + case HB_TAG('L','M','A',' '): /* Low Mari */ + return hb_language_from_string ("mhr", -1); /* Eastern Mari */ case HB_TAG('L','U','H',' '): /* Luyia */ return hb_language_from_string ("luy", -1); /* Luyia [macrolanguage] */ case HB_TAG('L','V','I',' '): /* Latvian */