Infer tag mappings for unregistered macrolanguages

Every macrolanguage not mentioned in the OT language system tag registry
is mapped to every tag of its individual languages, if those have
registered tags.
This commit is contained in:
David Corbett 2022-01-28 22:26:38 -05:00
parent 5a6545940a
commit 0e31595e0d
2 changed files with 28 additions and 15 deletions

View File

@ -329,6 +329,10 @@ class OpenTypeRegistryParser (HTMLParser):
from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47`` from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
inverted. Its values start as unsorted sets; inverted. Its values start as unsorted sets;
``sort_languages`` converts them to sorted lists. ``sort_languages`` converts them to sorted lists.
from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
A copy of ``from_bcp_47``. It starts as ``None`` and is
populated at the beginning of the first call to
``inherit_from_macrolanguages``.
""" """
def __init__ (self): def __init__ (self):
@ -338,6 +342,7 @@ class OpenTypeRegistryParser (HTMLParser):
self.ranks = collections.defaultdict (int) self.ranks = collections.defaultdict (int)
self.to_bcp_47 = collections.defaultdict (set) self.to_bcp_47 = collections.defaultdict (set)
self.from_bcp_47 = collections.defaultdict (set) self.from_bcp_47 = collections.defaultdict (set)
self.from_bcp_47_uninherited = None
# Whether the parser is in a <td> element # Whether the parser is in a <td> element
self._td = False self._td = False
# Whether the parser is after a <br> element within the current <tr> element # Whether the parser is after a <br> element within the current <tr> element
@ -463,29 +468,24 @@ class OpenTypeRegistryParser (HTMLParser):
to SQI. to SQI.
If a BCP 47 tag for a macrolanguage has no OpenType mapping but If a BCP 47 tag for a macrolanguage has no OpenType mapping but
all of its individual languages do and they all map to the same some of its individual languages do, their mappings are copied
tags, the mapping is copied to the macrolanguage. to the macrolanguage.
""" """
global bcp_47 global bcp_47
original_ot_from_bcp_47 = dict (self.from_bcp_47) first_time = self.from_bcp_47_uninherited is None
if first_time:
self.from_bcp_47_uninherited = dict (self.from_bcp_47)
for macrolanguage, languages in dict (bcp_47.macrolanguages).items (): for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ())) ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ()))
if ot_macrolanguages: if ot_macrolanguages:
for ot_macrolanguage in ot_macrolanguages: for ot_macrolanguage in ot_macrolanguages:
for language in languages: for language in languages:
self.add_language (language, ot_macrolanguage) self.add_language (language, ot_macrolanguage)
self.ranks[ot_macrolanguage] += 1 self.ranks[ot_macrolanguage] += 1
else: elif first_time:
for language in languages: for language in languages:
if language in original_ot_from_bcp_47: if language in self.from_bcp_47_uninherited:
if ot_macrolanguages: ot_macrolanguages |= self.from_bcp_47_uninherited[language]
ml = original_ot_from_bcp_47[language]
if ml:
ot_macrolanguages &= ml
else:
pass
else:
ot_macrolanguages |= original_ot_from_bcp_47[language]
else: else:
ot_macrolanguages.clear () ot_macrolanguages.clear ()
if not ot_macrolanguages: if not ot_macrolanguages:
@ -1121,7 +1121,11 @@ def verify_disambiguation_dict ():
elif len (primary_tags) == 0: elif len (primary_tags) == 0:
expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag) expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
else: else:
macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]') original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
if len (original_languages) == 1:
macrolanguages = original_languages
else:
macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
if len (macrolanguages) != 1: if len (macrolanguages) != 1:
macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]') macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
if len (macrolanguages) != 1: if len (macrolanguages) != 1:

View File

@ -256,6 +256,8 @@ static const LangTag ot_languages[] = {
{"chh", HB_TAG_NONE }, /* Chinook != Chattisgarhi */ {"chh", HB_TAG_NONE }, /* Chinook != Chattisgarhi */
{"chj", HB_TAG('C','C','H','N')}, /* Ojitlán Chinantec -> Chinantec */ {"chj", HB_TAG('C','C','H','N')}, /* Ojitlán Chinantec -> Chinantec */
{"chk", HB_TAG('C','H','K','0')}, /* Chuukese */ {"chk", HB_TAG('C','H','K','0')}, /* Chuukese */
{"chm", HB_TAG('H','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> High Mari */
{"chm", HB_TAG('L','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> Low Mari */
{"chn", HB_TAG('C','P','P',' ')}, /* Chinook jargon -> Creoles */ {"chn", HB_TAG('C','P','P',' ')}, /* Chinook jargon -> Creoles */
/*{"cho", HB_TAG('C','H','O',' ')},*/ /* Choctaw */ /*{"cho", HB_TAG('C','H','O',' ')},*/ /* Choctaw */
{"chp", HB_TAG('C','H','P',' ')}, /* Chipewyan */ {"chp", HB_TAG('C','H','P',' ')}, /* Chipewyan */
@ -1311,6 +1313,9 @@ static const LangTag ot_languages[] = {
{"sgo", HB_TAG_NONE }, /* Songa (retired code) != Sango */ {"sgo", HB_TAG_NONE }, /* Songa (retired code) != Sango */
/*{"sgs", HB_TAG('S','G','S',' ')},*/ /* Samogitian */ /*{"sgs", HB_TAG('S','G','S',' ')},*/ /* Samogitian */
{"sgw", HB_TAG('C','H','G',' ')}, /* Sebat Bet Gurage -> Chaha Gurage */ {"sgw", HB_TAG('C','H','G',' ')}, /* Sebat Bet Gurage -> Chaha Gurage */
{"sh", HB_TAG('B','O','S',' ')}, /* Serbo-Croatian [macrolanguage] -> Bosnian */
{"sh", HB_TAG('H','R','V',' ')}, /* Serbo-Croatian [macrolanguage] -> Croatian */
{"sh", HB_TAG('S','R','B',' ')}, /* Serbo-Croatian [macrolanguage] -> Serbian */
{"shi", HB_TAG('S','H','I',' ')}, /* Tachelhit */ {"shi", HB_TAG('S','H','I',' ')}, /* Tachelhit */
{"shi", HB_TAG('B','B','R',' ')}, /* Tachelhit -> Berber */ {"shi", HB_TAG('B','B','R',' ')}, /* Tachelhit -> Berber */
{"shl", HB_TAG('Q','I','N',' ')}, /* Shendu -> Chin */ {"shl", HB_TAG('Q','I','N',' ')}, /* Shendu -> Chin */
@ -2841,6 +2846,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
return hb_language_from_string ("fa", -1); /* Persian [macrolanguage] */ return hb_language_from_string ("fa", -1); /* Persian [macrolanguage] */
case HB_TAG('G','O','N',' '): /* Gondi */ case HB_TAG('G','O','N',' '): /* Gondi */
return hb_language_from_string ("gon", -1); /* Gondi [macrolanguage] */ return hb_language_from_string ("gon", -1); /* Gondi [macrolanguage] */
case HB_TAG('H','M','A',' '): /* High Mari */
return hb_language_from_string ("mrj", -1); /* Western Mari */
case HB_TAG('H','M','N',' '): /* Hmong */ case HB_TAG('H','M','N',' '): /* Hmong */
return hb_language_from_string ("hmn", -1); /* Hmong [macrolanguage] */ return hb_language_from_string ("hmn", -1); /* Hmong [macrolanguage] */
case HB_TAG('H','N','D',' '): /* Hindko */ case HB_TAG('H','N','D',' '): /* Hindko */
@ -2881,6 +2888,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
return hb_language_from_string ("uki", -1); /* Kui (India) */ return hb_language_from_string ("uki", -1); /* Kui (India) */
case HB_TAG('K','U','R',' '): /* Kurdish */ case HB_TAG('K','U','R',' '): /* Kurdish */
return hb_language_from_string ("ku", -1); /* Kurdish [macrolanguage] */ return hb_language_from_string ("ku", -1); /* Kurdish [macrolanguage] */
case HB_TAG('L','M','A',' '): /* Low Mari */
return hb_language_from_string ("mhr", -1); /* Eastern Mari */
case HB_TAG('L','U','H',' '): /* Luyia */ case HB_TAG('L','U','H',' '): /* Luyia */
return hb_language_from_string ("luy", -1); /* Luyia [macrolanguage] */ return hb_language_from_string ("luy", -1); /* Luyia [macrolanguage] */
case HB_TAG('L','V','I',' '): /* Latvian */ case HB_TAG('L','V','I',' '): /* Latvian */