Infer tag mappings for unregistered macrolanguages
Every macrolanguage not mentioned in the OT language system tag registry is mapped to every tag of its individual languages, if those have registered tags.
This commit is contained in:
parent
5a6545940a
commit
0e31595e0d
|
@ -329,6 +329,10 @@ class OpenTypeRegistryParser (HTMLParser):
|
|||
from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
|
||||
inverted. Its values start as unsorted sets;
|
||||
``sort_languages`` converts them to sorted lists.
|
||||
from_bcp_47_uninherited (Optional[Dict[str, AbstractSet[str]]]):
|
||||
A copy of ``from_bcp_47``. It starts as ``None`` and is
|
||||
populated at the beginning of the first call to
|
||||
``inherit_from_macrolanguages``.
|
||||
|
||||
"""
|
||||
def __init__ (self):
|
||||
|
@ -338,6 +342,7 @@ class OpenTypeRegistryParser (HTMLParser):
|
|||
self.ranks = collections.defaultdict (int)
|
||||
self.to_bcp_47 = collections.defaultdict (set)
|
||||
self.from_bcp_47 = collections.defaultdict (set)
|
||||
self.from_bcp_47_uninherited = None
|
||||
# Whether the parser is in a <td> element
|
||||
self._td = False
|
||||
# Whether the parser is after a <br> element within the current <tr> element
|
||||
|
@ -463,29 +468,24 @@ class OpenTypeRegistryParser (HTMLParser):
|
|||
to SQI.
|
||||
|
||||
If a BCP 47 tag for a macrolanguage has no OpenType mapping but
|
||||
all of its individual languages do and they all map to the same
|
||||
tags, the mapping is copied to the macrolanguage.
|
||||
some of its individual languages do, their mappings are copied
|
||||
to the macrolanguage.
|
||||
"""
|
||||
global bcp_47
|
||||
original_ot_from_bcp_47 = dict (self.from_bcp_47)
|
||||
first_time = self.from_bcp_47_uninherited is None
|
||||
if first_time:
|
||||
self.from_bcp_47_uninherited = dict (self.from_bcp_47)
|
||||
for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
|
||||
ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
|
||||
ot_macrolanguages = set (self.from_bcp_47_uninherited.get (macrolanguage, set ()))
|
||||
if ot_macrolanguages:
|
||||
for ot_macrolanguage in ot_macrolanguages:
|
||||
for language in languages:
|
||||
self.add_language (language, ot_macrolanguage)
|
||||
self.ranks[ot_macrolanguage] += 1
|
||||
else:
|
||||
elif first_time:
|
||||
for language in languages:
|
||||
if language in original_ot_from_bcp_47:
|
||||
if ot_macrolanguages:
|
||||
ml = original_ot_from_bcp_47[language]
|
||||
if ml:
|
||||
ot_macrolanguages &= ml
|
||||
else:
|
||||
pass
|
||||
else:
|
||||
ot_macrolanguages |= original_ot_from_bcp_47[language]
|
||||
if language in self.from_bcp_47_uninherited:
|
||||
ot_macrolanguages |= self.from_bcp_47_uninherited[language]
|
||||
else:
|
||||
ot_macrolanguages.clear ()
|
||||
if not ot_macrolanguages:
|
||||
|
@ -1121,7 +1121,11 @@ def verify_disambiguation_dict ():
|
|||
elif len (primary_tags) == 0:
|
||||
expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
|
||||
else:
|
||||
macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
|
||||
original_languages = [t for t in primary_tags if t in ot.from_bcp_47_uninherited and 'retired code' not in bcp_47.scopes.get (t, '')]
|
||||
if len (original_languages) == 1:
|
||||
macrolanguages = original_languages
|
||||
else:
|
||||
macrolanguages = [t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]']
|
||||
if len (macrolanguages) != 1:
|
||||
macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
|
||||
if len (macrolanguages) != 1:
|
||||
|
|
|
@ -256,6 +256,8 @@ static const LangTag ot_languages[] = {
|
|||
{"chh", HB_TAG_NONE }, /* Chinook != Chattisgarhi */
|
||||
{"chj", HB_TAG('C','C','H','N')}, /* Ojitlán Chinantec -> Chinantec */
|
||||
{"chk", HB_TAG('C','H','K','0')}, /* Chuukese */
|
||||
{"chm", HB_TAG('H','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> High Mari */
|
||||
{"chm", HB_TAG('L','M','A',' ')}, /* Mari (Russia) [macrolanguage] -> Low Mari */
|
||||
{"chn", HB_TAG('C','P','P',' ')}, /* Chinook jargon -> Creoles */
|
||||
/*{"cho", HB_TAG('C','H','O',' ')},*/ /* Choctaw */
|
||||
{"chp", HB_TAG('C','H','P',' ')}, /* Chipewyan */
|
||||
|
@ -1311,6 +1313,9 @@ static const LangTag ot_languages[] = {
|
|||
{"sgo", HB_TAG_NONE }, /* Songa (retired code) != Sango */
|
||||
/*{"sgs", HB_TAG('S','G','S',' ')},*/ /* Samogitian */
|
||||
{"sgw", HB_TAG('C','H','G',' ')}, /* Sebat Bet Gurage -> Chaha Gurage */
|
||||
{"sh", HB_TAG('B','O','S',' ')}, /* Serbo-Croatian [macrolanguage] -> Bosnian */
|
||||
{"sh", HB_TAG('H','R','V',' ')}, /* Serbo-Croatian [macrolanguage] -> Croatian */
|
||||
{"sh", HB_TAG('S','R','B',' ')}, /* Serbo-Croatian [macrolanguage] -> Serbian */
|
||||
{"shi", HB_TAG('S','H','I',' ')}, /* Tachelhit */
|
||||
{"shi", HB_TAG('B','B','R',' ')}, /* Tachelhit -> Berber */
|
||||
{"shl", HB_TAG('Q','I','N',' ')}, /* Shendu -> Chin */
|
||||
|
@ -2841,6 +2846,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
|
|||
return hb_language_from_string ("fa", -1); /* Persian [macrolanguage] */
|
||||
case HB_TAG('G','O','N',' '): /* Gondi */
|
||||
return hb_language_from_string ("gon", -1); /* Gondi [macrolanguage] */
|
||||
case HB_TAG('H','M','A',' '): /* High Mari */
|
||||
return hb_language_from_string ("mrj", -1); /* Western Mari */
|
||||
case HB_TAG('H','M','N',' '): /* Hmong */
|
||||
return hb_language_from_string ("hmn", -1); /* Hmong [macrolanguage] */
|
||||
case HB_TAG('H','N','D',' '): /* Hindko */
|
||||
|
@ -2881,6 +2888,8 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
|
|||
return hb_language_from_string ("uki", -1); /* Kui (India) */
|
||||
case HB_TAG('K','U','R',' '): /* Kurdish */
|
||||
return hb_language_from_string ("ku", -1); /* Kurdish [macrolanguage] */
|
||||
case HB_TAG('L','M','A',' '): /* Low Mari */
|
||||
return hb_language_from_string ("mhr", -1); /* Eastern Mari */
|
||||
case HB_TAG('L','U','H',' '): /* Luyia */
|
||||
return hb_language_from_string ("luy", -1); /* Luyia [macrolanguage] */
|
||||
case HB_TAG('L','V','I',' '): /* Latvian */
|
||||
|
|
Loading…
Reference in New Issue