Consistently emit BCP 47 subtag scope suffixes

This commit is contained in:
David Corbett 2020-10-10 14:15:16 -04:00 committed by Behdad Esfahbod
parent 1c05f6789b
commit 916c5a9007
2 changed files with 69 additions and 56 deletions

View File

@ -591,7 +591,9 @@ class BCP47Parser (object):
elif not has_preferred_value and line.startswith ('Macrolanguage: '):
self._add_macrolanguage (line.split (' ')[1], subtag)
elif subtag_type == 'variant':
if line.startswith ('Prefix: '):
if line.startswith ('Deprecated: '):
self.scopes[subtag] = ' (retired code)' + self.scopes.get (subtag, '')
elif line.startswith ('Prefix: '):
self.prefixes[subtag].add (line.split (' ')[1])
elif line.startswith ('File-Date: '):
self.header = line
@ -622,6 +624,17 @@ class BCP47Parser (object):
for macrolanguage in macrolanguages:
self._add_macrolanguage (biggest_macrolanguage, macrolanguage)
def _get_name_piece (self, subtag):
"""Return the first name of a subtag plus its scope suffix.
Args:
subtag (str): A BCP 47 subtag.
Returns:
The name form of ``subtag``.
"""
return self.names[subtag].split ('\n')[0] + self.scopes.get (subtag, '')
def get_name (self, lt):
"""Return the names of the subtags in a language tag.
@ -631,13 +644,13 @@ class BCP47Parser (object):
Returns:
The name form of ``lt``.
"""
name = self.names[lt.language].split ('\n')[0]
name = self._get_name_piece (lt.language)
if lt.script:
name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
name += '; ' + self._get_name_piece (lt.script.title ())
if lt.region:
name += '; ' + self.names[lt.region.upper ()].split ('\n')[0]
name += '; ' + self._get_name_piece (lt.region.upper ())
if lt.variant:
name += '; ' + self.names[lt.variant].split ('\n')[0]
name += '; ' + self._get_name_piece (lt.variant)
return name
bcp_47 = BCP47Parser ()

View File

@ -7,7 +7,7 @@
* on files with these headers:
*
* <meta name="updated_at" content="2019-05-22 06:05 PM" />
* File-Date: 2020-07-17
* File-Date: 2020-09-29
*/
#ifndef HB_OT_TAG_TABLE_HH
@ -1137,7 +1137,7 @@ hb_ot_tags_from_complex_language (const char *lang_str,
case 'a':
if (0 == strcmp (&lang_str[1], "rt-lojban"))
{
/* Lojban */
/* Lojban (retired code) */
tags[0] = HB_TAG('J','B','O',' '); /* Lojban */
*count = 1;
return true;
@ -1731,7 +1731,7 @@ hb_ot_tags_from_complex_language (const char *lang_str,
case 'i':
if (0 == strcmp (&lang_str[1], "-navajo"))
{
/* Navajo */
/* Navajo (retired code) */
unsigned int i;
hb_tag_t possible_tags[] = {
HB_TAG('N','A','V',' '), /* Navajo */
@ -1744,14 +1744,14 @@ hb_ot_tags_from_complex_language (const char *lang_str,
}
if (0 == strcmp (&lang_str[1], "-hak"))
{
/* Hakka */
/* Hakka (retired code) */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
}
if (0 == strcmp (&lang_str[1], "-lux"))
{
/* Luxembourgish */
/* Luxembourgish (retired code) */
tags[0] = HB_TAG('L','T','Z',' '); /* Luxembourgish */
*count = 1;
return true;
@ -1875,14 +1875,14 @@ hb_ot_tags_from_complex_language (const char *lang_str,
}
if (0 == strcmp (&lang_str[1], "o-bok"))
{
/* Norwegian Bokmal */
/* Norwegian Bokmal (retired code) */
tags[0] = HB_TAG('N','O','R',' '); /* Norwegian */
*count = 1;
return true;
}
if (0 == strcmp (&lang_str[1], "o-nyn"))
{
/* Norwegian Nynorsk */
/* Norwegian Nynorsk (retired code) */
tags[0] = HB_TAG('N','Y','N',' '); /* Norwegian Nynorsk (Nynorsk, Norwegian) */
*count = 1;
return true;
@ -1964,42 +1964,42 @@ hb_ot_tags_from_complex_language (const char *lang_str,
case 'z':
if (lang_matches (&lang_str[1], "h-hant-hk"))
{
/* Chinese */
/* Chinese [macrolanguage] */
tags[0] = HB_TAG('Z','H','H',' '); /* Chinese, Hong Kong SAR */
*count = 1;
return true;
}
if (lang_matches (&lang_str[1], "h-hant-mo"))
{
/* Chinese */
/* Chinese [macrolanguage] */
tags[0] = HB_TAG('Z','H','H',' '); /* Chinese, Hong Kong SAR */
*count = 1;
return true;
}
if (0 == strcmp (&lang_str[1], "h-min-nan"))
{
/* Minnan, Hokkien, Amoy, Taiwanese, Southern Min, Southern Fujian, Hoklo, Southern Fukien, Ho-lo */
/* Minnan, Hokkien, Amoy, Taiwanese, Southern Min, Southern Fujian, Hoklo, Southern Fukien, Ho-lo (retired code) */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
}
if (lang_matches (&lang_str[1], "h-hans"))
{
/* Chinese */
/* Chinese [macrolanguage] */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
}
if (lang_matches (&lang_str[1], "h-hant"))
{
/* Chinese */
/* Chinese [macrolanguage] */
tags[0] = HB_TAG('Z','H','T',' '); /* Chinese Traditional */
*count = 1;
return true;
}
if (0 == strcmp (&lang_str[1], "h-min"))
{
/* Min, Fuzhou, Hokkien, Amoy, or Taiwanese */
/* Min, Fuzhou, Hokkien, Amoy, or Taiwanese (retired code) */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
@ -2007,7 +2007,7 @@ hb_ot_tags_from_complex_language (const char *lang_str,
if (0 == strncmp (&lang_str[1], "h-", 2)
&& subtag_matches (lang_str, limit, "-hk"))
{
/* Chinese; Hong Kong */
/* Chinese [macrolanguage]; Hong Kong */
tags[0] = HB_TAG('Z','H','H',' '); /* Chinese, Hong Kong SAR */
*count = 1;
return true;
@ -2015,7 +2015,7 @@ hb_ot_tags_from_complex_language (const char *lang_str,
if (0 == strncmp (&lang_str[1], "h-", 2)
&& subtag_matches (lang_str, limit, "-mo"))
{
/* Chinese; Macao */
/* Chinese [macrolanguage]; Macao */
tags[0] = HB_TAG('Z','H','H',' '); /* Chinese, Hong Kong SAR */
*count = 1;
return true;
@ -2023,7 +2023,7 @@ hb_ot_tags_from_complex_language (const char *lang_str,
if (0 == strncmp (&lang_str[1], "h-", 2)
&& subtag_matches (lang_str, limit, "-tw"))
{
/* Chinese; Taiwan, Province of China */
/* Chinese [macrolanguage]; Taiwan, Province of China */
tags[0] = HB_TAG('Z','H','T',' '); /* Chinese Traditional */
*count = 1;
return true;
@ -2055,83 +2055,83 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
case HB_TAG('A','P','P','H'): /* Phonetic transcription—Americanist conventions */
return hb_language_from_string ("und-fonnapa", -1); /* Undetermined; North American Phonetic Alphabet */
case HB_TAG('A','R','A',' '): /* Arabic */
return hb_language_from_string ("ar", -1); /* Arabic */
return hb_language_from_string ("ar", -1); /* Arabic [macrolanguage] */
case HB_TAG('A','R','K',' '): /* Rakhine */
return hb_language_from_string ("rki", -1); /* Rakhine */
case HB_TAG('A','T','H',' '): /* Athapaskan */
return hb_language_from_string ("ath", -1); /* Athapascan */
return hb_language_from_string ("ath", -1); /* Athapascan [family] */
case HB_TAG('B','I','K',' '): /* Bikol */
return hb_language_from_string ("bik", -1); /* Bikol */
return hb_language_from_string ("bik", -1); /* Bikol [macrolanguage] */
case HB_TAG('C','P','P',' '): /* Creoles */
return hb_language_from_string ("crp", -1); /* Creoles and pidgins */
return hb_language_from_string ("crp", -1); /* Creoles and pidgins [family] */
case HB_TAG('C','R','R',' '): /* Carrier */
return hb_language_from_string ("crx", -1); /* Carrier */
case HB_TAG('D','N','K',' '): /* Dinka */
return hb_language_from_string ("din", -1); /* Dinka */
return hb_language_from_string ("din", -1); /* Dinka [macrolanguage] */
case HB_TAG('D','R','I',' '): /* Dari */
return hb_language_from_string ("prs", -1); /* Dari */
case HB_TAG('D','Z','N',' '): /* Dzongkha */
return hb_language_from_string ("dz", -1); /* Dzongkha */
case HB_TAG('E','T','I',' '): /* Estonian */
return hb_language_from_string ("et", -1); /* Estonian */
return hb_language_from_string ("et", -1); /* Estonian [macrolanguage] */
case HB_TAG('G','O','N',' '): /* Gondi */
return hb_language_from_string ("gon", -1); /* Gondi */
return hb_language_from_string ("gon", -1); /* Gondi [macrolanguage] */
case HB_TAG('H','M','N',' '): /* Hmong */
return hb_language_from_string ("hmn", -1); /* Hmong */
return hb_language_from_string ("hmn", -1); /* Hmong [macrolanguage] */
case HB_TAG('H','N','D',' '): /* Hindko */
return hb_language_from_string ("hnd", -1); /* Southern Hindko */
case HB_TAG('H','Y','E',' '): /* Armenian */
return hb_language_from_string ("hyw", -1); /* Western Armenian */
case HB_TAG('I','J','O',' '): /* Ijo */
return hb_language_from_string ("ijo", -1); /* Ijo */
return hb_language_from_string ("ijo", -1); /* Ijo [family] */
case HB_TAG('I','N','U',' '): /* Inuktitut */
return hb_language_from_string ("iu", -1); /* Inuktitut */
return hb_language_from_string ("iu", -1); /* Inuktitut [macrolanguage] */
case HB_TAG('I','P','K',' '): /* Inupiat */
return hb_language_from_string ("ik", -1); /* Inupiaq */
return hb_language_from_string ("ik", -1); /* Inupiaq [macrolanguage] */
case HB_TAG('I','P','P','H'): /* Phonetic transcription—IPA conventions */
return hb_language_from_string ("und-fonipa", -1); /* Undetermined; International Phonetic Alphabet */
case HB_TAG('I','R','T',' '): /* Irish Traditional */
return hb_language_from_string ("ga-Latg", -1); /* Irish; Latin (Gaelic variant) */
case HB_TAG('J','I','I',' '): /* Yiddish */
return hb_language_from_string ("yi", -1); /* Yiddish */
return hb_language_from_string ("yi", -1); /* Yiddish [macrolanguage] */
case HB_TAG('K','A','L',' '): /* Kalenjin */
return hb_language_from_string ("kln", -1); /* Kalenjin */
return hb_language_from_string ("kln", -1); /* Kalenjin [macrolanguage] */
case HB_TAG('K','G','E',' '): /* Khutsuri Georgian */
return hb_language_from_string ("und-Geok", -1); /* Undetermined; Khutsuri (Asomtavruli and Nuskhuri) */
case HB_TAG('K','N','R',' '): /* Kanuri */
return hb_language_from_string ("kr", -1); /* Kanuri */
return hb_language_from_string ("kr", -1); /* Kanuri [macrolanguage] */
case HB_TAG('K','O','K',' '): /* Konkani */
return hb_language_from_string ("kok", -1); /* Konkani */
return hb_language_from_string ("kok", -1); /* Konkani [macrolanguage] */
case HB_TAG('K','U','I',' '): /* Kui */
return hb_language_from_string ("uki", -1); /* Kui (India) */
case HB_TAG('K','U','R',' '): /* Kurdish */
return hb_language_from_string ("ku", -1); /* Kurdish */
return hb_language_from_string ("ku", -1); /* Kurdish [macrolanguage] */
case HB_TAG('L','U','H',' '): /* Luyia */
return hb_language_from_string ("luy", -1); /* Luyia */
return hb_language_from_string ("luy", -1); /* Luyia [macrolanguage] */
case HB_TAG('L','V','I',' '): /* Latvian */
return hb_language_from_string ("lv", -1); /* Latvian */
return hb_language_from_string ("lv", -1); /* Latvian [macrolanguage] */
case HB_TAG('M','A','W',' '): /* Marwari */
return hb_language_from_string ("mwr", -1); /* Marwari */
return hb_language_from_string ("mwr", -1); /* Marwari [macrolanguage] */
case HB_TAG('M','L','G',' '): /* Malagasy */
return hb_language_from_string ("mg", -1); /* Malagasy */
return hb_language_from_string ("mg", -1); /* Malagasy [macrolanguage] */
case HB_TAG('M','L','Y',' '): /* Malay */
return hb_language_from_string ("ms", -1); /* Malay */
return hb_language_from_string ("ms", -1); /* Malay [macrolanguage] */
case HB_TAG('M','N','G',' '): /* Mongolian */
return hb_language_from_string ("mn", -1); /* Mongolian */
return hb_language_from_string ("mn", -1); /* Mongolian [macrolanguage] */
case HB_TAG('M','O','L',' '): /* Moldavian */
return hb_language_from_string ("ro-MD", -1); /* Romanian; Moldova */
case HB_TAG('N','E','P',' '): /* Nepali */
return hb_language_from_string ("ne", -1); /* Nepali */
return hb_language_from_string ("ne", -1); /* Nepali [macrolanguage] */
case HB_TAG('N','I','S',' '): /* Nisi */
return hb_language_from_string ("njz", -1); /* Nyishi */
case HB_TAG('N','O','R',' '): /* Norwegian */
return hb_language_from_string ("no", -1); /* Norwegian */
return hb_language_from_string ("no", -1); /* Norwegian [macrolanguage] */
case HB_TAG('O','J','B',' '): /* Ojibway */
return hb_language_from_string ("oj", -1); /* Ojibwa */
return hb_language_from_string ("oj", -1); /* Ojibwa [macrolanguage] */
case HB_TAG('O','R','O',' '): /* Oromo */
return hb_language_from_string ("om", -1); /* Oromo */
return hb_language_from_string ("om", -1); /* Oromo [macrolanguage] */
case HB_TAG('P','A','S',' '): /* Pashto */
return hb_language_from_string ("ps", -1); /* Pashto */
return hb_language_from_string ("ps", -1); /* Pashto [macrolanguage] */
case HB_TAG('P','G','R',' '): /* Polytonic Greek */
return hb_language_from_string ("el-polyton", -1); /* Modern Greek (1453-); Polytonic Greek */
case HB_TAG('P','R','O',' '): /* Provençal / Old Provençal */
@ -2143,13 +2143,13 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
case HB_TAG('Q','W','H',' '): /* Quechua (Peru) */
return hb_language_from_string ("qwh", -1); /* Huaylas Ancash Quechua */
case HB_TAG('R','A','J',' '): /* Rajasthani */
return hb_language_from_string ("raj", -1); /* Rajasthani */
return hb_language_from_string ("raj", -1); /* Rajasthani [macrolanguage] */
case HB_TAG('R','O','Y',' '): /* Romany */
return hb_language_from_string ("rom", -1); /* Romany */
return hb_language_from_string ("rom", -1); /* Romany [macrolanguage] */
case HB_TAG('S','Q','I',' '): /* Albanian */
return hb_language_from_string ("sq", -1); /* Albanian */
return hb_language_from_string ("sq", -1); /* Albanian [macrolanguage] */
case HB_TAG('S','Y','R',' '): /* Syriac */
return hb_language_from_string ("syr", -1); /* Syriac */
return hb_language_from_string ("syr", -1); /* Syriac [macrolanguage] */
case HB_TAG('S','Y','R','E'): /* Syriac, Estrangela script-variant (equivalent to ISO 15924 'Syre') */
return hb_language_from_string ("und-Syre", -1); /* Undetermined; Syriac (Estrangelo variant) */
case HB_TAG('S','Y','R','J'): /* Syriac, Western script-variant (equivalent to ISO 15924 'Syrj') */
@ -2157,7 +2157,7 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
case HB_TAG('S','Y','R','N'): /* Syriac, Eastern script-variant (equivalent to ISO 15924 'Syrn') */
return hb_language_from_string ("und-Syrn", -1); /* Undetermined; Syriac (Eastern variant) */
case HB_TAG('T','M','H',' '): /* Tamashek */
return hb_language_from_string ("tmh", -1); /* Tamashek */
return hb_language_from_string ("tmh", -1); /* Tamashek [macrolanguage] */
case HB_TAG('T','N','E',' '): /* Tundra Nenets */
return hb_language_from_string ("yrk", -1); /* Nenets */
case HB_TAG('T','O','D',' '): /* Todo */
@ -2165,11 +2165,11 @@ hb_ot_ambiguous_tag_to_language (hb_tag_t tag)
case HB_TAG('T','W','I',' '): /* Twi */
return hb_language_from_string ("tw", -1); /* Twi */
case HB_TAG('Z','H','H',' '): /* Chinese, Hong Kong SAR */
return hb_language_from_string ("zh-HK", -1); /* Chinese; Hong Kong */
return hb_language_from_string ("zh-HK", -1); /* Chinese [macrolanguage]; Hong Kong */
case HB_TAG('Z','H','S',' '): /* Chinese Simplified */
return hb_language_from_string ("zh-Hans", -1); /* Chinese; Han (Simplified variant) */
return hb_language_from_string ("zh-Hans", -1); /* Chinese [macrolanguage]; Han (Simplified variant) */
case HB_TAG('Z','H','T',' '): /* Chinese Traditional */
return hb_language_from_string ("zh-Hant", -1); /* Chinese; Han (Traditional variant) */
return hb_language_from_string ("zh-Hant", -1); /* Chinese [macrolanguage]; Han (Traditional variant) */
default:
return HB_LANGUAGE_INVALID;
}