Match extlang subtags

If the second subtag of a BCP 47 tag is three letters long, it denotes an extended language. The tag converter ignores the language subtag and uses the extended language instead. There are some grandfathered exceptions, which are handled earlier.
2018-01-20 15:53:09 -05:00 · 2018-01-20 15:53:09 -05:00 · 7c7cb2a989
parent 2f1f961cc0
commit 7c7cb2a989
4 changed files with 38 additions and 2 deletions
--- a/src/gen-tag-table.py
+++ b/src/gen-tag-table.py
@ -884,7 +884,7 @@ def print_subtag_matches (subtag):
 for language, tags in sorted (ot.from_bcp_47.items (), key=lambda i: (-len (i[0]), i[0])):
 	lt = LanguageTag (language)
-	if len (lt.subtags) == 1 or lt.grandfathered and ot.from_bcp_47[lt.subtags[0]] == tags:
+	if len (lt.subtags) == 1 or lt.grandfathered and len (lt.subtags[1]) != 3 and ot.from_bcp_47[lt.subtags[0]] == tags:
 		continue
 	print ('  if (', end='')
 	if (lt.language == 'und' or
--- a/src/hb-ot-tag-table.hh
+++ b/src/hb-ot-tag-table.hh
@ -1279,6 +1279,13 @@ hb_ot_tags_from_complex_language (const char   *lang_str,
    *count = 1;
    return true;
  }
  if (0 == strcmp (lang_str, "zh-min-nan"))
  {
    /* Minnan, Hokkien, Amoy, Taiwanese, Southern Min, Southern Fujian, Hoklo, Southern Fukien, Ho-lo */
    tags[0] = HB_TAG('Z','H','S',' ');  /* Chinese Simplified */
    *count = 1;
    return true;
  }
  if (lang_matches (lang_str, "cdo-hans"))
  {
    /* Min Dong Chinese; Han (Simplified variant) */
@ -1791,6 +1798,13 @@ hb_ot_tags_from_complex_language (const char   *lang_str,
    *count = 1;
    return true;
  }
  if (0 == strcmp (lang_str, "no-bok"))
  {
    /* Norwegian Bokmal */
    tags[0] = HB_TAG('N','O','R',' ');  /* Norwegian */
    *count = 1;
    return true;
  }
  if (0 == strcmp (lang_str, "no-nyn"))
  {
    /* Norwegian Nynorsk */
@ -1822,6 +1836,13 @@ hb_ot_tags_from_complex_language (const char   *lang_str,
    *count = 1;
    return true;
  }
  if (0 == strcmp (lang_str, "zh-min"))
  {
    /* Min, Fuzhou, Hokkien, Amoy, or Taiwanese */
    tags[0] = HB_TAG('Z','H','S',' ');  /* Chinese Simplified */
    *count = 1;
    return true;
  }
  if (0 == strcmp (lang_str, "i-hak"))
  {
    /* Hakka */
--- a/src/hb-ot-tag.cc
+++ b/src/hb-ot-tag.cc
@ -249,8 +249,17 @@ hb_ot_tags_from_language (const char   *lang_str,
    return;
  /* Find a language matching in the first component. */
  s = strchr (lang_str, '-');
  {
    const LangTag *lang_tag;
    if (s && limit - lang_str >= 6)
    {
      const char *extlang_end = strchr (s + 1, '-');
      /* If there is an extended language tag, use it. */
      if (3 == (extlang_end ? extlang_end - s - 1 : strlen (s + 1)) &&
 	  ISALPHA (s[1]))
 	lang_str = s + 1;
    }
    lang_tag = (LangTag *) bsearch (lang_str, ot_languages,
 				    ARRAY_LENGTH (ot_languages), sizeof (LangTag),
 				    lang_compare_first_component);
@ -264,7 +273,6 @@ hb_ot_tags_from_language (const char   *lang_str,
    }
  }
  s = strchr (lang_str, '-');
  if (!s)
    s = lang_str + strlen (lang_str);
  if (s - lang_str == 3) {
--- a/test/api/test-ot-tag.c
+++ b/test/api/test-ot-tag.c
@ -369,9 +369,13 @@ test_ot_tag_language (void)
  test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-zxc");
  /* Unnormalized BCP 47 tags */
  test_tag_from_language ("ARA", "ar-aao");
  test_tag_from_language ("JBO", "art-lojban");
  test_tag_from_language ("KOK", "kok-gom");
  test_tag_from_language ("LTZ", "i-lux");
  test_tag_from_language ("MNG", "drh");
  test_tag_from_language ("MOR", "ar-ary");
  test_tag_from_language ("MOR", "ar-ary-DZ");
  test_tag_from_language ("NOR", "no-bok");
  test_tag_from_language ("NYN", "no-nyn");
  test_tag_from_language ("ZHS", "i-hak");
@ -379,6 +383,9 @@ test_ot_tag_language (void)
  test_tag_from_language ("ZHS", "zh-min");
  test_tag_from_language ("ZHS", "zh-min-nan");
  test_tag_from_language ("ZHS", "zh-xiang");
  /* A UN M.49 region code, not an extended language subtag */
  test_tag_from_language ("ARA", "ar-001");
 }
 static void