Improve unicode property abbreviation support

2022-01-06 17:18:15 +00:00 · 2022-01-06 17:18:15 +00:00 · e3a16626ae
parent 14dbc6e6ec
commit e3a16626ae
3 changed files with 309 additions and 151 deletions
--- a/maint/GenerateCommon.py
+++ b/maint/GenerateCommon.py
@ -58,45 +58,6 @@ script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille
  'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
 ]

-script_abbrevs = [
-  'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
-  'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
-  'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
-  'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
-  'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
-  'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
-  'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
-#New for Unicode 5.0
-  'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
-#New for Unicode 5.1
-  'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
-  'Sund', 'Vaii',
-#New for Unicode 5.2
-  'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
-  'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
-#New for Unicode 6.0.0
-  'Batk', 'Brah', 'Mand',
-#New for Unicode 6.1.0
-  'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
-#New for Unicode 7.0.0
-  'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
-  'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
-  'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
-#New for Unicode 8.0.0
-  'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
-#New for Unicode 10.0.0
-  'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
-  'Zanb',
-#New for Unicode 11.0.0
-  'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd',
-#New for Unicode 12.0.0
-  'Elym', 'Nand', 'Hmnp', 'Wcho',
-#New for Unicode 13.0.0
-  'Chrs', 'Diak', 'Kits', 'Yezi',
-#New for Unicode 14.0.0
-  'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith'
- ]
-
 # BIDI classes in the DerivedBidiClass.txt file, with comments.

 bidi_classes = [
@ -184,16 +145,208 @@ break_properties = [
  'Extended_Pictographic', '14'
  ]

+# List of abbreviations for various properties
+
+abbreviations = {
+# Script abbreviations
+  'Unknown': 'Zzzz',
+  'Arabic': 'Arab',
+  'Armenian': 'Armn',
+  'Bengali': 'Beng',
+  'Bopomofo': 'Bopo',
+  'Braille': 'Brai',
+  'Buginese': 'Bugi',
+  'Buhid': 'Buhd',
+  'Canadian_Aboriginal': 'Cans',
+  'Cherokee': 'Cher',
+  'Common': 'Zyyy',
+  'Coptic': ('Copt', 'Qaac'),
+  'Cypriot': 'Cprt',
+  'Cyrillic': 'Cyrl',
+  'Deseret': 'Dsrt',
+  'Devanagari': 'Deva',
+  'Ethiopic': 'Ethi',
+  'Georgian': 'Geor',
+  'Glagolitic': 'Glag',
+  'Gothic': 'Goth',
+  'Greek': 'Grek',
+  'Gujarati': 'Gujr',
+  'Gurmukhi': 'Guru',
+  'Han': 'Hani',
+  'Hangul': 'Hang',
+  'Hanunoo': 'Hano',
+  'Hebrew': 'Hebr',
+  'Hiragana': 'Hira',
+  'Inherited': ('Zinh', 'Qaai'),
+  'Kannada': 'Knda',
+  'Katakana': 'Kana',
+  'Kharoshthi': 'Khar',
+  'Khmer': 'Khmr',
+  'Lao': 'Laoo',
+  'Latin': 'Latn',
+  'Limbu': 'Limb',
+  'Linear_B': 'Linb',
+  'Malayalam': 'Mlym',
+  'Mongolian': 'Mong',
+  'Myanmar': 'Mymr',
+  'New_Tai_Lue': 'Talu',
+  'Ogham': 'Ogam',
+  'Old_Italic': 'Ital',
+  'Old_Persian': 'Xpeo',
+  'Oriya': 'Orya',
+  'Osmanya': 'Osma',
+  'Runic': 'Runr',
+  'Shavian': 'Shaw',
+  'Sinhala': 'Sinh',
+  'Syloti_Nagri': 'Sylo',
+  'Syriac': 'Syrc',
+  'Tagalog': 'Tglg',
+  'Tagbanwa': 'Tagb',
+  'Tai_Le': 'Tale',
+  'Tamil': 'Taml',
+  'Telugu': 'Telu',
+  'Thaana': 'Thaa',
+  'Thai': (),
+  'Tibetan': 'Tibt',
+  'Tifinagh': 'Tfng',
+  'Ugaritic': 'Ugar',
+  'Yi': 'Yiii',
+# New for Unicode 5.0
+  'Balinese': 'Bali',
+  'Cuneiform': 'Xsux',
+  'Nko': 'Nkoo',
+  'Phags_Pa': 'Phag',
+  'Phoenician': 'Phnx',
+# New for Unicode 5.1
+  'Carian': 'Cari',
+  'Cham': (),
+  'Kayah_Li': 'Kali',
+  'Lepcha': 'Lepc',
+  'Lycian': 'Lyci',
+  'Lydian': 'Lydi',
+  'Ol_Chiki': 'Olck',
+  'Rejang': 'Rjng',
+  'Saurashtra': 'Saur',
+  'Sundanese': 'Sund',
+  'Vai': 'Vaii',
+# New for Unicode 5.2
+  'Avestan': 'Avst',
+  'Bamum': 'Bamu',
+  'Egyptian_Hieroglyphs': 'Egyp',
+  'Imperial_Aramaic': 'Armi',
+  'Inscriptional_Pahlavi': 'Phli',
+  'Inscriptional_Parthian': 'Prti',
+  'Javanese': 'Java',
+  'Kaithi': 'Kthi',
+  'Lisu': (),
+  'Meetei_Mayek': 'Mtei',
+  'Old_South_Arabian': 'Sarb',
+  'Old_Turkic': 'Orkh',
+  'Samaritan': 'Samr',
+  'Tai_Tham': 'Lana',
+  'Tai_Viet': 'Tavt',
+# New for Unicode 6.0.0
+  'Batak': 'Batk',
+  'Brahmi': 'Brah',
+  'Mandaic': 'Mand',
+# New for Unicode 6.1.0
+  'Chakma': 'Cakm',
+  'Meroitic_Cursive': 'Merc',
+  'Meroitic_Hieroglyphs': 'Mero',
+  'Miao': 'Plrd',
+  'Sharada': 'Shrd',
+  'Sora_Sompeng': 'Sora',
+  'Takri': 'Takr',
+# New for Unicode 7.0.0
+  'Bassa_Vah': 'Bass',
+  'Caucasian_Albanian': 'Aghb',
+  'Duployan': 'Dupl',
+  'Elbasan': 'Elba',
+  'Grantha': 'Gran',
+  'Khojki': 'Khoj',
+  'Khudawadi': 'Sind',
+  'Linear_A': 'Lina',
+  'Mahajani': 'Mahj',
+  'Manichaean': 'Mani',
+  'Mende_Kikakui': 'Mend',
+  'Modi': (),
+  'Mro': 'Mroo',
+  'Nabataean': 'Nbat',
+  'Old_North_Arabian': 'Narb',
+  'Old_Permic': 'Perm',
+  'Pahawh_Hmong': 'Hmng',
+  'Palmyrene': 'Palm',
+  'Psalter_Pahlavi': 'Phlp',
+  'Pau_Cin_Hau': 'Pauc',
+  'Siddham': 'Sidd',
+  'Tirhuta': 'Tirh',
+  'Warang_Citi': 'Wara',
+# New for Unicode 8.0.0
+  'Ahom': (),
+  'Anatolian_Hieroglyphs': 'Hluw',
+  'Hatran': 'Hatr',
+  'Multani': 'Mult',
+  'Old_Hungarian': 'Hung',
+  'SignWriting': 'Sgnw',
+# New for Unicode 10.0.0
+  'Adlam': 'Adlm',
+  'Bhaiksuki': 'Bhks',
+  'Marchen': 'Marc',
+  'Newa': (),
+  'Osage': 'Osge',
+  'Tangut': 'Tang',
+  'Masaram_Gondi': 'Gonm',
+  'Nushu': 'Nshu',
+  'Soyombo': 'Soyo',
+  'Zanabazar_Square': 'Zanb',
+# New for Unicode 11.0.0
+  'Dogra': 'Dogr',
+  'Gunjala_Gondi': 'Gong',
+  'Hanifi_Rohingya': 'Rohg',
+  'Makasar': 'Maka',
+  'Medefaidrin': 'Medf',
+  'Old_Sogdian': 'Sogo',
+  'Sogdian': 'Sogd',
+# New for Unicode 12.0.0
+  'Elymaic': 'Elym',
+  'Nandinagari': 'Nand',
+  'Nyiakeng_Puachue_Hmong': 'Hmnp',
+  'Wancho': 'Wcho',
+# New for Unicode 13.0.0
+  'Chorasmian': 'Chrs',
+  'Dives_Akuru': 'Diak',
+  'Khitan_Small_Script': 'Kits',
+  'Yezidi': 'Yezi',
+# New for Unicode 14.0.0
+  'Cypro_Minoan': 'Cpmn',
+  'Old_Uyghur': 'Ougr',
+  'Tangsa': 'Tngs',
+  'Toto': (),
+  'Vithkuqi': 'Vith',
+  }
+
+# Convert string abbreviations to tuples
+for key in abbreviations:
+  value = abbreviations[key]
+  if isinstance(value, str):
+    abbreviations[key] = (value,)
+
 # ---------------------------------------------------------------------------
 #                      REORDERING SCRIPT NAMES
 # ---------------------------------------------------------------------------

 import re

+script_abbrevs = []
+
 def reorder_scripts():
  global script_names
  global script_abbrevs

+  for name in script_names:
+    abbrevs = abbreviations[name]
+    script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
+
  extended_script_abbrevs = set()
  with open("Unicode.tables/ScriptExtensions.txt") as f:
    names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
--- a/maint/GenerateUcpTables.py
+++ b/maint/GenerateUcpTables.py
@ -48,10 +48,10 @@
 # Import common data lists and functions

 from GenerateCommon import \
+  abbreviations, \
  bidi_classes, \
  category_names, \
  general_category_names, \
-  script_abbrevs, \
  script_names, \
  open_output

@ -75,14 +75,15 @@ category_names = category_names[::2]
 # Create standardized versions of the names by lowercasing and removing
 # underscores.

+def stdname(x):
+  return x.lower().replace('_', '')
+
 def stdnames(x):
  y = [''] * len(x)
  for i in range(len(x)):
-    y[i] = x[i].lower().replace('_', '')
+    y[i] = stdname(x[i])
  return y

-std_script_names = stdnames(script_names)
-std_script_abbrevs = stdnames(script_abbrevs)
 std_category_names = stdnames(category_names)
 std_general_category_names = stdnames(general_category_names)
 std_bidi_class_names = stdnames(bidi_class_names)
@ -92,18 +93,16 @@ std_bidi_class_names = stdnames(bidi_class_names)
 # latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
 # still use the full original names.

+utt_table = []
+
 scx_end = script_names.index('Unknown')

-utt_table  = list(zip(std_script_names[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
-utt_table += list(zip(std_script_names[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
-utt_table += list(zip(std_script_abbrevs[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
-utt_table += list(zip(std_script_abbrevs[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
+for idx, name in enumerate(script_names):
+  pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'

-# At lease one script abbreviation is the same as the full name of the script,
-# so we must remove duplicates. It doesn't matter if this operation changes the
-# order, because we are going to sort the list later.
-
-utt_table = list(set(utt_table))
+  utt_table.append((stdname(name), name, pt_type))
+  for abbrev in abbreviations[name]:
+    utt_table.append((stdname(abbrev), name, pt_type))

 # Add the remaining property lists

--- a/src/pcre2_ucptables.c
+++ b/src/pcre2_ucptables.c
@ -340,6 +340,8 @@ the "loose matching" rules that Unicode advises and Perl uses. */
 #define STRING_prti0 STR_p STR_r STR_t STR_i "\0"
 #define STRING_ps0 STR_p STR_s "\0"
 #define STRING_psalterpahlavi0 STR_p STR_s STR_a STR_l STR_t STR_e STR_r STR_p STR_a STR_h STR_l STR_a STR_v STR_i "\0"
+#define STRING_qaac0 STR_q STR_a STR_a STR_c "\0"
+#define STRING_qaai0 STR_q STR_a STR_a STR_i "\0"
 #define STRING_rejang0 STR_r STR_e STR_j STR_a STR_n STR_g "\0"
 #define STRING_rjng0 STR_r STR_j STR_n STR_g "\0"
 #define STRING_rohg0 STR_r STR_o STR_h STR_g "\0"
@ -729,6 +731,8 @@ const char PRIV(utt_names)[] =
  STRING_prti0
  STRING_ps0
  STRING_psalterpahlavi0
+  STRING_qaac0
+  STRING_qaai0
  STRING_rejang0
  STRING_rjng0
  STRING_rohg0
@ -1118,106 +1122,108 @@ const ucp_type_table PRIV(utt)[] = {
  { 1962, PT_SC, ucp_Inscriptional_Parthian },
  { 1967, PT_PC, ucp_Ps },
  { 1970, PT_SCX, ucp_Psalter_Pahlavi },
-  { 1985, PT_SC, ucp_Rejang },
-  { 1992, PT_SC, ucp_Rejang },
-  { 1997, PT_SCX, ucp_Hanifi_Rohingya },
-  { 2002, PT_SC, ucp_Runic },
-  { 2008, PT_SC, ucp_Runic },
-  { 2013, PT_GC, ucp_S },
-  { 2015, PT_SC, ucp_Samaritan },
+  { 1985, PT_SCX, ucp_Coptic },
+  { 1990, PT_SC, ucp_Inherited },
+  { 1995, PT_SC, ucp_Rejang },
+  { 2002, PT_SC, ucp_Rejang },
+  { 2007, PT_SCX, ucp_Hanifi_Rohingya },
+  { 2012, PT_SC, ucp_Runic },
+  { 2018, PT_SC, ucp_Runic },
+  { 2023, PT_GC, ucp_S },
  { 2025, PT_SC, ucp_Samaritan },
-  { 2030, PT_SC, ucp_Old_South_Arabian },
-  { 2035, PT_SC, ucp_Saurashtra },
-  { 2040, PT_SC, ucp_Saurashtra },
-  { 2051, PT_PC, ucp_Sc },
-  { 2054, PT_SC, ucp_SignWriting },
-  { 2059, PT_SCX, ucp_Sharada },
-  { 2067, PT_SC, ucp_Shavian },
-  { 2075, PT_SC, ucp_Shavian },
-  { 2080, PT_SCX, ucp_Sharada },
-  { 2085, PT_SC, ucp_Siddham },
-  { 2090, PT_SC, ucp_Siddham },
-  { 2098, PT_SC, ucp_SignWriting },
-  { 2110, PT_SCX, ucp_Khudawadi },
-  { 2115, PT_SCX, ucp_Sinhala },
-  { 2120, PT_SCX, ucp_Sinhala },
-  { 2128, PT_PC, ucp_Sk },
-  { 2131, PT_PC, ucp_Sm },
-  { 2134, PT_PC, ucp_So },
-  { 2137, PT_SCX, ucp_Sogdian },
-  { 2142, PT_SCX, ucp_Sogdian },
-  { 2150, PT_SC, ucp_Old_Sogdian },
-  { 2155, PT_SC, ucp_Sora_Sompeng },
-  { 2160, PT_SC, ucp_Sora_Sompeng },
-  { 2172, PT_SC, ucp_Soyombo },
-  { 2177, PT_SC, ucp_Soyombo },
-  { 2185, PT_SC, ucp_Sundanese },
-  { 2190, PT_SC, ucp_Sundanese },
-  { 2200, PT_SCX, ucp_Syloti_Nagri },
-  { 2205, PT_SCX, ucp_Syloti_Nagri },
-  { 2217, PT_SCX, ucp_Syriac },
-  { 2222, PT_SCX, ucp_Syriac },
-  { 2229, PT_SCX, ucp_Tagalog },
-  { 2237, PT_SCX, ucp_Tagbanwa },
-  { 2242, PT_SCX, ucp_Tagbanwa },
-  { 2251, PT_SCX, ucp_Tai_Le },
-  { 2257, PT_SC, ucp_Tai_Tham },
-  { 2265, PT_SC, ucp_Tai_Viet },
-  { 2273, PT_SCX, ucp_Takri },
-  { 2278, PT_SCX, ucp_Takri },
-  { 2284, PT_SCX, ucp_Tai_Le },
-  { 2289, PT_SC, ucp_New_Tai_Lue },
-  { 2294, PT_SCX, ucp_Tamil },
-  { 2300, PT_SCX, ucp_Tamil },
-  { 2305, PT_SC, ucp_Tangut },
-  { 2310, PT_SC, ucp_Tangsa },
-  { 2317, PT_SC, ucp_Tangut },
-  { 2324, PT_SC, ucp_Tai_Viet },
-  { 2329, PT_SCX, ucp_Telugu },
-  { 2334, PT_SCX, ucp_Telugu },
-  { 2341, PT_SC, ucp_Tifinagh },
-  { 2346, PT_SCX, ucp_Tagalog },
-  { 2351, PT_SCX, ucp_Thaana },
-  { 2356, PT_SCX, ucp_Thaana },
-  { 2363, PT_SC, ucp_Thai },
-  { 2368, PT_SC, ucp_Tibetan },
-  { 2376, PT_SC, ucp_Tibetan },
-  { 2381, PT_SC, ucp_Tifinagh },
-  { 2390, PT_SCX, ucp_Tirhuta },
-  { 2395, PT_SCX, ucp_Tirhuta },
-  { 2403, PT_SC, ucp_Tangsa },
-  { 2408, PT_SC, ucp_Toto },
-  { 2413, PT_SC, ucp_Ugaritic },
-  { 2418, PT_SC, ucp_Ugaritic },
-  { 2427, PT_SC, ucp_Unknown },
-  { 2435, PT_SC, ucp_Vai },
-  { 2439, PT_SC, ucp_Vai },
-  { 2444, PT_SC, ucp_Vithkuqi },
-  { 2449, PT_SC, ucp_Vithkuqi },
-  { 2458, PT_SC, ucp_Wancho },
-  { 2465, PT_SC, ucp_Warang_Citi },
-  { 2470, PT_SC, ucp_Warang_Citi },
-  { 2481, PT_SC, ucp_Wancho },
-  { 2486, PT_ALNUM, 0 },
-  { 2490, PT_SC, ucp_Old_Persian },
-  { 2495, PT_PXSPACE, 0 },
-  { 2499, PT_SPACE, 0 },
-  { 2503, PT_SC, ucp_Cuneiform },
-  { 2508, PT_UCNC, 0 },
-  { 2512, PT_WORD, 0 },
-  { 2516, PT_SCX, ucp_Yezidi },
-  { 2521, PT_SCX, ucp_Yezidi },
-  { 2528, PT_SCX, ucp_Yi },
-  { 2531, PT_SCX, ucp_Yi },
-  { 2536, PT_GC, ucp_Z },
-  { 2538, PT_SC, ucp_Zanabazar_Square },
-  { 2554, PT_SC, ucp_Zanabazar_Square },
-  { 2559, PT_SC, ucp_Inherited },
-  { 2564, PT_PC, ucp_Zl },
-  { 2567, PT_PC, ucp_Zp },
-  { 2570, PT_PC, ucp_Zs },
-  { 2573, PT_SC, ucp_Common },
-  { 2578, PT_SC, ucp_Unknown }
+  { 2035, PT_SC, ucp_Samaritan },
+  { 2040, PT_SC, ucp_Old_South_Arabian },
+  { 2045, PT_SC, ucp_Saurashtra },
+  { 2050, PT_SC, ucp_Saurashtra },
+  { 2061, PT_PC, ucp_Sc },
+  { 2064, PT_SC, ucp_SignWriting },
+  { 2069, PT_SCX, ucp_Sharada },
+  { 2077, PT_SC, ucp_Shavian },
+  { 2085, PT_SC, ucp_Shavian },
+  { 2090, PT_SCX, ucp_Sharada },
+  { 2095, PT_SC, ucp_Siddham },
+  { 2100, PT_SC, ucp_Siddham },
+  { 2108, PT_SC, ucp_SignWriting },
+  { 2120, PT_SCX, ucp_Khudawadi },
+  { 2125, PT_SCX, ucp_Sinhala },
+  { 2130, PT_SCX, ucp_Sinhala },
+  { 2138, PT_PC, ucp_Sk },
+  { 2141, PT_PC, ucp_Sm },
+  { 2144, PT_PC, ucp_So },
+  { 2147, PT_SCX, ucp_Sogdian },
+  { 2152, PT_SCX, ucp_Sogdian },
+  { 2160, PT_SC, ucp_Old_Sogdian },
+  { 2165, PT_SC, ucp_Sora_Sompeng },
+  { 2170, PT_SC, ucp_Sora_Sompeng },
+  { 2182, PT_SC, ucp_Soyombo },
+  { 2187, PT_SC, ucp_Soyombo },
+  { 2195, PT_SC, ucp_Sundanese },
+  { 2200, PT_SC, ucp_Sundanese },
+  { 2210, PT_SCX, ucp_Syloti_Nagri },
+  { 2215, PT_SCX, ucp_Syloti_Nagri },
+  { 2227, PT_SCX, ucp_Syriac },
+  { 2232, PT_SCX, ucp_Syriac },
+  { 2239, PT_SCX, ucp_Tagalog },
+  { 2247, PT_SCX, ucp_Tagbanwa },
+  { 2252, PT_SCX, ucp_Tagbanwa },
+  { 2261, PT_SCX, ucp_Tai_Le },
+  { 2267, PT_SC, ucp_Tai_Tham },
+  { 2275, PT_SC, ucp_Tai_Viet },
+  { 2283, PT_SCX, ucp_Takri },
+  { 2288, PT_SCX, ucp_Takri },
+  { 2294, PT_SCX, ucp_Tai_Le },
+  { 2299, PT_SC, ucp_New_Tai_Lue },
+  { 2304, PT_SCX, ucp_Tamil },
+  { 2310, PT_SCX, ucp_Tamil },
+  { 2315, PT_SC, ucp_Tangut },
+  { 2320, PT_SC, ucp_Tangsa },
+  { 2327, PT_SC, ucp_Tangut },
+  { 2334, PT_SC, ucp_Tai_Viet },
+  { 2339, PT_SCX, ucp_Telugu },
+  { 2344, PT_SCX, ucp_Telugu },
+  { 2351, PT_SC, ucp_Tifinagh },
+  { 2356, PT_SCX, ucp_Tagalog },
+  { 2361, PT_SCX, ucp_Thaana },
+  { 2366, PT_SCX, ucp_Thaana },
+  { 2373, PT_SC, ucp_Thai },
+  { 2378, PT_SC, ucp_Tibetan },
+  { 2386, PT_SC, ucp_Tibetan },
+  { 2391, PT_SC, ucp_Tifinagh },
+  { 2400, PT_SCX, ucp_Tirhuta },
+  { 2405, PT_SCX, ucp_Tirhuta },
+  { 2413, PT_SC, ucp_Tangsa },
+  { 2418, PT_SC, ucp_Toto },
+  { 2423, PT_SC, ucp_Ugaritic },
+  { 2428, PT_SC, ucp_Ugaritic },
+  { 2437, PT_SC, ucp_Unknown },
+  { 2445, PT_SC, ucp_Vai },
+  { 2449, PT_SC, ucp_Vai },
+  { 2454, PT_SC, ucp_Vithkuqi },
+  { 2459, PT_SC, ucp_Vithkuqi },
+  { 2468, PT_SC, ucp_Wancho },
+  { 2475, PT_SC, ucp_Warang_Citi },
+  { 2480, PT_SC, ucp_Warang_Citi },
+  { 2491, PT_SC, ucp_Wancho },
+  { 2496, PT_ALNUM, 0 },
+  { 2500, PT_SC, ucp_Old_Persian },
+  { 2505, PT_PXSPACE, 0 },
+  { 2509, PT_SPACE, 0 },
+  { 2513, PT_SC, ucp_Cuneiform },
+  { 2518, PT_UCNC, 0 },
+  { 2522, PT_WORD, 0 },
+  { 2526, PT_SCX, ucp_Yezidi },
+  { 2531, PT_SCX, ucp_Yezidi },
+  { 2538, PT_SCX, ucp_Yi },
+  { 2541, PT_SCX, ucp_Yi },
+  { 2546, PT_GC, ucp_Z },
+  { 2548, PT_SC, ucp_Zanabazar_Square },
+  { 2564, PT_SC, ucp_Zanabazar_Square },
+  { 2569, PT_SC, ucp_Inherited },
+  { 2574, PT_PC, ucp_Zl },
+  { 2577, PT_PC, ucp_Zp },
+  { 2580, PT_PC, ucp_Zs },
+  { 2583, PT_SC, ucp_Common },
+  { 2588, PT_SC, ucp_Unknown }
 };

 const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);