Auto-generate script names

2022-01-07 06:47:07 +00:00 · 2022-01-07 06:47:07 +00:00 · ccbd3052dc
parent e3a16626ae
commit ccbd3052dc
7 changed files with 5155 additions and 3725 deletions
--- a/maint/GenerateCommon.py
+++ b/maint/GenerateCommon.py
@ -13,51 +13,6 @@
 #                             DATA LISTS
 # ---------------------------------------------------------------------------

-# The lists of script names and script abbreviations must be kept in step. Note
-# that the pcre2pattern and pcre2syntax documentation has lists of scripts.
-
-script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
- 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
- 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
- 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
- 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
- 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
- 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
- # New for Unicode 5.0
- 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
- # New for Unicode 5.1
- 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
- # New for Unicode 5.2
- 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
- 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
- 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
- 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
- # New for Unicode 6.0.0
- 'Batak', 'Brahmi', 'Mandaic', \
-# New for Unicode 6.1.0
- 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
-# New for Unicode 7.0.0
- 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
- 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
- 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
- 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
-# New for Unicode 8.0.0
- 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
- 'SignWriting',
-# New for Unicode 10.0.0
- 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
- 'Nushu', 'Soyombo', 'Zanabazar_Square',
-# New for Unicode 11.0.0
-  'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
-  'Old_Sogdian', 'Sogdian',
-# New for Unicode 12.0.0
-  'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
-# New for Unicode 13.0.0
-  'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
-# New for Unicode 14.0.0
-  'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
- ]
-
 # BIDI classes in the DerivedBidiClass.txt file, with comments.

 bidi_classes = [
@ -145,203 +100,63 @@ break_properties = [
  'Extended_Pictographic', '14'
  ]

-# List of abbreviations for various properties
+# ---------------------------------------------------------------------------
+#                     COLLECTING PROPERTY NAMES
+# ---------------------------------------------------------------------------

-abbreviations = {
-# Script abbreviations
-  'Unknown': 'Zzzz',
-  'Arabic': 'Arab',
-  'Armenian': 'Armn',
-  'Bengali': 'Beng',
-  'Bopomofo': 'Bopo',
-  'Braille': 'Brai',
-  'Buginese': 'Bugi',
-  'Buhid': 'Buhd',
-  'Canadian_Aboriginal': 'Cans',
-  'Cherokee': 'Cher',
-  'Common': 'Zyyy',
-  'Coptic': ('Copt', 'Qaac'),
-  'Cypriot': 'Cprt',
-  'Cyrillic': 'Cyrl',
-  'Deseret': 'Dsrt',
-  'Devanagari': 'Deva',
-  'Ethiopic': 'Ethi',
-  'Georgian': 'Geor',
-  'Glagolitic': 'Glag',
-  'Gothic': 'Goth',
-  'Greek': 'Grek',
-  'Gujarati': 'Gujr',
-  'Gurmukhi': 'Guru',
-  'Han': 'Hani',
-  'Hangul': 'Hang',
-  'Hanunoo': 'Hano',
-  'Hebrew': 'Hebr',
-  'Hiragana': 'Hira',
-  'Inherited': ('Zinh', 'Qaai'),
-  'Kannada': 'Knda',
-  'Katakana': 'Kana',
-  'Kharoshthi': 'Khar',
-  'Khmer': 'Khmr',
-  'Lao': 'Laoo',
-  'Latin': 'Latn',
-  'Limbu': 'Limb',
-  'Linear_B': 'Linb',
-  'Malayalam': 'Mlym',
-  'Mongolian': 'Mong',
-  'Myanmar': 'Mymr',
-  'New_Tai_Lue': 'Talu',
-  'Ogham': 'Ogam',
-  'Old_Italic': 'Ital',
-  'Old_Persian': 'Xpeo',
-  'Oriya': 'Orya',
-  'Osmanya': 'Osma',
-  'Runic': 'Runr',
-  'Shavian': 'Shaw',
-  'Sinhala': 'Sinh',
-  'Syloti_Nagri': 'Sylo',
-  'Syriac': 'Syrc',
-  'Tagalog': 'Tglg',
-  'Tagbanwa': 'Tagb',
-  'Tai_Le': 'Tale',
-  'Tamil': 'Taml',
-  'Telugu': 'Telu',
-  'Thaana': 'Thaa',
-  'Thai': (),
-  'Tibetan': 'Tibt',
-  'Tifinagh': 'Tfng',
-  'Ugaritic': 'Ugar',
-  'Yi': 'Yiii',
-# New for Unicode 5.0
-  'Balinese': 'Bali',
-  'Cuneiform': 'Xsux',
-  'Nko': 'Nkoo',
-  'Phags_Pa': 'Phag',
-  'Phoenician': 'Phnx',
-# New for Unicode 5.1
-  'Carian': 'Cari',
-  'Cham': (),
-  'Kayah_Li': 'Kali',
-  'Lepcha': 'Lepc',
-  'Lycian': 'Lyci',
-  'Lydian': 'Lydi',
-  'Ol_Chiki': 'Olck',
-  'Rejang': 'Rjng',
-  'Saurashtra': 'Saur',
-  'Sundanese': 'Sund',
-  'Vai': 'Vaii',
-# New for Unicode 5.2
-  'Avestan': 'Avst',
-  'Bamum': 'Bamu',
-  'Egyptian_Hieroglyphs': 'Egyp',
-  'Imperial_Aramaic': 'Armi',
-  'Inscriptional_Pahlavi': 'Phli',
-  'Inscriptional_Parthian': 'Prti',
-  'Javanese': 'Java',
-  'Kaithi': 'Kthi',
-  'Lisu': (),
-  'Meetei_Mayek': 'Mtei',
-  'Old_South_Arabian': 'Sarb',
-  'Old_Turkic': 'Orkh',
-  'Samaritan': 'Samr',
-  'Tai_Tham': 'Lana',
-  'Tai_Viet': 'Tavt',
-# New for Unicode 6.0.0
-  'Batak': 'Batk',
-  'Brahmi': 'Brah',
-  'Mandaic': 'Mand',
-# New for Unicode 6.1.0
-  'Chakma': 'Cakm',
-  'Meroitic_Cursive': 'Merc',
-  'Meroitic_Hieroglyphs': 'Mero',
-  'Miao': 'Plrd',
-  'Sharada': 'Shrd',
-  'Sora_Sompeng': 'Sora',
-  'Takri': 'Takr',
-# New for Unicode 7.0.0
-  'Bassa_Vah': 'Bass',
-  'Caucasian_Albanian': 'Aghb',
-  'Duployan': 'Dupl',
-  'Elbasan': 'Elba',
-  'Grantha': 'Gran',
-  'Khojki': 'Khoj',
-  'Khudawadi': 'Sind',
-  'Linear_A': 'Lina',
-  'Mahajani': 'Mahj',
-  'Manichaean': 'Mani',
-  'Mende_Kikakui': 'Mend',
-  'Modi': (),
-  'Mro': 'Mroo',
-  'Nabataean': 'Nbat',
-  'Old_North_Arabian': 'Narb',
-  'Old_Permic': 'Perm',
-  'Pahawh_Hmong': 'Hmng',
-  'Palmyrene': 'Palm',
-  'Psalter_Pahlavi': 'Phlp',
-  'Pau_Cin_Hau': 'Pauc',
-  'Siddham': 'Sidd',
-  'Tirhuta': 'Tirh',
-  'Warang_Citi': 'Wara',
-# New for Unicode 8.0.0
-  'Ahom': (),
-  'Anatolian_Hieroglyphs': 'Hluw',
-  'Hatran': 'Hatr',
-  'Multani': 'Mult',
-  'Old_Hungarian': 'Hung',
-  'SignWriting': 'Sgnw',
-# New for Unicode 10.0.0
-  'Adlam': 'Adlm',
-  'Bhaiksuki': 'Bhks',
-  'Marchen': 'Marc',
-  'Newa': (),
-  'Osage': 'Osge',
-  'Tangut': 'Tang',
-  'Masaram_Gondi': 'Gonm',
-  'Nushu': 'Nshu',
-  'Soyombo': 'Soyo',
-  'Zanabazar_Square': 'Zanb',
-# New for Unicode 11.0.0
-  'Dogra': 'Dogr',
-  'Gunjala_Gondi': 'Gong',
-  'Hanifi_Rohingya': 'Rohg',
-  'Makasar': 'Maka',
-  'Medefaidrin': 'Medf',
-  'Old_Sogdian': 'Sogo',
-  'Sogdian': 'Sogd',
-# New for Unicode 12.0.0
-  'Elymaic': 'Elym',
-  'Nandinagari': 'Nand',
-  'Nyiakeng_Puachue_Hmong': 'Hmnp',
-  'Wancho': 'Wcho',
-# New for Unicode 13.0.0
-  'Chorasmian': 'Chrs',
-  'Dives_Akuru': 'Diak',
-  'Khitan_Small_Script': 'Kits',
-  'Yezidi': 'Yezi',
-# New for Unicode 14.0.0
-  'Cypro_Minoan': 'Cpmn',
-  'Old_Uyghur': 'Ougr',
-  'Tangsa': 'Tngs',
-  'Toto': (),
-  'Vithkuqi': 'Vith',
-  }
+import re

-# Convert string abbreviations to tuples
-for key in abbreviations:
-  value = abbreviations[key]
-  if isinstance(value, str):
-    abbreviations[key] = (value,)
+script_names = ['Unknown']
+abbreviations = {}
+
+def collect_property_names():
+  global script_names
+  global abbreviations
+
+  names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
+
+  last_script_name = ""
+  with open("Unicode.tables/Scripts.txt") as f:
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None or match_obj.group(1) == last_script_name:
+        continue
+
+      last_script_name = match_obj.group(1)
+      script_names.append(last_script_name)
+
+  # Sometimes there is comment in the line
+  # so splitting around semicolon is not enough
+  value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
+
+  with open("Unicode.tables/PropertyValueAliases.txt") as f:
+    for line in f:
+      match_obj = value_alias_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      if match_obj.group(1) == "sc":
+        if match_obj.group(2) == match_obj.group(3):
+          abbreviations[match_obj.group(3)] = ()
+        elif match_obj.group(4) == None:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2),)
+        else:
+          abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
+
+collect_property_names()

 # ---------------------------------------------------------------------------
 #                      REORDERING SCRIPT NAMES
 # ---------------------------------------------------------------------------

-import re
-
 script_abbrevs = []

 def reorder_scripts():
  global script_names
  global script_abbrevs
+  global abbreviations

  for name in script_names:
    abbrevs = abbreviations[name]
--- a/maint/Unicode.tables/PropertyValueAliases.txt
+++ b/maint/Unicode.tables/PropertyValueAliases.txt
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
--- a/src/pcre2_ucp.h
+++ b/src/pcre2_ucp.h
@ -153,45 +153,45 @@ enum {

 enum {
  /* Scripts which has characters in other scripts. */
-  ucp_Arabic,
-  ucp_Bengali,
-  ucp_Bopomofo,
-  ucp_Buginese,
-  ucp_Buhid,
-  ucp_Coptic,
-  ucp_Cypriot,
-  ucp_Cyrillic,
-  ucp_Devanagari,
-  ucp_Georgian,
-  ucp_Glagolitic,
-  ucp_Greek,
-  ucp_Gujarati,
-  ucp_Gurmukhi,
-  ucp_Han,
-  ucp_Hangul,
-  ucp_Hanunoo,
-  ucp_Hiragana,
-  ucp_Kannada,
-  ucp_Katakana,
  ucp_Latin,
-  ucp_Limbu,
-  ucp_Linear_B,
-  ucp_Malayalam,
-  ucp_Mongolian,
-  ucp_Myanmar,
-  ucp_Oriya,
-  ucp_Sinhala,
-  ucp_Syloti_Nagri,
+  ucp_Greek,
+  ucp_Cyrillic,
+  ucp_Arabic,
  ucp_Syriac,
-  ucp_Tagalog,
-  ucp_Tagbanwa,
-  ucp_Tai_Le,
+  ucp_Thaana,
+  ucp_Devanagari,
+  ucp_Bengali,
+  ucp_Gurmukhi,
+  ucp_Gujarati,
+  ucp_Oriya,
  ucp_Tamil,
  ucp_Telugu,
-  ucp_Thaana,
+  ucp_Kannada,
+  ucp_Malayalam,
+  ucp_Sinhala,
+  ucp_Myanmar,
+  ucp_Georgian,
+  ucp_Hangul,
+  ucp_Mongolian,
+  ucp_Hiragana,
+  ucp_Katakana,
+  ucp_Bopomofo,
+  ucp_Han,
  ucp_Yi,
-  ucp_Nko,
+  ucp_Tagalog,
+  ucp_Hanunoo,
+  ucp_Buhid,
+  ucp_Tagbanwa,
+  ucp_Limbu,
+  ucp_Tai_Le,
+  ucp_Linear_B,
+  ucp_Cypriot,
+  ucp_Buginese,
+  ucp_Coptic,
+  ucp_Glagolitic,
+  ucp_Syloti_Nagri,
  ucp_Phags_Pa,
+  ucp_Nko,
  ucp_Kayah_Li,
  ucp_Javanese,
  ucp_Kaithi,
@ -202,13 +202,13 @@ enum {
  ucp_Duployan,
  ucp_Grantha,
  ucp_Khojki,
-  ucp_Khudawadi,
  ucp_Linear_A,
  ucp_Mahajani,
  ucp_Manichaean,
  ucp_Modi,
  ucp_Old_Permic,
  ucp_Psalter_Pahlavi,
+  ucp_Khudawadi,
  ucp_Tirhuta,
  ucp_Multani,
  ucp_Adlam,
@ -224,70 +224,70 @@ enum {

  /* Scripts which has no characters in other scripts. */
  ucp_Unknown,
-  ucp_Armenian,
-  ucp_Braille,
-  ucp_Canadian_Aboriginal,
-  ucp_Cherokee,
  ucp_Common,
-  ucp_Deseret,
-  ucp_Ethiopic,
-  ucp_Gothic,
+  ucp_Armenian,
  ucp_Hebrew,
-  ucp_Inherited,
-  ucp_Kharoshthi,
-  ucp_Khmer,
-  ucp_Lao,
-  ucp_New_Tai_Lue,
-  ucp_Ogham,
-  ucp_Old_Italic,
-  ucp_Old_Persian,
-  ucp_Osmanya,
-  ucp_Runic,
-  ucp_Shavian,
  ucp_Thai,
+  ucp_Lao,
  ucp_Tibetan,
-  ucp_Tifinagh,
+  ucp_Ethiopic,
+  ucp_Cherokee,
+  ucp_Canadian_Aboriginal,
+  ucp_Ogham,
+  ucp_Runic,
+  ucp_Khmer,
+  ucp_Old_Italic,
+  ucp_Gothic,
+  ucp_Deseret,
+  ucp_Inherited,
  ucp_Ugaritic,
+  ucp_Shavian,
+  ucp_Osmanya,
+  ucp_Braille,
+  ucp_New_Tai_Lue,
+  ucp_Tifinagh,
+  ucp_Old_Persian,
+  ucp_Kharoshthi,
  ucp_Balinese,
  ucp_Cuneiform,
  ucp_Phoenician,
-  ucp_Carian,
-  ucp_Cham,
-  ucp_Lepcha,
-  ucp_Lycian,
-  ucp_Lydian,
-  ucp_Ol_Chiki,
-  ucp_Rejang,
-  ucp_Saurashtra,
  ucp_Sundanese,
+  ucp_Lepcha,
+  ucp_Ol_Chiki,
  ucp_Vai,
-  ucp_Avestan,
-  ucp_Bamum,
-  ucp_Egyptian_Hieroglyphs,
-  ucp_Imperial_Aramaic,
-  ucp_Inscriptional_Pahlavi,
-  ucp_Inscriptional_Parthian,
-  ucp_Lisu,
-  ucp_Meetei_Mayek,
-  ucp_Old_South_Arabian,
-  ucp_Old_Turkic,
-  ucp_Samaritan,
+  ucp_Saurashtra,
+  ucp_Rejang,
+  ucp_Lycian,
+  ucp_Carian,
+  ucp_Lydian,
+  ucp_Cham,
  ucp_Tai_Tham,
  ucp_Tai_Viet,
+  ucp_Avestan,
+  ucp_Egyptian_Hieroglyphs,
+  ucp_Samaritan,
+  ucp_Lisu,
+  ucp_Bamum,
+  ucp_Meetei_Mayek,
+  ucp_Imperial_Aramaic,
+  ucp_Old_South_Arabian,
+  ucp_Inscriptional_Parthian,
+  ucp_Inscriptional_Pahlavi,
+  ucp_Old_Turkic,
  ucp_Batak,
  ucp_Brahmi,
  ucp_Meroitic_Cursive,
  ucp_Meroitic_Hieroglyphs,
  ucp_Miao,
  ucp_Sora_Sompeng,
-  ucp_Bassa_Vah,
  ucp_Caucasian_Albanian,
+  ucp_Bassa_Vah,
  ucp_Elbasan,
+  ucp_Pahawh_Hmong,
  ucp_Mende_Kikakui,
  ucp_Mro,
-  ucp_Nabataean,
  ucp_Old_North_Arabian,
-  ucp_Pahawh_Hmong,
+  ucp_Nabataean,
  ucp_Palmyrene,
  ucp_Pau_Cin_Hau,
  ucp_Siddham,
--- a/src/pcre2_ucptables.c
+++ b/src/pcre2_ucptables.c
@ -409,7 +409,7 @@ the "loose matching" rules that Unicode advises and Perl uses. */
 #define STRING_tifinagh0 STR_t STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
 #define STRING_tirh0 STR_t STR_i STR_r STR_h "\0"
 #define STRING_tirhuta0 STR_t STR_i STR_r STR_h STR_u STR_t STR_a "\0"
-#define STRING_tngs0 STR_t STR_n STR_g STR_s "\0"
+#define STRING_tnsa0 STR_t STR_n STR_s STR_a "\0"
 #define STRING_toto0 STR_t STR_o STR_t STR_o "\0"
 #define STRING_ugar0 STR_u STR_g STR_a STR_r "\0"
 #define STRING_ugaritic0 STR_u STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
@ -800,7 +800,7 @@ const char PRIV(utt_names)[] =
  STRING_tifinagh0
  STRING_tirh0
  STRING_tirhuta0
-  STRING_tngs0
+  STRING_tnsa0
  STRING_toto0
  STRING_ugar0
  STRING_ugaritic0
--- a/testdata/testinput26
+++ b/testdata/testinput26
--- a/testdata/testoutput26
+++ b/testdata/testoutput26