Auto-generate script names

This commit is contained in:
Zoltan Herczeg 2022-01-07 06:47:07 +00:00
parent e3a16626ae
commit ccbd3052dc
7 changed files with 5155 additions and 3725 deletions

View File

@ -13,51 +13,6 @@
# DATA LISTS
# ---------------------------------------------------------------------------
# The lists of script names and script abbreviations must be kept in step. Note
# that the pcre2pattern and pcre2syntax documentation has lists of scripts.
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
# New for Unicode 5.0
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
# New for Unicode 5.1
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
# New for Unicode 5.2
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
# New for Unicode 6.0.0
'Batak', 'Brahmi', 'Mandaic', \
# New for Unicode 6.1.0
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
# New for Unicode 7.0.0
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
# New for Unicode 8.0.0
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
'SignWriting',
# New for Unicode 10.0.0
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
'Nushu', 'Soyombo', 'Zanabazar_Square',
# New for Unicode 11.0.0
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
'Old_Sogdian', 'Sogdian',
# New for Unicode 12.0.0
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
# New for Unicode 13.0.0
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
# New for Unicode 14.0.0
'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
]
# BIDI classes in the DerivedBidiClass.txt file, with comments.
bidi_classes = [
@ -145,203 +100,63 @@ break_properties = [
'Extended_Pictographic', '14'
]
# List of abbreviations for various properties
# ---------------------------------------------------------------------------
# COLLECTING PROPERTY NAMES
# ---------------------------------------------------------------------------
abbreviations = {
# Script abbreviations
'Unknown': 'Zzzz',
'Arabic': 'Arab',
'Armenian': 'Armn',
'Bengali': 'Beng',
'Bopomofo': 'Bopo',
'Braille': 'Brai',
'Buginese': 'Bugi',
'Buhid': 'Buhd',
'Canadian_Aboriginal': 'Cans',
'Cherokee': 'Cher',
'Common': 'Zyyy',
'Coptic': ('Copt', 'Qaac'),
'Cypriot': 'Cprt',
'Cyrillic': 'Cyrl',
'Deseret': 'Dsrt',
'Devanagari': 'Deva',
'Ethiopic': 'Ethi',
'Georgian': 'Geor',
'Glagolitic': 'Glag',
'Gothic': 'Goth',
'Greek': 'Grek',
'Gujarati': 'Gujr',
'Gurmukhi': 'Guru',
'Han': 'Hani',
'Hangul': 'Hang',
'Hanunoo': 'Hano',
'Hebrew': 'Hebr',
'Hiragana': 'Hira',
'Inherited': ('Zinh', 'Qaai'),
'Kannada': 'Knda',
'Katakana': 'Kana',
'Kharoshthi': 'Khar',
'Khmer': 'Khmr',
'Lao': 'Laoo',
'Latin': 'Latn',
'Limbu': 'Limb',
'Linear_B': 'Linb',
'Malayalam': 'Mlym',
'Mongolian': 'Mong',
'Myanmar': 'Mymr',
'New_Tai_Lue': 'Talu',
'Ogham': 'Ogam',
'Old_Italic': 'Ital',
'Old_Persian': 'Xpeo',
'Oriya': 'Orya',
'Osmanya': 'Osma',
'Runic': 'Runr',
'Shavian': 'Shaw',
'Sinhala': 'Sinh',
'Syloti_Nagri': 'Sylo',
'Syriac': 'Syrc',
'Tagalog': 'Tglg',
'Tagbanwa': 'Tagb',
'Tai_Le': 'Tale',
'Tamil': 'Taml',
'Telugu': 'Telu',
'Thaana': 'Thaa',
'Thai': (),
'Tibetan': 'Tibt',
'Tifinagh': 'Tfng',
'Ugaritic': 'Ugar',
'Yi': 'Yiii',
# New for Unicode 5.0
'Balinese': 'Bali',
'Cuneiform': 'Xsux',
'Nko': 'Nkoo',
'Phags_Pa': 'Phag',
'Phoenician': 'Phnx',
# New for Unicode 5.1
'Carian': 'Cari',
'Cham': (),
'Kayah_Li': 'Kali',
'Lepcha': 'Lepc',
'Lycian': 'Lyci',
'Lydian': 'Lydi',
'Ol_Chiki': 'Olck',
'Rejang': 'Rjng',
'Saurashtra': 'Saur',
'Sundanese': 'Sund',
'Vai': 'Vaii',
# New for Unicode 5.2
'Avestan': 'Avst',
'Bamum': 'Bamu',
'Egyptian_Hieroglyphs': 'Egyp',
'Imperial_Aramaic': 'Armi',
'Inscriptional_Pahlavi': 'Phli',
'Inscriptional_Parthian': 'Prti',
'Javanese': 'Java',
'Kaithi': 'Kthi',
'Lisu': (),
'Meetei_Mayek': 'Mtei',
'Old_South_Arabian': 'Sarb',
'Old_Turkic': 'Orkh',
'Samaritan': 'Samr',
'Tai_Tham': 'Lana',
'Tai_Viet': 'Tavt',
# New for Unicode 6.0.0
'Batak': 'Batk',
'Brahmi': 'Brah',
'Mandaic': 'Mand',
# New for Unicode 6.1.0
'Chakma': 'Cakm',
'Meroitic_Cursive': 'Merc',
'Meroitic_Hieroglyphs': 'Mero',
'Miao': 'Plrd',
'Sharada': 'Shrd',
'Sora_Sompeng': 'Sora',
'Takri': 'Takr',
# New for Unicode 7.0.0
'Bassa_Vah': 'Bass',
'Caucasian_Albanian': 'Aghb',
'Duployan': 'Dupl',
'Elbasan': 'Elba',
'Grantha': 'Gran',
'Khojki': 'Khoj',
'Khudawadi': 'Sind',
'Linear_A': 'Lina',
'Mahajani': 'Mahj',
'Manichaean': 'Mani',
'Mende_Kikakui': 'Mend',
'Modi': (),
'Mro': 'Mroo',
'Nabataean': 'Nbat',
'Old_North_Arabian': 'Narb',
'Old_Permic': 'Perm',
'Pahawh_Hmong': 'Hmng',
'Palmyrene': 'Palm',
'Psalter_Pahlavi': 'Phlp',
'Pau_Cin_Hau': 'Pauc',
'Siddham': 'Sidd',
'Tirhuta': 'Tirh',
'Warang_Citi': 'Wara',
# New for Unicode 8.0.0
'Ahom': (),
'Anatolian_Hieroglyphs': 'Hluw',
'Hatran': 'Hatr',
'Multani': 'Mult',
'Old_Hungarian': 'Hung',
'SignWriting': 'Sgnw',
# New for Unicode 10.0.0
'Adlam': 'Adlm',
'Bhaiksuki': 'Bhks',
'Marchen': 'Marc',
'Newa': (),
'Osage': 'Osge',
'Tangut': 'Tang',
'Masaram_Gondi': 'Gonm',
'Nushu': 'Nshu',
'Soyombo': 'Soyo',
'Zanabazar_Square': 'Zanb',
# New for Unicode 11.0.0
'Dogra': 'Dogr',
'Gunjala_Gondi': 'Gong',
'Hanifi_Rohingya': 'Rohg',
'Makasar': 'Maka',
'Medefaidrin': 'Medf',
'Old_Sogdian': 'Sogo',
'Sogdian': 'Sogd',
# New for Unicode 12.0.0
'Elymaic': 'Elym',
'Nandinagari': 'Nand',
'Nyiakeng_Puachue_Hmong': 'Hmnp',
'Wancho': 'Wcho',
# New for Unicode 13.0.0
'Chorasmian': 'Chrs',
'Dives_Akuru': 'Diak',
'Khitan_Small_Script': 'Kits',
'Yezidi': 'Yezi',
# New for Unicode 14.0.0
'Cypro_Minoan': 'Cpmn',
'Old_Uyghur': 'Ougr',
'Tangsa': 'Tngs',
'Toto': (),
'Vithkuqi': 'Vith',
}
import re
# Convert string abbreviations to tuples
for key in abbreviations:
value = abbreviations[key]
if isinstance(value, str):
abbreviations[key] = (value,)
script_names = ['Unknown']
abbreviations = {}
def collect_property_names():
global script_names
global abbreviations
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
last_script_name = ""
with open("Unicode.tables/Scripts.txt") as f:
for line in f:
match_obj = names_re.match(line)
if match_obj == None or match_obj.group(1) == last_script_name:
continue
last_script_name = match_obj.group(1)
script_names.append(last_script_name)
# Sometimes there is comment in the line
# so splitting around semicolon is not enough
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
with open("Unicode.tables/PropertyValueAliases.txt") as f:
for line in f:
match_obj = value_alias_re.match(line)
if match_obj == None:
continue
if match_obj.group(1) == "sc":
if match_obj.group(2) == match_obj.group(3):
abbreviations[match_obj.group(3)] = ()
elif match_obj.group(4) == None:
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
else:
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
collect_property_names()
# ---------------------------------------------------------------------------
# REORDERING SCRIPT NAMES
# ---------------------------------------------------------------------------
import re
script_abbrevs = []
def reorder_scripts():
global script_names
global script_abbrevs
global abbreviations
for name in script_names:
abbrevs = abbreviations[name]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -153,45 +153,45 @@ enum {
enum {
/* Scripts which has characters in other scripts. */
ucp_Arabic,
ucp_Bengali,
ucp_Bopomofo,
ucp_Buginese,
ucp_Buhid,
ucp_Coptic,
ucp_Cypriot,
ucp_Cyrillic,
ucp_Devanagari,
ucp_Georgian,
ucp_Glagolitic,
ucp_Greek,
ucp_Gujarati,
ucp_Gurmukhi,
ucp_Han,
ucp_Hangul,
ucp_Hanunoo,
ucp_Hiragana,
ucp_Kannada,
ucp_Katakana,
ucp_Latin,
ucp_Limbu,
ucp_Linear_B,
ucp_Malayalam,
ucp_Mongolian,
ucp_Myanmar,
ucp_Oriya,
ucp_Sinhala,
ucp_Syloti_Nagri,
ucp_Greek,
ucp_Cyrillic,
ucp_Arabic,
ucp_Syriac,
ucp_Tagalog,
ucp_Tagbanwa,
ucp_Tai_Le,
ucp_Thaana,
ucp_Devanagari,
ucp_Bengali,
ucp_Gurmukhi,
ucp_Gujarati,
ucp_Oriya,
ucp_Tamil,
ucp_Telugu,
ucp_Thaana,
ucp_Kannada,
ucp_Malayalam,
ucp_Sinhala,
ucp_Myanmar,
ucp_Georgian,
ucp_Hangul,
ucp_Mongolian,
ucp_Hiragana,
ucp_Katakana,
ucp_Bopomofo,
ucp_Han,
ucp_Yi,
ucp_Nko,
ucp_Tagalog,
ucp_Hanunoo,
ucp_Buhid,
ucp_Tagbanwa,
ucp_Limbu,
ucp_Tai_Le,
ucp_Linear_B,
ucp_Cypriot,
ucp_Buginese,
ucp_Coptic,
ucp_Glagolitic,
ucp_Syloti_Nagri,
ucp_Phags_Pa,
ucp_Nko,
ucp_Kayah_Li,
ucp_Javanese,
ucp_Kaithi,
@ -202,13 +202,13 @@ enum {
ucp_Duployan,
ucp_Grantha,
ucp_Khojki,
ucp_Khudawadi,
ucp_Linear_A,
ucp_Mahajani,
ucp_Manichaean,
ucp_Modi,
ucp_Old_Permic,
ucp_Psalter_Pahlavi,
ucp_Khudawadi,
ucp_Tirhuta,
ucp_Multani,
ucp_Adlam,
@ -224,70 +224,70 @@ enum {
/* Scripts which has no characters in other scripts. */
ucp_Unknown,
ucp_Armenian,
ucp_Braille,
ucp_Canadian_Aboriginal,
ucp_Cherokee,
ucp_Common,
ucp_Deseret,
ucp_Ethiopic,
ucp_Gothic,
ucp_Armenian,
ucp_Hebrew,
ucp_Inherited,
ucp_Kharoshthi,
ucp_Khmer,
ucp_Lao,
ucp_New_Tai_Lue,
ucp_Ogham,
ucp_Old_Italic,
ucp_Old_Persian,
ucp_Osmanya,
ucp_Runic,
ucp_Shavian,
ucp_Thai,
ucp_Lao,
ucp_Tibetan,
ucp_Tifinagh,
ucp_Ethiopic,
ucp_Cherokee,
ucp_Canadian_Aboriginal,
ucp_Ogham,
ucp_Runic,
ucp_Khmer,
ucp_Old_Italic,
ucp_Gothic,
ucp_Deseret,
ucp_Inherited,
ucp_Ugaritic,
ucp_Shavian,
ucp_Osmanya,
ucp_Braille,
ucp_New_Tai_Lue,
ucp_Tifinagh,
ucp_Old_Persian,
ucp_Kharoshthi,
ucp_Balinese,
ucp_Cuneiform,
ucp_Phoenician,
ucp_Carian,
ucp_Cham,
ucp_Lepcha,
ucp_Lycian,
ucp_Lydian,
ucp_Ol_Chiki,
ucp_Rejang,
ucp_Saurashtra,
ucp_Sundanese,
ucp_Lepcha,
ucp_Ol_Chiki,
ucp_Vai,
ucp_Avestan,
ucp_Bamum,
ucp_Egyptian_Hieroglyphs,
ucp_Imperial_Aramaic,
ucp_Inscriptional_Pahlavi,
ucp_Inscriptional_Parthian,
ucp_Lisu,
ucp_Meetei_Mayek,
ucp_Old_South_Arabian,
ucp_Old_Turkic,
ucp_Samaritan,
ucp_Saurashtra,
ucp_Rejang,
ucp_Lycian,
ucp_Carian,
ucp_Lydian,
ucp_Cham,
ucp_Tai_Tham,
ucp_Tai_Viet,
ucp_Avestan,
ucp_Egyptian_Hieroglyphs,
ucp_Samaritan,
ucp_Lisu,
ucp_Bamum,
ucp_Meetei_Mayek,
ucp_Imperial_Aramaic,
ucp_Old_South_Arabian,
ucp_Inscriptional_Parthian,
ucp_Inscriptional_Pahlavi,
ucp_Old_Turkic,
ucp_Batak,
ucp_Brahmi,
ucp_Meroitic_Cursive,
ucp_Meroitic_Hieroglyphs,
ucp_Miao,
ucp_Sora_Sompeng,
ucp_Bassa_Vah,
ucp_Caucasian_Albanian,
ucp_Bassa_Vah,
ucp_Elbasan,
ucp_Pahawh_Hmong,
ucp_Mende_Kikakui,
ucp_Mro,
ucp_Nabataean,
ucp_Old_North_Arabian,
ucp_Pahawh_Hmong,
ucp_Nabataean,
ucp_Palmyrene,
ucp_Pau_Cin_Hau,
ucp_Siddham,

View File

@ -409,7 +409,7 @@ the "loose matching" rules that Unicode advises and Perl uses. */
#define STRING_tifinagh0 STR_t STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
#define STRING_tirh0 STR_t STR_i STR_r STR_h "\0"
#define STRING_tirhuta0 STR_t STR_i STR_r STR_h STR_u STR_t STR_a "\0"
#define STRING_tngs0 STR_t STR_n STR_g STR_s "\0"
#define STRING_tnsa0 STR_t STR_n STR_s STR_a "\0"
#define STRING_toto0 STR_t STR_o STR_t STR_o "\0"
#define STRING_ugar0 STR_u STR_g STR_a STR_r "\0"
#define STRING_ugaritic0 STR_u STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
@ -800,7 +800,7 @@ const char PRIV(utt_names)[] =
STRING_tifinagh0
STRING_tirh0
STRING_tirhuta0
STRING_tngs0
STRING_tnsa0
STRING_toto0
STRING_ugar0
STRING_ugaritic0

2250
testdata/testinput26 vendored

File diff suppressed because it is too large Load Diff

2882
testdata/testoutput26 vendored

File diff suppressed because it is too large Load Diff