Improve unicode property abbreviation support (#74)
* Improve unicode property abbreviation support * Auto-generate script names Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu>
This commit is contained in:
parent
14dbc6e6ec
commit
f90542a209
|
@ -13,90 +13,6 @@
|
||||||
# DATA LISTS
|
# DATA LISTS
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
# The lists of script names and script abbreviations must be kept in step. Note
|
|
||||||
# that the pcre2pattern and pcre2syntax documentation has lists of scripts.
|
|
||||||
|
|
||||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
|
||||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
|
||||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
|
||||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
|
||||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
|
||||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
|
||||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
|
||||||
# New for Unicode 5.0
|
|
||||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
|
||||||
# New for Unicode 5.1
|
|
||||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
|
||||||
# New for Unicode 5.2
|
|
||||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
|
||||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
|
||||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
|
||||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
|
||||||
# New for Unicode 6.0.0
|
|
||||||
'Batak', 'Brahmi', 'Mandaic', \
|
|
||||||
# New for Unicode 6.1.0
|
|
||||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
|
||||||
# New for Unicode 7.0.0
|
|
||||||
'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
|
|
||||||
'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
|
|
||||||
'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
|
|
||||||
'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi',
|
|
||||||
# New for Unicode 8.0.0
|
|
||||||
'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian',
|
|
||||||
'SignWriting',
|
|
||||||
# New for Unicode 10.0.0
|
|
||||||
'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi',
|
|
||||||
'Nushu', 'Soyombo', 'Zanabazar_Square',
|
|
||||||
# New for Unicode 11.0.0
|
|
||||||
'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin',
|
|
||||||
'Old_Sogdian', 'Sogdian',
|
|
||||||
# New for Unicode 12.0.0
|
|
||||||
'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho',
|
|
||||||
# New for Unicode 13.0.0
|
|
||||||
'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi',
|
|
||||||
# New for Unicode 14.0.0
|
|
||||||
'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi'
|
|
||||||
]
|
|
||||||
|
|
||||||
script_abbrevs = [
|
|
||||||
'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
|
|
||||||
'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
|
|
||||||
'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
|
|
||||||
'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
|
|
||||||
'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
|
|
||||||
'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
|
|
||||||
'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
|
|
||||||
#New for Unicode 5.0
|
|
||||||
'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
|
|
||||||
#New for Unicode 5.1
|
|
||||||
'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
|
|
||||||
'Sund', 'Vaii',
|
|
||||||
#New for Unicode 5.2
|
|
||||||
'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
|
|
||||||
'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
|
|
||||||
#New for Unicode 6.0.0
|
|
||||||
'Batk', 'Brah', 'Mand',
|
|
||||||
#New for Unicode 6.1.0
|
|
||||||
'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
|
|
||||||
#New for Unicode 7.0.0
|
|
||||||
'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
|
|
||||||
'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
|
|
||||||
'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
|
|
||||||
#New for Unicode 8.0.0
|
|
||||||
'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
|
|
||||||
#New for Unicode 10.0.0
|
|
||||||
'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
|
|
||||||
'Zanb',
|
|
||||||
#New for Unicode 11.0.0
|
|
||||||
'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd',
|
|
||||||
#New for Unicode 12.0.0
|
|
||||||
'Elym', 'Nand', 'Hmnp', 'Wcho',
|
|
||||||
#New for Unicode 13.0.0
|
|
||||||
'Chrs', 'Diak', 'Kits', 'Yezi',
|
|
||||||
#New for Unicode 14.0.0
|
|
||||||
'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith'
|
|
||||||
]
|
|
||||||
|
|
||||||
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
# BIDI classes in the DerivedBidiClass.txt file, with comments.
|
||||||
|
|
||||||
bidi_classes = [
|
bidi_classes = [
|
||||||
|
@ -185,14 +101,66 @@ break_properties = [
|
||||||
]
|
]
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# REORDERING SCRIPT NAMES
|
# COLLECTING PROPERTY NAMES
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
script_names = ['Unknown']
|
||||||
|
abbreviations = {}
|
||||||
|
|
||||||
|
def collect_property_names():
|
||||||
|
global script_names
|
||||||
|
global abbreviations
|
||||||
|
|
||||||
|
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #')
|
||||||
|
|
||||||
|
last_script_name = ""
|
||||||
|
with open("Unicode.tables/Scripts.txt") as f:
|
||||||
|
for line in f:
|
||||||
|
match_obj = names_re.match(line)
|
||||||
|
|
||||||
|
if match_obj == None or match_obj.group(1) == last_script_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
last_script_name = match_obj.group(1)
|
||||||
|
script_names.append(last_script_name)
|
||||||
|
|
||||||
|
# Sometimes there is comment in the line
|
||||||
|
# so splitting around semicolon is not enough
|
||||||
|
value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?')
|
||||||
|
|
||||||
|
with open("Unicode.tables/PropertyValueAliases.txt") as f:
|
||||||
|
for line in f:
|
||||||
|
match_obj = value_alias_re.match(line)
|
||||||
|
|
||||||
|
if match_obj == None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if match_obj.group(1) == "sc":
|
||||||
|
if match_obj.group(2) == match_obj.group(3):
|
||||||
|
abbreviations[match_obj.group(3)] = ()
|
||||||
|
elif match_obj.group(4) == None:
|
||||||
|
abbreviations[match_obj.group(3)] = (match_obj.group(2),)
|
||||||
|
else:
|
||||||
|
abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4))
|
||||||
|
|
||||||
|
collect_property_names()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# REORDERING SCRIPT NAMES
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
script_abbrevs = []
|
||||||
|
|
||||||
def reorder_scripts():
|
def reorder_scripts():
|
||||||
global script_names
|
global script_names
|
||||||
global script_abbrevs
|
global script_abbrevs
|
||||||
|
global abbreviations
|
||||||
|
|
||||||
|
for name in script_names:
|
||||||
|
abbrevs = abbreviations[name]
|
||||||
|
script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0])
|
||||||
|
|
||||||
extended_script_abbrevs = set()
|
extended_script_abbrevs = set()
|
||||||
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
with open("Unicode.tables/ScriptExtensions.txt") as f:
|
||||||
|
|
|
@ -48,10 +48,10 @@
|
||||||
# Import common data lists and functions
|
# Import common data lists and functions
|
||||||
|
|
||||||
from GenerateCommon import \
|
from GenerateCommon import \
|
||||||
|
abbreviations, \
|
||||||
bidi_classes, \
|
bidi_classes, \
|
||||||
category_names, \
|
category_names, \
|
||||||
general_category_names, \
|
general_category_names, \
|
||||||
script_abbrevs, \
|
|
||||||
script_names, \
|
script_names, \
|
||||||
open_output
|
open_output
|
||||||
|
|
||||||
|
@ -75,14 +75,15 @@ category_names = category_names[::2]
|
||||||
# Create standardized versions of the names by lowercasing and removing
|
# Create standardized versions of the names by lowercasing and removing
|
||||||
# underscores.
|
# underscores.
|
||||||
|
|
||||||
|
def stdname(x):
|
||||||
|
return x.lower().replace('_', '')
|
||||||
|
|
||||||
def stdnames(x):
|
def stdnames(x):
|
||||||
y = [''] * len(x)
|
y = [''] * len(x)
|
||||||
for i in range(len(x)):
|
for i in range(len(x)):
|
||||||
y[i] = x[i].lower().replace('_', '')
|
y[i] = stdname(x[i])
|
||||||
return y
|
return y
|
||||||
|
|
||||||
std_script_names = stdnames(script_names)
|
|
||||||
std_script_abbrevs = stdnames(script_abbrevs)
|
|
||||||
std_category_names = stdnames(category_names)
|
std_category_names = stdnames(category_names)
|
||||||
std_general_category_names = stdnames(general_category_names)
|
std_general_category_names = stdnames(general_category_names)
|
||||||
std_bidi_class_names = stdnames(bidi_class_names)
|
std_bidi_class_names = stdnames(bidi_class_names)
|
||||||
|
@ -92,18 +93,16 @@ std_bidi_class_names = stdnames(bidi_class_names)
|
||||||
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
|
||||||
# still use the full original names.
|
# still use the full original names.
|
||||||
|
|
||||||
|
utt_table = []
|
||||||
|
|
||||||
scx_end = script_names.index('Unknown')
|
scx_end = script_names.index('Unknown')
|
||||||
|
|
||||||
utt_table = list(zip(std_script_names[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
|
for idx, name in enumerate(script_names):
|
||||||
utt_table += list(zip(std_script_names[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
|
pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC'
|
||||||
utt_table += list(zip(std_script_abbrevs[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
|
|
||||||
utt_table += list(zip(std_script_abbrevs[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
|
|
||||||
|
|
||||||
# At lease one script abbreviation is the same as the full name of the script,
|
utt_table.append((stdname(name), name, pt_type))
|
||||||
# so we must remove duplicates. It doesn't matter if this operation changes the
|
for abbrev in abbreviations[name]:
|
||||||
# order, because we are going to sort the list later.
|
utt_table.append((stdname(abbrev), name, pt_type))
|
||||||
|
|
||||||
utt_table = list(set(utt_table))
|
|
||||||
|
|
||||||
# Add the remaining property lists
|
# Add the remaining property lists
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
1704
src/pcre2_ucd.c
1704
src/pcre2_ucd.c
File diff suppressed because it is too large
Load Diff
150
src/pcre2_ucp.h
150
src/pcre2_ucp.h
|
@ -153,45 +153,45 @@ enum {
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
/* Scripts which has characters in other scripts. */
|
/* Scripts which has characters in other scripts. */
|
||||||
ucp_Arabic,
|
|
||||||
ucp_Bengali,
|
|
||||||
ucp_Bopomofo,
|
|
||||||
ucp_Buginese,
|
|
||||||
ucp_Buhid,
|
|
||||||
ucp_Coptic,
|
|
||||||
ucp_Cypriot,
|
|
||||||
ucp_Cyrillic,
|
|
||||||
ucp_Devanagari,
|
|
||||||
ucp_Georgian,
|
|
||||||
ucp_Glagolitic,
|
|
||||||
ucp_Greek,
|
|
||||||
ucp_Gujarati,
|
|
||||||
ucp_Gurmukhi,
|
|
||||||
ucp_Han,
|
|
||||||
ucp_Hangul,
|
|
||||||
ucp_Hanunoo,
|
|
||||||
ucp_Hiragana,
|
|
||||||
ucp_Kannada,
|
|
||||||
ucp_Katakana,
|
|
||||||
ucp_Latin,
|
ucp_Latin,
|
||||||
ucp_Limbu,
|
ucp_Greek,
|
||||||
ucp_Linear_B,
|
ucp_Cyrillic,
|
||||||
ucp_Malayalam,
|
ucp_Arabic,
|
||||||
ucp_Mongolian,
|
|
||||||
ucp_Myanmar,
|
|
||||||
ucp_Oriya,
|
|
||||||
ucp_Sinhala,
|
|
||||||
ucp_Syloti_Nagri,
|
|
||||||
ucp_Syriac,
|
ucp_Syriac,
|
||||||
ucp_Tagalog,
|
ucp_Thaana,
|
||||||
ucp_Tagbanwa,
|
ucp_Devanagari,
|
||||||
ucp_Tai_Le,
|
ucp_Bengali,
|
||||||
|
ucp_Gurmukhi,
|
||||||
|
ucp_Gujarati,
|
||||||
|
ucp_Oriya,
|
||||||
ucp_Tamil,
|
ucp_Tamil,
|
||||||
ucp_Telugu,
|
ucp_Telugu,
|
||||||
ucp_Thaana,
|
ucp_Kannada,
|
||||||
|
ucp_Malayalam,
|
||||||
|
ucp_Sinhala,
|
||||||
|
ucp_Myanmar,
|
||||||
|
ucp_Georgian,
|
||||||
|
ucp_Hangul,
|
||||||
|
ucp_Mongolian,
|
||||||
|
ucp_Hiragana,
|
||||||
|
ucp_Katakana,
|
||||||
|
ucp_Bopomofo,
|
||||||
|
ucp_Han,
|
||||||
ucp_Yi,
|
ucp_Yi,
|
||||||
ucp_Nko,
|
ucp_Tagalog,
|
||||||
|
ucp_Hanunoo,
|
||||||
|
ucp_Buhid,
|
||||||
|
ucp_Tagbanwa,
|
||||||
|
ucp_Limbu,
|
||||||
|
ucp_Tai_Le,
|
||||||
|
ucp_Linear_B,
|
||||||
|
ucp_Cypriot,
|
||||||
|
ucp_Buginese,
|
||||||
|
ucp_Coptic,
|
||||||
|
ucp_Glagolitic,
|
||||||
|
ucp_Syloti_Nagri,
|
||||||
ucp_Phags_Pa,
|
ucp_Phags_Pa,
|
||||||
|
ucp_Nko,
|
||||||
ucp_Kayah_Li,
|
ucp_Kayah_Li,
|
||||||
ucp_Javanese,
|
ucp_Javanese,
|
||||||
ucp_Kaithi,
|
ucp_Kaithi,
|
||||||
|
@ -202,13 +202,13 @@ enum {
|
||||||
ucp_Duployan,
|
ucp_Duployan,
|
||||||
ucp_Grantha,
|
ucp_Grantha,
|
||||||
ucp_Khojki,
|
ucp_Khojki,
|
||||||
ucp_Khudawadi,
|
|
||||||
ucp_Linear_A,
|
ucp_Linear_A,
|
||||||
ucp_Mahajani,
|
ucp_Mahajani,
|
||||||
ucp_Manichaean,
|
ucp_Manichaean,
|
||||||
ucp_Modi,
|
ucp_Modi,
|
||||||
ucp_Old_Permic,
|
ucp_Old_Permic,
|
||||||
ucp_Psalter_Pahlavi,
|
ucp_Psalter_Pahlavi,
|
||||||
|
ucp_Khudawadi,
|
||||||
ucp_Tirhuta,
|
ucp_Tirhuta,
|
||||||
ucp_Multani,
|
ucp_Multani,
|
||||||
ucp_Adlam,
|
ucp_Adlam,
|
||||||
|
@ -224,70 +224,70 @@ enum {
|
||||||
|
|
||||||
/* Scripts which has no characters in other scripts. */
|
/* Scripts which has no characters in other scripts. */
|
||||||
ucp_Unknown,
|
ucp_Unknown,
|
||||||
ucp_Armenian,
|
|
||||||
ucp_Braille,
|
|
||||||
ucp_Canadian_Aboriginal,
|
|
||||||
ucp_Cherokee,
|
|
||||||
ucp_Common,
|
ucp_Common,
|
||||||
ucp_Deseret,
|
ucp_Armenian,
|
||||||
ucp_Ethiopic,
|
|
||||||
ucp_Gothic,
|
|
||||||
ucp_Hebrew,
|
ucp_Hebrew,
|
||||||
ucp_Inherited,
|
|
||||||
ucp_Kharoshthi,
|
|
||||||
ucp_Khmer,
|
|
||||||
ucp_Lao,
|
|
||||||
ucp_New_Tai_Lue,
|
|
||||||
ucp_Ogham,
|
|
||||||
ucp_Old_Italic,
|
|
||||||
ucp_Old_Persian,
|
|
||||||
ucp_Osmanya,
|
|
||||||
ucp_Runic,
|
|
||||||
ucp_Shavian,
|
|
||||||
ucp_Thai,
|
ucp_Thai,
|
||||||
|
ucp_Lao,
|
||||||
ucp_Tibetan,
|
ucp_Tibetan,
|
||||||
ucp_Tifinagh,
|
ucp_Ethiopic,
|
||||||
|
ucp_Cherokee,
|
||||||
|
ucp_Canadian_Aboriginal,
|
||||||
|
ucp_Ogham,
|
||||||
|
ucp_Runic,
|
||||||
|
ucp_Khmer,
|
||||||
|
ucp_Old_Italic,
|
||||||
|
ucp_Gothic,
|
||||||
|
ucp_Deseret,
|
||||||
|
ucp_Inherited,
|
||||||
ucp_Ugaritic,
|
ucp_Ugaritic,
|
||||||
|
ucp_Shavian,
|
||||||
|
ucp_Osmanya,
|
||||||
|
ucp_Braille,
|
||||||
|
ucp_New_Tai_Lue,
|
||||||
|
ucp_Tifinagh,
|
||||||
|
ucp_Old_Persian,
|
||||||
|
ucp_Kharoshthi,
|
||||||
ucp_Balinese,
|
ucp_Balinese,
|
||||||
ucp_Cuneiform,
|
ucp_Cuneiform,
|
||||||
ucp_Phoenician,
|
ucp_Phoenician,
|
||||||
ucp_Carian,
|
|
||||||
ucp_Cham,
|
|
||||||
ucp_Lepcha,
|
|
||||||
ucp_Lycian,
|
|
||||||
ucp_Lydian,
|
|
||||||
ucp_Ol_Chiki,
|
|
||||||
ucp_Rejang,
|
|
||||||
ucp_Saurashtra,
|
|
||||||
ucp_Sundanese,
|
ucp_Sundanese,
|
||||||
|
ucp_Lepcha,
|
||||||
|
ucp_Ol_Chiki,
|
||||||
ucp_Vai,
|
ucp_Vai,
|
||||||
ucp_Avestan,
|
ucp_Saurashtra,
|
||||||
ucp_Bamum,
|
ucp_Rejang,
|
||||||
ucp_Egyptian_Hieroglyphs,
|
ucp_Lycian,
|
||||||
ucp_Imperial_Aramaic,
|
ucp_Carian,
|
||||||
ucp_Inscriptional_Pahlavi,
|
ucp_Lydian,
|
||||||
ucp_Inscriptional_Parthian,
|
ucp_Cham,
|
||||||
ucp_Lisu,
|
|
||||||
ucp_Meetei_Mayek,
|
|
||||||
ucp_Old_South_Arabian,
|
|
||||||
ucp_Old_Turkic,
|
|
||||||
ucp_Samaritan,
|
|
||||||
ucp_Tai_Tham,
|
ucp_Tai_Tham,
|
||||||
ucp_Tai_Viet,
|
ucp_Tai_Viet,
|
||||||
|
ucp_Avestan,
|
||||||
|
ucp_Egyptian_Hieroglyphs,
|
||||||
|
ucp_Samaritan,
|
||||||
|
ucp_Lisu,
|
||||||
|
ucp_Bamum,
|
||||||
|
ucp_Meetei_Mayek,
|
||||||
|
ucp_Imperial_Aramaic,
|
||||||
|
ucp_Old_South_Arabian,
|
||||||
|
ucp_Inscriptional_Parthian,
|
||||||
|
ucp_Inscriptional_Pahlavi,
|
||||||
|
ucp_Old_Turkic,
|
||||||
ucp_Batak,
|
ucp_Batak,
|
||||||
ucp_Brahmi,
|
ucp_Brahmi,
|
||||||
ucp_Meroitic_Cursive,
|
ucp_Meroitic_Cursive,
|
||||||
ucp_Meroitic_Hieroglyphs,
|
ucp_Meroitic_Hieroglyphs,
|
||||||
ucp_Miao,
|
ucp_Miao,
|
||||||
ucp_Sora_Sompeng,
|
ucp_Sora_Sompeng,
|
||||||
ucp_Bassa_Vah,
|
|
||||||
ucp_Caucasian_Albanian,
|
ucp_Caucasian_Albanian,
|
||||||
|
ucp_Bassa_Vah,
|
||||||
ucp_Elbasan,
|
ucp_Elbasan,
|
||||||
|
ucp_Pahawh_Hmong,
|
||||||
ucp_Mende_Kikakui,
|
ucp_Mende_Kikakui,
|
||||||
ucp_Mro,
|
ucp_Mro,
|
||||||
ucp_Nabataean,
|
|
||||||
ucp_Old_North_Arabian,
|
ucp_Old_North_Arabian,
|
||||||
ucp_Pahawh_Hmong,
|
ucp_Nabataean,
|
||||||
ucp_Palmyrene,
|
ucp_Palmyrene,
|
||||||
ucp_Pau_Cin_Hau,
|
ucp_Pau_Cin_Hau,
|
||||||
ucp_Siddham,
|
ucp_Siddham,
|
||||||
|
|
|
@ -340,6 +340,8 @@ the "loose matching" rules that Unicode advises and Perl uses. */
|
||||||
#define STRING_prti0 STR_p STR_r STR_t STR_i "\0"
|
#define STRING_prti0 STR_p STR_r STR_t STR_i "\0"
|
||||||
#define STRING_ps0 STR_p STR_s "\0"
|
#define STRING_ps0 STR_p STR_s "\0"
|
||||||
#define STRING_psalterpahlavi0 STR_p STR_s STR_a STR_l STR_t STR_e STR_r STR_p STR_a STR_h STR_l STR_a STR_v STR_i "\0"
|
#define STRING_psalterpahlavi0 STR_p STR_s STR_a STR_l STR_t STR_e STR_r STR_p STR_a STR_h STR_l STR_a STR_v STR_i "\0"
|
||||||
|
#define STRING_qaac0 STR_q STR_a STR_a STR_c "\0"
|
||||||
|
#define STRING_qaai0 STR_q STR_a STR_a STR_i "\0"
|
||||||
#define STRING_rejang0 STR_r STR_e STR_j STR_a STR_n STR_g "\0"
|
#define STRING_rejang0 STR_r STR_e STR_j STR_a STR_n STR_g "\0"
|
||||||
#define STRING_rjng0 STR_r STR_j STR_n STR_g "\0"
|
#define STRING_rjng0 STR_r STR_j STR_n STR_g "\0"
|
||||||
#define STRING_rohg0 STR_r STR_o STR_h STR_g "\0"
|
#define STRING_rohg0 STR_r STR_o STR_h STR_g "\0"
|
||||||
|
@ -407,7 +409,7 @@ the "loose matching" rules that Unicode advises and Perl uses. */
|
||||||
#define STRING_tifinagh0 STR_t STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
|
#define STRING_tifinagh0 STR_t STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
|
||||||
#define STRING_tirh0 STR_t STR_i STR_r STR_h "\0"
|
#define STRING_tirh0 STR_t STR_i STR_r STR_h "\0"
|
||||||
#define STRING_tirhuta0 STR_t STR_i STR_r STR_h STR_u STR_t STR_a "\0"
|
#define STRING_tirhuta0 STR_t STR_i STR_r STR_h STR_u STR_t STR_a "\0"
|
||||||
#define STRING_tngs0 STR_t STR_n STR_g STR_s "\0"
|
#define STRING_tnsa0 STR_t STR_n STR_s STR_a "\0"
|
||||||
#define STRING_toto0 STR_t STR_o STR_t STR_o "\0"
|
#define STRING_toto0 STR_t STR_o STR_t STR_o "\0"
|
||||||
#define STRING_ugar0 STR_u STR_g STR_a STR_r "\0"
|
#define STRING_ugar0 STR_u STR_g STR_a STR_r "\0"
|
||||||
#define STRING_ugaritic0 STR_u STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
|
#define STRING_ugaritic0 STR_u STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
|
||||||
|
@ -729,6 +731,8 @@ const char PRIV(utt_names)[] =
|
||||||
STRING_prti0
|
STRING_prti0
|
||||||
STRING_ps0
|
STRING_ps0
|
||||||
STRING_psalterpahlavi0
|
STRING_psalterpahlavi0
|
||||||
|
STRING_qaac0
|
||||||
|
STRING_qaai0
|
||||||
STRING_rejang0
|
STRING_rejang0
|
||||||
STRING_rjng0
|
STRING_rjng0
|
||||||
STRING_rohg0
|
STRING_rohg0
|
||||||
|
@ -796,7 +800,7 @@ const char PRIV(utt_names)[] =
|
||||||
STRING_tifinagh0
|
STRING_tifinagh0
|
||||||
STRING_tirh0
|
STRING_tirh0
|
||||||
STRING_tirhuta0
|
STRING_tirhuta0
|
||||||
STRING_tngs0
|
STRING_tnsa0
|
||||||
STRING_toto0
|
STRING_toto0
|
||||||
STRING_ugar0
|
STRING_ugar0
|
||||||
STRING_ugaritic0
|
STRING_ugaritic0
|
||||||
|
@ -1118,106 +1122,108 @@ const ucp_type_table PRIV(utt)[] = {
|
||||||
{ 1962, PT_SC, ucp_Inscriptional_Parthian },
|
{ 1962, PT_SC, ucp_Inscriptional_Parthian },
|
||||||
{ 1967, PT_PC, ucp_Ps },
|
{ 1967, PT_PC, ucp_Ps },
|
||||||
{ 1970, PT_SCX, ucp_Psalter_Pahlavi },
|
{ 1970, PT_SCX, ucp_Psalter_Pahlavi },
|
||||||
{ 1985, PT_SC, ucp_Rejang },
|
{ 1985, PT_SCX, ucp_Coptic },
|
||||||
{ 1992, PT_SC, ucp_Rejang },
|
{ 1990, PT_SC, ucp_Inherited },
|
||||||
{ 1997, PT_SCX, ucp_Hanifi_Rohingya },
|
{ 1995, PT_SC, ucp_Rejang },
|
||||||
{ 2002, PT_SC, ucp_Runic },
|
{ 2002, PT_SC, ucp_Rejang },
|
||||||
{ 2008, PT_SC, ucp_Runic },
|
{ 2007, PT_SCX, ucp_Hanifi_Rohingya },
|
||||||
{ 2013, PT_GC, ucp_S },
|
{ 2012, PT_SC, ucp_Runic },
|
||||||
{ 2015, PT_SC, ucp_Samaritan },
|
{ 2018, PT_SC, ucp_Runic },
|
||||||
|
{ 2023, PT_GC, ucp_S },
|
||||||
{ 2025, PT_SC, ucp_Samaritan },
|
{ 2025, PT_SC, ucp_Samaritan },
|
||||||
{ 2030, PT_SC, ucp_Old_South_Arabian },
|
{ 2035, PT_SC, ucp_Samaritan },
|
||||||
{ 2035, PT_SC, ucp_Saurashtra },
|
{ 2040, PT_SC, ucp_Old_South_Arabian },
|
||||||
{ 2040, PT_SC, ucp_Saurashtra },
|
{ 2045, PT_SC, ucp_Saurashtra },
|
||||||
{ 2051, PT_PC, ucp_Sc },
|
{ 2050, PT_SC, ucp_Saurashtra },
|
||||||
{ 2054, PT_SC, ucp_SignWriting },
|
{ 2061, PT_PC, ucp_Sc },
|
||||||
{ 2059, PT_SCX, ucp_Sharada },
|
{ 2064, PT_SC, ucp_SignWriting },
|
||||||
{ 2067, PT_SC, ucp_Shavian },
|
{ 2069, PT_SCX, ucp_Sharada },
|
||||||
{ 2075, PT_SC, ucp_Shavian },
|
{ 2077, PT_SC, ucp_Shavian },
|
||||||
{ 2080, PT_SCX, ucp_Sharada },
|
{ 2085, PT_SC, ucp_Shavian },
|
||||||
{ 2085, PT_SC, ucp_Siddham },
|
{ 2090, PT_SCX, ucp_Sharada },
|
||||||
{ 2090, PT_SC, ucp_Siddham },
|
{ 2095, PT_SC, ucp_Siddham },
|
||||||
{ 2098, PT_SC, ucp_SignWriting },
|
{ 2100, PT_SC, ucp_Siddham },
|
||||||
{ 2110, PT_SCX, ucp_Khudawadi },
|
{ 2108, PT_SC, ucp_SignWriting },
|
||||||
{ 2115, PT_SCX, ucp_Sinhala },
|
{ 2120, PT_SCX, ucp_Khudawadi },
|
||||||
{ 2120, PT_SCX, ucp_Sinhala },
|
{ 2125, PT_SCX, ucp_Sinhala },
|
||||||
{ 2128, PT_PC, ucp_Sk },
|
{ 2130, PT_SCX, ucp_Sinhala },
|
||||||
{ 2131, PT_PC, ucp_Sm },
|
{ 2138, PT_PC, ucp_Sk },
|
||||||
{ 2134, PT_PC, ucp_So },
|
{ 2141, PT_PC, ucp_Sm },
|
||||||
{ 2137, PT_SCX, ucp_Sogdian },
|
{ 2144, PT_PC, ucp_So },
|
||||||
{ 2142, PT_SCX, ucp_Sogdian },
|
{ 2147, PT_SCX, ucp_Sogdian },
|
||||||
{ 2150, PT_SC, ucp_Old_Sogdian },
|
{ 2152, PT_SCX, ucp_Sogdian },
|
||||||
{ 2155, PT_SC, ucp_Sora_Sompeng },
|
{ 2160, PT_SC, ucp_Old_Sogdian },
|
||||||
{ 2160, PT_SC, ucp_Sora_Sompeng },
|
{ 2165, PT_SC, ucp_Sora_Sompeng },
|
||||||
{ 2172, PT_SC, ucp_Soyombo },
|
{ 2170, PT_SC, ucp_Sora_Sompeng },
|
||||||
{ 2177, PT_SC, ucp_Soyombo },
|
{ 2182, PT_SC, ucp_Soyombo },
|
||||||
{ 2185, PT_SC, ucp_Sundanese },
|
{ 2187, PT_SC, ucp_Soyombo },
|
||||||
{ 2190, PT_SC, ucp_Sundanese },
|
{ 2195, PT_SC, ucp_Sundanese },
|
||||||
{ 2200, PT_SCX, ucp_Syloti_Nagri },
|
{ 2200, PT_SC, ucp_Sundanese },
|
||||||
{ 2205, PT_SCX, ucp_Syloti_Nagri },
|
{ 2210, PT_SCX, ucp_Syloti_Nagri },
|
||||||
{ 2217, PT_SCX, ucp_Syriac },
|
{ 2215, PT_SCX, ucp_Syloti_Nagri },
|
||||||
{ 2222, PT_SCX, ucp_Syriac },
|
{ 2227, PT_SCX, ucp_Syriac },
|
||||||
{ 2229, PT_SCX, ucp_Tagalog },
|
{ 2232, PT_SCX, ucp_Syriac },
|
||||||
{ 2237, PT_SCX, ucp_Tagbanwa },
|
{ 2239, PT_SCX, ucp_Tagalog },
|
||||||
{ 2242, PT_SCX, ucp_Tagbanwa },
|
{ 2247, PT_SCX, ucp_Tagbanwa },
|
||||||
{ 2251, PT_SCX, ucp_Tai_Le },
|
{ 2252, PT_SCX, ucp_Tagbanwa },
|
||||||
{ 2257, PT_SC, ucp_Tai_Tham },
|
{ 2261, PT_SCX, ucp_Tai_Le },
|
||||||
{ 2265, PT_SC, ucp_Tai_Viet },
|
{ 2267, PT_SC, ucp_Tai_Tham },
|
||||||
{ 2273, PT_SCX, ucp_Takri },
|
{ 2275, PT_SC, ucp_Tai_Viet },
|
||||||
{ 2278, PT_SCX, ucp_Takri },
|
{ 2283, PT_SCX, ucp_Takri },
|
||||||
{ 2284, PT_SCX, ucp_Tai_Le },
|
{ 2288, PT_SCX, ucp_Takri },
|
||||||
{ 2289, PT_SC, ucp_New_Tai_Lue },
|
{ 2294, PT_SCX, ucp_Tai_Le },
|
||||||
{ 2294, PT_SCX, ucp_Tamil },
|
{ 2299, PT_SC, ucp_New_Tai_Lue },
|
||||||
{ 2300, PT_SCX, ucp_Tamil },
|
{ 2304, PT_SCX, ucp_Tamil },
|
||||||
{ 2305, PT_SC, ucp_Tangut },
|
{ 2310, PT_SCX, ucp_Tamil },
|
||||||
{ 2310, PT_SC, ucp_Tangsa },
|
{ 2315, PT_SC, ucp_Tangut },
|
||||||
{ 2317, PT_SC, ucp_Tangut },
|
{ 2320, PT_SC, ucp_Tangsa },
|
||||||
{ 2324, PT_SC, ucp_Tai_Viet },
|
{ 2327, PT_SC, ucp_Tangut },
|
||||||
{ 2329, PT_SCX, ucp_Telugu },
|
{ 2334, PT_SC, ucp_Tai_Viet },
|
||||||
{ 2334, PT_SCX, ucp_Telugu },
|
{ 2339, PT_SCX, ucp_Telugu },
|
||||||
{ 2341, PT_SC, ucp_Tifinagh },
|
{ 2344, PT_SCX, ucp_Telugu },
|
||||||
{ 2346, PT_SCX, ucp_Tagalog },
|
{ 2351, PT_SC, ucp_Tifinagh },
|
||||||
{ 2351, PT_SCX, ucp_Thaana },
|
{ 2356, PT_SCX, ucp_Tagalog },
|
||||||
{ 2356, PT_SCX, ucp_Thaana },
|
{ 2361, PT_SCX, ucp_Thaana },
|
||||||
{ 2363, PT_SC, ucp_Thai },
|
{ 2366, PT_SCX, ucp_Thaana },
|
||||||
{ 2368, PT_SC, ucp_Tibetan },
|
{ 2373, PT_SC, ucp_Thai },
|
||||||
{ 2376, PT_SC, ucp_Tibetan },
|
{ 2378, PT_SC, ucp_Tibetan },
|
||||||
{ 2381, PT_SC, ucp_Tifinagh },
|
{ 2386, PT_SC, ucp_Tibetan },
|
||||||
{ 2390, PT_SCX, ucp_Tirhuta },
|
{ 2391, PT_SC, ucp_Tifinagh },
|
||||||
{ 2395, PT_SCX, ucp_Tirhuta },
|
{ 2400, PT_SCX, ucp_Tirhuta },
|
||||||
{ 2403, PT_SC, ucp_Tangsa },
|
{ 2405, PT_SCX, ucp_Tirhuta },
|
||||||
{ 2408, PT_SC, ucp_Toto },
|
{ 2413, PT_SC, ucp_Tangsa },
|
||||||
{ 2413, PT_SC, ucp_Ugaritic },
|
{ 2418, PT_SC, ucp_Toto },
|
||||||
{ 2418, PT_SC, ucp_Ugaritic },
|
{ 2423, PT_SC, ucp_Ugaritic },
|
||||||
{ 2427, PT_SC, ucp_Unknown },
|
{ 2428, PT_SC, ucp_Ugaritic },
|
||||||
{ 2435, PT_SC, ucp_Vai },
|
{ 2437, PT_SC, ucp_Unknown },
|
||||||
{ 2439, PT_SC, ucp_Vai },
|
{ 2445, PT_SC, ucp_Vai },
|
||||||
{ 2444, PT_SC, ucp_Vithkuqi },
|
{ 2449, PT_SC, ucp_Vai },
|
||||||
{ 2449, PT_SC, ucp_Vithkuqi },
|
{ 2454, PT_SC, ucp_Vithkuqi },
|
||||||
{ 2458, PT_SC, ucp_Wancho },
|
{ 2459, PT_SC, ucp_Vithkuqi },
|
||||||
{ 2465, PT_SC, ucp_Warang_Citi },
|
{ 2468, PT_SC, ucp_Wancho },
|
||||||
{ 2470, PT_SC, ucp_Warang_Citi },
|
{ 2475, PT_SC, ucp_Warang_Citi },
|
||||||
{ 2481, PT_SC, ucp_Wancho },
|
{ 2480, PT_SC, ucp_Warang_Citi },
|
||||||
{ 2486, PT_ALNUM, 0 },
|
{ 2491, PT_SC, ucp_Wancho },
|
||||||
{ 2490, PT_SC, ucp_Old_Persian },
|
{ 2496, PT_ALNUM, 0 },
|
||||||
{ 2495, PT_PXSPACE, 0 },
|
{ 2500, PT_SC, ucp_Old_Persian },
|
||||||
{ 2499, PT_SPACE, 0 },
|
{ 2505, PT_PXSPACE, 0 },
|
||||||
{ 2503, PT_SC, ucp_Cuneiform },
|
{ 2509, PT_SPACE, 0 },
|
||||||
{ 2508, PT_UCNC, 0 },
|
{ 2513, PT_SC, ucp_Cuneiform },
|
||||||
{ 2512, PT_WORD, 0 },
|
{ 2518, PT_UCNC, 0 },
|
||||||
{ 2516, PT_SCX, ucp_Yezidi },
|
{ 2522, PT_WORD, 0 },
|
||||||
{ 2521, PT_SCX, ucp_Yezidi },
|
{ 2526, PT_SCX, ucp_Yezidi },
|
||||||
{ 2528, PT_SCX, ucp_Yi },
|
{ 2531, PT_SCX, ucp_Yezidi },
|
||||||
{ 2531, PT_SCX, ucp_Yi },
|
{ 2538, PT_SCX, ucp_Yi },
|
||||||
{ 2536, PT_GC, ucp_Z },
|
{ 2541, PT_SCX, ucp_Yi },
|
||||||
{ 2538, PT_SC, ucp_Zanabazar_Square },
|
{ 2546, PT_GC, ucp_Z },
|
||||||
{ 2554, PT_SC, ucp_Zanabazar_Square },
|
{ 2548, PT_SC, ucp_Zanabazar_Square },
|
||||||
{ 2559, PT_SC, ucp_Inherited },
|
{ 2564, PT_SC, ucp_Zanabazar_Square },
|
||||||
{ 2564, PT_PC, ucp_Zl },
|
{ 2569, PT_SC, ucp_Inherited },
|
||||||
{ 2567, PT_PC, ucp_Zp },
|
{ 2574, PT_PC, ucp_Zl },
|
||||||
{ 2570, PT_PC, ucp_Zs },
|
{ 2577, PT_PC, ucp_Zp },
|
||||||
{ 2573, PT_SC, ucp_Common },
|
{ 2580, PT_PC, ucp_Zs },
|
||||||
{ 2578, PT_SC, ucp_Unknown }
|
{ 2583, PT_SC, ucp_Common },
|
||||||
|
{ 2588, PT_SC, ucp_Unknown }
|
||||||
};
|
};
|
||||||
|
|
||||||
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue