From f90542a20904776107a322d0add8e87731f59b04 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Fri, 7 Jan 2022 11:01:18 +0100 Subject: [PATCH] Improve unicode property abbreviation support (#74) * Improve unicode property abbreviation support * Auto-generate script names Co-authored-by: Zoltan Herczeg --- maint/GenerateCommon.py | 138 +- maint/GenerateUcpTables.py | 25 +- maint/Unicode.tables/PropertyValueAliases.txt | 1615 +++++++++ src/pcre2_ucd.c | 1704 +++++----- src/pcre2_ucp.h | 150 +- src/pcre2_ucptables.c | 208 +- testdata/testinput26 | 2250 ++++++------- testdata/testoutput26 | 2882 ++++++++--------- 8 files changed, 5280 insertions(+), 3692 deletions(-) create mode 100644 maint/Unicode.tables/PropertyValueAliases.txt diff --git a/maint/GenerateCommon.py b/maint/GenerateCommon.py index c7e3d83..bea523f 100644 --- a/maint/GenerateCommon.py +++ b/maint/GenerateCommon.py @@ -13,90 +13,6 @@ # DATA LISTS # --------------------------------------------------------------------------- -# The lists of script names and script abbreviations must be kept in step. Note -# that the pcre2pattern and pcre2syntax documentation has lists of scripts. - -script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ - 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ - 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ - 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ - 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ - 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ - 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ - # New for Unicode 5.0 - 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ - # New for Unicode 5.1 - 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ - # New for Unicode 5.2 - 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ - 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ - 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ - 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \ - # New for Unicode 6.0.0 - 'Batak', 'Brahmi', 'Mandaic', \ -# New for Unicode 6.1.0 - 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', -# New for Unicode 7.0.0 - 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', - 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', - 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', - 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', -# New for Unicode 8.0.0 - 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', - 'SignWriting', -# New for Unicode 10.0.0 - 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', - 'Nushu', 'Soyombo', 'Zanabazar_Square', -# New for Unicode 11.0.0 - 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', - 'Old_Sogdian', 'Sogdian', -# New for Unicode 12.0.0 - 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', -# New for Unicode 13.0.0 - 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', -# New for Unicode 14.0.0 - 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' - ] - -script_abbrevs = [ - 'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans', - 'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor', - 'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr', - 'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb', - 'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya', - 'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale', - 'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii', -#New for Unicode 5.0 - 'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx', -#New for Unicode 5.1 - 'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur', - 'Sund', 'Vaii', -#New for Unicode 5.2 - 'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu', - 'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt', -#New for Unicode 6.0.0 - 'Batk', 'Brah', 'Mand', -#New for Unicode 6.1.0 - 'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr', -#New for Unicode 7.0.0 - 'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj', - 'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm', - 'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara', -#New for Unicode 8.0.0 - 'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw', -#New for Unicode 10.0.0 - 'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo', - 'Zanb', -#New for Unicode 11.0.0 - 'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd', -#New for Unicode 12.0.0 - 'Elym', 'Nand', 'Hmnp', 'Wcho', -#New for Unicode 13.0.0 - 'Chrs', 'Diak', 'Kits', 'Yezi', -#New for Unicode 14.0.0 - 'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith' - ] - # BIDI classes in the DerivedBidiClass.txt file, with comments. bidi_classes = [ @@ -185,14 +101,66 @@ break_properties = [ ] # --------------------------------------------------------------------------- -# REORDERING SCRIPT NAMES +# COLLECTING PROPERTY NAMES # --------------------------------------------------------------------------- import re +script_names = ['Unknown'] +abbreviations = {} + +def collect_property_names(): + global script_names + global abbreviations + + names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') + + last_script_name = "" + with open("Unicode.tables/Scripts.txt") as f: + for line in f: + match_obj = names_re.match(line) + + if match_obj == None or match_obj.group(1) == last_script_name: + continue + + last_script_name = match_obj.group(1) + script_names.append(last_script_name) + + # Sometimes there is comment in the line + # so splitting around semicolon is not enough + value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') + + with open("Unicode.tables/PropertyValueAliases.txt") as f: + for line in f: + match_obj = value_alias_re.match(line) + + if match_obj == None: + continue + + if match_obj.group(1) == "sc": + if match_obj.group(2) == match_obj.group(3): + abbreviations[match_obj.group(3)] = () + elif match_obj.group(4) == None: + abbreviations[match_obj.group(3)] = (match_obj.group(2),) + else: + abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) + +collect_property_names() + +# --------------------------------------------------------------------------- +# REORDERING SCRIPT NAMES +# --------------------------------------------------------------------------- + +script_abbrevs = [] + def reorder_scripts(): global script_names global script_abbrevs + global abbreviations + + for name in script_names: + abbrevs = abbreviations[name] + script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) extended_script_abbrevs = set() with open("Unicode.tables/ScriptExtensions.txt") as f: diff --git a/maint/GenerateUcpTables.py b/maint/GenerateUcpTables.py index 7d2292c..e1678ec 100755 --- a/maint/GenerateUcpTables.py +++ b/maint/GenerateUcpTables.py @@ -48,10 +48,10 @@ # Import common data lists and functions from GenerateCommon import \ + abbreviations, \ bidi_classes, \ category_names, \ general_category_names, \ - script_abbrevs, \ script_names, \ open_output @@ -75,14 +75,15 @@ category_names = category_names[::2] # Create standardized versions of the names by lowercasing and removing # underscores. +def stdname(x): + return x.lower().replace('_', '') + def stdnames(x): y = [''] * len(x) for i in range(len(x)): - y[i] = x[i].lower().replace('_', '') + y[i] = stdname(x[i]) return y -std_script_names = stdnames(script_names) -std_script_abbrevs = stdnames(script_abbrevs) std_category_names = stdnames(category_names) std_general_category_names = stdnames(general_category_names) std_bidi_class_names = stdnames(bidi_class_names) @@ -92,18 +93,16 @@ std_bidi_class_names = stdnames(bidi_class_names) # latter is used for the ucp_xx names. NOTE: for the script abbreviations, we # still use the full original names. +utt_table = [] + scx_end = script_names.index('Unknown') -utt_table = list(zip(std_script_names[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end)) -utt_table += list(zip(std_script_names[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end))) -utt_table += list(zip(std_script_abbrevs[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end)) -utt_table += list(zip(std_script_abbrevs[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end))) +for idx, name in enumerate(script_names): + pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC' -# At lease one script abbreviation is the same as the full name of the script, -# so we must remove duplicates. It doesn't matter if this operation changes the -# order, because we are going to sort the list later. - -utt_table = list(set(utt_table)) + utt_table.append((stdname(name), name, pt_type)) + for abbrev in abbreviations[name]: + utt_table.append((stdname(abbrev), name, pt_type)) # Add the remaining property lists diff --git a/maint/Unicode.tables/PropertyValueAliases.txt b/maint/Unicode.tables/PropertyValueAliases.txt new file mode 100644 index 0000000..f0cb26b --- /dev/null +++ b/maint/Unicode.tables/PropertyValueAliases.txt @@ -0,0 +1,1615 @@ +# PropertyValueAliases-14.0.0.txt +# Date: 2021-05-10, 21:08:53 GMT +# © 2021 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see http://www.unicode.org/reports/tr44/ +# +# This file contains aliases for property values used in the UCD. +# These names can be used for XML formats of UCD data, for regular-expression +# property tests, and other programmatic textual descriptions of Unicode data. +# +# The names may be translated in appropriate environments, and additional +# aliases may be useful. +# +# FORMAT +# +# Each line describes a property value name. +# This consists of three or more fields, separated by semicolons. +# +# First Field: The first field describes the property for which that +# property value name is used. +# +# Second Field: The second field is the short name for the property value. +# It is typically an abbreviation, but in a number of cases it is simply +# a duplicate of the "long name" in the third field. +# +# Third Field: The third field is the long name for the property value, +# typically the formal name used in documentation about the property value. +# +# In the case of Canonical_Combining_Class (ccc), there are 4 fields: +# The second field is numeric, the third is the short name, and the fourth is the long name. +# +# The above are the preferred aliases. Other aliases may be listed in additional fields. +# +# Loose matching should be applied to all property names and property values, with +# the exception of String Property values. With loose matching of property names and +# values, the case distinctions, whitespace, hyphens, and '_' are ignored. +# For Numeric Property values, numeric equivalence is applied: thus "01.00" +# is equivalent to "1". +# +# NOTE: Property value names are NOT unique across properties. For example: +# +# AL means Arabic Letter for the Bidi_Class property, and +# AL means Above_Left for the Canonical_Combining_Class property, and +# AL means Alphabetic for the Line_Break property. +# +# In addition, some property names may be the same as some property value names. +# For example: +# +# sc means the Script property, and +# Sc means the General_Category property value Currency_Symbol (Sc) +# +# The combination of property value and property name is, however, unique. +# +# For more information, see UAX #44, Unicode Character Database, and +# UTS #18, Unicode Regular Expressions. +# ================================================ + + +# ASCII_Hex_Digit (AHex) + +AHex; N ; No ; F ; False +AHex; Y ; Yes ; T ; True + +# Age (age) + +age; 1.1 ; V1_1 +age; 2.0 ; V2_0 +age; 2.1 ; V2_1 +age; 3.0 ; V3_0 +age; 3.1 ; V3_1 +age; 3.2 ; V3_2 +age; 4.0 ; V4_0 +age; 4.1 ; V4_1 +age; 5.0 ; V5_0 +age; 5.1 ; V5_1 +age; 5.2 ; V5_2 +age; 6.0 ; V6_0 +age; 6.1 ; V6_1 +age; 6.2 ; V6_2 +age; 6.3 ; V6_3 +age; 7.0 ; V7_0 +age; 8.0 ; V8_0 +age; 9.0 ; V9_0 +age; 10.0 ; V10_0 +age; 11.0 ; V11_0 +age; 12.0 ; V12_0 +age; 12.1 ; V12_1 +age; 13.0 ; V13_0 +age; 14.0 ; V14_0 +age; NA ; Unassigned + +# Alphabetic (Alpha) + +Alpha; N ; No ; F ; False +Alpha; Y ; Yes ; T ; True + +# Bidi_Class (bc) + +bc ; AL ; Arabic_Letter +bc ; AN ; Arabic_Number +bc ; B ; Paragraph_Separator +bc ; BN ; Boundary_Neutral +bc ; CS ; Common_Separator +bc ; EN ; European_Number +bc ; ES ; European_Separator +bc ; ET ; European_Terminator +bc ; FSI ; First_Strong_Isolate +bc ; L ; Left_To_Right +bc ; LRE ; Left_To_Right_Embedding +bc ; LRI ; Left_To_Right_Isolate +bc ; LRO ; Left_To_Right_Override +bc ; NSM ; Nonspacing_Mark +bc ; ON ; Other_Neutral +bc ; PDF ; Pop_Directional_Format +bc ; PDI ; Pop_Directional_Isolate +bc ; R ; Right_To_Left +bc ; RLE ; Right_To_Left_Embedding +bc ; RLI ; Right_To_Left_Isolate +bc ; RLO ; Right_To_Left_Override +bc ; S ; Segment_Separator +bc ; WS ; White_Space + +# Bidi_Control (Bidi_C) + +Bidi_C; N ; No ; F ; False +Bidi_C; Y ; Yes ; T ; True + +# Bidi_Mirrored (Bidi_M) + +Bidi_M; N ; No ; F ; False +Bidi_M; Y ; Yes ; T ; True + +# Bidi_Mirroring_Glyph (bmg) + +# @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; + +# Bidi_Paired_Bracket (bpb) + +# @missing: 0000..10FFFF; Bidi_Paired_Bracket; + +# Bidi_Paired_Bracket_Type (bpt) + +bpt; c ; Close +bpt; n ; None +bpt; o ; Open +# @missing: 0000..10FFFF; Bidi_Paired_Bracket_Type; n + +# Block (blk) + +blk; Adlam ; Adlam +blk; Aegean_Numbers ; Aegean_Numbers +blk; Ahom ; Ahom +blk; Alchemical ; Alchemical_Symbols +blk; Alphabetic_PF ; Alphabetic_Presentation_Forms +blk; Anatolian_Hieroglyphs ; Anatolian_Hieroglyphs +blk; Ancient_Greek_Music ; Ancient_Greek_Musical_Notation +blk; Ancient_Greek_Numbers ; Ancient_Greek_Numbers +blk; Ancient_Symbols ; Ancient_Symbols +blk; Arabic ; Arabic +blk; Arabic_Ext_A ; Arabic_Extended_A +blk; Arabic_Ext_B ; Arabic_Extended_B +blk; Arabic_Math ; Arabic_Mathematical_Alphabetic_Symbols +blk; Arabic_PF_A ; Arabic_Presentation_Forms_A ; Arabic_Presentation_Forms-A +blk; Arabic_PF_B ; Arabic_Presentation_Forms_B +blk; Arabic_Sup ; Arabic_Supplement +blk; Armenian ; Armenian +blk; Arrows ; Arrows +blk; ASCII ; Basic_Latin +blk; Avestan ; Avestan +blk; Balinese ; Balinese +blk; Bamum ; Bamum +blk; Bamum_Sup ; Bamum_Supplement +blk; Bassa_Vah ; Bassa_Vah +blk; Batak ; Batak +blk; Bengali ; Bengali +blk; Bhaiksuki ; Bhaiksuki +blk; Block_Elements ; Block_Elements +blk; Bopomofo ; Bopomofo +blk; Bopomofo_Ext ; Bopomofo_Extended +blk; Box_Drawing ; Box_Drawing +blk; Brahmi ; Brahmi +blk; Braille ; Braille_Patterns +blk; Buginese ; Buginese +blk; Buhid ; Buhid +blk; Byzantine_Music ; Byzantine_Musical_Symbols +blk; Carian ; Carian +blk; Caucasian_Albanian ; Caucasian_Albanian +blk; Chakma ; Chakma +blk; Cham ; Cham +blk; Cherokee ; Cherokee +blk; Cherokee_Sup ; Cherokee_Supplement +blk; Chess_Symbols ; Chess_Symbols +blk; Chorasmian ; Chorasmian +blk; CJK ; CJK_Unified_Ideographs +blk; CJK_Compat ; CJK_Compatibility +blk; CJK_Compat_Forms ; CJK_Compatibility_Forms +blk; CJK_Compat_Ideographs ; CJK_Compatibility_Ideographs +blk; CJK_Compat_Ideographs_Sup ; CJK_Compatibility_Ideographs_Supplement +blk; CJK_Ext_A ; CJK_Unified_Ideographs_Extension_A +blk; CJK_Ext_B ; CJK_Unified_Ideographs_Extension_B +blk; CJK_Ext_C ; CJK_Unified_Ideographs_Extension_C +blk; CJK_Ext_D ; CJK_Unified_Ideographs_Extension_D +blk; CJK_Ext_E ; CJK_Unified_Ideographs_Extension_E +blk; CJK_Ext_F ; CJK_Unified_Ideographs_Extension_F +blk; CJK_Ext_G ; CJK_Unified_Ideographs_Extension_G +blk; CJK_Radicals_Sup ; CJK_Radicals_Supplement +blk; CJK_Strokes ; CJK_Strokes +blk; CJK_Symbols ; CJK_Symbols_And_Punctuation +blk; Compat_Jamo ; Hangul_Compatibility_Jamo +blk; Control_Pictures ; Control_Pictures +blk; Coptic ; Coptic +blk; Coptic_Epact_Numbers ; Coptic_Epact_Numbers +blk; Counting_Rod ; Counting_Rod_Numerals +blk; Cuneiform ; Cuneiform +blk; Cuneiform_Numbers ; Cuneiform_Numbers_And_Punctuation +blk; Currency_Symbols ; Currency_Symbols +blk; Cypriot_Syllabary ; Cypriot_Syllabary +blk; Cypro_Minoan ; Cypro_Minoan +blk; Cyrillic ; Cyrillic +blk; Cyrillic_Ext_A ; Cyrillic_Extended_A +blk; Cyrillic_Ext_B ; Cyrillic_Extended_B +blk; Cyrillic_Ext_C ; Cyrillic_Extended_C +blk; Cyrillic_Sup ; Cyrillic_Supplement ; Cyrillic_Supplementary +blk; Deseret ; Deseret +blk; Devanagari ; Devanagari +blk; Devanagari_Ext ; Devanagari_Extended +blk; Diacriticals ; Combining_Diacritical_Marks +blk; Diacriticals_Ext ; Combining_Diacritical_Marks_Extended +blk; Diacriticals_For_Symbols ; Combining_Diacritical_Marks_For_Symbols; Combining_Marks_For_Symbols +blk; Diacriticals_Sup ; Combining_Diacritical_Marks_Supplement +blk; Dingbats ; Dingbats +blk; Dives_Akuru ; Dives_Akuru +blk; Dogra ; Dogra +blk; Domino ; Domino_Tiles +blk; Duployan ; Duployan +blk; Early_Dynastic_Cuneiform ; Early_Dynastic_Cuneiform +blk; Egyptian_Hieroglyph_Format_Controls; Egyptian_Hieroglyph_Format_Controls +blk; Egyptian_Hieroglyphs ; Egyptian_Hieroglyphs +blk; Elbasan ; Elbasan +blk; Elymaic ; Elymaic +blk; Emoticons ; Emoticons +blk; Enclosed_Alphanum ; Enclosed_Alphanumerics +blk; Enclosed_Alphanum_Sup ; Enclosed_Alphanumeric_Supplement +blk; Enclosed_CJK ; Enclosed_CJK_Letters_And_Months +blk; Enclosed_Ideographic_Sup ; Enclosed_Ideographic_Supplement +blk; Ethiopic ; Ethiopic +blk; Ethiopic_Ext ; Ethiopic_Extended +blk; Ethiopic_Ext_A ; Ethiopic_Extended_A +blk; Ethiopic_Ext_B ; Ethiopic_Extended_B +blk; Ethiopic_Sup ; Ethiopic_Supplement +blk; Geometric_Shapes ; Geometric_Shapes +blk; Geometric_Shapes_Ext ; Geometric_Shapes_Extended +blk; Georgian ; Georgian +blk; Georgian_Ext ; Georgian_Extended +blk; Georgian_Sup ; Georgian_Supplement +blk; Glagolitic ; Glagolitic +blk; Glagolitic_Sup ; Glagolitic_Supplement +blk; Gothic ; Gothic +blk; Grantha ; Grantha +blk; Greek ; Greek_And_Coptic +blk; Greek_Ext ; Greek_Extended +blk; Gujarati ; Gujarati +blk; Gunjala_Gondi ; Gunjala_Gondi +blk; Gurmukhi ; Gurmukhi +blk; Half_And_Full_Forms ; Halfwidth_And_Fullwidth_Forms +blk; Half_Marks ; Combining_Half_Marks +blk; Hangul ; Hangul_Syllables +blk; Hanifi_Rohingya ; Hanifi_Rohingya +blk; Hanunoo ; Hanunoo +blk; Hatran ; Hatran +blk; Hebrew ; Hebrew +blk; High_PU_Surrogates ; High_Private_Use_Surrogates +blk; High_Surrogates ; High_Surrogates +blk; Hiragana ; Hiragana +blk; IDC ; Ideographic_Description_Characters +blk; Ideographic_Symbols ; Ideographic_Symbols_And_Punctuation +blk; Imperial_Aramaic ; Imperial_Aramaic +blk; Indic_Number_Forms ; Common_Indic_Number_Forms +blk; Indic_Siyaq_Numbers ; Indic_Siyaq_Numbers +blk; Inscriptional_Pahlavi ; Inscriptional_Pahlavi +blk; Inscriptional_Parthian ; Inscriptional_Parthian +blk; IPA_Ext ; IPA_Extensions +blk; Jamo ; Hangul_Jamo +blk; Jamo_Ext_A ; Hangul_Jamo_Extended_A +blk; Jamo_Ext_B ; Hangul_Jamo_Extended_B +blk; Javanese ; Javanese +blk; Kaithi ; Kaithi +blk; Kana_Ext_A ; Kana_Extended_A +blk; Kana_Ext_B ; Kana_Extended_B +blk; Kana_Sup ; Kana_Supplement +blk; Kanbun ; Kanbun +blk; Kangxi ; Kangxi_Radicals +blk; Kannada ; Kannada +blk; Katakana ; Katakana +blk; Katakana_Ext ; Katakana_Phonetic_Extensions +blk; Kayah_Li ; Kayah_Li +blk; Kharoshthi ; Kharoshthi +blk; Khitan_Small_Script ; Khitan_Small_Script +blk; Khmer ; Khmer +blk; Khmer_Symbols ; Khmer_Symbols +blk; Khojki ; Khojki +blk; Khudawadi ; Khudawadi +blk; Lao ; Lao +blk; Latin_1_Sup ; Latin_1_Supplement ; Latin_1 +blk; Latin_Ext_A ; Latin_Extended_A +blk; Latin_Ext_Additional ; Latin_Extended_Additional +blk; Latin_Ext_B ; Latin_Extended_B +blk; Latin_Ext_C ; Latin_Extended_C +blk; Latin_Ext_D ; Latin_Extended_D +blk; Latin_Ext_E ; Latin_Extended_E +blk; Latin_Ext_F ; Latin_Extended_F +blk; Latin_Ext_G ; Latin_Extended_G +blk; Lepcha ; Lepcha +blk; Letterlike_Symbols ; Letterlike_Symbols +blk; Limbu ; Limbu +blk; Linear_A ; Linear_A +blk; Linear_B_Ideograms ; Linear_B_Ideograms +blk; Linear_B_Syllabary ; Linear_B_Syllabary +blk; Lisu ; Lisu +blk; Lisu_Sup ; Lisu_Supplement +blk; Low_Surrogates ; Low_Surrogates +blk; Lycian ; Lycian +blk; Lydian ; Lydian +blk; Mahajani ; Mahajani +blk; Mahjong ; Mahjong_Tiles +blk; Makasar ; Makasar +blk; Malayalam ; Malayalam +blk; Mandaic ; Mandaic +blk; Manichaean ; Manichaean +blk; Marchen ; Marchen +blk; Masaram_Gondi ; Masaram_Gondi +blk; Math_Alphanum ; Mathematical_Alphanumeric_Symbols +blk; Math_Operators ; Mathematical_Operators +blk; Mayan_Numerals ; Mayan_Numerals +blk; Medefaidrin ; Medefaidrin +blk; Meetei_Mayek ; Meetei_Mayek +blk; Meetei_Mayek_Ext ; Meetei_Mayek_Extensions +blk; Mende_Kikakui ; Mende_Kikakui +blk; Meroitic_Cursive ; Meroitic_Cursive +blk; Meroitic_Hieroglyphs ; Meroitic_Hieroglyphs +blk; Miao ; Miao +blk; Misc_Arrows ; Miscellaneous_Symbols_And_Arrows +blk; Misc_Math_Symbols_A ; Miscellaneous_Mathematical_Symbols_A +blk; Misc_Math_Symbols_B ; Miscellaneous_Mathematical_Symbols_B +blk; Misc_Pictographs ; Miscellaneous_Symbols_And_Pictographs +blk; Misc_Symbols ; Miscellaneous_Symbols +blk; Misc_Technical ; Miscellaneous_Technical +blk; Modi ; Modi +blk; Modifier_Letters ; Spacing_Modifier_Letters +blk; Modifier_Tone_Letters ; Modifier_Tone_Letters +blk; Mongolian ; Mongolian +blk; Mongolian_Sup ; Mongolian_Supplement +blk; Mro ; Mro +blk; Multani ; Multani +blk; Music ; Musical_Symbols +blk; Myanmar ; Myanmar +blk; Myanmar_Ext_A ; Myanmar_Extended_A +blk; Myanmar_Ext_B ; Myanmar_Extended_B +blk; Nabataean ; Nabataean +blk; Nandinagari ; Nandinagari +blk; NB ; No_Block +blk; New_Tai_Lue ; New_Tai_Lue +blk; Newa ; Newa +blk; NKo ; NKo +blk; Number_Forms ; Number_Forms +blk; Nushu ; Nushu +blk; Nyiakeng_Puachue_Hmong ; Nyiakeng_Puachue_Hmong +blk; OCR ; Optical_Character_Recognition +blk; Ogham ; Ogham +blk; Ol_Chiki ; Ol_Chiki +blk; Old_Hungarian ; Old_Hungarian +blk; Old_Italic ; Old_Italic +blk; Old_North_Arabian ; Old_North_Arabian +blk; Old_Permic ; Old_Permic +blk; Old_Persian ; Old_Persian +blk; Old_Sogdian ; Old_Sogdian +blk; Old_South_Arabian ; Old_South_Arabian +blk; Old_Turkic ; Old_Turkic +blk; Old_Uyghur ; Old_Uyghur +blk; Oriya ; Oriya +blk; Ornamental_Dingbats ; Ornamental_Dingbats +blk; Osage ; Osage +blk; Osmanya ; Osmanya +blk; Ottoman_Siyaq_Numbers ; Ottoman_Siyaq_Numbers +blk; Pahawh_Hmong ; Pahawh_Hmong +blk; Palmyrene ; Palmyrene +blk; Pau_Cin_Hau ; Pau_Cin_Hau +blk; Phags_Pa ; Phags_Pa +blk; Phaistos ; Phaistos_Disc +blk; Phoenician ; Phoenician +blk; Phonetic_Ext ; Phonetic_Extensions +blk; Phonetic_Ext_Sup ; Phonetic_Extensions_Supplement +blk; Playing_Cards ; Playing_Cards +blk; Psalter_Pahlavi ; Psalter_Pahlavi +blk; PUA ; Private_Use_Area ; Private_Use +blk; Punctuation ; General_Punctuation +blk; Rejang ; Rejang +blk; Rumi ; Rumi_Numeral_Symbols +blk; Runic ; Runic +blk; Samaritan ; Samaritan +blk; Saurashtra ; Saurashtra +blk; Sharada ; Sharada +blk; Shavian ; Shavian +blk; Shorthand_Format_Controls ; Shorthand_Format_Controls +blk; Siddham ; Siddham +blk; Sinhala ; Sinhala +blk; Sinhala_Archaic_Numbers ; Sinhala_Archaic_Numbers +blk; Small_Forms ; Small_Form_Variants +blk; Small_Kana_Ext ; Small_Kana_Extension +blk; Sogdian ; Sogdian +blk; Sora_Sompeng ; Sora_Sompeng +blk; Soyombo ; Soyombo +blk; Specials ; Specials +blk; Sundanese ; Sundanese +blk; Sundanese_Sup ; Sundanese_Supplement +blk; Sup_Arrows_A ; Supplemental_Arrows_A +blk; Sup_Arrows_B ; Supplemental_Arrows_B +blk; Sup_Arrows_C ; Supplemental_Arrows_C +blk; Sup_Math_Operators ; Supplemental_Mathematical_Operators +blk; Sup_PUA_A ; Supplementary_Private_Use_Area_A +blk; Sup_PUA_B ; Supplementary_Private_Use_Area_B +blk; Sup_Punctuation ; Supplemental_Punctuation +blk; Sup_Symbols_And_Pictographs ; Supplemental_Symbols_And_Pictographs +blk; Super_And_Sub ; Superscripts_And_Subscripts +blk; Sutton_SignWriting ; Sutton_SignWriting +blk; Syloti_Nagri ; Syloti_Nagri +blk; Symbols_And_Pictographs_Ext_A ; Symbols_And_Pictographs_Extended_A +blk; Symbols_For_Legacy_Computing ; Symbols_For_Legacy_Computing +blk; Syriac ; Syriac +blk; Syriac_Sup ; Syriac_Supplement +blk; Tagalog ; Tagalog +blk; Tagbanwa ; Tagbanwa +blk; Tags ; Tags +blk; Tai_Le ; Tai_Le +blk; Tai_Tham ; Tai_Tham +blk; Tai_Viet ; Tai_Viet +blk; Tai_Xuan_Jing ; Tai_Xuan_Jing_Symbols +blk; Takri ; Takri +blk; Tamil ; Tamil +blk; Tamil_Sup ; Tamil_Supplement +blk; Tangsa ; Tangsa +blk; Tangut ; Tangut +blk; Tangut_Components ; Tangut_Components +blk; Tangut_Sup ; Tangut_Supplement +blk; Telugu ; Telugu +blk; Thaana ; Thaana +blk; Thai ; Thai +blk; Tibetan ; Tibetan +blk; Tifinagh ; Tifinagh +blk; Tirhuta ; Tirhuta +blk; Toto ; Toto +blk; Transport_And_Map ; Transport_And_Map_Symbols +blk; UCAS ; Unified_Canadian_Aboriginal_Syllabics; Canadian_Syllabics +blk; UCAS_Ext ; Unified_Canadian_Aboriginal_Syllabics_Extended +blk; UCAS_Ext_A ; Unified_Canadian_Aboriginal_Syllabics_Extended_A +blk; Ugaritic ; Ugaritic +blk; Vai ; Vai +blk; Vedic_Ext ; Vedic_Extensions +blk; Vertical_Forms ; Vertical_Forms +blk; Vithkuqi ; Vithkuqi +blk; VS ; Variation_Selectors +blk; VS_Sup ; Variation_Selectors_Supplement +blk; Wancho ; Wancho +blk; Warang_Citi ; Warang_Citi +blk; Yezidi ; Yezidi +blk; Yi_Radicals ; Yi_Radicals +blk; Yi_Syllables ; Yi_Syllables +blk; Yijing ; Yijing_Hexagram_Symbols +blk; Zanabazar_Square ; Zanabazar_Square +blk; Znamenny_Music ; Znamenny_Musical_Notation + +# Canonical_Combining_Class (ccc) + +ccc; 0; NR ; Not_Reordered +ccc; 1; OV ; Overlay +ccc; 6; HANR ; Han_Reading +ccc; 7; NK ; Nukta +ccc; 8; KV ; Kana_Voicing +ccc; 9; VR ; Virama +ccc; 10; CCC10 ; CCC10 +ccc; 11; CCC11 ; CCC11 +ccc; 12; CCC12 ; CCC12 +ccc; 13; CCC13 ; CCC13 +ccc; 14; CCC14 ; CCC14 +ccc; 15; CCC15 ; CCC15 +ccc; 16; CCC16 ; CCC16 +ccc; 17; CCC17 ; CCC17 +ccc; 18; CCC18 ; CCC18 +ccc; 19; CCC19 ; CCC19 +ccc; 20; CCC20 ; CCC20 +ccc; 21; CCC21 ; CCC21 +ccc; 22; CCC22 ; CCC22 +ccc; 23; CCC23 ; CCC23 +ccc; 24; CCC24 ; CCC24 +ccc; 25; CCC25 ; CCC25 +ccc; 26; CCC26 ; CCC26 +ccc; 27; CCC27 ; CCC27 +ccc; 28; CCC28 ; CCC28 +ccc; 29; CCC29 ; CCC29 +ccc; 30; CCC30 ; CCC30 +ccc; 31; CCC31 ; CCC31 +ccc; 32; CCC32 ; CCC32 +ccc; 33; CCC33 ; CCC33 +ccc; 34; CCC34 ; CCC34 +ccc; 35; CCC35 ; CCC35 +ccc; 36; CCC36 ; CCC36 +ccc; 84; CCC84 ; CCC84 +ccc; 91; CCC91 ; CCC91 +ccc; 103; CCC103 ; CCC103 +ccc; 107; CCC107 ; CCC107 +ccc; 118; CCC118 ; CCC118 +ccc; 122; CCC122 ; CCC122 +ccc; 129; CCC129 ; CCC129 +ccc; 130; CCC130 ; CCC130 +ccc; 132; CCC132 ; CCC132 +ccc; 133; CCC133 ; CCC133 # RESERVED +ccc; 200; ATBL ; Attached_Below_Left +ccc; 202; ATB ; Attached_Below +ccc; 214; ATA ; Attached_Above +ccc; 216; ATAR ; Attached_Above_Right +ccc; 218; BL ; Below_Left +ccc; 220; B ; Below +ccc; 222; BR ; Below_Right +ccc; 224; L ; Left +ccc; 226; R ; Right +ccc; 228; AL ; Above_Left +ccc; 230; A ; Above +ccc; 232; AR ; Above_Right +ccc; 233; DB ; Double_Below +ccc; 234; DA ; Double_Above +ccc; 240; IS ; Iota_Subscript + +# Case_Folding (cf) + +# @missing: 0000..10FFFF; Case_Folding; + +# Case_Ignorable (CI) + +CI ; N ; No ; F ; False +CI ; Y ; Yes ; T ; True + +# Cased (Cased) + +Cased; N ; No ; F ; False +Cased; Y ; Yes ; T ; True + +# Changes_When_Casefolded (CWCF) + +CWCF; N ; No ; F ; False +CWCF; Y ; Yes ; T ; True + +# Changes_When_Casemapped (CWCM) + +CWCM; N ; No ; F ; False +CWCM; Y ; Yes ; T ; True + +# Changes_When_Lowercased (CWL) + +CWL; N ; No ; F ; False +CWL; Y ; Yes ; T ; True + +# Changes_When_NFKC_Casefolded (CWKCF) + +CWKCF; N ; No ; F ; False +CWKCF; Y ; Yes ; T ; True + +# Changes_When_Titlecased (CWT) + +CWT; N ; No ; F ; False +CWT; Y ; Yes ; T ; True + +# Changes_When_Uppercased (CWU) + +CWU; N ; No ; F ; False +CWU; Y ; Yes ; T ; True + +# Composition_Exclusion (CE) + +CE ; N ; No ; F ; False +CE ; Y ; Yes ; T ; True + +# Dash (Dash) + +Dash; N ; No ; F ; False +Dash; Y ; Yes ; T ; True + +# Decomposition_Mapping (dm) + +# @missing: 0000..10FFFF; Decomposition_Mapping; + +# Decomposition_Type (dt) + +dt ; Can ; Canonical ; can +dt ; Com ; Compat ; com +dt ; Enc ; Circle ; enc +dt ; Fin ; Final ; fin +dt ; Font ; Font ; font +dt ; Fra ; Fraction ; fra +dt ; Init ; Initial ; init +dt ; Iso ; Isolated ; iso +dt ; Med ; Medial ; med +dt ; Nar ; Narrow ; nar +dt ; Nb ; Nobreak ; nb +dt ; None ; None ; none +dt ; Sml ; Small ; sml +dt ; Sqr ; Square ; sqr +dt ; Sub ; Sub ; sub +dt ; Sup ; Super ; sup +dt ; Vert ; Vertical ; vert +dt ; Wide ; Wide ; wide + +# Default_Ignorable_Code_Point (DI) + +DI ; N ; No ; F ; False +DI ; Y ; Yes ; T ; True + +# Deprecated (Dep) + +Dep; N ; No ; F ; False +Dep; Y ; Yes ; T ; True + +# Diacritic (Dia) + +Dia; N ; No ; F ; False +Dia; Y ; Yes ; T ; True + +# East_Asian_Width (ea) + +ea ; A ; Ambiguous +ea ; F ; Fullwidth +ea ; H ; Halfwidth +ea ; N ; Neutral +ea ; Na ; Narrow +ea ; W ; Wide + +# Emoji (Emoji) + +Emoji; N ; No ; F ; False +Emoji; Y ; Yes ; T ; True + +# Emoji_Component (EComp) + +EComp; N ; No ; F ; False +EComp; Y ; Yes ; T ; True + +# Emoji_Modifier (EMod) + +EMod; N ; No ; F ; False +EMod; Y ; Yes ; T ; True + +# Emoji_Modifier_Base (EBase) + +EBase; N ; No ; F ; False +EBase; Y ; Yes ; T ; True + +# Emoji_Presentation (EPres) + +EPres; N ; No ; F ; False +EPres; Y ; Yes ; T ; True + +# Equivalent_Unified_Ideograph (EqUIdeo) + +# @missing: 0000..10FFFF; Equivalent_Unified_Ideograph; + +# Expands_On_NFC (XO_NFC) + +XO_NFC; N ; No ; F ; False +XO_NFC; Y ; Yes ; T ; True + +# Expands_On_NFD (XO_NFD) + +XO_NFD; N ; No ; F ; False +XO_NFD; Y ; Yes ; T ; True + +# Expands_On_NFKC (XO_NFKC) + +XO_NFKC; N ; No ; F ; False +XO_NFKC; Y ; Yes ; T ; True + +# Expands_On_NFKD (XO_NFKD) + +XO_NFKD; N ; No ; F ; False +XO_NFKD; Y ; Yes ; T ; True + +# Extended_Pictographic (ExtPict) + +ExtPict; N ; No ; F ; False +ExtPict; Y ; Yes ; T ; True + +# Extender (Ext) + +Ext; N ; No ; F ; False +Ext; Y ; Yes ; T ; True + +# FC_NFKC_Closure (FC_NFKC) + +# @missing: 0000..10FFFF; FC_NFKC_Closure; + +# Full_Composition_Exclusion (Comp_Ex) + +Comp_Ex; N ; No ; F ; False +Comp_Ex; Y ; Yes ; T ; True + +# General_Category (gc) + +gc ; C ; Other # Cc | Cf | Cn | Co | Cs +gc ; Cc ; Control ; cntrl +gc ; Cf ; Format +gc ; Cn ; Unassigned +gc ; Co ; Private_Use +gc ; Cs ; Surrogate +gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu +gc ; LC ; Cased_Letter # Ll | Lt | Lu +gc ; Ll ; Lowercase_Letter +gc ; Lm ; Modifier_Letter +gc ; Lo ; Other_Letter +gc ; Lt ; Titlecase_Letter +gc ; Lu ; Uppercase_Letter +gc ; M ; Mark ; Combining_Mark # Mc | Me | Mn +gc ; Mc ; Spacing_Mark +gc ; Me ; Enclosing_Mark +gc ; Mn ; Nonspacing_Mark +gc ; N ; Number # Nd | Nl | No +gc ; Nd ; Decimal_Number ; digit +gc ; Nl ; Letter_Number +gc ; No ; Other_Number +gc ; P ; Punctuation ; punct # Pc | Pd | Pe | Pf | Pi | Po | Ps +gc ; Pc ; Connector_Punctuation +gc ; Pd ; Dash_Punctuation +gc ; Pe ; Close_Punctuation +gc ; Pf ; Final_Punctuation +gc ; Pi ; Initial_Punctuation +gc ; Po ; Other_Punctuation +gc ; Ps ; Open_Punctuation +gc ; S ; Symbol # Sc | Sk | Sm | So +gc ; Sc ; Currency_Symbol +gc ; Sk ; Modifier_Symbol +gc ; Sm ; Math_Symbol +gc ; So ; Other_Symbol +gc ; Z ; Separator # Zl | Zp | Zs +gc ; Zl ; Line_Separator +gc ; Zp ; Paragraph_Separator +gc ; Zs ; Space_Separator +# @missing: 0000..10FFFF; General_Category; Unassigned + +# Grapheme_Base (Gr_Base) + +Gr_Base; N ; No ; F ; False +Gr_Base; Y ; Yes ; T ; True + +# Grapheme_Cluster_Break (GCB) + +GCB; CN ; Control +GCB; CR ; CR +GCB; EB ; E_Base +GCB; EBG ; E_Base_GAZ +GCB; EM ; E_Modifier +GCB; EX ; Extend +GCB; GAZ ; Glue_After_Zwj +GCB; L ; L +GCB; LF ; LF +GCB; LV ; LV +GCB; LVT ; LVT +GCB; PP ; Prepend +GCB; RI ; Regional_Indicator +GCB; SM ; SpacingMark +GCB; T ; T +GCB; V ; V +GCB; XX ; Other +GCB; ZWJ ; ZWJ + +# Grapheme_Extend (Gr_Ext) + +Gr_Ext; N ; No ; F ; False +Gr_Ext; Y ; Yes ; T ; True + +# Grapheme_Link (Gr_Link) + +Gr_Link; N ; No ; F ; False +Gr_Link; Y ; Yes ; T ; True + +# Hangul_Syllable_Type (hst) + +hst; L ; Leading_Jamo +hst; LV ; LV_Syllable +hst; LVT ; LVT_Syllable +hst; NA ; Not_Applicable +hst; T ; Trailing_Jamo +hst; V ; Vowel_Jamo + +# Hex_Digit (Hex) + +Hex; N ; No ; F ; False +Hex; Y ; Yes ; T ; True + +# Hyphen (Hyphen) + +Hyphen; N ; No ; F ; False +Hyphen; Y ; Yes ; T ; True + +# IDS_Binary_Operator (IDSB) + +IDSB; N ; No ; F ; False +IDSB; Y ; Yes ; T ; True + +# IDS_Trinary_Operator (IDST) + +IDST; N ; No ; F ; False +IDST; Y ; Yes ; T ; True + +# ID_Continue (IDC) + +IDC; N ; No ; F ; False +IDC; Y ; Yes ; T ; True + +# ID_Start (IDS) + +IDS; N ; No ; F ; False +IDS; Y ; Yes ; T ; True + +# ISO_Comment (isc) + +# @missing: 0000..10FFFF; ISO_Comment; + +# Ideographic (Ideo) + +Ideo; N ; No ; F ; False +Ideo; Y ; Yes ; T ; True + +# Indic_Positional_Category (InPC) + +InPC; Bottom ; Bottom +InPC; Bottom_And_Left ; Bottom_And_Left +InPC; Bottom_And_Right ; Bottom_And_Right +InPC; Left ; Left +InPC; Left_And_Right ; Left_And_Right +InPC; NA ; NA +InPC; Overstruck ; Overstruck +InPC; Right ; Right +InPC; Top ; Top +InPC; Top_And_Bottom ; Top_And_Bottom +InPC; Top_And_Bottom_And_Left ; Top_And_Bottom_And_Left +InPC; Top_And_Bottom_And_Right ; Top_And_Bottom_And_Right +InPC; Top_And_Left ; Top_And_Left +InPC; Top_And_Left_And_Right ; Top_And_Left_And_Right +InPC; Top_And_Right ; Top_And_Right +InPC; Visual_Order_Left ; Visual_Order_Left + +# Indic_Syllabic_Category (InSC) + +InSC; Avagraha ; Avagraha +InSC; Bindu ; Bindu +InSC; Brahmi_Joining_Number ; Brahmi_Joining_Number +InSC; Cantillation_Mark ; Cantillation_Mark +InSC; Consonant ; Consonant +InSC; Consonant_Dead ; Consonant_Dead +InSC; Consonant_Final ; Consonant_Final +InSC; Consonant_Head_Letter ; Consonant_Head_Letter +InSC; Consonant_Initial_Postfixed ; Consonant_Initial_Postfixed +InSC; Consonant_Killer ; Consonant_Killer +InSC; Consonant_Medial ; Consonant_Medial +InSC; Consonant_Placeholder ; Consonant_Placeholder +InSC; Consonant_Preceding_Repha ; Consonant_Preceding_Repha +InSC; Consonant_Prefixed ; Consonant_Prefixed +InSC; Consonant_Subjoined ; Consonant_Subjoined +InSC; Consonant_Succeeding_Repha ; Consonant_Succeeding_Repha +InSC; Consonant_With_Stacker ; Consonant_With_Stacker +InSC; Gemination_Mark ; Gemination_Mark +InSC; Invisible_Stacker ; Invisible_Stacker +InSC; Joiner ; Joiner +InSC; Modifying_Letter ; Modifying_Letter +InSC; Non_Joiner ; Non_Joiner +InSC; Nukta ; Nukta +InSC; Number ; Number +InSC; Number_Joiner ; Number_Joiner +InSC; Other ; Other +InSC; Pure_Killer ; Pure_Killer +InSC; Register_Shifter ; Register_Shifter +InSC; Syllable_Modifier ; Syllable_Modifier +InSC; Tone_Letter ; Tone_Letter +InSC; Tone_Mark ; Tone_Mark +InSC; Virama ; Virama +InSC; Visarga ; Visarga +InSC; Vowel ; Vowel +InSC; Vowel_Dependent ; Vowel_Dependent +InSC; Vowel_Independent ; Vowel_Independent + +# Jamo_Short_Name (JSN) + +JSN; A ; A +JSN; AE ; AE +JSN; B ; B +JSN; BB ; BB +JSN; BS ; BS +JSN; C ; C +JSN; D ; D +JSN; DD ; DD +JSN; E ; E +JSN; EO ; EO +JSN; EU ; EU +JSN; G ; G +JSN; GG ; GG +JSN; GS ; GS +JSN; H ; H +JSN; I ; I +JSN; J ; J +JSN; JJ ; JJ +JSN; K ; K +JSN; L ; L +JSN; LB ; LB +JSN; LG ; LG +JSN; LH ; LH +JSN; LM ; LM +JSN; LP ; LP +JSN; LS ; LS +JSN; LT ; LT +JSN; M ; M +JSN; N ; N +JSN; NG ; NG +JSN; NH ; NH +JSN; NJ ; NJ +JSN; O ; O +JSN; OE ; OE +JSN; P ; P +JSN; R ; R +JSN; S ; S +JSN; SS ; SS +JSN; T ; T +JSN; U ; U +JSN; WA ; WA +JSN; WAE ; WAE +JSN; WE ; WE +JSN; WEO ; WEO +JSN; WI ; WI +JSN; YA ; YA +JSN; YAE ; YAE +JSN; YE ; YE +JSN; YEO ; YEO +JSN; YI ; YI +JSN; YO ; YO +JSN; YU ; YU +# @missing: 0000..10FFFF; Jamo_Short_Name; + +# Join_Control (Join_C) + +Join_C; N ; No ; F ; False +Join_C; Y ; Yes ; T ; True + +# Joining_Group (jg) + +jg ; African_Feh ; African_Feh +jg ; African_Noon ; African_Noon +jg ; African_Qaf ; African_Qaf +jg ; Ain ; Ain +jg ; Alaph ; Alaph +jg ; Alef ; Alef +jg ; Beh ; Beh +jg ; Beth ; Beth +jg ; Burushaski_Yeh_Barree ; Burushaski_Yeh_Barree +jg ; Dal ; Dal +jg ; Dalath_Rish ; Dalath_Rish +jg ; E ; E +jg ; Farsi_Yeh ; Farsi_Yeh +jg ; Fe ; Fe +jg ; Feh ; Feh +jg ; Final_Semkath ; Final_Semkath +jg ; Gaf ; Gaf +jg ; Gamal ; Gamal +jg ; Hah ; Hah +jg ; Hanifi_Rohingya_Kinna_Ya ; Hanifi_Rohingya_Kinna_Ya +jg ; Hanifi_Rohingya_Pa ; Hanifi_Rohingya_Pa +jg ; He ; He +jg ; Heh ; Heh +jg ; Heh_Goal ; Heh_Goal +jg ; Heth ; Heth +jg ; Kaf ; Kaf +jg ; Kaph ; Kaph +jg ; Khaph ; Khaph +jg ; Knotted_Heh ; Knotted_Heh +jg ; Lam ; Lam +jg ; Lamadh ; Lamadh +jg ; Malayalam_Bha ; Malayalam_Bha +jg ; Malayalam_Ja ; Malayalam_Ja +jg ; Malayalam_Lla ; Malayalam_Lla +jg ; Malayalam_Llla ; Malayalam_Llla +jg ; Malayalam_Nga ; Malayalam_Nga +jg ; Malayalam_Nna ; Malayalam_Nna +jg ; Malayalam_Nnna ; Malayalam_Nnna +jg ; Malayalam_Nya ; Malayalam_Nya +jg ; Malayalam_Ra ; Malayalam_Ra +jg ; Malayalam_Ssa ; Malayalam_Ssa +jg ; Malayalam_Tta ; Malayalam_Tta +jg ; Manichaean_Aleph ; Manichaean_Aleph +jg ; Manichaean_Ayin ; Manichaean_Ayin +jg ; Manichaean_Beth ; Manichaean_Beth +jg ; Manichaean_Daleth ; Manichaean_Daleth +jg ; Manichaean_Dhamedh ; Manichaean_Dhamedh +jg ; Manichaean_Five ; Manichaean_Five +jg ; Manichaean_Gimel ; Manichaean_Gimel +jg ; Manichaean_Heth ; Manichaean_Heth +jg ; Manichaean_Hundred ; Manichaean_Hundred +jg ; Manichaean_Kaph ; Manichaean_Kaph +jg ; Manichaean_Lamedh ; Manichaean_Lamedh +jg ; Manichaean_Mem ; Manichaean_Mem +jg ; Manichaean_Nun ; Manichaean_Nun +jg ; Manichaean_One ; Manichaean_One +jg ; Manichaean_Pe ; Manichaean_Pe +jg ; Manichaean_Qoph ; Manichaean_Qoph +jg ; Manichaean_Resh ; Manichaean_Resh +jg ; Manichaean_Sadhe ; Manichaean_Sadhe +jg ; Manichaean_Samekh ; Manichaean_Samekh +jg ; Manichaean_Taw ; Manichaean_Taw +jg ; Manichaean_Ten ; Manichaean_Ten +jg ; Manichaean_Teth ; Manichaean_Teth +jg ; Manichaean_Thamedh ; Manichaean_Thamedh +jg ; Manichaean_Twenty ; Manichaean_Twenty +jg ; Manichaean_Waw ; Manichaean_Waw +jg ; Manichaean_Yodh ; Manichaean_Yodh +jg ; Manichaean_Zayin ; Manichaean_Zayin +jg ; Meem ; Meem +jg ; Mim ; Mim +jg ; No_Joining_Group ; No_Joining_Group +jg ; Noon ; Noon +jg ; Nun ; Nun +jg ; Nya ; Nya +jg ; Pe ; Pe +jg ; Qaf ; Qaf +jg ; Qaph ; Qaph +jg ; Reh ; Reh +jg ; Reversed_Pe ; Reversed_Pe +jg ; Rohingya_Yeh ; Rohingya_Yeh +jg ; Sad ; Sad +jg ; Sadhe ; Sadhe +jg ; Seen ; Seen +jg ; Semkath ; Semkath +jg ; Shin ; Shin +jg ; Straight_Waw ; Straight_Waw +jg ; Swash_Kaf ; Swash_Kaf +jg ; Syriac_Waw ; Syriac_Waw +jg ; Tah ; Tah +jg ; Taw ; Taw +jg ; Teh_Marbuta ; Teh_Marbuta +jg ; Teh_Marbuta_Goal ; Hamza_On_Heh_Goal +jg ; Teth ; Teth +jg ; Thin_Yeh ; Thin_Yeh +jg ; Vertical_Tail ; Vertical_Tail +jg ; Waw ; Waw +jg ; Yeh ; Yeh +jg ; Yeh_Barree ; Yeh_Barree +jg ; Yeh_With_Tail ; Yeh_With_Tail +jg ; Yudh ; Yudh +jg ; Yudh_He ; Yudh_He +jg ; Zain ; Zain +jg ; Zhain ; Zhain + +# Joining_Type (jt) + +jt ; C ; Join_Causing +jt ; D ; Dual_Joining +jt ; L ; Left_Joining +jt ; R ; Right_Joining +jt ; T ; Transparent +jt ; U ; Non_Joining + +# Line_Break (lb) + +lb ; AI ; Ambiguous +lb ; AL ; Alphabetic +lb ; B2 ; Break_Both +lb ; BA ; Break_After +lb ; BB ; Break_Before +lb ; BK ; Mandatory_Break +lb ; CB ; Contingent_Break +lb ; CJ ; Conditional_Japanese_Starter +lb ; CL ; Close_Punctuation +lb ; CM ; Combining_Mark +lb ; CP ; Close_Parenthesis +lb ; CR ; Carriage_Return +lb ; EB ; E_Base +lb ; EM ; E_Modifier +lb ; EX ; Exclamation +lb ; GL ; Glue +lb ; H2 ; H2 +lb ; H3 ; H3 +lb ; HL ; Hebrew_Letter +lb ; HY ; Hyphen +lb ; ID ; Ideographic +lb ; IN ; Inseparable ; Inseperable +lb ; IS ; Infix_Numeric +lb ; JL ; JL +lb ; JT ; JT +lb ; JV ; JV +lb ; LF ; Line_Feed +lb ; NL ; Next_Line +lb ; NS ; Nonstarter +lb ; NU ; Numeric +lb ; OP ; Open_Punctuation +lb ; PO ; Postfix_Numeric +lb ; PR ; Prefix_Numeric +lb ; QU ; Quotation +lb ; RI ; Regional_Indicator +lb ; SA ; Complex_Context +lb ; SG ; Surrogate +lb ; SP ; Space +lb ; SY ; Break_Symbols +lb ; WJ ; Word_Joiner +lb ; XX ; Unknown +lb ; ZW ; ZWSpace +lb ; ZWJ ; ZWJ + +# Logical_Order_Exception (LOE) + +LOE; N ; No ; F ; False +LOE; Y ; Yes ; T ; True + +# Lowercase (Lower) + +Lower; N ; No ; F ; False +Lower; Y ; Yes ; T ; True + +# Lowercase_Mapping (lc) + +# @missing: 0000..10FFFF; Lowercase_Mapping; + +# Math (Math) + +Math; N ; No ; F ; False +Math; Y ; Yes ; T ; True + +# NFC_Quick_Check (NFC_QC) + +NFC_QC; M ; Maybe +NFC_QC; N ; No +NFC_QC; Y ; Yes + +# NFD_Quick_Check (NFD_QC) + +NFD_QC; N ; No +NFD_QC; Y ; Yes + +# NFKC_Casefold (NFKC_CF) + +# @missing: 0000..10FFFF; NFKC_Casefold; + +# NFKC_Quick_Check (NFKC_QC) + +NFKC_QC; M ; Maybe +NFKC_QC; N ; No +NFKC_QC; Y ; Yes + +# NFKD_Quick_Check (NFKD_QC) + +NFKD_QC; N ; No +NFKD_QC; Y ; Yes + +# Name (na) + +# @missing: 0000..10FFFF; Name; + +# Name_Alias (Name_Alias) + +# @missing: 0000..10FFFF; Name_Alias; + +# Noncharacter_Code_Point (NChar) + +NChar; N ; No ; F ; False +NChar; Y ; Yes ; T ; True + +# Numeric_Type (nt) + +nt ; De ; Decimal +nt ; Di ; Digit +nt ; None ; None +nt ; Nu ; Numeric + +# Numeric_Value (nv) + +# @missing: 0000..10FFFF; Numeric_Value; NaN + +# Other_Alphabetic (OAlpha) + +OAlpha; N ; No ; F ; False +OAlpha; Y ; Yes ; T ; True + +# Other_Default_Ignorable_Code_Point (ODI) + +ODI; N ; No ; F ; False +ODI; Y ; Yes ; T ; True + +# Other_Grapheme_Extend (OGr_Ext) + +OGr_Ext; N ; No ; F ; False +OGr_Ext; Y ; Yes ; T ; True + +# Other_ID_Continue (OIDC) + +OIDC; N ; No ; F ; False +OIDC; Y ; Yes ; T ; True + +# Other_ID_Start (OIDS) + +OIDS; N ; No ; F ; False +OIDS; Y ; Yes ; T ; True + +# Other_Lowercase (OLower) + +OLower; N ; No ; F ; False +OLower; Y ; Yes ; T ; True + +# Other_Math (OMath) + +OMath; N ; No ; F ; False +OMath; Y ; Yes ; T ; True + +# Other_Uppercase (OUpper) + +OUpper; N ; No ; F ; False +OUpper; Y ; Yes ; T ; True + +# Pattern_Syntax (Pat_Syn) + +Pat_Syn; N ; No ; F ; False +Pat_Syn; Y ; Yes ; T ; True + +# Pattern_White_Space (Pat_WS) + +Pat_WS; N ; No ; F ; False +Pat_WS; Y ; Yes ; T ; True + +# Prepended_Concatenation_Mark (PCM) + +PCM; N ; No ; F ; False +PCM; Y ; Yes ; T ; True + +# Quotation_Mark (QMark) + +QMark; N ; No ; F ; False +QMark; Y ; Yes ; T ; True + +# Radical (Radical) + +Radical; N ; No ; F ; False +Radical; Y ; Yes ; T ; True + +# Regional_Indicator (RI) + +RI ; N ; No ; F ; False +RI ; Y ; Yes ; T ; True + +# Script (sc) + +sc ; Adlm ; Adlam +sc ; Aghb ; Caucasian_Albanian +sc ; Ahom ; Ahom +sc ; Arab ; Arabic +sc ; Armi ; Imperial_Aramaic +sc ; Armn ; Armenian +sc ; Avst ; Avestan +sc ; Bali ; Balinese +sc ; Bamu ; Bamum +sc ; Bass ; Bassa_Vah +sc ; Batk ; Batak +sc ; Beng ; Bengali +sc ; Bhks ; Bhaiksuki +sc ; Bopo ; Bopomofo +sc ; Brah ; Brahmi +sc ; Brai ; Braille +sc ; Bugi ; Buginese +sc ; Buhd ; Buhid +sc ; Cakm ; Chakma +sc ; Cans ; Canadian_Aboriginal +sc ; Cari ; Carian +sc ; Cham ; Cham +sc ; Cher ; Cherokee +sc ; Chrs ; Chorasmian +sc ; Copt ; Coptic ; Qaac +sc ; Cpmn ; Cypro_Minoan +sc ; Cprt ; Cypriot +sc ; Cyrl ; Cyrillic +sc ; Deva ; Devanagari +sc ; Diak ; Dives_Akuru +sc ; Dogr ; Dogra +sc ; Dsrt ; Deseret +sc ; Dupl ; Duployan +sc ; Egyp ; Egyptian_Hieroglyphs +sc ; Elba ; Elbasan +sc ; Elym ; Elymaic +sc ; Ethi ; Ethiopic +sc ; Geor ; Georgian +sc ; Glag ; Glagolitic +sc ; Gong ; Gunjala_Gondi +sc ; Gonm ; Masaram_Gondi +sc ; Goth ; Gothic +sc ; Gran ; Grantha +sc ; Grek ; Greek +sc ; Gujr ; Gujarati +sc ; Guru ; Gurmukhi +sc ; Hang ; Hangul +sc ; Hani ; Han +sc ; Hano ; Hanunoo +sc ; Hatr ; Hatran +sc ; Hebr ; Hebrew +sc ; Hira ; Hiragana +sc ; Hluw ; Anatolian_Hieroglyphs +sc ; Hmng ; Pahawh_Hmong +sc ; Hmnp ; Nyiakeng_Puachue_Hmong +sc ; Hrkt ; Katakana_Or_Hiragana +sc ; Hung ; Old_Hungarian +sc ; Ital ; Old_Italic +sc ; Java ; Javanese +sc ; Kali ; Kayah_Li +sc ; Kana ; Katakana +sc ; Khar ; Kharoshthi +sc ; Khmr ; Khmer +sc ; Khoj ; Khojki +sc ; Kits ; Khitan_Small_Script +sc ; Knda ; Kannada +sc ; Kthi ; Kaithi +sc ; Lana ; Tai_Tham +sc ; Laoo ; Lao +sc ; Latn ; Latin +sc ; Lepc ; Lepcha +sc ; Limb ; Limbu +sc ; Lina ; Linear_A +sc ; Linb ; Linear_B +sc ; Lisu ; Lisu +sc ; Lyci ; Lycian +sc ; Lydi ; Lydian +sc ; Mahj ; Mahajani +sc ; Maka ; Makasar +sc ; Mand ; Mandaic +sc ; Mani ; Manichaean +sc ; Marc ; Marchen +sc ; Medf ; Medefaidrin +sc ; Mend ; Mende_Kikakui +sc ; Merc ; Meroitic_Cursive +sc ; Mero ; Meroitic_Hieroglyphs +sc ; Mlym ; Malayalam +sc ; Modi ; Modi +sc ; Mong ; Mongolian +sc ; Mroo ; Mro +sc ; Mtei ; Meetei_Mayek +sc ; Mult ; Multani +sc ; Mymr ; Myanmar +sc ; Nand ; Nandinagari +sc ; Narb ; Old_North_Arabian +sc ; Nbat ; Nabataean +sc ; Newa ; Newa +sc ; Nkoo ; Nko +sc ; Nshu ; Nushu +sc ; Ogam ; Ogham +sc ; Olck ; Ol_Chiki +sc ; Orkh ; Old_Turkic +sc ; Orya ; Oriya +sc ; Osge ; Osage +sc ; Osma ; Osmanya +sc ; Ougr ; Old_Uyghur +sc ; Palm ; Palmyrene +sc ; Pauc ; Pau_Cin_Hau +sc ; Perm ; Old_Permic +sc ; Phag ; Phags_Pa +sc ; Phli ; Inscriptional_Pahlavi +sc ; Phlp ; Psalter_Pahlavi +sc ; Phnx ; Phoenician +sc ; Plrd ; Miao +sc ; Prti ; Inscriptional_Parthian +sc ; Rjng ; Rejang +sc ; Rohg ; Hanifi_Rohingya +sc ; Runr ; Runic +sc ; Samr ; Samaritan +sc ; Sarb ; Old_South_Arabian +sc ; Saur ; Saurashtra +sc ; Sgnw ; SignWriting +sc ; Shaw ; Shavian +sc ; Shrd ; Sharada +sc ; Sidd ; Siddham +sc ; Sind ; Khudawadi +sc ; Sinh ; Sinhala +sc ; Sogd ; Sogdian +sc ; Sogo ; Old_Sogdian +sc ; Sora ; Sora_Sompeng +sc ; Soyo ; Soyombo +sc ; Sund ; Sundanese +sc ; Sylo ; Syloti_Nagri +sc ; Syrc ; Syriac +sc ; Tagb ; Tagbanwa +sc ; Takr ; Takri +sc ; Tale ; Tai_Le +sc ; Talu ; New_Tai_Lue +sc ; Taml ; Tamil +sc ; Tang ; Tangut +sc ; Tavt ; Tai_Viet +sc ; Telu ; Telugu +sc ; Tfng ; Tifinagh +sc ; Tglg ; Tagalog +sc ; Thaa ; Thaana +sc ; Thai ; Thai +sc ; Tibt ; Tibetan +sc ; Tirh ; Tirhuta +sc ; Tnsa ; Tangsa +sc ; Toto ; Toto +sc ; Ugar ; Ugaritic +sc ; Vaii ; Vai +sc ; Vith ; Vithkuqi +sc ; Wara ; Warang_Citi +sc ; Wcho ; Wancho +sc ; Xpeo ; Old_Persian +sc ; Xsux ; Cuneiform +sc ; Yezi ; Yezidi +sc ; Yiii ; Yi +sc ; Zanb ; Zanabazar_Square +sc ; Zinh ; Inherited ; Qaai +sc ; Zyyy ; Common +sc ; Zzzz ; Unknown + +# Script_Extensions (scx) + +# @missing: 0000..10FFFF; Script_Extensions;