Unicode properties data records extended to 12-bytes to include a
ScriptExtensions property.
This commit is contained in:
parent
cda4780fb6
commit
04ba4bce0f
|
@ -61,26 +61,39 @@
|
|||
# property, which is used by PCRE2 as a grapheme breaking property. This was
|
||||
# done when updating to Unicode 11.0.0 (July 2018).
|
||||
#
|
||||
# Added code to add a Script Extensions field to records.
|
||||
#
|
||||
#
|
||||
# The main tables generated by this script are used by macros defined in
|
||||
# pcre2_internal.h. They look up Unicode character properties using short
|
||||
# sequences of code that contains no branches, which makes for greater speed.
|
||||
#
|
||||
# Conceptually, there is a table of records (of type ucd_record), containing a
|
||||
# script number, character type, grapheme break type, offset to caseless
|
||||
# matching set, and offset to the character's other case for every character.
|
||||
# However, a real table covering all Unicode characters would be far too big.
|
||||
# It can be efficiently compressed by observing that many characters have the
|
||||
# same record, and many blocks of characters (taking 128 characters in a block)
|
||||
# have the same set of records as other blocks. This leads to a 2-stage lookup
|
||||
# process.
|
||||
# script number, script extension value, character type, grapheme break type,
|
||||
# offset to caseless matching set, offset to the character's other case, for
|
||||
# every character. However, a real table covering all Unicode characters would
|
||||
# be far too big. It can be efficiently compressed by observing that many
|
||||
# characters have the same record, and many blocks of characters (taking 128
|
||||
# characters in a block) have the same set of records as other blocks. This
|
||||
# leads to a 2-stage lookup process.
|
||||
#
|
||||
# This script constructs four tables. The ucd_caseless_sets table contains
|
||||
# This script constructs six tables. The ucd_caseless_sets table contains
|
||||
# lists of characters that all match each other caselessly. Each list is
|
||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||
# any valid character. The first list is empty; this is used for characters
|
||||
# that are not part of any list.
|
||||
#
|
||||
# The ucd_digit_sets table contains the code points of the '9' characters in
|
||||
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
|
||||
# in script runs all come from the same set. The first element in the vector
|
||||
# contains the number of subsequent elements, which are in ascending order.
|
||||
#
|
||||
# The ucd_script_sets vector contains lists of script numbers that are the
|
||||
# Script Extensions properties of certain characters. Each list is terminated
|
||||
# by zero (ucp_Unknown). A character with more than one script listed for its
|
||||
# Script Extension property has a negative value in its record. This is the
|
||||
# negated offset to the start of the relevant list.
|
||||
#
|
||||
# The ucd_records table contains one instance of every unique record that is
|
||||
# required. The ucd_stage1 table is indexed by a character's block number, and
|
||||
# yields what is in effect a "virtual" block number. The ucd_stage2 table is a
|
||||
|
@ -117,11 +130,8 @@
|
|||
# In these examples, no other blocks resolve to the same "virtual" block, as it
|
||||
# happens, but plenty of other blocks do share "virtual" blocks.
|
||||
#
|
||||
# There is a fourth table, maintained by hand, which translates from the
|
||||
# individual character types such as ucp_Cc to the general types like ucp_C.
|
||||
#
|
||||
# Philip Hazel, 03 July 2008
|
||||
# Last Updated: 07 July 2018
|
||||
# Last Updated: 03 October 2018
|
||||
#
|
||||
#
|
||||
# 01-March-2010: Updated list of scripts for Unicode 5.2.0
|
||||
|
@ -144,6 +154,7 @@
|
|||
# 07-July-2018: Added code to scan emoji-data.txt for the Extended
|
||||
# Pictographic property.
|
||||
# 01-October-2018: Added the 'Unknown' script name
|
||||
# 03-October-2018: Added new field for Script Extensions
|
||||
##############################################################################
|
||||
|
||||
|
||||
|
@ -165,6 +176,32 @@ def get_other_case(chardata):
|
|||
return int(chardata[2], 16) - int(chardata[0], 16)
|
||||
return 0
|
||||
|
||||
# Parse a line of ScriptExtensions.txt
|
||||
def get_script_extension(chardata):
|
||||
this_script_list = list(chardata[1].split(' '))
|
||||
if len(this_script_list) == 1:
|
||||
return script_abbrevs.index(this_script_list[0])
|
||||
|
||||
script_numbers = []
|
||||
for d in this_script_list:
|
||||
script_numbers.append(script_abbrevs.index(d))
|
||||
script_numbers.append(0)
|
||||
script_numbers_length = len(script_numbers)
|
||||
|
||||
for i in range(1, len(script_lists) - script_numbers_length + 1):
|
||||
for j in range(0, script_numbers_length):
|
||||
found = True
|
||||
if script_lists[i+j] != script_numbers[j]:
|
||||
found = False
|
||||
break
|
||||
if found:
|
||||
return -i
|
||||
|
||||
# Not found in existing lists
|
||||
|
||||
return_value = len(script_lists)
|
||||
script_lists.extend(script_numbers)
|
||||
return -return_value
|
||||
|
||||
# Read the whole table in memory, setting/checking the Unicode version
|
||||
def read_table(file_name, get_value, default_value):
|
||||
|
@ -330,24 +367,24 @@ def print_records(records, record_size):
|
|||
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
|
||||
print('};\n')
|
||||
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
||||
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
|
||||
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
|
||||
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
|
||||
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
|
||||
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
|
||||
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
|
||||
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
|
||||
# New for Unicode 5.0
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
||||
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
|
||||
# New for Unicode 5.1
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
||||
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
|
||||
# New for Unicode 5.2
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
|
||||
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
|
||||
'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
|
||||
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
|
||||
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
|
||||
# New for Unicode 6.0.0
|
||||
'Batak', 'Brahmi', 'Mandaic', \
|
||||
'Batak', 'Brahmi', 'Mandaic',
|
||||
# New for Unicode 6.1.0
|
||||
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
|
||||
# New for Unicode 7.0.0
|
||||
|
@ -366,6 +403,39 @@ script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille
|
|||
'Old_Sogdian', 'Sogdian'
|
||||
]
|
||||
|
||||
script_abbrevs = [
|
||||
'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
|
||||
'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
|
||||
'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
|
||||
'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
|
||||
'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
|
||||
'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
|
||||
'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
|
||||
#New for Unicode 5.0
|
||||
'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
|
||||
#New for Unicode 5.1
|
||||
'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
|
||||
'Sund', 'Vaii',
|
||||
#New for Unicode 5.2
|
||||
'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
|
||||
'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
|
||||
#New for Unicode 6.0.0
|
||||
'Batk', 'Brah', 'Mand',
|
||||
#New for Unicode 6.1.0
|
||||
'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
|
||||
#New for Unicode 7.0.0
|
||||
'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
|
||||
'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
|
||||
'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
|
||||
#New for Unicode 8.0.0
|
||||
'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
|
||||
#New for Unicode 10.0.0
|
||||
'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
|
||||
'Zanb',
|
||||
#New for Unicode 11.0.0
|
||||
'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd'
|
||||
]
|
||||
|
||||
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
||||
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
||||
|
@ -415,6 +485,28 @@ for line in file:
|
|||
break_props[i] = break_property_names.index('Extended_Pictographic')
|
||||
file.close()
|
||||
|
||||
# The Script Extensions property default value is the Script value. Parse the
|
||||
# file, setting 'Unknown' as the default (this will never be a Script Extension
|
||||
# value), then scan it and fill in the default from Scripts. Code added by PH
|
||||
# in October 2018. Positive values are used for just a single script for a
|
||||
# code point. Negative values are negated offsets in a list of lists of
|
||||
# multiple scripts. Initialize this list with a single entry, as the zeroth
|
||||
# element is never used.
|
||||
|
||||
script_lists = [0]
|
||||
script_abbrevs_default = script_abbrevs.index('Zzzz')
|
||||
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
|
||||
|
||||
for i in range(0, MAX_UNICODE):
|
||||
if scriptx[i] == script_abbrevs_default:
|
||||
scriptx[i] = script[i]
|
||||
|
||||
# With the addition of the new Script Extensions field, we need some padding
|
||||
# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
|
||||
# greater than 255 to make the field 16 bits.
|
||||
|
||||
padding_dummy = [0] * MAX_UNICODE
|
||||
padding_dummy[0] = 256
|
||||
|
||||
# This block of code was added by PH in September 2012. I am not a Python
|
||||
# programmer, so the style is probably dreadful, but it does the job. It scans
|
||||
|
@ -427,7 +519,7 @@ file.close()
|
|||
# sets only one value, so first we go through the table and set "return"
|
||||
# offsets for those that are not already set.
|
||||
|
||||
for c in range(0x10ffff):
|
||||
for c in range(MAX_UNICODE):
|
||||
if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
|
||||
other_case[c + other_case[c]] = -other_case[c]
|
||||
|
||||
|
@ -435,7 +527,7 @@ for c in range(0x10ffff):
|
|||
|
||||
sets = []
|
||||
|
||||
for c in range(0x10ffff):
|
||||
for c in range(MAX_UNICODE):
|
||||
o = c + other_case[c]
|
||||
|
||||
# Trigger when this character's other case does not point back here. We
|
||||
|
@ -489,7 +581,7 @@ for s in sets:
|
|||
# Combine the tables
|
||||
|
||||
table, records = combine_tables(script, category, break_props,
|
||||
caseless_offsets, other_case)
|
||||
caseless_offsets, other_case, scriptx, padding_dummy)
|
||||
|
||||
record_size, record_struct = get_record_size_struct(list(records.keys()))
|
||||
|
||||
|
@ -537,7 +629,7 @@ print("a comment was received about space saving - maybe the guy linked")
|
|||
print("all the modules rather than using a library - so we include a")
|
||||
print("condition to cut out the tables when not needed. But don't leave")
|
||||
print("a totally empty module because some compilers barf at that.")
|
||||
print("Instead, just supply small dummy tables. */")
|
||||
print("Instead, just supply some small dummy tables. */")
|
||||
print()
|
||||
print("#ifndef SUPPORT_UNICODE")
|
||||
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
|
||||
|
@ -559,6 +651,8 @@ print(" ucp_Cn, /* type unassigned */")
|
|||
print(" ucp_gbOther, /* grapheme break property */")
|
||||
print(" 0, /* case set */")
|
||||
print(" 0, /* other case */")
|
||||
print(" ucp_Unknown, /* script extension */")
|
||||
print(" 0, /* dummy filler */")
|
||||
print(" }};")
|
||||
print("#endif")
|
||||
print()
|
||||
|
@ -609,8 +703,7 @@ digitsets.sort()
|
|||
|
||||
print("/* This table lists the code points for the '9' characters in each")
|
||||
print("set of decimal digits. It is used to ensure that all the digits in")
|
||||
print("a script run come from the same set. */")
|
||||
print()
|
||||
print("a script run come from the same set. */\n")
|
||||
print("const uint32_t PRIV(ucd_digit_sets)[] = {")
|
||||
|
||||
print(" %d, /* Number of subsequent values */" % len(digitsets), end='')
|
||||
|
@ -621,12 +714,28 @@ for d in digitsets:
|
|||
count = 0
|
||||
print(" 0x%05x," % d, end='')
|
||||
count += 1
|
||||
print("\n};")
|
||||
print()
|
||||
print("\n};\n")
|
||||
|
||||
print("/* This vector is a list of lists of scripts for the Script Extension")
|
||||
print("property. Each sublist is zero-terminated. */\n")
|
||||
print("const uint8_t PRIV(ucd_script_sets)[] = {")
|
||||
|
||||
count = 0
|
||||
print(" /* 0 */", end='')
|
||||
for d in script_lists:
|
||||
print(" %3d," % d, end='')
|
||||
count += 1
|
||||
if d == 0:
|
||||
print("\n /* %3d */" % count, end='')
|
||||
print("\n};\n")
|
||||
|
||||
# Output the main UCD tables.
|
||||
|
||||
print("/* These are the main two-stage UCD tables. */\n")
|
||||
print("/* These are the main two-stage UCD tables. The fields in each record are:")
|
||||
print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
|
||||
print("offset to multichar other cases or zero (8 bits), offset to other case")
|
||||
print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
|
||||
print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
|
||||
|
||||
print_records(records, record_size)
|
||||
print_table(min_stage1, 'PRIV(ucd_stage1)')
|
||||
|
|
|
@ -0,0 +1,531 @@
|
|||
# ScriptExtensions-11.0.0.txt
|
||||
# Date: 2018-02-04, 20:04:00 GMT
|
||||
# © 2018 Unicode®, Inc.
|
||||
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
#
|
||||
# Unicode Character Database
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# The Script_Extensions property indicates which characters are commonly used
|
||||
# with more than one script, but with a limited number of scripts.
|
||||
# For each code point, there is one or more property values. Each such value is a Script property value.
|
||||
# For more information, see:
|
||||
# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
|
||||
# Especially the sections:
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_Script_Values
|
||||
# http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
#
|
||||
# Each Script_Extensions value in this file consists of a set
|
||||
# of one or more abbreviated Script property values. The ordering of the
|
||||
# values in that set is not material, but for stability in presentation
|
||||
# it is given here as alphabetical.
|
||||
#
|
||||
# The Script_Extensions values are presented in sorted order in the file.
|
||||
# They are sorted first by the number of Script property values in their sets,
|
||||
# and then alphabetically by first differing Script property value.
|
||||
#
|
||||
# Following each distinct Script_Extensions value is the list of code
|
||||
# points associated with that value, listed in code point order.
|
||||
#
|
||||
# All code points not explicitly listed for Script_Extensions
|
||||
# have as their value the corresponding Script property value
|
||||
#
|
||||
# @missing: 0000..10FFFF; <script>
|
||||
|
||||
# ================================================
|
||||
|
||||
# Property: Script_Extensions
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng
|
||||
|
||||
1CF7 ; Beng # Mc VEDIC SIGN ATIKRAMA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva
|
||||
|
||||
1CD1 ; Deva # Mn VEDIC TONE SHARA
|
||||
1CD4 ; Deva # Mn VEDIC SIGN YAJURVEDIC MIDLINE SVARITA
|
||||
1CDB ; Deva # Mn VEDIC TONE TRIPLE SVARITA
|
||||
1CDE..1CDF ; Deva # Mn [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW
|
||||
1CE2..1CE8 ; Deva # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
|
||||
1CE9 ; Deva # Lo VEDIC SIGN ANUSVARA ANTARGOMUKHA
|
||||
1CEB..1CEC ; Deva # Lo [2] VEDIC SIGN ANUSVARA VAMAGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
|
||||
1CEE..1CF1 ; Deva # Lo [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
|
||||
|
||||
# Total code points: 19
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Dupl
|
||||
|
||||
1BCA0..1BCA3 ; Dupl # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Grek
|
||||
|
||||
0342 ; Grek # Mn COMBINING GREEK PERISPOMENI
|
||||
0345 ; Grek # Mn COMBINING GREEK YPOGEGRAMMENI
|
||||
1DC0..1DC1 ; Grek # Mn [2] COMBINING DOTTED GRAVE ACCENT..COMBINING DOTTED ACUTE ACCENT
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Hani
|
||||
|
||||
3006 ; Hani # Lo IDEOGRAPHIC CLOSING MARK
|
||||
303E..303F ; Hani # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
|
||||
3190..3191 ; Hani # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
|
||||
3192..3195 ; Hani # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
|
||||
3196..319F ; Hani # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
|
||||
31C0..31E3 ; Hani # So [36] CJK STROKE T..CJK STROKE Q
|
||||
3220..3229 ; Hani # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
|
||||
322A..3247 ; Hani # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
|
||||
3280..3289 ; Hani # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
|
||||
328A..32B0 ; Hani # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
|
||||
32C0..32CB ; Hani # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
|
||||
3358..3370 ; Hani # So [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
|
||||
337B..337F ; Hani # So [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
|
||||
33E0..33FE ; Hani # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
|
||||
1D360..1D371 ; Hani # No [18] COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TENS DIGIT NINE
|
||||
1F250..1F251 ; Hani # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
|
||||
|
||||
# Total code points: 237
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Latn
|
||||
|
||||
0363..036F ; Latn # Mn [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X
|
||||
|
||||
# Total code points: 13
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Copt
|
||||
|
||||
102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
|
||||
102E1..102FB ; Arab Copt # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
|
||||
|
||||
# Total code points: 28
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Rohg
|
||||
|
||||
06D4 ; Arab Rohg # Po ARABIC FULL STOP
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Syrc
|
||||
|
||||
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
|
||||
0670 ; Arab Syrc # Mn ARABIC LETTER SUPERSCRIPT ALEF
|
||||
|
||||
# Total code points: 12
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Thaa
|
||||
|
||||
0660..0669 ; Arab Thaa # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
|
||||
FDF2 ; Arab Thaa # Lo ARABIC LIGATURE ALLAH ISOLATED FORM
|
||||
FDFD ; Arab Thaa # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
|
||||
|
||||
# Total code points: 12
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Armn Geor
|
||||
|
||||
0589 ; Armn Geor # Po ARMENIAN FULL STOP
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva
|
||||
|
||||
1CD5..1CD6 ; Beng Deva # Mn [2] VEDIC TONE YAJURVEDIC AGGRAVATED INDEPENDENT SVARITA..VEDIC TONE YAJURVEDIC INDEPENDENT SVARITA
|
||||
1CD8 ; Beng Deva # Mn VEDIC TONE CANDRA BELOW
|
||||
1CE1 ; Beng Deva # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
|
||||
1CEA ; Beng Deva # Lo VEDIC SIGN ANUSVARA BAHIRGOMUKHA
|
||||
1CED ; Beng Deva # Mn VEDIC SIGN TIRYAK
|
||||
1CF5..1CF6 ; Beng Deva # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
|
||||
A8F1 ; Beng Deva # Mn COMBINING DEVANAGARI SIGN AVAGRAHA
|
||||
|
||||
# Total code points: 9
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hani
|
||||
|
||||
302A..302D ; Bopo Hani # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bugi Java
|
||||
|
||||
A9CF ; Bugi Java # Lm JAVANESE PANGRANGKEP
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cprt Linb
|
||||
|
||||
10100..10102 ; Cprt Linb # Po [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
|
||||
10137..1013F ; Cprt Linb # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
|
||||
|
||||
# Total code points: 12
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cyrl Glag
|
||||
|
||||
0484 ; Cyrl Glag # Mn COMBINING CYRILLIC PALATALIZATION
|
||||
0487 ; Cyrl Glag # Mn COMBINING CYRILLIC POKRYTIE
|
||||
2E43 ; Cyrl Glag # Po DASH WITH LEFT UPTURN
|
||||
A66F ; Cyrl Glag # Mn COMBINING CYRILLIC VZMET
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cyrl Latn
|
||||
|
||||
0485..0486 ; Cyrl Latn # Mn [2] COMBINING CYRILLIC DASIA PNEUMATA..COMBINING CYRILLIC PSILI PNEUMATA
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cyrl Perm
|
||||
|
||||
0483 ; Cyrl Perm # Mn COMBINING CYRILLIC TITLO
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Gran
|
||||
|
||||
1CD3 ; Deva Gran # Po VEDIC SIGN NIHSHVASA
|
||||
1CF2..1CF3 ; Deva Gran # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
|
||||
1CF8..1CF9 ; Deva Gran # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
|
||||
|
||||
# Total code points: 5
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Shrd
|
||||
|
||||
1CD7 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA
|
||||
1CD9 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA SCHROEDER
|
||||
1CDC..1CDD ; Deva Shrd # Mn [2] VEDIC TONE KATHAKA ANUDATTA..VEDIC TONE DOT BELOW
|
||||
1CE0 ; Deva Shrd # Mn VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
|
||||
|
||||
# Total code points: 5
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Taml
|
||||
|
||||
A8F3 ; Deva Taml # Lo DEVANAGARI SIGN CANDRABINDU VIRAMA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Geor Latn
|
||||
|
||||
10FB ; Geor Latn # Po GEORGIAN PARAGRAPH SEPARATOR
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Gran Taml
|
||||
|
||||
0BE6..0BEF ; Gran Taml # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
|
||||
0BF0..0BF2 ; Gran Taml # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
|
||||
0BF3 ; Gran Taml # So TAMIL DAY SIGN
|
||||
11301 ; Gran Taml # Mn GRANTHA SIGN CANDRABINDU
|
||||
11303 ; Gran Taml # Mc GRANTHA SIGN VISARGA
|
||||
1133B..1133C ; Gran Taml # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
|
||||
|
||||
# Total code points: 18
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Gujr Khoj
|
||||
|
||||
0AE6..0AEF ; Gujr Khoj # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Guru Mult
|
||||
|
||||
0A66..0A6F ; Guru Mult # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Hira Kana
|
||||
|
||||
3031..3035 ; Hira Kana # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
|
||||
3099..309A ; Hira Kana # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
309B..309C ; Hira Kana # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
30A0 ; Hira Kana # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
|
||||
30FC ; Hira Kana # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
FF70 ; Hira Kana # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
|
||||
# Total code points: 14
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Mong Phag
|
||||
|
||||
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
|
||||
1805 ; Mong Phag # Po MONGOLIAN FOUR DOTS
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Syrc Thaa
|
||||
|
||||
061C ; Arab Syrc Thaa # Cf ARABIC LETTER MARK
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Cakm Sylo
|
||||
|
||||
09E6..09EF ; Beng Cakm Sylo # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cakm Mymr Tale
|
||||
|
||||
1040..1049 ; Cakm Mymr Tale # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Cprt Lina Linb
|
||||
|
||||
10107..10133 ; Cprt Lina Linb # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
|
||||
|
||||
# Total code points: 45
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Gran Knda
|
||||
|
||||
1CF4 ; Deva Gran Knda # Mn VEDIC TONE CANDRA ABOVE
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Gran Latn
|
||||
|
||||
20F0 ; Deva Gran Latn # Mn COMBINING ASTERISK ABOVE
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Hani Hira Kana
|
||||
|
||||
303C ; Hani Hira Kana # Lo MASU MARK
|
||||
303D ; Hani Hira Kana # Po PART ALTERNATION MARK
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Kali Latn Mymr
|
||||
|
||||
A92E ; Kali Latn Mymr # Po KAYAH LI SIGN CWI
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Arab Rohg Syrc Thaa
|
||||
|
||||
060C ; Arab Rohg Syrc Thaa # Po ARABIC COMMA
|
||||
061B ; Arab Rohg Syrc Thaa # Po ARABIC SEMICOLON
|
||||
061F ; Arab Rohg Syrc Thaa # Po ARABIC QUESTION MARK
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Gran Knda
|
||||
|
||||
1CD0 ; Beng Deva Gran Knda # Mn VEDIC TONE KARSHANA
|
||||
1CD2 ; Beng Deva Gran Knda # Mn VEDIC TONE PRENKHA
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Buhd Hano Tagb Tglg
|
||||
|
||||
1735..1736 ; Buhd Hano Tagb Tglg # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
|
||||
|
||||
# Total code points: 2
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Kthi Mahj
|
||||
|
||||
0966..096F ; Deva Dogr Kthi Mahj # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana
|
||||
|
||||
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
|
||||
3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
|
||||
301C ; Bopo Hang Hani Hira Kana # Pd WAVE DASH
|
||||
301D ; Bopo Hang Hani Hira Kana # Ps REVERSED DOUBLE PRIME QUOTATION MARK
|
||||
301E..301F ; Bopo Hang Hani Hira Kana # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
|
||||
3030 ; Bopo Hang Hani Hira Kana # Pd WAVY DASH
|
||||
3037 ; Bopo Hang Hani Hira Kana # So IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
|
||||
FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
|
||||
|
||||
# Total code points: 10
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
|
||||
|
||||
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
|
||||
3008 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT ANGLE BRACKET
|
||||
3009 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT ANGLE BRACKET
|
||||
300A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT DOUBLE ANGLE BRACKET
|
||||
300B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT DOUBLE ANGLE BRACKET
|
||||
300C ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT CORNER BRACKET
|
||||
300D ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT CORNER BRACKET
|
||||
300E ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE CORNER BRACKET
|
||||
300F ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE CORNER BRACKET
|
||||
3010 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT BLACK LENTICULAR BRACKET
|
||||
3011 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT BLACK LENTICULAR BRACKET
|
||||
3014 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT TORTOISE SHELL BRACKET
|
||||
3015 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT TORTOISE SHELL BRACKET
|
||||
3016 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE LENTICULAR BRACKET
|
||||
3017 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE LENTICULAR BRACKET
|
||||
3018 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE TORTOISE SHELL BRACKET
|
||||
3019 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
301A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE SQUARE BRACKET
|
||||
301B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE SQUARE BRACKET
|
||||
30FB ; Bopo Hang Hani Hira Kana Yiii # Po KATAKANA MIDDLE DOT
|
||||
FF61 ; Bopo Hang Hani Hira Kana Yiii # Po HALFWIDTH IDEOGRAPHIC FULL STOP
|
||||
FF62 ; Bopo Hang Hani Hira Kana Yiii # Ps HALFWIDTH LEFT CORNER BRACKET
|
||||
FF63 ; Bopo Hang Hani Hira Kana Yiii # Pe HALFWIDTH RIGHT CORNER BRACKET
|
||||
FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
|
||||
|
||||
# Total code points: 26
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Knda Mlym Orya Taml Telu
|
||||
|
||||
1CDA ; Deva Knda Mlym Orya Taml Telu # Mn VEDIC TONE DOUBLE SVARITA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
|
||||
|
||||
0640 ; Adlm Arab Mand Mani Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
|
||||
|
||||
A836..A837 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
|
||||
A838 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # Sc NORTH INDIC RUPEE MARK
|
||||
A839 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So NORTH INDIC QUANTITY MARK
|
||||
|
||||
# Total code points: 4
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
|
||||
|
||||
0952 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN ANUDATTA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh
|
||||
|
||||
A833..A835 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE SIXTEENTH..NORTH INDIC FRACTION THREE SIXTEENTHS
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
|
||||
|
||||
0951 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN UDATTA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh
|
||||
|
||||
A830..A832 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE QUARTERS
|
||||
|
||||
# Total code points: 3
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
|
||||
0964 ; Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DANDA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# ================================================
|
||||
|
||||
# Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
|
||||
0965 ; Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DOUBLE DANDA
|
||||
|
||||
# Total code points: 1
|
||||
|
||||
# EOF
|
390
maint/ucptest.c
390
maint/ucptest.c
|
@ -9,11 +9,12 @@
|
|||
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
|
||||
*/
|
||||
|
||||
/* The program expects to read commands on stdin, and it writes output
|
||||
to stdout. There is only one command, "findprop", followed by a list of Unicode
|
||||
code points as hex numbers (without any prefixes). The output is one line per
|
||||
character, giving its Unicode properties followed by its other case if there is
|
||||
one. */
|
||||
/* If there are arguments, they are a list of hexadecimal code points whose
|
||||
properties are to be output. Otherwise, the program expects to read commands on
|
||||
stdin, and it writes output to stdout. There is only one command, "findprop",
|
||||
followed by a list of Unicode code points as hex numbers (without any
|
||||
prefixes). The output is one line per character, giving its Unicode properties
|
||||
followed by its other case if there is one. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../src/config.h"
|
||||
|
@ -46,6 +47,183 @@ one. */
|
|||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find a script name *
|
||||
*************************************************/
|
||||
|
||||
static unsigned char *
|
||||
find_script_name(int script)
|
||||
{
|
||||
switch(script)
|
||||
{
|
||||
default: return US"??";
|
||||
case ucp_Unknown: return US"Unknown";
|
||||
case ucp_Arabic: return US"Arabic";
|
||||
case ucp_Armenian: return US"Armenian";
|
||||
case ucp_Balinese: return US"Balinese";
|
||||
case ucp_Bengali: return US"Bengali";
|
||||
case ucp_Bopomofo: return US"Bopomofo";
|
||||
case ucp_Braille: return US"Braille";
|
||||
case ucp_Buginese: return US"Buginese";
|
||||
case ucp_Buhid: return US"Buhid";
|
||||
case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal";
|
||||
case ucp_Cherokee: return US"Cherokee";
|
||||
case ucp_Common: return US"Common";
|
||||
case ucp_Coptic: return US"Coptic";
|
||||
case ucp_Cuneiform: return US"Cuneiform";
|
||||
case ucp_Cypriot: return US"Cypriot";
|
||||
case ucp_Cyrillic: return US"Cyrillic";
|
||||
case ucp_Deseret: return US"Deseret";
|
||||
case ucp_Devanagari: return US"Devanagari";
|
||||
case ucp_Ethiopic: return US"Ethiopic";
|
||||
case ucp_Georgian: return US"Georgian";
|
||||
case ucp_Glagolitic: return US"Glagolitic";
|
||||
case ucp_Gothic: return US"Gothic";
|
||||
case ucp_Greek: return US"Greek";
|
||||
case ucp_Gujarati: return US"Gujarati";
|
||||
case ucp_Gurmukhi: return US"Gurmukhi";
|
||||
case ucp_Han: return US"Han";
|
||||
case ucp_Hangul: return US"Hangul";
|
||||
case ucp_Hanunoo: return US"Hanunoo";
|
||||
case ucp_Hebrew: return US"Hebrew";
|
||||
case ucp_Hiragana: return US"Hiragana";
|
||||
case ucp_Inherited: return US"Inherited";
|
||||
case ucp_Kannada: return US"Kannada";
|
||||
case ucp_Katakana: return US"Katakana";
|
||||
case ucp_Kharoshthi: return US"Kharoshthi";
|
||||
case ucp_Khmer: return US"Khmer";
|
||||
case ucp_Lao: return US"Lao";
|
||||
case ucp_Latin: return US"Latin";
|
||||
case ucp_Limbu: return US"Limbu";
|
||||
case ucp_Linear_B: return US"Linear_B";
|
||||
case ucp_Malayalam: return US"Malayalam";
|
||||
case ucp_Mongolian: return US"Mongolian";
|
||||
case ucp_Myanmar: return US"Myanmar";
|
||||
case ucp_New_Tai_Lue: return US"New_Tai_Lue";
|
||||
case ucp_Nko: return US"Nko";
|
||||
case ucp_Ogham: return US"Ogham";
|
||||
case ucp_Old_Italic: return US"Old_Italic";
|
||||
case ucp_Old_Persian: return US"Old_Persian";
|
||||
case ucp_Oriya: return US"Oriya";
|
||||
case ucp_Osmanya: return US"Osmanya";
|
||||
case ucp_Phags_Pa: return US"Phags_Pa";
|
||||
case ucp_Phoenician: return US"Phoenician";
|
||||
case ucp_Runic: return US"Runic";
|
||||
case ucp_Shavian: return US"Shavian";
|
||||
case ucp_Sinhala: return US"Sinhala";
|
||||
case ucp_Syloti_Nagri: return US"Syloti_Nagri";
|
||||
case ucp_Syriac: return US"Syriac";
|
||||
case ucp_Tagalog: return US"Tagalog";
|
||||
case ucp_Tagbanwa: return US"Tagbanwa";
|
||||
case ucp_Tai_Le: return US"Tai_Le";
|
||||
case ucp_Tamil: return US"Tamil";
|
||||
case ucp_Telugu: return US"Telugu";
|
||||
case ucp_Thaana: return US"Thaana";
|
||||
case ucp_Thai: return US"Thai";
|
||||
case ucp_Tibetan: return US"Tibetan";
|
||||
case ucp_Tifinagh: return US"Tifinagh";
|
||||
case ucp_Ugaritic: return US"Ugaritic";
|
||||
case ucp_Yi: return US"Yi";
|
||||
/* New for Unicode 5.1: */
|
||||
case ucp_Carian: return US"Carian";
|
||||
case ucp_Cham: return US"Cham";
|
||||
case ucp_Kayah_Li: return US"Kayah_Li";
|
||||
case ucp_Lepcha: return US"Lepcha";
|
||||
case ucp_Lycian: return US"Lycian";
|
||||
case ucp_Lydian: return US"Lydian";
|
||||
case ucp_Ol_Chiki: return US"Ol_Chiki";
|
||||
case ucp_Rejang: return US"Rejang";
|
||||
case ucp_Saurashtra: return US"Saurashtra";
|
||||
case ucp_Sundanese: return US"Sundanese";
|
||||
case ucp_Vai: return US"Vai";
|
||||
/* New for Unicode 5.2: */
|
||||
case ucp_Avestan: return US"Avestan";
|
||||
case ucp_Bamum: return US"Bamum";
|
||||
case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs";
|
||||
case ucp_Imperial_Aramaic: return US"Imperial_Aramaic";
|
||||
case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi";
|
||||
case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian";
|
||||
case ucp_Javanese: return US"Javanese";
|
||||
case ucp_Kaithi: return US"Kaithi";
|
||||
case ucp_Lisu: return US"Lisu";
|
||||
case ucp_Meetei_Mayek: return US"Meetei_Mayek";
|
||||
case ucp_Old_South_Arabian: return US"Old_South_Arabian";
|
||||
case ucp_Old_Turkic: return US"Old_Turkic";
|
||||
case ucp_Samaritan: return US"Samaritan";
|
||||
case ucp_Tai_Tham: return US"Tai_Tham";
|
||||
case ucp_Tai_Viet: return US"Tai_Viet";
|
||||
/* New for Unicode 6.0.0 */
|
||||
case ucp_Batak: return US"Batak";
|
||||
case ucp_Brahmi: return US"Brahmi";
|
||||
case ucp_Mandaic: return US"Mandaic";
|
||||
|
||||
/* New for Unicode 6.1.0 */
|
||||
case ucp_Chakma: return US"Chakma";
|
||||
case ucp_Meroitic_Cursive: return US"Meroitic_Cursive";
|
||||
case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs";
|
||||
case ucp_Miao: return US"Miao";
|
||||
case ucp_Sharada: return US"Sharada";
|
||||
case ucp_Sora_Sompeng: return US"Sora Sompent";
|
||||
case ucp_Takri: return US"Takri";
|
||||
|
||||
/* New for Unicode 7.0.0 */
|
||||
case ucp_Bassa_Vah: return US"Bassa_Vah";
|
||||
case ucp_Caucasian_Albanian: return US"Caucasian_Albanian";
|
||||
case ucp_Duployan: return US"Duployan";
|
||||
case ucp_Elbasan: return US"Elbasan";
|
||||
case ucp_Grantha: return US"Grantha";
|
||||
case ucp_Khojki: return US"Khojki";
|
||||
case ucp_Khudawadi: return US"Khudawadi";
|
||||
case ucp_Linear_A: return US"Linear_A";
|
||||
case ucp_Mahajani: return US"Mahajani";
|
||||
case ucp_Manichaean: return US"Manichaean";
|
||||
case ucp_Mende_Kikakui: return US"Mende_Kikakui";
|
||||
case ucp_Modi: return US"Modi";
|
||||
case ucp_Mro: return US"Mro";
|
||||
case ucp_Nabataean: return US"Nabataean";
|
||||
case ucp_Old_North_Arabian: return US"Old_North_Arabian";
|
||||
case ucp_Old_Permic: return US"Old_Permic";
|
||||
case ucp_Pahawh_Hmong: return US"Pahawh_Hmong";
|
||||
case ucp_Palmyrene: return US"Palmyrene";
|
||||
case ucp_Psalter_Pahlavi: return US"Psalter_Pahlavi";
|
||||
case ucp_Pau_Cin_Hau: return US"Pau_Cin_Hau";
|
||||
case ucp_Siddham: return US"Siddham";
|
||||
case ucp_Tirhuta: return US"Tirhuta";
|
||||
case ucp_Warang_Citi: return US"Warang_Citi";
|
||||
|
||||
/* New for Unicode 8.0.0 */
|
||||
case ucp_Ahom: return US"Ahom";
|
||||
case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs";
|
||||
case ucp_Hatran: return US"Hatran";
|
||||
case ucp_Multani: return US"Multani";
|
||||
case ucp_Old_Hungarian: return US"Old_Hungarian";
|
||||
case ucp_SignWriting: return US"SignWriting";
|
||||
|
||||
/* New for Unicode 10.0.0 (no update since 8.0.0) */
|
||||
case ucp_Adlam: return US"Adlam";
|
||||
case ucp_Bhaiksuki: return US"Bhaiksuki";
|
||||
case ucp_Marchen: return US"Marchen";
|
||||
case ucp_Newa: return US"Newa";
|
||||
case ucp_Osage: return US"Osage";
|
||||
case ucp_Tangut: return US"Tangut";
|
||||
case ucp_Masaram_Gondi: return US"Masaram_Gondi";
|
||||
case ucp_Nushu: return US"Nushu";
|
||||
case ucp_Soyombo: return US"Soyombo";
|
||||
case ucp_Zanabazar_Square: return US"Zanabazar_Square";
|
||||
|
||||
/* New for Unicode 11.0.0 */
|
||||
case ucp_Dogra: return US"Dogra";
|
||||
case ucp_Gunjala_Gondi: return US"Gunjala_Gondi";
|
||||
case ucp_Hanifi_Rohingya: return US"Hanifi_Rohingya";
|
||||
case ucp_Makasar: return US"Makasar";
|
||||
case ucp_Medefaidrin: return US"Medefaidrin";
|
||||
case ucp_Old_Sogdian: return US"Old_Sogdian";
|
||||
case ucp_Sogdian: return US"Sogdian";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Print Unicode property info for a char *
|
||||
*************************************************/
|
||||
|
@ -56,15 +234,17 @@ print_prop(int c)
|
|||
int type = UCD_CATEGORY(c);
|
||||
int fulltype = UCD_CHARTYPE(c);
|
||||
int script = UCD_SCRIPT(c);
|
||||
int scriptx = UCD_SCRIPTX(c);
|
||||
int gbprop = UCD_GRAPHBREAK(c);
|
||||
int othercase = UCD_OTHERCASE(c);
|
||||
int caseset = UCD_CASESET(c);
|
||||
|
||||
unsigned char *fulltypename = US"??";
|
||||
unsigned char *typename = US"??";
|
||||
unsigned char *scriptname = US"??";
|
||||
unsigned char *graphbreak = US"??";
|
||||
|
||||
unsigned char *scriptname = find_script_name(script);
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case ucp_C: typename = US"Control"; break;
|
||||
|
@ -132,172 +312,6 @@ switch(gbprop)
|
|||
default: graphbreak = US"Unknown"; break;
|
||||
}
|
||||
|
||||
switch(script)
|
||||
{
|
||||
case ucp_Unknown: scriptname = US"Unknown"; break;
|
||||
case ucp_Arabic: scriptname = US"Arabic"; break;
|
||||
case ucp_Armenian: scriptname = US"Armenian"; break;
|
||||
case ucp_Balinese: scriptname = US"Balinese"; break;
|
||||
case ucp_Bengali: scriptname = US"Bengali"; break;
|
||||
case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
|
||||
case ucp_Braille: scriptname = US"Braille"; break;
|
||||
case ucp_Buginese: scriptname = US"Buginese"; break;
|
||||
case ucp_Buhid: scriptname = US"Buhid"; break;
|
||||
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
|
||||
case ucp_Cherokee: scriptname = US"Cherokee"; break;
|
||||
case ucp_Common: scriptname = US"Common"; break;
|
||||
case ucp_Coptic: scriptname = US"Coptic"; break;
|
||||
case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
|
||||
case ucp_Cypriot: scriptname = US"Cypriot"; break;
|
||||
case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
|
||||
case ucp_Deseret: scriptname = US"Deseret"; break;
|
||||
case ucp_Devanagari: scriptname = US"Devanagari"; break;
|
||||
case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
|
||||
case ucp_Georgian: scriptname = US"Georgian"; break;
|
||||
case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
|
||||
case ucp_Gothic: scriptname = US"Gothic"; break;
|
||||
case ucp_Greek: scriptname = US"Greek"; break;
|
||||
case ucp_Gujarati: scriptname = US"Gujarati"; break;
|
||||
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
|
||||
case ucp_Han: scriptname = US"Han"; break;
|
||||
case ucp_Hangul: scriptname = US"Hangul"; break;
|
||||
case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
|
||||
case ucp_Hebrew: scriptname = US"Hebrew"; break;
|
||||
case ucp_Hiragana: scriptname = US"Hiragana"; break;
|
||||
case ucp_Inherited: scriptname = US"Inherited"; break;
|
||||
case ucp_Kannada: scriptname = US"Kannada"; break;
|
||||
case ucp_Katakana: scriptname = US"Katakana"; break;
|
||||
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
|
||||
case ucp_Khmer: scriptname = US"Khmer"; break;
|
||||
case ucp_Lao: scriptname = US"Lao"; break;
|
||||
case ucp_Latin: scriptname = US"Latin"; break;
|
||||
case ucp_Limbu: scriptname = US"Limbu"; break;
|
||||
case ucp_Linear_B: scriptname = US"Linear_B"; break;
|
||||
case ucp_Malayalam: scriptname = US"Malayalam"; break;
|
||||
case ucp_Mongolian: scriptname = US"Mongolian"; break;
|
||||
case ucp_Myanmar: scriptname = US"Myanmar"; break;
|
||||
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
|
||||
case ucp_Nko: scriptname = US"Nko"; break;
|
||||
case ucp_Ogham: scriptname = US"Ogham"; break;
|
||||
case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
|
||||
case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
|
||||
case ucp_Oriya: scriptname = US"Oriya"; break;
|
||||
case ucp_Osmanya: scriptname = US"Osmanya"; break;
|
||||
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
|
||||
case ucp_Phoenician: scriptname = US"Phoenician"; break;
|
||||
case ucp_Runic: scriptname = US"Runic"; break;
|
||||
case ucp_Shavian: scriptname = US"Shavian"; break;
|
||||
case ucp_Sinhala: scriptname = US"Sinhala"; break;
|
||||
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
|
||||
case ucp_Syriac: scriptname = US"Syriac"; break;
|
||||
case ucp_Tagalog: scriptname = US"Tagalog"; break;
|
||||
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
|
||||
case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
|
||||
case ucp_Tamil: scriptname = US"Tamil"; break;
|
||||
case ucp_Telugu: scriptname = US"Telugu"; break;
|
||||
case ucp_Thaana: scriptname = US"Thaana"; break;
|
||||
case ucp_Thai: scriptname = US"Thai"; break;
|
||||
case ucp_Tibetan: scriptname = US"Tibetan"; break;
|
||||
case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
|
||||
case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
|
||||
case ucp_Yi: scriptname = US"Yi"; break;
|
||||
/* New for Unicode 5.1: */
|
||||
case ucp_Carian: scriptname = US"Carian"; break;
|
||||
case ucp_Cham: scriptname = US"Cham"; break;
|
||||
case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
|
||||
case ucp_Lepcha: scriptname = US"Lepcha"; break;
|
||||
case ucp_Lycian: scriptname = US"Lycian"; break;
|
||||
case ucp_Lydian: scriptname = US"Lydian"; break;
|
||||
case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
|
||||
case ucp_Rejang: scriptname = US"Rejang"; break;
|
||||
case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
|
||||
case ucp_Sundanese: scriptname = US"Sundanese"; break;
|
||||
case ucp_Vai: scriptname = US"Vai"; break;
|
||||
/* New for Unicode 5.2: */
|
||||
case ucp_Avestan: scriptname = US"Avestan"; break;
|
||||
case ucp_Bamum: scriptname = US"Bamum"; break;
|
||||
case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
|
||||
case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
|
||||
case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
|
||||
case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
|
||||
case ucp_Javanese: scriptname = US"Javanese"; break;
|
||||
case ucp_Kaithi: scriptname = US"Kaithi"; break;
|
||||
case ucp_Lisu: scriptname = US"Lisu"; break;
|
||||
case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
|
||||
case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
|
||||
case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
|
||||
case ucp_Samaritan: scriptname = US"Samaritan"; break;
|
||||
case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
|
||||
case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
|
||||
/* New for Unicode 6.0.0 */
|
||||
case ucp_Batak: scriptname = US"Batak"; break;
|
||||
case ucp_Brahmi: scriptname = US"Brahmi"; break;
|
||||
case ucp_Mandaic: scriptname = US"Mandaic"; break;
|
||||
|
||||
/* New for Unicode 6.1.0 */
|
||||
case ucp_Chakma: scriptname = US"Chakma"; break;
|
||||
case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break;
|
||||
case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
|
||||
case ucp_Miao: scriptname = US"Miao"; break;
|
||||
case ucp_Sharada: scriptname = US"Sharada"; break;
|
||||
case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break;
|
||||
case ucp_Takri: scriptname = US"Takri"; break;
|
||||
|
||||
/* New for Unicode 7.0.0 */
|
||||
case ucp_Bassa_Vah: scriptname = US"Bassa_Vah"; break;
|
||||
case ucp_Caucasian_Albanian: scriptname = US"Caucasian_Albanian"; break;
|
||||
case ucp_Duployan: scriptname = US"Duployan"; break;
|
||||
case ucp_Elbasan: scriptname = US"Elbasan"; break;
|
||||
case ucp_Grantha: scriptname = US"Grantha"; break;
|
||||
case ucp_Khojki: scriptname = US"Khojki"; break;
|
||||
case ucp_Khudawadi: scriptname = US"Khudawadi"; break;
|
||||
case ucp_Linear_A: scriptname = US"Linear_A"; break;
|
||||
case ucp_Mahajani: scriptname = US"Mahajani"; break;
|
||||
case ucp_Manichaean: scriptname = US"Manichaean"; break;
|
||||
case ucp_Mende_Kikakui: scriptname = US"Mende_Kikakui"; break;
|
||||
case ucp_Modi: scriptname = US"Modi"; break;
|
||||
case ucp_Mro: scriptname = US"Mro"; break;
|
||||
case ucp_Nabataean: scriptname = US"Nabataean"; break;
|
||||
case ucp_Old_North_Arabian: scriptname = US"Old_North_Arabian"; break;
|
||||
case ucp_Old_Permic: scriptname = US"Old_Permic"; break;
|
||||
case ucp_Pahawh_Hmong: scriptname = US"Pahawh_Hmong"; break;
|
||||
case ucp_Palmyrene: scriptname = US"Palmyrene"; break;
|
||||
case ucp_Psalter_Pahlavi: scriptname = US"Psalter_Pahlavi"; break;
|
||||
case ucp_Pau_Cin_Hau: scriptname = US"Pau_Cin_Hau"; break;
|
||||
case ucp_Siddham: scriptname = US"Siddham"; break;
|
||||
case ucp_Tirhuta: scriptname = US"Tirhuta"; break;
|
||||
case ucp_Warang_Citi: scriptname = US"Warang_Citi"; break;
|
||||
|
||||
/* New for Unicode 8.0.0 */
|
||||
case ucp_Ahom: scriptname = US"Ahom"; break;
|
||||
case ucp_Anatolian_Hieroglyphs: scriptname = US"Anatolian_Hieroglyphs"; break;
|
||||
case ucp_Hatran: scriptname = US"Hatran"; break;
|
||||
case ucp_Multani: scriptname = US"Multani"; break;
|
||||
case ucp_Old_Hungarian: scriptname = US"Old_Hungarian"; break;
|
||||
case ucp_SignWriting: scriptname = US"SignWriting"; break;
|
||||
|
||||
/* New for Unicode 10.0.0 (no update since 8.0.0) */
|
||||
case ucp_Adlam: scriptname = US"Adlam"; break;
|
||||
case ucp_Bhaiksuki: scriptname = US"Bhaiksuki"; break;
|
||||
case ucp_Marchen: scriptname = US"Marchen"; break;
|
||||
case ucp_Newa: scriptname = US"Newa"; break;
|
||||
case ucp_Osage: scriptname = US"Osage"; break;
|
||||
case ucp_Tangut: scriptname = US"Tangut"; break;
|
||||
case ucp_Masaram_Gondi: scriptname = US"Masaram_Gondi"; break;
|
||||
case ucp_Nushu: scriptname = US"Nushu"; break;
|
||||
case ucp_Soyombo: scriptname = US"Soyombo"; break;
|
||||
case ucp_Zanabazar_Square: scriptname = US"Zanabazar_Square"; break;
|
||||
|
||||
/* New for Unicode 11.0.0 */
|
||||
case ucp_Dogra: scriptname = US"Dogra"; break;
|
||||
case ucp_Gunjala_Gondi: scriptname = US"Gunjala_Gondi"; break;
|
||||
case ucp_Hanifi_Rohingya: scriptname = US"Hanifi_Rohingya"; break;
|
||||
case ucp_Makasar: scriptname = US"Makasar"; break;
|
||||
case ucp_Medefaidrin: scriptname = US"Medefaidrin"; break;
|
||||
case ucp_Old_Sogdian: scriptname = US"Old_Sogdian"; break;
|
||||
case ucp_Sogdian: scriptname = US"Sogdian"; break;
|
||||
}
|
||||
|
||||
printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
|
||||
if (othercase != c)
|
||||
{
|
||||
|
@ -309,6 +323,23 @@ if (othercase != c)
|
|||
if (*p != othercase && *p != c) printf(", %04x", *p);
|
||||
}
|
||||
}
|
||||
|
||||
if (scriptx != script)
|
||||
{
|
||||
printf(", [");
|
||||
if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else
|
||||
{
|
||||
char *sep = "";
|
||||
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
|
||||
while (*p != 0)
|
||||
{
|
||||
printf("%s%s", sep, find_script_name(*p++));
|
||||
sep = ", ";
|
||||
}
|
||||
}
|
||||
printf("]");
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
@ -319,9 +350,22 @@ printf("\n");
|
|||
*************************************************/
|
||||
|
||||
int
|
||||
main(void)
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
unsigned char buffer[1024];
|
||||
|
||||
if (argc > 1)
|
||||
{
|
||||
int i;
|
||||
for (i = 1; i < argc; i++)
|
||||
{
|
||||
unsigned char *endptr;
|
||||
int c = strtoul(argv[i], CSS(&endptr), 16);
|
||||
print_prop(c);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
|
||||
{
|
||||
unsigned char name[24];
|
||||
|
|
|
@ -38,3 +38,5 @@ findprop 118a0 11ac7 16ad0
|
|||
findprop 11700 14400 108e0 11280 1d800
|
||||
|
||||
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
||||
|
||||
findprop a836 a833 1cf4 20f0 1cd0
|
||||
|
|
|
@ -289,7 +289,7 @@ ffe3 Symbol: Modifier symbol, Common, Other
|
|||
ffe4 Symbol: Other symbol, Common, Other
|
||||
ffe5 Symbol: Currency symbol, Common, Other
|
||||
ffe6 Symbol: Currency symbol, Common, Other
|
||||
ffe7 Control: Unassigned, Common, Other
|
||||
ffe7 Control: Unassigned, Unknown, Other
|
||||
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
|
||||
ffe8 Symbol: Other symbol, Common, Other
|
||||
ffe9 Symbol: Mathematical symbol, Common, Other
|
||||
|
@ -298,22 +298,22 @@ ffeb Symbol: Mathematical symbol, Common, Other
|
|||
ffec Symbol: Mathematical symbol, Common, Other
|
||||
ffed Symbol: Other symbol, Common, Other
|
||||
ffee Symbol: Other symbol, Common, Other
|
||||
ffef Control: Unassigned, Common, Other
|
||||
ffef Control: Unassigned, Unknown, Other
|
||||
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
|
||||
fff8 Control: Unassigned, Common, Control
|
||||
fff8 Control: Unassigned, Unknown, Control
|
||||
fff9 Control: Format, Common, Control
|
||||
fffa Control: Format, Common, Control
|
||||
fffb Control: Format, Common, Control
|
||||
fffc Symbol: Other symbol, Common, Other
|
||||
fffd Symbol: Other symbol, Common, Other
|
||||
fffe Control: Unassigned, Common, Other
|
||||
ffff Control: Unassigned, Common, Other
|
||||
fffe Control: Unassigned, Unknown, Other
|
||||
ffff Control: Unassigned, Unknown, Other
|
||||
findprop 10000 10001 e01ef f0000 100000
|
||||
10000 Letter: Other letter, Linear_B, Other
|
||||
10001 Letter: Other letter, Linear_B, Other
|
||||
e01ef Mark: Non-spacing mark, Inherited, Extend
|
||||
f0000 Control: Private use, Common, Other
|
||||
100000 Control: Private use, Common, Other
|
||||
f0000 Control: Private use, Unknown, Other
|
||||
100000 Control: Private use, Unknown, Other
|
||||
|
||||
findprop 1b00 12000 7c0 a840 10900
|
||||
1b00 Mark: Non-spacing mark, Balinese, Extend
|
||||
|
@ -379,3 +379,10 @@ findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
|
|||
16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68
|
||||
10f27 Letter: Other letter, Old_Sogdian, Other
|
||||
10f30 Letter: Other letter, Sogdian, Other
|
||||
|
||||
findprop a836 a833 1cf4 20f0 1cd0
|
||||
a836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
|
||||
a833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
|
||||
1cf4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
|
||||
20f0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
|
||||
1cd0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
|
||||
|
|
|
@ -1778,6 +1778,8 @@ typedef struct {
|
|||
uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */
|
||||
uint8_t caseset; /* offset to multichar other cases or zero */
|
||||
int32_t other_case; /* offset to other case, or zero if none */
|
||||
int16_t scriptx; /* script extension value */
|
||||
int16_t dummy; /* spare - to round to multiple of 4 bytes */
|
||||
} ucd_record;
|
||||
|
||||
/* UCD access macros */
|
||||
|
@ -1800,6 +1802,7 @@ typedef struct {
|
|||
#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop
|
||||
#define UCD_CASESET(ch) GET_UCD(ch)->caseset
|
||||
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
|
||||
#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx
|
||||
|
||||
/* Header for serialized pcre2 codes. */
|
||||
|
||||
|
@ -1858,6 +1861,7 @@ extern const uint8_t PRIV(utf8_table4)[];
|
|||
#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_)
|
||||
#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
|
||||
#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
|
||||
#define _pcre2_ucd_script_sets PCRE2_SUFFIX(_pcre2_ucd_script_sets_)
|
||||
#define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_)
|
||||
#define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_)
|
||||
#define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_)
|
||||
|
@ -1880,6 +1884,7 @@ extern const uint32_t PRIV(hspace_list)[];
|
|||
extern const uint32_t PRIV(vspace_list)[];
|
||||
extern const uint32_t PRIV(ucd_caseless_sets)[];
|
||||
extern const uint32_t PRIV(ucd_digit_sets)[];
|
||||
extern const uint8_t PRIV(ucd_script_sets)[];
|
||||
extern const ucd_record PRIV(ucd_records)[];
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
extern const ucd_record PRIV(dummy_ucd_record)[];
|
||||
|
|
|
@ -4716,11 +4716,11 @@ struct sljit_jump *jump;
|
|||
#if defined SLJIT_DEBUG && SLJIT_DEBUG
|
||||
/* dummy_ucd_record */
|
||||
const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
|
||||
SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
|
||||
SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
|
||||
SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
|
||||
#endif
|
||||
|
||||
SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8);
|
||||
SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
|
||||
|
||||
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
|
||||
|
||||
|
@ -4756,11 +4756,11 @@ struct sljit_jump *jump;
|
|||
#if defined SLJIT_DEBUG && SLJIT_DEBUG
|
||||
/* dummy_ucd_record */
|
||||
const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
|
||||
SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
|
||||
SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
|
||||
SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
|
||||
#endif
|
||||
|
||||
SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8);
|
||||
SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
|
||||
|
||||
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
|
||||
|
||||
|
@ -4781,8 +4781,19 @@ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
|
|||
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
|
||||
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
|
||||
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
|
||||
|
||||
// PH hacking
|
||||
//fprintf(stderr, "~~A\n");
|
||||
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
|
||||
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
|
||||
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
|
||||
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
|
||||
|
||||
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
|
||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
|
||||
|
||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
|
||||
|
||||
// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
|
||||
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
|
||||
}
|
||||
|
||||
|
@ -7775,8 +7786,18 @@ if (needstype || needsscript)
|
|||
/* Before anything else, we deal with scripts. */
|
||||
if (needsscript)
|
||||
{
|
||||
// PH hacking
|
||||
//fprintf(stderr, "~~B\n");
|
||||
|
||||
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
|
||||
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
|
||||
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
|
||||
|
||||
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
|
||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
|
||||
|
||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
|
||||
|
||||
// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
|
||||
|
||||
ccbegin = cc;
|
||||
|
||||
|
@ -7820,12 +7841,30 @@ if (needstype || needsscript)
|
|||
{
|
||||
if (!needschar)
|
||||
{
|
||||
// PH hacking
|
||||
//fprintf(stderr, "~~C\n");
|
||||
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
|
||||
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
|
||||
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
|
||||
OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP1, 0);
|
||||
|
||||
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
|
||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
|
||||
|
||||
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
|
||||
|
||||
// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
// PH hacking
|
||||
//fprintf(stderr, "~~D\n");
|
||||
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
|
||||
|
||||
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
|
||||
|
||||
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
|
||||
OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0);
|
||||
|
||||
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
|
||||
typereg = RETURN_ADDR;
|
||||
}
|
||||
|
@ -9155,10 +9194,19 @@ if (common->utf && *cc == OP_REFI)
|
|||
|
||||
CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);
|
||||
|
||||
// PH hacking
|
||||
//fprintf(stderr, "~~E\n");
|
||||
|
||||
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
|
||||
|
||||
add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL));
|
||||
|
||||
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
|
||||
|
||||
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
|
||||
|
||||
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
|
||||
|
||||
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records));
|
||||
|
||||
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case));
|
||||
|
|
7202
src/pcre2_ucd.c
7202
src/pcre2_ucd.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue