Unicode properties data records extended to 12-bytes to include a

ScriptExtensions property.
This commit is contained in:
Philip.Hazel 2018-10-06 17:39:52 +00:00
parent cda4780fb6
commit 04ba4bce0f
8 changed files with 4642 additions and 3752 deletions

View File

@ -61,26 +61,39 @@
# property, which is used by PCRE2 as a grapheme breaking property. This was # property, which is used by PCRE2 as a grapheme breaking property. This was
# done when updating to Unicode 11.0.0 (July 2018). # done when updating to Unicode 11.0.0 (July 2018).
# #
# Added code to add a Script Extensions field to records.
#
# #
# The main tables generated by this script are used by macros defined in # The main tables generated by this script are used by macros defined in
# pcre2_internal.h. They look up Unicode character properties using short # pcre2_internal.h. They look up Unicode character properties using short
# sequences of code that contains no branches, which makes for greater speed. # sequences of code that contains no branches, which makes for greater speed.
# #
# Conceptually, there is a table of records (of type ucd_record), containing a # Conceptually, there is a table of records (of type ucd_record), containing a
# script number, character type, grapheme break type, offset to caseless # script number, script extension value, character type, grapheme break type,
# matching set, and offset to the character's other case for every character. # offset to caseless matching set, offset to the character's other case, for
# However, a real table covering all Unicode characters would be far too big. # every character. However, a real table covering all Unicode characters would
# It can be efficiently compressed by observing that many characters have the # be far too big. It can be efficiently compressed by observing that many
# same record, and many blocks of characters (taking 128 characters in a block) # characters have the same record, and many blocks of characters (taking 128
# have the same set of records as other blocks. This leads to a 2-stage lookup # characters in a block) have the same set of records as other blocks. This
# process. # leads to a 2-stage lookup process.
# #
# This script constructs four tables. The ucd_caseless_sets table contains # This script constructs six tables. The ucd_caseless_sets table contains
# lists of characters that all match each other caselessly. Each list is # lists of characters that all match each other caselessly. Each list is
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
# any valid character. The first list is empty; this is used for characters # any valid character. The first list is empty; this is used for characters
# that are not part of any list. # that are not part of any list.
# #
# The ucd_digit_sets table contains the code points of the '9' characters in
# each set of 10 decimal digits in Unicode. This is used to ensure that digits
# in script runs all come from the same set. The first element in the vector
# contains the number of subsequent elements, which are in ascending order.
#
# The ucd_script_sets vector contains lists of script numbers that are the
# Script Extensions properties of certain characters. Each list is terminated
# by zero (ucp_Unknown). A character with more than one script listed for its
# Script Extension property has a negative value in its record. This is the
# negated offset to the start of the relevant list.
#
# The ucd_records table contains one instance of every unique record that is # The ucd_records table contains one instance of every unique record that is
# required. The ucd_stage1 table is indexed by a character's block number, and # required. The ucd_stage1 table is indexed by a character's block number, and
# yields what is in effect a "virtual" block number. The ucd_stage2 table is a # yields what is in effect a "virtual" block number. The ucd_stage2 table is a
@ -117,11 +130,8 @@
# In these examples, no other blocks resolve to the same "virtual" block, as it # In these examples, no other blocks resolve to the same "virtual" block, as it
# happens, but plenty of other blocks do share "virtual" blocks. # happens, but plenty of other blocks do share "virtual" blocks.
# #
# There is a fourth table, maintained by hand, which translates from the
# individual character types such as ucp_Cc to the general types like ucp_C.
#
# Philip Hazel, 03 July 2008 # Philip Hazel, 03 July 2008
# Last Updated: 07 July 2018 # Last Updated: 03 October 2018
# #
# #
# 01-March-2010: Updated list of scripts for Unicode 5.2.0 # 01-March-2010: Updated list of scripts for Unicode 5.2.0
@ -144,6 +154,7 @@
# 07-July-2018: Added code to scan emoji-data.txt for the Extended # 07-July-2018: Added code to scan emoji-data.txt for the Extended
# Pictographic property. # Pictographic property.
# 01-October-2018: Added the 'Unknown' script name # 01-October-2018: Added the 'Unknown' script name
# 03-October-2018: Added new field for Script Extensions
############################################################################## ##############################################################################
@ -165,6 +176,32 @@ def get_other_case(chardata):
return int(chardata[2], 16) - int(chardata[0], 16) return int(chardata[2], 16) - int(chardata[0], 16)
return 0 return 0
# Parse a line of ScriptExtensions.txt
def get_script_extension(chardata):
this_script_list = list(chardata[1].split(' '))
if len(this_script_list) == 1:
return script_abbrevs.index(this_script_list[0])
script_numbers = []
for d in this_script_list:
script_numbers.append(script_abbrevs.index(d))
script_numbers.append(0)
script_numbers_length = len(script_numbers)
for i in range(1, len(script_lists) - script_numbers_length + 1):
for j in range(0, script_numbers_length):
found = True
if script_lists[i+j] != script_numbers[j]:
found = False
break
if found:
return -i
# Not found in existing lists
return_value = len(script_lists)
script_lists.extend(script_numbers)
return -return_value
# Read the whole table in memory, setting/checking the Unicode version # Read the whole table in memory, setting/checking the Unicode version
def read_table(file_name, get_value, default_value): def read_table(file_name, get_value, default_value):
@ -330,24 +367,24 @@ def print_records(records, record_size):
print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))) print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,)))
print('};\n') print('};\n')
script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal',
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian',
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana',
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam',
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic',
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana',
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi',
# New for Unicode 5.0 # New for Unicode 5.0
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician',
# New for Unicode 5.1 # New for Unicode 5.1
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai',
# New for Unicode 5.2 # New for Unicode 5.2
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic',
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ 'Inscriptional_Pahlavi', 'Inscriptional_Parthian',
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek',
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \ 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet',
# New for Unicode 6.0.0 # New for Unicode 6.0.0
'Batak', 'Brahmi', 'Mandaic', \ 'Batak', 'Brahmi', 'Mandaic',
# New for Unicode 6.1.0 # New for Unicode 6.1.0
'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
# New for Unicode 7.0.0 # New for Unicode 7.0.0
@ -366,6 +403,39 @@ script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille
'Old_Sogdian', 'Sogdian' 'Old_Sogdian', 'Sogdian'
] ]
script_abbrevs = [
'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans',
'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor',
'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr',
'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb',
'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya',
'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale',
'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii',
#New for Unicode 5.0
'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx',
#New for Unicode 5.1
'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur',
'Sund', 'Vaii',
#New for Unicode 5.2
'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu',
'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt',
#New for Unicode 6.0.0
'Batk', 'Brah', 'Mand',
#New for Unicode 6.1.0
'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr',
#New for Unicode 7.0.0
'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj',
'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm',
'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara',
#New for Unicode 8.0.0
'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw',
#New for Unicode 10.0.0
'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo',
'Zanb',
#New for Unicode 11.0.0
'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd'
]
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
@ -415,6 +485,28 @@ for line in file:
break_props[i] = break_property_names.index('Extended_Pictographic') break_props[i] = break_property_names.index('Extended_Pictographic')
file.close() file.close()
# The Script Extensions property default value is the Script value. Parse the
# file, setting 'Unknown' as the default (this will never be a Script Extension
# value), then scan it and fill in the default from Scripts. Code added by PH
# in October 2018. Positive values are used for just a single script for a
# code point. Negative values are negated offsets in a list of lists of
# multiple scripts. Initialize this list with a single entry, as the zeroth
# element is never used.
script_lists = [0]
script_abbrevs_default = script_abbrevs.index('Zzzz')
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
for i in range(0, MAX_UNICODE):
if scriptx[i] == script_abbrevs_default:
scriptx[i] = script[i]
# With the addition of the new Script Extensions field, we need some padding
# to get the Unicode records up to 12 bytes (multiple of 4). Set a value
# greater than 255 to make the field 16 bits.
padding_dummy = [0] * MAX_UNICODE
padding_dummy[0] = 256
# This block of code was added by PH in September 2012. I am not a Python # This block of code was added by PH in September 2012. I am not a Python
# programmer, so the style is probably dreadful, but it does the job. It scans # programmer, so the style is probably dreadful, but it does the job. It scans
@ -427,7 +519,7 @@ file.close()
# sets only one value, so first we go through the table and set "return" # sets only one value, so first we go through the table and set "return"
# offsets for those that are not already set. # offsets for those that are not already set.
for c in range(0x10ffff): for c in range(MAX_UNICODE):
if other_case[c] != 0 and other_case[c + other_case[c]] == 0: if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
other_case[c + other_case[c]] = -other_case[c] other_case[c + other_case[c]] = -other_case[c]
@ -435,7 +527,7 @@ for c in range(0x10ffff):
sets = [] sets = []
for c in range(0x10ffff): for c in range(MAX_UNICODE):
o = c + other_case[c] o = c + other_case[c]
# Trigger when this character's other case does not point back here. We # Trigger when this character's other case does not point back here. We
@ -489,7 +581,7 @@ for s in sets:
# Combine the tables # Combine the tables
table, records = combine_tables(script, category, break_props, table, records = combine_tables(script, category, break_props,
caseless_offsets, other_case) caseless_offsets, other_case, scriptx, padding_dummy)
record_size, record_struct = get_record_size_struct(list(records.keys())) record_size, record_struct = get_record_size_struct(list(records.keys()))
@ -537,7 +629,7 @@ print("a comment was received about space saving - maybe the guy linked")
print("all the modules rather than using a library - so we include a") print("all the modules rather than using a library - so we include a")
print("condition to cut out the tables when not needed. But don't leave") print("condition to cut out the tables when not needed. But don't leave")
print("a totally empty module because some compilers barf at that.") print("a totally empty module because some compilers barf at that.")
print("Instead, just supply small dummy tables. */") print("Instead, just supply some small dummy tables. */")
print() print()
print("#ifndef SUPPORT_UNICODE") print("#ifndef SUPPORT_UNICODE")
print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};") print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};")
@ -559,6 +651,8 @@ print(" ucp_Cn, /* type unassigned */")
print(" ucp_gbOther, /* grapheme break property */") print(" ucp_gbOther, /* grapheme break property */")
print(" 0, /* case set */") print(" 0, /* case set */")
print(" 0, /* other case */") print(" 0, /* other case */")
print(" ucp_Unknown, /* script extension */")
print(" 0, /* dummy filler */")
print(" }};") print(" }};")
print("#endif") print("#endif")
print() print()
@ -609,8 +703,7 @@ digitsets.sort()
print("/* This table lists the code points for the '9' characters in each") print("/* This table lists the code points for the '9' characters in each")
print("set of decimal digits. It is used to ensure that all the digits in") print("set of decimal digits. It is used to ensure that all the digits in")
print("a script run come from the same set. */") print("a script run come from the same set. */\n")
print()
print("const uint32_t PRIV(ucd_digit_sets)[] = {") print("const uint32_t PRIV(ucd_digit_sets)[] = {")
print(" %d, /* Number of subsequent values */" % len(digitsets), end='') print(" %d, /* Number of subsequent values */" % len(digitsets), end='')
@ -621,12 +714,28 @@ for d in digitsets:
count = 0 count = 0
print(" 0x%05x," % d, end='') print(" 0x%05x," % d, end='')
count += 1 count += 1
print("\n};") print("\n};\n")
print()
print("/* This vector is a list of lists of scripts for the Script Extension")
print("property. Each sublist is zero-terminated. */\n")
print("const uint8_t PRIV(ucd_script_sets)[] = {")
count = 0
print(" /* 0 */", end='')
for d in script_lists:
print(" %3d," % d, end='')
count += 1
if d == 0:
print("\n /* %3d */" % count, end='')
print("\n};\n")
# Output the main UCD tables. # Output the main UCD tables.
print("/* These are the main two-stage UCD tables. */\n") print("/* These are the main two-stage UCD tables. The fields in each record are:")
print("script (8 bits), character type (8 bits), grapheme break property (8 bits),")
print("offset to multichar other cases or zero (8 bits), offset to other case")
print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy")
print("16-bit field to make the whole thing a multiple of 4 bytes. */\n")
print_records(records, record_size) print_records(records, record_size)
print_table(min_stage1, 'PRIV(ucd_stage1)') print_table(min_stage1, 'PRIV(ucd_stage1)')

View File

@ -0,0 +1,531 @@
# ScriptExtensions-11.0.0.txt
# Date: 2018-02-04, 20:04:00 GMT
# © 2018 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
#
# The Script_Extensions property indicates which characters are commonly used
# with more than one script, but with a limited number of scripts.
# For each code point, there is one or more property values. Each such value is a Script property value.
# For more information, see:
# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/
# Especially the sections:
# http://www.unicode.org/reports/tr24/#Assignment_Script_Values
# http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
#
# Each Script_Extensions value in this file consists of a set
# of one or more abbreviated Script property values. The ordering of the
# values in that set is not material, but for stability in presentation
# it is given here as alphabetical.
#
# The Script_Extensions values are presented in sorted order in the file.
# They are sorted first by the number of Script property values in their sets,
# and then alphabetically by first differing Script property value.
#
# Following each distinct Script_Extensions value is the list of code
# points associated with that value, listed in code point order.
#
# All code points not explicitly listed for Script_Extensions
# have as their value the corresponding Script property value
#
# @missing: 0000..10FFFF; <script>
# ================================================
# Property: Script_Extensions
# ================================================
# Script_Extensions=Beng
1CF7 ; Beng # Mc VEDIC SIGN ATIKRAMA
# Total code points: 1
# ================================================
# Script_Extensions=Deva
1CD1 ; Deva # Mn VEDIC TONE SHARA
1CD4 ; Deva # Mn VEDIC SIGN YAJURVEDIC MIDLINE SVARITA
1CDB ; Deva # Mn VEDIC TONE TRIPLE SVARITA
1CDE..1CDF ; Deva # Mn [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW
1CE2..1CE8 ; Deva # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CE9 ; Deva # Lo VEDIC SIGN ANUSVARA ANTARGOMUKHA
1CEB..1CEC ; Deva # Lo [2] VEDIC SIGN ANUSVARA VAMAGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
1CEE..1CF1 ; Deva # Lo [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
# Total code points: 19
# ================================================
# Script_Extensions=Dupl
1BCA0..1BCA3 ; Dupl # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
# Total code points: 4
# ================================================
# Script_Extensions=Grek
0342 ; Grek # Mn COMBINING GREEK PERISPOMENI
0345 ; Grek # Mn COMBINING GREEK YPOGEGRAMMENI
1DC0..1DC1 ; Grek # Mn [2] COMBINING DOTTED GRAVE ACCENT..COMBINING DOTTED ACUTE ACCENT
# Total code points: 4
# ================================================
# Script_Extensions=Hani
3006 ; Hani # Lo IDEOGRAPHIC CLOSING MARK
303E..303F ; Hani # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
3190..3191 ; Hani # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
3192..3195 ; Hani # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
3196..319F ; Hani # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
31C0..31E3 ; Hani # So [36] CJK STROKE T..CJK STROKE Q
3220..3229 ; Hani # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
322A..3247 ; Hani # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
3280..3289 ; Hani # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
328A..32B0 ; Hani # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
32C0..32CB ; Hani # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
3358..3370 ; Hani # So [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
337B..337F ; Hani # So [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
33E0..33FE ; Hani # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
1D360..1D371 ; Hani # No [18] COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TENS DIGIT NINE
1F250..1F251 ; Hani # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
# Total code points: 237
# ================================================
# Script_Extensions=Latn
0363..036F ; Latn # Mn [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X
# Total code points: 13
# ================================================
# Script_Extensions=Arab Copt
102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
102E1..102FB ; Arab Copt # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
# Total code points: 28
# ================================================
# Script_Extensions=Arab Rohg
06D4 ; Arab Rohg # Po ARABIC FULL STOP
# Total code points: 1
# ================================================
# Script_Extensions=Arab Syrc
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
0670 ; Arab Syrc # Mn ARABIC LETTER SUPERSCRIPT ALEF
# Total code points: 12
# ================================================
# Script_Extensions=Arab Thaa
0660..0669 ; Arab Thaa # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
FDF2 ; Arab Thaa # Lo ARABIC LIGATURE ALLAH ISOLATED FORM
FDFD ; Arab Thaa # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
# Total code points: 12
# ================================================
# Script_Extensions=Armn Geor
0589 ; Armn Geor # Po ARMENIAN FULL STOP
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva
1CD5..1CD6 ; Beng Deva # Mn [2] VEDIC TONE YAJURVEDIC AGGRAVATED INDEPENDENT SVARITA..VEDIC TONE YAJURVEDIC INDEPENDENT SVARITA
1CD8 ; Beng Deva # Mn VEDIC TONE CANDRA BELOW
1CE1 ; Beng Deva # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
1CEA ; Beng Deva # Lo VEDIC SIGN ANUSVARA BAHIRGOMUKHA
1CED ; Beng Deva # Mn VEDIC SIGN TIRYAK
1CF5..1CF6 ; Beng Deva # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
A8F1 ; Beng Deva # Mn COMBINING DEVANAGARI SIGN AVAGRAHA
# Total code points: 9
# ================================================
# Script_Extensions=Bopo Hani
302A..302D ; Bopo Hani # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
# Total code points: 4
# ================================================
# Script_Extensions=Bugi Java
A9CF ; Bugi Java # Lm JAVANESE PANGRANGKEP
# Total code points: 1
# ================================================
# Script_Extensions=Cprt Linb
10100..10102 ; Cprt Linb # Po [3] AEGEAN WORD SEPARATOR LINE..AEGEAN CHECK MARK
10137..1013F ; Cprt Linb # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
# Total code points: 12
# ================================================
# Script_Extensions=Cyrl Glag
0484 ; Cyrl Glag # Mn COMBINING CYRILLIC PALATALIZATION
0487 ; Cyrl Glag # Mn COMBINING CYRILLIC POKRYTIE
2E43 ; Cyrl Glag # Po DASH WITH LEFT UPTURN
A66F ; Cyrl Glag # Mn COMBINING CYRILLIC VZMET
# Total code points: 4
# ================================================
# Script_Extensions=Cyrl Latn
0485..0486 ; Cyrl Latn # Mn [2] COMBINING CYRILLIC DASIA PNEUMATA..COMBINING CYRILLIC PSILI PNEUMATA
# Total code points: 2
# ================================================
# Script_Extensions=Cyrl Perm
0483 ; Cyrl Perm # Mn COMBINING CYRILLIC TITLO
# Total code points: 1
# ================================================
# Script_Extensions=Deva Gran
1CD3 ; Deva Gran # Po VEDIC SIGN NIHSHVASA
1CF2..1CF3 ; Deva Gran # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA
1CF8..1CF9 ; Deva Gran # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
# Total code points: 5
# ================================================
# Script_Extensions=Deva Shrd
1CD7 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA
1CD9 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA SCHROEDER
1CDC..1CDD ; Deva Shrd # Mn [2] VEDIC TONE KATHAKA ANUDATTA..VEDIC TONE DOT BELOW
1CE0 ; Deva Shrd # Mn VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
# Total code points: 5
# ================================================
# Script_Extensions=Deva Taml
A8F3 ; Deva Taml # Lo DEVANAGARI SIGN CANDRABINDU VIRAMA
# Total code points: 1
# ================================================
# Script_Extensions=Geor Latn
10FB ; Geor Latn # Po GEORGIAN PARAGRAPH SEPARATOR
# Total code points: 1
# ================================================
# Script_Extensions=Gran Taml
0BE6..0BEF ; Gran Taml # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
0BF0..0BF2 ; Gran Taml # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
0BF3 ; Gran Taml # So TAMIL DAY SIGN
11301 ; Gran Taml # Mn GRANTHA SIGN CANDRABINDU
11303 ; Gran Taml # Mc GRANTHA SIGN VISARGA
1133B..1133C ; Gran Taml # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
# Total code points: 18
# ================================================
# Script_Extensions=Gujr Khoj
0AE6..0AEF ; Gujr Khoj # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Guru Mult
0A66..0A6F ; Guru Mult # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Hira Kana
3031..3035 ; Hira Kana # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
3099..309A ; Hira Kana # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
309B..309C ; Hira Kana # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
30A0 ; Hira Kana # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
30FC ; Hira Kana # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF70 ; Hira Kana # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
# Total code points: 14
# ================================================
# Script_Extensions=Mong Phag
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
1805 ; Mong Phag # Po MONGOLIAN FOUR DOTS
# Total code points: 3
# ================================================
# Script_Extensions=Arab Syrc Thaa
061C ; Arab Syrc Thaa # Cf ARABIC LETTER MARK
# Total code points: 1
# ================================================
# Script_Extensions=Beng Cakm Sylo
09E6..09EF ; Beng Cakm Sylo # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Cakm Mymr Tale
1040..1049 ; Cakm Mymr Tale # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Cprt Lina Linb
10107..10133 ; Cprt Lina Linb # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
# Total code points: 45
# ================================================
# Script_Extensions=Deva Gran Knda
1CF4 ; Deva Gran Knda # Mn VEDIC TONE CANDRA ABOVE
# Total code points: 1
# ================================================
# Script_Extensions=Deva Gran Latn
20F0 ; Deva Gran Latn # Mn COMBINING ASTERISK ABOVE
# Total code points: 1
# ================================================
# Script_Extensions=Hani Hira Kana
303C ; Hani Hira Kana # Lo MASU MARK
303D ; Hani Hira Kana # Po PART ALTERNATION MARK
# Total code points: 2
# ================================================
# Script_Extensions=Kali Latn Mymr
A92E ; Kali Latn Mymr # Po KAYAH LI SIGN CWI
# Total code points: 1
# ================================================
# Script_Extensions=Arab Rohg Syrc Thaa
060C ; Arab Rohg Syrc Thaa # Po ARABIC COMMA
061B ; Arab Rohg Syrc Thaa # Po ARABIC SEMICOLON
061F ; Arab Rohg Syrc Thaa # Po ARABIC QUESTION MARK
# Total code points: 3
# ================================================
# Script_Extensions=Beng Deva Gran Knda
1CD0 ; Beng Deva Gran Knda # Mn VEDIC TONE KARSHANA
1CD2 ; Beng Deva Gran Knda # Mn VEDIC TONE PRENKHA
# Total code points: 2
# ================================================
# Script_Extensions=Buhd Hano Tagb Tglg
1735..1736 ; Buhd Hano Tagb Tglg # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
# Total code points: 2
# ================================================
# Script_Extensions=Deva Dogr Kthi Mahj
0966..096F ; Deva Dogr Kthi Mahj # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
301C ; Bopo Hang Hani Hira Kana # Pd WAVE DASH
301D ; Bopo Hang Hani Hira Kana # Ps REVERSED DOUBLE PRIME QUOTATION MARK
301E..301F ; Bopo Hang Hani Hira Kana # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
3030 ; Bopo Hang Hani Hira Kana # Pd WAVY DASH
3037 ; Bopo Hang Hani Hira Kana # So IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 10
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
3008 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT ANGLE BRACKET
3009 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT ANGLE BRACKET
300A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT DOUBLE ANGLE BRACKET
300B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT DOUBLE ANGLE BRACKET
300C ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT CORNER BRACKET
300D ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT CORNER BRACKET
300E ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE CORNER BRACKET
300F ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE CORNER BRACKET
3010 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT BLACK LENTICULAR BRACKET
3011 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT BLACK LENTICULAR BRACKET
3014 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT TORTOISE SHELL BRACKET
3015 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT TORTOISE SHELL BRACKET
3016 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE LENTICULAR BRACKET
3017 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE LENTICULAR BRACKET
3018 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE TORTOISE SHELL BRACKET
3019 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE TORTOISE SHELL BRACKET
301A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE SQUARE BRACKET
301B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE SQUARE BRACKET
30FB ; Bopo Hang Hani Hira Kana Yiii # Po KATAKANA MIDDLE DOT
FF61 ; Bopo Hang Hani Hira Kana Yiii # Po HALFWIDTH IDEOGRAPHIC FULL STOP
FF62 ; Bopo Hang Hani Hira Kana Yiii # Ps HALFWIDTH LEFT CORNER BRACKET
FF63 ; Bopo Hang Hani Hira Kana Yiii # Pe HALFWIDTH RIGHT CORNER BRACKET
FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
# Total code points: 26
# ================================================
# Script_Extensions=Deva Knda Mlym Orya Taml Telu
1CDA ; Deva Knda Mlym Orya Taml Telu # Mn VEDIC TONE DOUBLE SVARITA
# Total code points: 1
# ================================================
# Script_Extensions=Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
0640 ; Adlm Arab Mand Mani Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
# Total code points: 1
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
A836..A837 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
A838 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # Sc NORTH INDIC RUPEE MARK
A839 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So NORTH INDIC QUANTITY MARK
# Total code points: 4
# ================================================
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
0952 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN ANUDATTA
# Total code points: 1
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh
A833..A835 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE SIXTEENTH..NORTH INDIC FRACTION THREE SIXTEENTHS
# Total code points: 3
# ================================================
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
0951 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN UDATTA
# Total code points: 1
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh
A830..A832 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE QUARTERS
# Total code points: 3
# ================================================
# Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
0964 ; Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DANDA
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
0965 ; Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DOUBLE DANDA
# Total code points: 1
# EOF

View File

@ -9,11 +9,12 @@
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
*/ */
/* The program expects to read commands on stdin, and it writes output /* If there are arguments, they are a list of hexadecimal code points whose
to stdout. There is only one command, "findprop", followed by a list of Unicode properties are to be output. Otherwise, the program expects to read commands on
code points as hex numbers (without any prefixes). The output is one line per stdin, and it writes output to stdout. There is only one command, "findprop",
character, giving its Unicode properties followed by its other case if there is followed by a list of Unicode code points as hex numbers (without any
one. */ prefixes). The output is one line per character, giving its Unicode properties
followed by its other case if there is one. */
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
#include "../src/config.h" #include "../src/config.h"
@ -46,6 +47,183 @@ one. */
/*************************************************
* Find a script name *
*************************************************/
static unsigned char *
find_script_name(int script)
{
switch(script)
{
default: return US"??";
case ucp_Unknown: return US"Unknown";
case ucp_Arabic: return US"Arabic";
case ucp_Armenian: return US"Armenian";
case ucp_Balinese: return US"Balinese";
case ucp_Bengali: return US"Bengali";
case ucp_Bopomofo: return US"Bopomofo";
case ucp_Braille: return US"Braille";
case ucp_Buginese: return US"Buginese";
case ucp_Buhid: return US"Buhid";
case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal";
case ucp_Cherokee: return US"Cherokee";
case ucp_Common: return US"Common";
case ucp_Coptic: return US"Coptic";
case ucp_Cuneiform: return US"Cuneiform";
case ucp_Cypriot: return US"Cypriot";
case ucp_Cyrillic: return US"Cyrillic";
case ucp_Deseret: return US"Deseret";
case ucp_Devanagari: return US"Devanagari";
case ucp_Ethiopic: return US"Ethiopic";
case ucp_Georgian: return US"Georgian";
case ucp_Glagolitic: return US"Glagolitic";
case ucp_Gothic: return US"Gothic";
case ucp_Greek: return US"Greek";
case ucp_Gujarati: return US"Gujarati";
case ucp_Gurmukhi: return US"Gurmukhi";
case ucp_Han: return US"Han";
case ucp_Hangul: return US"Hangul";
case ucp_Hanunoo: return US"Hanunoo";
case ucp_Hebrew: return US"Hebrew";
case ucp_Hiragana: return US"Hiragana";
case ucp_Inherited: return US"Inherited";
case ucp_Kannada: return US"Kannada";
case ucp_Katakana: return US"Katakana";
case ucp_Kharoshthi: return US"Kharoshthi";
case ucp_Khmer: return US"Khmer";
case ucp_Lao: return US"Lao";
case ucp_Latin: return US"Latin";
case ucp_Limbu: return US"Limbu";
case ucp_Linear_B: return US"Linear_B";
case ucp_Malayalam: return US"Malayalam";
case ucp_Mongolian: return US"Mongolian";
case ucp_Myanmar: return US"Myanmar";
case ucp_New_Tai_Lue: return US"New_Tai_Lue";
case ucp_Nko: return US"Nko";
case ucp_Ogham: return US"Ogham";
case ucp_Old_Italic: return US"Old_Italic";
case ucp_Old_Persian: return US"Old_Persian";
case ucp_Oriya: return US"Oriya";
case ucp_Osmanya: return US"Osmanya";
case ucp_Phags_Pa: return US"Phags_Pa";
case ucp_Phoenician: return US"Phoenician";
case ucp_Runic: return US"Runic";
case ucp_Shavian: return US"Shavian";
case ucp_Sinhala: return US"Sinhala";
case ucp_Syloti_Nagri: return US"Syloti_Nagri";
case ucp_Syriac: return US"Syriac";
case ucp_Tagalog: return US"Tagalog";
case ucp_Tagbanwa: return US"Tagbanwa";
case ucp_Tai_Le: return US"Tai_Le";
case ucp_Tamil: return US"Tamil";
case ucp_Telugu: return US"Telugu";
case ucp_Thaana: return US"Thaana";
case ucp_Thai: return US"Thai";
case ucp_Tibetan: return US"Tibetan";
case ucp_Tifinagh: return US"Tifinagh";
case ucp_Ugaritic: return US"Ugaritic";
case ucp_Yi: return US"Yi";
/* New for Unicode 5.1: */
case ucp_Carian: return US"Carian";
case ucp_Cham: return US"Cham";
case ucp_Kayah_Li: return US"Kayah_Li";
case ucp_Lepcha: return US"Lepcha";
case ucp_Lycian: return US"Lycian";
case ucp_Lydian: return US"Lydian";
case ucp_Ol_Chiki: return US"Ol_Chiki";
case ucp_Rejang: return US"Rejang";
case ucp_Saurashtra: return US"Saurashtra";
case ucp_Sundanese: return US"Sundanese";
case ucp_Vai: return US"Vai";
/* New for Unicode 5.2: */
case ucp_Avestan: return US"Avestan";
case ucp_Bamum: return US"Bamum";
case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs";
case ucp_Imperial_Aramaic: return US"Imperial_Aramaic";
case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi";
case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian";
case ucp_Javanese: return US"Javanese";
case ucp_Kaithi: return US"Kaithi";
case ucp_Lisu: return US"Lisu";
case ucp_Meetei_Mayek: return US"Meetei_Mayek";
case ucp_Old_South_Arabian: return US"Old_South_Arabian";
case ucp_Old_Turkic: return US"Old_Turkic";
case ucp_Samaritan: return US"Samaritan";
case ucp_Tai_Tham: return US"Tai_Tham";
case ucp_Tai_Viet: return US"Tai_Viet";
/* New for Unicode 6.0.0 */
case ucp_Batak: return US"Batak";
case ucp_Brahmi: return US"Brahmi";
case ucp_Mandaic: return US"Mandaic";
/* New for Unicode 6.1.0 */
case ucp_Chakma: return US"Chakma";
case ucp_Meroitic_Cursive: return US"Meroitic_Cursive";
case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs";
case ucp_Miao: return US"Miao";
case ucp_Sharada: return US"Sharada";
case ucp_Sora_Sompeng: return US"Sora Sompent";
case ucp_Takri: return US"Takri";
/* New for Unicode 7.0.0 */
case ucp_Bassa_Vah: return US"Bassa_Vah";
case ucp_Caucasian_Albanian: return US"Caucasian_Albanian";
case ucp_Duployan: return US"Duployan";
case ucp_Elbasan: return US"Elbasan";
case ucp_Grantha: return US"Grantha";
case ucp_Khojki: return US"Khojki";
case ucp_Khudawadi: return US"Khudawadi";
case ucp_Linear_A: return US"Linear_A";
case ucp_Mahajani: return US"Mahajani";
case ucp_Manichaean: return US"Manichaean";
case ucp_Mende_Kikakui: return US"Mende_Kikakui";
case ucp_Modi: return US"Modi";
case ucp_Mro: return US"Mro";
case ucp_Nabataean: return US"Nabataean";
case ucp_Old_North_Arabian: return US"Old_North_Arabian";
case ucp_Old_Permic: return US"Old_Permic";
case ucp_Pahawh_Hmong: return US"Pahawh_Hmong";
case ucp_Palmyrene: return US"Palmyrene";
case ucp_Psalter_Pahlavi: return US"Psalter_Pahlavi";
case ucp_Pau_Cin_Hau: return US"Pau_Cin_Hau";
case ucp_Siddham: return US"Siddham";
case ucp_Tirhuta: return US"Tirhuta";
case ucp_Warang_Citi: return US"Warang_Citi";
/* New for Unicode 8.0.0 */
case ucp_Ahom: return US"Ahom";
case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs";
case ucp_Hatran: return US"Hatran";
case ucp_Multani: return US"Multani";
case ucp_Old_Hungarian: return US"Old_Hungarian";
case ucp_SignWriting: return US"SignWriting";
/* New for Unicode 10.0.0 (no update since 8.0.0) */
case ucp_Adlam: return US"Adlam";
case ucp_Bhaiksuki: return US"Bhaiksuki";
case ucp_Marchen: return US"Marchen";
case ucp_Newa: return US"Newa";
case ucp_Osage: return US"Osage";
case ucp_Tangut: return US"Tangut";
case ucp_Masaram_Gondi: return US"Masaram_Gondi";
case ucp_Nushu: return US"Nushu";
case ucp_Soyombo: return US"Soyombo";
case ucp_Zanabazar_Square: return US"Zanabazar_Square";
/* New for Unicode 11.0.0 */
case ucp_Dogra: return US"Dogra";
case ucp_Gunjala_Gondi: return US"Gunjala_Gondi";
case ucp_Hanifi_Rohingya: return US"Hanifi_Rohingya";
case ucp_Makasar: return US"Makasar";
case ucp_Medefaidrin: return US"Medefaidrin";
case ucp_Old_Sogdian: return US"Old_Sogdian";
case ucp_Sogdian: return US"Sogdian";
}
}
/************************************************* /*************************************************
* Print Unicode property info for a char * * Print Unicode property info for a char *
*************************************************/ *************************************************/
@ -56,15 +234,17 @@ print_prop(int c)
int type = UCD_CATEGORY(c); int type = UCD_CATEGORY(c);
int fulltype = UCD_CHARTYPE(c); int fulltype = UCD_CHARTYPE(c);
int script = UCD_SCRIPT(c); int script = UCD_SCRIPT(c);
int scriptx = UCD_SCRIPTX(c);
int gbprop = UCD_GRAPHBREAK(c); int gbprop = UCD_GRAPHBREAK(c);
int othercase = UCD_OTHERCASE(c); int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c); int caseset = UCD_CASESET(c);
unsigned char *fulltypename = US"??"; unsigned char *fulltypename = US"??";
unsigned char *typename = US"??"; unsigned char *typename = US"??";
unsigned char *scriptname = US"??";
unsigned char *graphbreak = US"??"; unsigned char *graphbreak = US"??";
unsigned char *scriptname = find_script_name(script);
switch (type) switch (type)
{ {
case ucp_C: typename = US"Control"; break; case ucp_C: typename = US"Control"; break;
@ -132,172 +312,6 @@ switch(gbprop)
default: graphbreak = US"Unknown"; break; default: graphbreak = US"Unknown"; break;
} }
switch(script)
{
case ucp_Unknown: scriptname = US"Unknown"; break;
case ucp_Arabic: scriptname = US"Arabic"; break;
case ucp_Armenian: scriptname = US"Armenian"; break;
case ucp_Balinese: scriptname = US"Balinese"; break;
case ucp_Bengali: scriptname = US"Bengali"; break;
case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
case ucp_Braille: scriptname = US"Braille"; break;
case ucp_Buginese: scriptname = US"Buginese"; break;
case ucp_Buhid: scriptname = US"Buhid"; break;
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
case ucp_Cherokee: scriptname = US"Cherokee"; break;
case ucp_Common: scriptname = US"Common"; break;
case ucp_Coptic: scriptname = US"Coptic"; break;
case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
case ucp_Cypriot: scriptname = US"Cypriot"; break;
case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
case ucp_Deseret: scriptname = US"Deseret"; break;
case ucp_Devanagari: scriptname = US"Devanagari"; break;
case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
case ucp_Georgian: scriptname = US"Georgian"; break;
case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
case ucp_Gothic: scriptname = US"Gothic"; break;
case ucp_Greek: scriptname = US"Greek"; break;
case ucp_Gujarati: scriptname = US"Gujarati"; break;
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
case ucp_Han: scriptname = US"Han"; break;
case ucp_Hangul: scriptname = US"Hangul"; break;
case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
case ucp_Hebrew: scriptname = US"Hebrew"; break;
case ucp_Hiragana: scriptname = US"Hiragana"; break;
case ucp_Inherited: scriptname = US"Inherited"; break;
case ucp_Kannada: scriptname = US"Kannada"; break;
case ucp_Katakana: scriptname = US"Katakana"; break;
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
case ucp_Khmer: scriptname = US"Khmer"; break;
case ucp_Lao: scriptname = US"Lao"; break;
case ucp_Latin: scriptname = US"Latin"; break;
case ucp_Limbu: scriptname = US"Limbu"; break;
case ucp_Linear_B: scriptname = US"Linear_B"; break;
case ucp_Malayalam: scriptname = US"Malayalam"; break;
case ucp_Mongolian: scriptname = US"Mongolian"; break;
case ucp_Myanmar: scriptname = US"Myanmar"; break;
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
case ucp_Nko: scriptname = US"Nko"; break;
case ucp_Ogham: scriptname = US"Ogham"; break;
case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
case ucp_Oriya: scriptname = US"Oriya"; break;
case ucp_Osmanya: scriptname = US"Osmanya"; break;
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
case ucp_Phoenician: scriptname = US"Phoenician"; break;
case ucp_Runic: scriptname = US"Runic"; break;
case ucp_Shavian: scriptname = US"Shavian"; break;
case ucp_Sinhala: scriptname = US"Sinhala"; break;
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
case ucp_Syriac: scriptname = US"Syriac"; break;
case ucp_Tagalog: scriptname = US"Tagalog"; break;
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
case ucp_Tamil: scriptname = US"Tamil"; break;
case ucp_Telugu: scriptname = US"Telugu"; break;
case ucp_Thaana: scriptname = US"Thaana"; break;
case ucp_Thai: scriptname = US"Thai"; break;
case ucp_Tibetan: scriptname = US"Tibetan"; break;
case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
case ucp_Yi: scriptname = US"Yi"; break;
/* New for Unicode 5.1: */
case ucp_Carian: scriptname = US"Carian"; break;
case ucp_Cham: scriptname = US"Cham"; break;
case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
case ucp_Lepcha: scriptname = US"Lepcha"; break;
case ucp_Lycian: scriptname = US"Lycian"; break;
case ucp_Lydian: scriptname = US"Lydian"; break;
case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
case ucp_Rejang: scriptname = US"Rejang"; break;
case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
case ucp_Sundanese: scriptname = US"Sundanese"; break;
case ucp_Vai: scriptname = US"Vai"; break;
/* New for Unicode 5.2: */
case ucp_Avestan: scriptname = US"Avestan"; break;
case ucp_Bamum: scriptname = US"Bamum"; break;
case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
case ucp_Javanese: scriptname = US"Javanese"; break;
case ucp_Kaithi: scriptname = US"Kaithi"; break;
case ucp_Lisu: scriptname = US"Lisu"; break;
case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
case ucp_Samaritan: scriptname = US"Samaritan"; break;
case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
/* New for Unicode 6.0.0 */
case ucp_Batak: scriptname = US"Batak"; break;
case ucp_Brahmi: scriptname = US"Brahmi"; break;
case ucp_Mandaic: scriptname = US"Mandaic"; break;
/* New for Unicode 6.1.0 */
case ucp_Chakma: scriptname = US"Chakma"; break;
case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break;
case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
case ucp_Miao: scriptname = US"Miao"; break;
case ucp_Sharada: scriptname = US"Sharada"; break;
case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break;
case ucp_Takri: scriptname = US"Takri"; break;
/* New for Unicode 7.0.0 */
case ucp_Bassa_Vah: scriptname = US"Bassa_Vah"; break;
case ucp_Caucasian_Albanian: scriptname = US"Caucasian_Albanian"; break;
case ucp_Duployan: scriptname = US"Duployan"; break;
case ucp_Elbasan: scriptname = US"Elbasan"; break;
case ucp_Grantha: scriptname = US"Grantha"; break;
case ucp_Khojki: scriptname = US"Khojki"; break;
case ucp_Khudawadi: scriptname = US"Khudawadi"; break;
case ucp_Linear_A: scriptname = US"Linear_A"; break;
case ucp_Mahajani: scriptname = US"Mahajani"; break;
case ucp_Manichaean: scriptname = US"Manichaean"; break;
case ucp_Mende_Kikakui: scriptname = US"Mende_Kikakui"; break;
case ucp_Modi: scriptname = US"Modi"; break;
case ucp_Mro: scriptname = US"Mro"; break;
case ucp_Nabataean: scriptname = US"Nabataean"; break;
case ucp_Old_North_Arabian: scriptname = US"Old_North_Arabian"; break;
case ucp_Old_Permic: scriptname = US"Old_Permic"; break;
case ucp_Pahawh_Hmong: scriptname = US"Pahawh_Hmong"; break;
case ucp_Palmyrene: scriptname = US"Palmyrene"; break;
case ucp_Psalter_Pahlavi: scriptname = US"Psalter_Pahlavi"; break;
case ucp_Pau_Cin_Hau: scriptname = US"Pau_Cin_Hau"; break;
case ucp_Siddham: scriptname = US"Siddham"; break;
case ucp_Tirhuta: scriptname = US"Tirhuta"; break;
case ucp_Warang_Citi: scriptname = US"Warang_Citi"; break;
/* New for Unicode 8.0.0 */
case ucp_Ahom: scriptname = US"Ahom"; break;
case ucp_Anatolian_Hieroglyphs: scriptname = US"Anatolian_Hieroglyphs"; break;
case ucp_Hatran: scriptname = US"Hatran"; break;
case ucp_Multani: scriptname = US"Multani"; break;
case ucp_Old_Hungarian: scriptname = US"Old_Hungarian"; break;
case ucp_SignWriting: scriptname = US"SignWriting"; break;
/* New for Unicode 10.0.0 (no update since 8.0.0) */
case ucp_Adlam: scriptname = US"Adlam"; break;
case ucp_Bhaiksuki: scriptname = US"Bhaiksuki"; break;
case ucp_Marchen: scriptname = US"Marchen"; break;
case ucp_Newa: scriptname = US"Newa"; break;
case ucp_Osage: scriptname = US"Osage"; break;
case ucp_Tangut: scriptname = US"Tangut"; break;
case ucp_Masaram_Gondi: scriptname = US"Masaram_Gondi"; break;
case ucp_Nushu: scriptname = US"Nushu"; break;
case ucp_Soyombo: scriptname = US"Soyombo"; break;
case ucp_Zanabazar_Square: scriptname = US"Zanabazar_Square"; break;
/* New for Unicode 11.0.0 */
case ucp_Dogra: scriptname = US"Dogra"; break;
case ucp_Gunjala_Gondi: scriptname = US"Gunjala_Gondi"; break;
case ucp_Hanifi_Rohingya: scriptname = US"Hanifi_Rohingya"; break;
case ucp_Makasar: scriptname = US"Makasar"; break;
case ucp_Medefaidrin: scriptname = US"Medefaidrin"; break;
case ucp_Old_Sogdian: scriptname = US"Old_Sogdian"; break;
case ucp_Sogdian: scriptname = US"Sogdian"; break;
}
printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak); printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
if (othercase != c) if (othercase != c)
{ {
@ -309,6 +323,23 @@ if (othercase != c)
if (*p != othercase && *p != c) printf(", %04x", *p); if (*p != othercase && *p != c) printf(", %04x", *p);
} }
} }
if (scriptx != script)
{
printf(", [");
if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else
{
char *sep = "";
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
while (*p != 0)
{
printf("%s%s", sep, find_script_name(*p++));
sep = ", ";
}
}
printf("]");
}
printf("\n"); printf("\n");
} }
@ -319,9 +350,22 @@ printf("\n");
*************************************************/ *************************************************/
int int
main(void) main(int argc, char **argv)
{ {
unsigned char buffer[1024]; unsigned char buffer[1024];
if (argc > 1)
{
int i;
for (i = 1; i < argc; i++)
{
unsigned char *endptr;
int c = strtoul(argv[i], CSS(&endptr), 16);
print_prop(c);
}
return 0;
}
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL) while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
{ {
unsigned char name[24]; unsigned char name[24];

View File

@ -38,3 +38,5 @@ findprop 118a0 11ac7 16ad0
findprop 11700 14400 108e0 11280 1d800 findprop 11700 14400 108e0 11280 1d800
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30 findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
findprop a836 a833 1cf4 20f0 1cd0

View File

@ -289,7 +289,7 @@ ffe3 Symbol: Modifier symbol, Common, Other
ffe4 Symbol: Other symbol, Common, Other ffe4 Symbol: Other symbol, Common, Other
ffe5 Symbol: Currency symbol, Common, Other ffe5 Symbol: Currency symbol, Common, Other
ffe6 Symbol: Currency symbol, Common, Other ffe6 Symbol: Currency symbol, Common, Other
ffe7 Control: Unassigned, Common, Other ffe7 Control: Unassigned, Unknown, Other
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
ffe8 Symbol: Other symbol, Common, Other ffe8 Symbol: Other symbol, Common, Other
ffe9 Symbol: Mathematical symbol, Common, Other ffe9 Symbol: Mathematical symbol, Common, Other
@ -298,22 +298,22 @@ ffeb Symbol: Mathematical symbol, Common, Other
ffec Symbol: Mathematical symbol, Common, Other ffec Symbol: Mathematical symbol, Common, Other
ffed Symbol: Other symbol, Common, Other ffed Symbol: Other symbol, Common, Other
ffee Symbol: Other symbol, Common, Other ffee Symbol: Other symbol, Common, Other
ffef Control: Unassigned, Common, Other ffef Control: Unassigned, Unknown, Other
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
fff8 Control: Unassigned, Common, Control fff8 Control: Unassigned, Unknown, Control
fff9 Control: Format, Common, Control fff9 Control: Format, Common, Control
fffa Control: Format, Common, Control fffa Control: Format, Common, Control
fffb Control: Format, Common, Control fffb Control: Format, Common, Control
fffc Symbol: Other symbol, Common, Other fffc Symbol: Other symbol, Common, Other
fffd Symbol: Other symbol, Common, Other fffd Symbol: Other symbol, Common, Other
fffe Control: Unassigned, Common, Other fffe Control: Unassigned, Unknown, Other
ffff Control: Unassigned, Common, Other ffff Control: Unassigned, Unknown, Other
findprop 10000 10001 e01ef f0000 100000 findprop 10000 10001 e01ef f0000 100000
10000 Letter: Other letter, Linear_B, Other 10000 Letter: Other letter, Linear_B, Other
10001 Letter: Other letter, Linear_B, Other 10001 Letter: Other letter, Linear_B, Other
e01ef Mark: Non-spacing mark, Inherited, Extend e01ef Mark: Non-spacing mark, Inherited, Extend
f0000 Control: Private use, Common, Other f0000 Control: Private use, Unknown, Other
100000 Control: Private use, Common, Other 100000 Control: Private use, Unknown, Other
findprop 1b00 12000 7c0 a840 10900 findprop 1b00 12000 7c0 a840 10900
1b00 Mark: Non-spacing mark, Balinese, Extend 1b00 Mark: Non-spacing mark, Balinese, Extend
@ -379,3 +379,10 @@ findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68 16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68
10f27 Letter: Other letter, Old_Sogdian, Other 10f27 Letter: Other letter, Old_Sogdian, Other
10f30 Letter: Other letter, Sogdian, Other 10f30 Letter: Other letter, Sogdian, Other
findprop a836 a833 1cf4 20f0 1cd0
a836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
a833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
1cf4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
20f0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
1cd0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]

View File

@ -1778,6 +1778,8 @@ typedef struct {
uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */ uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */
uint8_t caseset; /* offset to multichar other cases or zero */ uint8_t caseset; /* offset to multichar other cases or zero */
int32_t other_case; /* offset to other case, or zero if none */ int32_t other_case; /* offset to other case, or zero if none */
int16_t scriptx; /* script extension value */
int16_t dummy; /* spare - to round to multiple of 4 bytes */
} ucd_record; } ucd_record;
/* UCD access macros */ /* UCD access macros */
@ -1800,6 +1802,7 @@ typedef struct {
#define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop #define UCD_GRAPHBREAK(ch) GET_UCD(ch)->gbprop
#define UCD_CASESET(ch) GET_UCD(ch)->caseset #define UCD_CASESET(ch) GET_UCD(ch)->caseset
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case))) #define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx
/* Header for serialized pcre2 codes. */ /* Header for serialized pcre2 codes. */
@ -1858,6 +1861,7 @@ extern const uint8_t PRIV(utf8_table4)[];
#define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_) #define _pcre2_vspace_list PCRE2_SUFFIX(_pcre2_vspace_list_)
#define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_) #define _pcre2_ucd_caseless_sets PCRE2_SUFFIX(_pcre2_ucd_caseless_sets_)
#define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_) #define _pcre2_ucd_digit_sets PCRE2_SUFFIX(_pcre2_ucd_digit_sets_)
#define _pcre2_ucd_script_sets PCRE2_SUFFIX(_pcre2_ucd_script_sets_)
#define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_) #define _pcre2_ucd_records PCRE2_SUFFIX(_pcre2_ucd_records_)
#define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_) #define _pcre2_ucd_stage1 PCRE2_SUFFIX(_pcre2_ucd_stage1_)
#define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_) #define _pcre2_ucd_stage2 PCRE2_SUFFIX(_pcre2_ucd_stage2_)
@ -1880,6 +1884,7 @@ extern const uint32_t PRIV(hspace_list)[];
extern const uint32_t PRIV(vspace_list)[]; extern const uint32_t PRIV(vspace_list)[];
extern const uint32_t PRIV(ucd_caseless_sets)[]; extern const uint32_t PRIV(ucd_caseless_sets)[];
extern const uint32_t PRIV(ucd_digit_sets)[]; extern const uint32_t PRIV(ucd_digit_sets)[];
extern const uint8_t PRIV(ucd_script_sets)[];
extern const ucd_record PRIV(ucd_records)[]; extern const ucd_record PRIV(ucd_records)[];
#if PCRE2_CODE_UNIT_WIDTH == 32 #if PCRE2_CODE_UNIT_WIDTH == 32
extern const ucd_record PRIV(dummy_ucd_record)[]; extern const ucd_record PRIV(dummy_ucd_record)[];

View File

@ -4716,11 +4716,11 @@ struct sljit_jump *jump;
#if defined SLJIT_DEBUG && SLJIT_DEBUG #if defined SLJIT_DEBUG && SLJIT_DEBUG
/* dummy_ucd_record */ /* dummy_ucd_record */
const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR); const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther); SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0); SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
#endif #endif
SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8); SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
@ -4756,11 +4756,11 @@ struct sljit_jump *jump;
#if defined SLJIT_DEBUG && SLJIT_DEBUG #if defined SLJIT_DEBUG && SLJIT_DEBUG
/* dummy_ucd_record */ /* dummy_ucd_record */
const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR); const ucd_record *record = GET_UCD(UNASSIGNED_UTF_CHAR);
SLJIT_ASSERT(record->script == ucp_Common && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther); SLJIT_ASSERT(record->script == ucp_Unknown && record->chartype == ucp_Cn && record->gbprop == ucp_gbOther);
SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0); SLJIT_ASSERT(record->caseset == 0 && record->other_case == 0);
#endif #endif
SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 8); SLJIT_ASSERT(UCD_BLOCK_SIZE == 128 && sizeof(ucd_record) == 12);
sljit_emit_fast_enter(compiler, RETURN_ADDR, 0); sljit_emit_fast_enter(compiler, RETURN_ADDR, 0);
@ -4781,8 +4781,19 @@ OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, UCD_BLOCK_SHIFT);
OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP2, 0);
OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2)); OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2));
OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1);
// PH hacking
//fprintf(stderr, "~~A\n");
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0); sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
} }
@ -7775,8 +7786,18 @@ if (needstype || needsscript)
/* Before anything else, we deal with scripts. */ /* Before anything else, we deal with scripts. */
if (needsscript) if (needsscript)
{ {
// PH hacking
//fprintf(stderr, "~~B\n");
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script)); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
ccbegin = cc; ccbegin = cc;
@ -7820,12 +7841,30 @@ if (needstype || needsscript)
{ {
if (!needschar) if (!needschar)
{ {
// PH hacking
//fprintf(stderr, "~~C\n");
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP2(SLJIT_ADD, TMP1, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 0);
// OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM2(TMP1, TMP2), 3);
} }
else else
{ {
// PH hacking
//fprintf(stderr, "~~D\n");
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP1(SLJIT_MOV, TMP1, 0, RETURN_ADDR, 0);
OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); OP1(SLJIT_MOV_U8, RETURN_ADDR, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype));
typereg = RETURN_ADDR; typereg = RETURN_ADDR;
} }
@ -9155,10 +9194,19 @@ if (common->utf && *cc == OP_REFI)
CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop); CMPTO(SLJIT_EQUAL, TMP1, 0, char1_reg, 0, loop);
// PH hacking
//fprintf(stderr, "~~E\n");
OP1(SLJIT_MOV, TMP3, 0, TMP1, 0); OP1(SLJIT_MOV, TMP3, 0, TMP1, 0);
add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL)); add_jump(compiler, &common->getucd, JUMP(SLJIT_FAST_CALL));
OP2(SLJIT_SHL, TMP1, 0, TMP2, 0, SLJIT_IMM, 2);
OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3); OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 3);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, TMP1, 0);
OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records)); OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_records));
OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case)); OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(TMP2), SLJIT_OFFSETOF(ucd_record, other_case));

File diff suppressed because it is too large Load Diff