From 04ba4bce0fee3bd91251b5c29e73bb8b2120ce9c Mon Sep 17 00:00:00 2001 From: "Philip.Hazel" Date: Sat, 6 Oct 2018 17:39:52 +0000 Subject: [PATCH] Unicode properties data records extended to 12-bytes to include a ScriptExtensions property. --- maint/MultiStage2.py | 179 +- maint/Unicode.tables/ScriptExtensions.txt | 531 ++ maint/ucptest.c | 392 +- maint/ucptestdata/testinput1 | 2 + maint/ucptestdata/testoutput1 | 21 +- src/pcre2_internal.h | 5 + src/pcre2_jit_compile.c | 62 +- src/pcre2_ucd.c | 7202 +++++++++++---------- 8 files changed, 4642 insertions(+), 3752 deletions(-) create mode 100644 maint/Unicode.tables/ScriptExtensions.txt diff --git a/maint/MultiStage2.py b/maint/MultiStage2.py index cdb021a..2765a81 100755 --- a/maint/MultiStage2.py +++ b/maint/MultiStage2.py @@ -61,26 +61,39 @@ # property, which is used by PCRE2 as a grapheme breaking property. This was # done when updating to Unicode 11.0.0 (July 2018). # +# Added code to add a Script Extensions field to records. +# # # The main tables generated by this script are used by macros defined in # pcre2_internal.h. They look up Unicode character properties using short # sequences of code that contains no branches, which makes for greater speed. # # Conceptually, there is a table of records (of type ucd_record), containing a -# script number, character type, grapheme break type, offset to caseless -# matching set, and offset to the character's other case for every character. -# However, a real table covering all Unicode characters would be far too big. -# It can be efficiently compressed by observing that many characters have the -# same record, and many blocks of characters (taking 128 characters in a block) -# have the same set of records as other blocks. This leads to a 2-stage lookup -# process. +# script number, script extension value, character type, grapheme break type, +# offset to caseless matching set, offset to the character's other case, for +# every character. However, a real table covering all Unicode characters would +# be far too big. It can be efficiently compressed by observing that many +# characters have the same record, and many blocks of characters (taking 128 +# characters in a block) have the same set of records as other blocks. This +# leads to a 2-stage lookup process. # -# This script constructs four tables. The ucd_caseless_sets table contains +# This script constructs six tables. The ucd_caseless_sets table contains # lists of characters that all match each other caselessly. Each list is # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than # any valid character. The first list is empty; this is used for characters # that are not part of any list. # +# The ucd_digit_sets table contains the code points of the '9' characters in +# each set of 10 decimal digits in Unicode. This is used to ensure that digits +# in script runs all come from the same set. The first element in the vector +# contains the number of subsequent elements, which are in ascending order. +# +# The ucd_script_sets vector contains lists of script numbers that are the +# Script Extensions properties of certain characters. Each list is terminated +# by zero (ucp_Unknown). A character with more than one script listed for its +# Script Extension property has a negative value in its record. This is the +# negated offset to the start of the relevant list. +# # The ucd_records table contains one instance of every unique record that is # required. The ucd_stage1 table is indexed by a character's block number, and # yields what is in effect a "virtual" block number. The ucd_stage2 table is a @@ -117,11 +130,8 @@ # In these examples, no other blocks resolve to the same "virtual" block, as it # happens, but plenty of other blocks do share "virtual" blocks. # -# There is a fourth table, maintained by hand, which translates from the -# individual character types such as ucp_Cc to the general types like ucp_C. -# # Philip Hazel, 03 July 2008 -# Last Updated: 07 July 2018 +# Last Updated: 03 October 2018 # # # 01-March-2010: Updated list of scripts for Unicode 5.2.0 @@ -144,6 +154,7 @@ # 07-July-2018: Added code to scan emoji-data.txt for the Extended # Pictographic property. # 01-October-2018: Added the 'Unknown' script name +# 03-October-2018: Added new field for Script Extensions ############################################################################## @@ -164,7 +175,33 @@ def get_other_case(chardata): if chardata[1] == 'C' or chardata[1] == 'S': return int(chardata[2], 16) - int(chardata[0], 16) return 0 + +# Parse a line of ScriptExtensions.txt +def get_script_extension(chardata): + this_script_list = list(chardata[1].split(' ')) + if len(this_script_list) == 1: + return script_abbrevs.index(this_script_list[0]) + + script_numbers = [] + for d in this_script_list: + script_numbers.append(script_abbrevs.index(d)) + script_numbers.append(0) + script_numbers_length = len(script_numbers) + for i in range(1, len(script_lists) - script_numbers_length + 1): + for j in range(0, script_numbers_length): + found = True + if script_lists[i+j] != script_numbers[j]: + found = False + break + if found: + return -i + + # Not found in existing lists + + return_value = len(script_lists) + script_lists.extend(script_numbers) + return -return_value # Read the whole table in memory, setting/checking the Unicode version def read_table(file_name, get_value, default_value): @@ -330,24 +367,24 @@ def print_records(records, record_size): print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))) print('};\n') -script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ - 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ - 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ - 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ - 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ - 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ - 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ +script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', + 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', + 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', + 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', + 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', + 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', + 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', # New for Unicode 5.0 - 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ + 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', # New for Unicode 5.1 - 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ + 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', # New for Unicode 5.2 - 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ - 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ - 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ - 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \ + 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', + 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', + 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', + 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', # New for Unicode 6.0.0 - 'Batak', 'Brahmi', 'Mandaic', \ + 'Batak', 'Brahmi', 'Mandaic', # New for Unicode 6.1.0 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', # New for Unicode 7.0.0 @@ -365,6 +402,39 @@ script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', 'Old_Sogdian', 'Sogdian' ] + +script_abbrevs = [ + 'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans', + 'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor', + 'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr', + 'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb', + 'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya', + 'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale', + 'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii', +#New for Unicode 5.0 + 'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx', +#New for Unicode 5.1 + 'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur', + 'Sund', 'Vaii', +#New for Unicode 5.2 + 'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu', + 'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt', +#New for Unicode 6.0.0 + 'Batk', 'Brah', 'Mand', +#New for Unicode 6.1.0 + 'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr', +#New for Unicode 7.0.0 + 'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj', + 'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm', + 'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara', +#New for Unicode 8.0.0 + 'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw', +#New for Unicode 10.0.0 + 'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo', + 'Zanb', +#New for Unicode 11.0.0 + 'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd' + ] category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', @@ -415,6 +485,28 @@ for line in file: break_props[i] = break_property_names.index('Extended_Pictographic') file.close() +# The Script Extensions property default value is the Script value. Parse the +# file, setting 'Unknown' as the default (this will never be a Script Extension +# value), then scan it and fill in the default from Scripts. Code added by PH +# in October 2018. Positive values are used for just a single script for a +# code point. Negative values are negated offsets in a list of lists of +# multiple scripts. Initialize this list with a single entry, as the zeroth +# element is never used. + +script_lists = [0] +script_abbrevs_default = script_abbrevs.index('Zzzz') +scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default) + +for i in range(0, MAX_UNICODE): + if scriptx[i] == script_abbrevs_default: + scriptx[i] = script[i] + +# With the addition of the new Script Extensions field, we need some padding +# to get the Unicode records up to 12 bytes (multiple of 4). Set a value +# greater than 255 to make the field 16 bits. + +padding_dummy = [0] * MAX_UNICODE +padding_dummy[0] = 256 # This block of code was added by PH in September 2012. I am not a Python # programmer, so the style is probably dreadful, but it does the job. It scans @@ -427,7 +519,7 @@ file.close() # sets only one value, so first we go through the table and set "return" # offsets for those that are not already set. -for c in range(0x10ffff): +for c in range(MAX_UNICODE): if other_case[c] != 0 and other_case[c + other_case[c]] == 0: other_case[c + other_case[c]] = -other_case[c] @@ -435,7 +527,7 @@ for c in range(0x10ffff): sets = [] -for c in range(0x10ffff): +for c in range(MAX_UNICODE): o = c + other_case[c] # Trigger when this character's other case does not point back here. We @@ -489,7 +581,7 @@ for s in sets: # Combine the tables table, records = combine_tables(script, category, break_props, - caseless_offsets, other_case) + caseless_offsets, other_case, scriptx, padding_dummy) record_size, record_struct = get_record_size_struct(list(records.keys())) @@ -537,7 +629,7 @@ print("a comment was received about space saving - maybe the guy linked") print("all the modules rather than using a library - so we include a") print("condition to cut out the tables when not needed. But don't leave") print("a totally empty module because some compilers barf at that.") -print("Instead, just supply small dummy tables. */") +print("Instead, just supply some small dummy tables. */") print() print("#ifndef SUPPORT_UNICODE") print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};") @@ -559,6 +651,8 @@ print(" ucp_Cn, /* type unassigned */") print(" ucp_gbOther, /* grapheme break property */") print(" 0, /* case set */") print(" 0, /* other case */") +print(" ucp_Unknown, /* script extension */") +print(" 0, /* dummy filler */") print(" }};") print("#endif") print() @@ -609,8 +703,7 @@ digitsets.sort() print("/* This table lists the code points for the '9' characters in each") print("set of decimal digits. It is used to ensure that all the digits in") -print("a script run come from the same set. */") -print() +print("a script run come from the same set. */\n") print("const uint32_t PRIV(ucd_digit_sets)[] = {") print(" %d, /* Number of subsequent values */" % len(digitsets), end='') @@ -621,12 +714,28 @@ for d in digitsets: count = 0 print(" 0x%05x," % d, end='') count += 1 -print("\n};") -print() +print("\n};\n") + +print("/* This vector is a list of lists of scripts for the Script Extension") +print("property. Each sublist is zero-terminated. */\n") +print("const uint8_t PRIV(ucd_script_sets)[] = {") + +count = 0 +print(" /* 0 */", end='') +for d in script_lists: + print(" %3d," % d, end='') + count += 1 + if d == 0: + print("\n /* %3d */" % count, end='') +print("\n};\n") # Output the main UCD tables. -print("/* These are the main two-stage UCD tables. */\n") +print("/* These are the main two-stage UCD tables. The fields in each record are:") +print("script (8 bits), character type (8 bits), grapheme break property (8 bits),") +print("offset to multichar other cases or zero (8 bits), offset to other case") +print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy") +print("16-bit field to make the whole thing a multiple of 4 bytes. */\n") print_records(records, record_size) print_table(min_stage1, 'PRIV(ucd_stage1)') diff --git a/maint/Unicode.tables/ScriptExtensions.txt b/maint/Unicode.tables/ScriptExtensions.txt new file mode 100644 index 0000000..42959b1 --- /dev/null +++ b/maint/Unicode.tables/ScriptExtensions.txt @@ -0,0 +1,531 @@ +# ScriptExtensions-11.0.0.txt +# Date: 2018-02-04, 20:04:00 GMT +# © 2018 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see http://www.unicode.org/reports/tr44/ +# +# The Script_Extensions property indicates which characters are commonly used +# with more than one script, but with a limited number of scripts. +# For each code point, there is one or more property values. Each such value is a Script property value. +# For more information, see: +# UAX #24, Unicode Script Property: http://www.unicode.org/reports/tr24/ +# Especially the sections: +# http://www.unicode.org/reports/tr24/#Assignment_Script_Values +# http://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values +# +# Each Script_Extensions value in this file consists of a set +# of one or more abbreviated Script property values. The ordering of the +# values in that set is not material, but for stability in presentation +# it is given here as alphabetical. +# +# The Script_Extensions values are presented in sorted order in the file. +# They are sorted first by the number of Script property values in their sets, +# and then alphabetically by first differing Script property value. +# +# Following each distinct Script_Extensions value is the list of code +# points associated with that value, listed in code point order. +# +# All code points not explicitly listed for Script_Extensions +# have as their value the corresponding Script property value +# +# @missing: 0000..10FFFF;