From 87571b5af345c569b6967810c45dffc5d8f767c3 Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Mon, 10 Jan 2022 16:26:41 +0000 Subject: [PATCH] Update documentation and comments for UCD generation --- maint/GenerateUcd.py | 111 ++++++++++++++++++++----------------- maint/GenerateUcpTables.py | 8 +-- maint/README | 67 +++++++++------------- src/pcre2_ucd.c | 10 +++- 4 files changed, 95 insertions(+), 101 deletions(-) diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py index 505dfab..6b9c720 100755 --- a/maint/GenerateUcd.py +++ b/maint/GenerateUcd.py @@ -10,20 +10,15 @@ # PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of # Unicode property support. A number of extensions have since been added. The # main difference in the 2021 upgrade (apart from comments and layout) is that -# the data tables (e.g. list of script names) are now held in a separate Python -# module that is shared with the other Generate scripts. +# the data tables (e.g. list of script names) are now listed in or generated by +# a separate Python module that is shared with the other Generate scripts. # -# This script must be run in the "maint" directory. It requires eight Unicode -# data tables: DerivedBidiClass.txt, DerivedGeneralCategory.txt, -# GraphemeBreakProperty.txt, PropList.txt, Scripts.txt, ScriptExtensions.txt, -# CaseFolding.txt, and emoji-data.txt. These must be in the Unicode.tables -# subdirectory. -# -# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" -# subdirectory of the Unicode database (UCD) on the Unicode web site; -# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. PropList.txt, -# Scripts.txt, ScriptExtensions.txt, and CaseFolding.txt are directly in the -# UCD directory. +# This script must be run in the "maint" directory. It requires the following +# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt, +# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt, +# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt, +# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and +# emoji-data.txt. These must be in the Unicode.tables subdirectory. # # The emoji-data.txt file is found in the "emoji" subdirectory even though it # is technically part of a different (but coordinated) standard as shown @@ -32,6 +27,11 @@ # # http://unicode.org/Public/emoji/13.0/ReadMe.txt # +# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" +# subdirectory of the Unicode database (UCD) on the Unicode web site; +# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files +# are in the top-level UCD directory. +# # ----------------------------------------------------------------------------- # Minor modifications made to the original script: # Added #! line at start @@ -106,6 +106,7 @@ # Changes to the refactored script: # # 26-December-2021: Refactoring completed +# 10-January-2022: Addition of general Boolean property support # # ---------------------------------------------------------------------------- # @@ -117,7 +118,8 @@ # Conceptually, there is a table of records (of type ucd_record), one for each # Unicode character. Each record contains the script number, script extension # value, character type, grapheme break type, offset to caseless matching set, -# offset to the character's other case, and the bidi class/control. +# offset to the character's other case, the bidi class, and offset to bitmap of +# Boolean properties. # # A real table covering all Unicode characters would be far too big. It can be # efficiently compressed by observing that many characters have the same @@ -125,7 +127,7 @@ # the same set of records as other blocks. This leads to a 2-stage lookup # process. # -# This script constructs six tables. The ucd_caseless_sets table contains +# This script constructs seven tables. The ucd_caseless_sets table contains # lists of characters that all match each other caselessly. Each list is # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than # any valid character. The first list is empty; this is used for characters @@ -136,13 +138,12 @@ # in script runs all come from the same set. The first element in the vector # contains the number of subsequent elements, which are in ascending order. # -# The lists of scripts in script_names and script_abbrevs are partitioned into -# two groups. Scripts that appear in at least one character's script extension -# list come first, follwed by "Unknown" and then all the rest. This sorting is -# done certain automatically in the GenerateCommon.py script. A script's number -# is its index in these lists. +# Scripts are partitioned into two groups. Scripts that appear in at least one +# character's script extension list come first, followed by "Unknown" and then +# all the rest. This sorting is done automatically in the GenerateCommon.py +# script. A script's number is its index in the script_names list. # -# The ucd_script_sets vector contains bitmaps that represent lists of scripts +# The ucd_script_sets table contains bitmaps that represent lists of scripts # for Script Extensions properties. Each bitmap consists of a fixed number of # unsigned 32-bit numbers, enough to allocate a bit for every script that is # used in any character's extension list, that is, enough for every script @@ -151,10 +152,15 @@ # bitmap has no bits set; characters that have no script extensions have zero # as their script extensions value so that they use this map. # -# The ucd_records table contains one instance of every unique record that is -# required. The ucd_stage1 table is indexed by a character's block number, -# which is the character's code point divided by 128, since 128 is the size -# of each block. The result of a lookup in ucd_stage1 a "virtual" block number. +# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean +# properties. Each bitmap consists of a fixed number of unsigned 32-bit +# numbers, enough to allocate a bit for each supported Boolean property. +# +# The ucd_records table contains one instance of every unique character record +# that is required. The ucd_stage1 table is indexed by a character's block +# number, which is the character's code point divided by 128, since 128 is the +# size of each block. The result of a lookup in ucd_stage1 a "virtual" block +# number. # # The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by # the offset of a character within its own block, and the result is the index @@ -165,15 +171,16 @@ # # Example: lowercase "a" (U+0061) is in block 0 # lookup 0 in stage1 table yields 0 -# lookup 97 (0x61) in the first table in stage2 yields 23 -# record 23 is { 20, 5, 12, 0, -32, 0, 9, 0 } -# 20 = ucp_Latin => Latin script +# lookup 97 (0x61) in the first table in stage2 yields 35 +# record 35 is { 0, 5, 12, 0, -32, 0, 9, 22, 0 } +# 0 = ucp_Latin => Latin script # 5 = ucp_Ll => Lower case letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # -32 (-0x20) => Other case is U+0041 # 0 => No special Script Extension property # 9 = ucp_bidiL => Bidi class left-to-right +# 22 => Offset to Boolean properties # 0 => Dummy value, unused at present # # Almost all lowercase latin characters resolve to the same record. One or two @@ -181,36 +188,38 @@ # example, k, K and the Kelvin symbol are such a set). # # Example: hiragana letter A (U+3042) is in block 96 (0x60) -# lookup 96 in stage1 table yields 91 -# lookup 66 (0x42) in table 91 in stage2 yields 614 -# record 614 is { 17, 7, 12, 0, 0, 0, 9, 0 } -# 17 = ucp_Hiragana => Hiragana script +# lookup 96 in stage1 table yields 93 +# lookup 66 (0x42) in table 93 in stage2 yields 819 +# record 614 is { 20, 7, 12, 0, 0, 0, 9, 41, 0 } +# 20 = ucp_Hiragana => Hiragana script # 7 = ucp_Lo => Other letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # 0 => No other case # 0 => No special Script Extension property # 9 = ucp_bidiL => Bidi class left-to-right +# 41 => Offset to Boolean properties # 0 => Dummy value, unused at present # # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) # lookup 57 in stage1 table yields 55 -# lookup 80 (0x50) in table 55 in stage2 yields 486 -# record 485 is { 78, 12, 3, 0, 0, 138, 13, 0 } -# 78 = ucp_Inherited => Script inherited from predecessor +# lookup 80 (0x50) in table 55 in stage2 yields 621 +# record 621 is { 84, 12, 3, 0, 0, 138, 13, 48, 0 } +# 84 = ucp_Inherited => Script inherited from predecessor # 12 = ucp_Mn => Non-spacing mark # 3 = ucp_gbExtend => Grapheme break property "Extend" # 0 => Not part of a caseless set # 0 => No other case # 138 => Script Extension list offset = 138 # 13 = ucp_bidiNSM => Bidi class non-spacing mark +# 48 => Offset to Boolean properties # 0 => Dummy value, unused at present # # At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, # 18, and 47 set. This means that this character is expected to be used with # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. # -# Philip Hazel, last updated 31 December 2021. +# Philip Hazel, last updated 10 January 2022. ############################################################################## @@ -480,7 +489,7 @@ def write_bitsets(list, item_size): unicode_version = "" # Some of the tables imported from GenerateCommon.py have alternate comment -# strings for use by GenerateUcpHeader. The comments are now wanted here, so +# strings for use by GenerateUcpHeader. The comments are not wanted here, so # remove them. bidi_classes = bidi_classes[::2] @@ -522,22 +531,18 @@ for line in file: break_props[i] = break_properties.index('Extended_Pictographic') file.close() -# The Script Extensions property default value is the Script value. Parse the -# file, setting 'Unknown' as the default (this will never be a Script Extension -# value), then scan it and fill in the default from Scripts. Code added by PH -# in October 2018. Positive values are used for just a single script for a -# code point. Negative values are negated offsets in a list of bitsets of -# multiple scripts. Initialize this list with a single entry, as the zeroth -# element is never used. +# Handle script extensions. The get_script_extesion() function maintains a +# list of unique bitmaps representing lists of scripts, returning the offset +# in that list. Initialize the list with an empty set, which is used for +# characters that have no script extensions. script_lists = [[]] - last_script_extension = "" scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0) # Find the Boolean properties of each character. This next bit of magic creates -# a list of empty lists. Just using [[]] * MAX_UNICODE gives a list of -# references to the *same* list, which is not what we want. +# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to +# the *same* list, which is not what we want. bprops = [[] for _ in range(MAX_UNICODE)] @@ -601,7 +606,6 @@ for line in file: file.close() - # Scan each character's boolean property list and created a list of unique # lists, at the same time, setting the index in that list for each property in # the bool_props vector. @@ -620,7 +624,6 @@ for c in range(MAX_UNICODE): bool_props[c] = i - # With the addition of the Script Extensions field, we needed some padding to # get the Unicode records up to 12 bytes (multiple of 4). Originally this was a # 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits @@ -793,7 +796,7 @@ const ucd_record PRIV(dummy_ucd_record)[] = {{ 0, /* other case */ ucp_Unknown, /* script extension */ ucp_bidiL, /* bidi class */ - 0, /* bool properties offset */ + 0, /* bool properties offset */ 0 /* dummy filler */ }}; #endif @@ -864,14 +867,18 @@ for d in digitsets: f.write("\n};\n\n") f.write("""\ -/* This vector is a list of script bitsets for the Script Extension property. */ +/* This vector is a list of script bitsets for the Script Extension property. +The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as +ucd_script_sets_item_size. */ const uint32_t PRIV(ucd_script_sets)[] = { """) write_bitsets(script_lists, script_list_item_size) f.write("""\ -/* This vector is a list of bitsets for Boolean properties. */ +/* This vector is a list of bitsets for Boolean properties. The number of +32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in +pcre2_ucp.h. */ const uint32_t PRIV(ucd_boolprop_sets)[] = { """) diff --git a/maint/GenerateUcpTables.py b/maint/GenerateUcpTables.py index 2286a76..528ff91 100755 --- a/maint/GenerateUcpTables.py +++ b/maint/GenerateUcpTables.py @@ -42,7 +42,7 @@ # Note subsequent changes here: # # 27-December-2021: Added support for 4-letter script abbreviations. -# xx-January-2022: Further updates for Boolean property support +# 10-January-2022: Further updates for Boolean property support # ----------------------------------------------------------------------------- @@ -112,20 +112,18 @@ utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(catego utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names))) - for name in bool_properties: utt_table.append((stdname(name), name, 'PT_BOOL')) if name in abbreviations: for abbrev in abbreviations[name]: utt_table.append((stdname(abbrev), name, 'PT_BOOL')) - # Now add specials and synonyms. Note both the standardized and capitalized # forms are needed. utt_table.append(('any', 'Any', 'PT_ANY')) -utt_table.append(('l&', 'L&', 'PT_LAMP')) -utt_table.append(('lc', 'LC', 'PT_LAMP')) +utt_table.append(('l&', 'L&', 'PT_LAMP')) +utt_table.append(('lc', 'LC', 'PT_LAMP')) utt_table.append(('xan', 'Xan', 'PT_ALNUM')) utt_table.append(('xps', 'Xps', 'PT_PXSPACE')) utt_table.append(('xsp', 'Xsp', 'PT_SPACE')) diff --git a/maint/README b/maint/README index 7bb2314..f21ff87 100644 --- a/maint/README +++ b/maint/README @@ -22,13 +22,14 @@ GenerateCommon.py GenerateTest26.py A Python script that generates input and expected output test data for test - 26, which tests Unicode property support. + 26, which tests certain aspects of Unicode property support. GenerateUcd.py A Python script that generates the file pcre2_ucd.c from GenerateCommon.py and Unicode data files, which are themselves downloaded from the Unicode web site. The generated file contains the tables for a 2-stage lookup of Unicode - properties, along with some auxiliary tables. + properties, along with some auxiliary tables. The script starts with a long + comment that gives details of the tables it constructs. GenerateUcpHeader.py A Python script that generates the file pcre2_ucp.h from GenerateCommon.py @@ -38,7 +39,7 @@ GenerateUcpHeader.py GenerateUcpTables.py A Python script that generates the file pcre2_ucptables.c from GenerateCommon.py and Unicode data files. The generated file contains tables - for looking up Unicode properties. + for looking up Unicode property names. ManyConfigTests A shell script that runs "configure, make, test" a number of times with @@ -63,11 +64,11 @@ Unicode.tables ScriptExtensions.txt are where to look for script information. ucptest.c - A short C program for testing the Unicode property macros that do lookups in - the pcre2_ucd.c data, mainly useful after rebuilding the Unicode property - table. Compile and run this in the "maint" directory (see comments at its - head). This program can also be used to find characters with specific - properties. + A program for testing the Unicode property macros that do lookups in the + pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables. + Compile and run this in the "maint" directory (see comments at its head). + This program can also be used to find characters with specific properties and + to list which properties are supported. ucptestdata A directory containing four files, testinput{1,2} and testoutput{1,2}, for @@ -85,35 +86,18 @@ utf8.c Updating to a new Unicode release ================================= -**** WORK IN PROGRESS **** - -Work is going on in the area of Unicode property handling. What follows here is -now out-of-date. It will be updated once the current project is complete. -Updating to any new Unicode release is best left till then also. -06-January-2022 - - When there is a new release of Unicode, the files in Unicode.tables must be -refreshed from the web site. If the new version of Unicode adds new character -scripts, the lists in GenerateCommon.py must be updated. I have been adding -each new group at the end of the relevant list, with a comment. +refreshed from the web site. Once that is done, the four Python scripts that +generate files from the Unicode data can be run from within the "maint" +directory. -NOTE: Both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of -supported Unicode scripts that also have to be updated. These lists are in -alphabetical order. +Note: Previously, it was necessary to update lists of scripts and their +abbreviations by hand before running the Python scripts. This is no longer +necessary because the scripts have been upgraded to extract this information +themselves. Also, there used to be explicit lists of script in two of the man +pages. This is no longer the case. -There are two lists in GenerateCommon.py: the full names and the abbreviations -that are found in the ScriptExtensions.txt file. A list of script names and -their abbreviations can be found in the PropertyValueAliases.txt file on the -Unicode web site. There is also a Wikipedia page that lists them, and notes the -Unicode version in which they were introduced: - -https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts - -Once the script name lists have been updated, the three generator scripts can -be run from within the maint directory. If you get the error "ValueError: -list.index(x): x not in list", the cause is usually a missing (or misspelt) -name in one of the lists. You can give an output file name as an argument, but +You can give an output file name as an argument to the following scripts, but by default: GenerateUcd.py creates pcre2_ucd.c ) @@ -121,17 +105,18 @@ GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory GenerateUcpTables.py creates pcre2_ucptables.c ) These files can be compared against the existing versions in the src directory -to check on any changes before replacing the old files, but you can generate -directly into the final location by running +to check on any changes before replacing the old files, but you can also +generate directly into the final location by running: ./GenerateUcd.py ../src/pcre2_ucd.c ./GenerateUcpHeader.py ../src/pcre2_ucp.h ./GenerateUcpTables.py ../src/pcre2_ucptables.c -The ucptest program can be compiled and used to check that the new tables work -properly, using the data files in ucptestdata to check a number of test -characters. See the comments at the start of ucptest.c. If there are new -scripts, adding a few tests to the files in ucptestdata is a good idea. +Once the .c and .h files are in the ../src directory, the ucptest program can +be compiled and used to check that the new tables work properly. The data files +in ucptestdata are set up to check a number of test characters. See the +comments at the start of ucptest.c. If there are new scripts, adding a few +tests to the files in ucptestdata is a good idea. Finally, you should run the GenerateTest26.py script to regenerate new versions of the input and expected output from a series of Unicode property tests that @@ -476,4 +461,4 @@ years. Philip Hazel Email local part: Philip.Hazel Email domain: gmail.com -Last updated: 31 December 2021 +Last updated: 10 January 2022 diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index fdae767..0a598ca 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -107,7 +107,7 @@ const ucd_record PRIV(dummy_ucd_record)[] = {{ 0, /* other case */ ucp_Unknown, /* script extension */ ucp_bidiL, /* bidi class */ - 0, /* bool properties offset */ + 0, /* bool properties offset */ 0 /* dummy filler */ }}; #endif @@ -168,7 +168,9 @@ const uint32_t PRIV(ucd_digit_sets)[] = { 0x1e959, 0x1fbf9, }; -/* This vector is a list of script bitsets for the Script Extension property. */ +/* This vector is a list of script bitsets for the Script Extension property. +The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as +ucd_script_sets_item_size. */ const uint32_t PRIV(ucd_script_sets)[] = { 0x00000000u, 0x00000000u, 0x00000000u, @@ -236,7 +238,9 @@ const uint32_t PRIV(ucd_script_sets)[] = { 0x2000ffc0u, 0x3984a010u, 0x00000001u, }; -/* This vector is a list of bitsets for Boolean properties. */ +/* This vector is a list of bitsets for Boolean properties. The number of +32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in +pcre2_ucp.h. */ const uint32_t PRIV(ucd_boolprop_sets)[] = { 0x00000000u, 0x00000000u,