Update documentation and comments for UCD generation
This commit is contained in:
parent
838cdac4dc
commit
87571b5af3
|
@ -10,20 +10,15 @@
|
||||||
# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
|
# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
|
||||||
# Unicode property support. A number of extensions have since been added. The
|
# Unicode property support. A number of extensions have since been added. The
|
||||||
# main difference in the 2021 upgrade (apart from comments and layout) is that
|
# main difference in the 2021 upgrade (apart from comments and layout) is that
|
||||||
# the data tables (e.g. list of script names) are now held in a separate Python
|
# the data tables (e.g. list of script names) are now listed in or generated by
|
||||||
# module that is shared with the other Generate scripts.
|
# a separate Python module that is shared with the other Generate scripts.
|
||||||
#
|
#
|
||||||
# This script must be run in the "maint" directory. It requires eight Unicode
|
# This script must be run in the "maint" directory. It requires the following
|
||||||
# data tables: DerivedBidiClass.txt, DerivedGeneralCategory.txt,
|
# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
|
||||||
# GraphemeBreakProperty.txt, PropList.txt, Scripts.txt, ScriptExtensions.txt,
|
# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
|
||||||
# CaseFolding.txt, and emoji-data.txt. These must be in the Unicode.tables
|
# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
|
||||||
# subdirectory.
|
# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
|
||||||
#
|
# emoji-data.txt. These must be in the Unicode.tables subdirectory.
|
||||||
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
|
|
||||||
# subdirectory of the Unicode database (UCD) on the Unicode web site;
|
|
||||||
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. PropList.txt,
|
|
||||||
# Scripts.txt, ScriptExtensions.txt, and CaseFolding.txt are directly in the
|
|
||||||
# UCD directory.
|
|
||||||
#
|
#
|
||||||
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
# The emoji-data.txt file is found in the "emoji" subdirectory even though it
|
||||||
# is technically part of a different (but coordinated) standard as shown
|
# is technically part of a different (but coordinated) standard as shown
|
||||||
|
@ -32,6 +27,11 @@
|
||||||
#
|
#
|
||||||
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
# http://unicode.org/Public/emoji/13.0/ReadMe.txt
|
||||||
#
|
#
|
||||||
|
# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
|
||||||
|
# subdirectory of the Unicode database (UCD) on the Unicode web site;
|
||||||
|
# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
|
||||||
|
# are in the top-level UCD directory.
|
||||||
|
#
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Minor modifications made to the original script:
|
# Minor modifications made to the original script:
|
||||||
# Added #! line at start
|
# Added #! line at start
|
||||||
|
@ -106,6 +106,7 @@
|
||||||
# Changes to the refactored script:
|
# Changes to the refactored script:
|
||||||
#
|
#
|
||||||
# 26-December-2021: Refactoring completed
|
# 26-December-2021: Refactoring completed
|
||||||
|
# 10-January-2022: Addition of general Boolean property support
|
||||||
#
|
#
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
|
@ -117,7 +118,8 @@
|
||||||
# Conceptually, there is a table of records (of type ucd_record), one for each
|
# Conceptually, there is a table of records (of type ucd_record), one for each
|
||||||
# Unicode character. Each record contains the script number, script extension
|
# Unicode character. Each record contains the script number, script extension
|
||||||
# value, character type, grapheme break type, offset to caseless matching set,
|
# value, character type, grapheme break type, offset to caseless matching set,
|
||||||
# offset to the character's other case, and the bidi class/control.
|
# offset to the character's other case, the bidi class, and offset to bitmap of
|
||||||
|
# Boolean properties.
|
||||||
#
|
#
|
||||||
# A real table covering all Unicode characters would be far too big. It can be
|
# A real table covering all Unicode characters would be far too big. It can be
|
||||||
# efficiently compressed by observing that many characters have the same
|
# efficiently compressed by observing that many characters have the same
|
||||||
|
@ -125,7 +127,7 @@
|
||||||
# the same set of records as other blocks. This leads to a 2-stage lookup
|
# the same set of records as other blocks. This leads to a 2-stage lookup
|
||||||
# process.
|
# process.
|
||||||
#
|
#
|
||||||
# This script constructs six tables. The ucd_caseless_sets table contains
|
# This script constructs seven tables. The ucd_caseless_sets table contains
|
||||||
# lists of characters that all match each other caselessly. Each list is
|
# lists of characters that all match each other caselessly. Each list is
|
||||||
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
|
||||||
# any valid character. The first list is empty; this is used for characters
|
# any valid character. The first list is empty; this is used for characters
|
||||||
|
@ -136,13 +138,12 @@
|
||||||
# in script runs all come from the same set. The first element in the vector
|
# in script runs all come from the same set. The first element in the vector
|
||||||
# contains the number of subsequent elements, which are in ascending order.
|
# contains the number of subsequent elements, which are in ascending order.
|
||||||
#
|
#
|
||||||
# The lists of scripts in script_names and script_abbrevs are partitioned into
|
# Scripts are partitioned into two groups. Scripts that appear in at least one
|
||||||
# two groups. Scripts that appear in at least one character's script extension
|
# character's script extension list come first, followed by "Unknown" and then
|
||||||
# list come first, follwed by "Unknown" and then all the rest. This sorting is
|
# all the rest. This sorting is done automatically in the GenerateCommon.py
|
||||||
# done certain automatically in the GenerateCommon.py script. A script's number
|
# script. A script's number is its index in the script_names list.
|
||||||
# is its index in these lists.
|
|
||||||
#
|
#
|
||||||
# The ucd_script_sets vector contains bitmaps that represent lists of scripts
|
# The ucd_script_sets table contains bitmaps that represent lists of scripts
|
||||||
# for Script Extensions properties. Each bitmap consists of a fixed number of
|
# for Script Extensions properties. Each bitmap consists of a fixed number of
|
||||||
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
|
# unsigned 32-bit numbers, enough to allocate a bit for every script that is
|
||||||
# used in any character's extension list, that is, enough for every script
|
# used in any character's extension list, that is, enough for every script
|
||||||
|
@ -151,10 +152,15 @@
|
||||||
# bitmap has no bits set; characters that have no script extensions have zero
|
# bitmap has no bits set; characters that have no script extensions have zero
|
||||||
# as their script extensions value so that they use this map.
|
# as their script extensions value so that they use this map.
|
||||||
#
|
#
|
||||||
# The ucd_records table contains one instance of every unique record that is
|
# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
|
||||||
# required. The ucd_stage1 table is indexed by a character's block number,
|
# properties. Each bitmap consists of a fixed number of unsigned 32-bit
|
||||||
# which is the character's code point divided by 128, since 128 is the size
|
# numbers, enough to allocate a bit for each supported Boolean property.
|
||||||
# of each block. The result of a lookup in ucd_stage1 a "virtual" block number.
|
#
|
||||||
|
# The ucd_records table contains one instance of every unique character record
|
||||||
|
# that is required. The ucd_stage1 table is indexed by a character's block
|
||||||
|
# number, which is the character's code point divided by 128, since 128 is the
|
||||||
|
# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
|
||||||
|
# number.
|
||||||
#
|
#
|
||||||
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
|
||||||
# the offset of a character within its own block, and the result is the index
|
# the offset of a character within its own block, and the result is the index
|
||||||
|
@ -165,15 +171,16 @@
|
||||||
#
|
#
|
||||||
# Example: lowercase "a" (U+0061) is in block 0
|
# Example: lowercase "a" (U+0061) is in block 0
|
||||||
# lookup 0 in stage1 table yields 0
|
# lookup 0 in stage1 table yields 0
|
||||||
# lookup 97 (0x61) in the first table in stage2 yields 23
|
# lookup 97 (0x61) in the first table in stage2 yields 35
|
||||||
# record 23 is { 20, 5, 12, 0, -32, 0, 9, 0 }
|
# record 35 is { 0, 5, 12, 0, -32, 0, 9, 22, 0 }
|
||||||
# 20 = ucp_Latin => Latin script
|
# 0 = ucp_Latin => Latin script
|
||||||
# 5 = ucp_Ll => Lower case letter
|
# 5 = ucp_Ll => Lower case letter
|
||||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||||
# 0 => Not part of a caseless set
|
# 0 => Not part of a caseless set
|
||||||
# -32 (-0x20) => Other case is U+0041
|
# -32 (-0x20) => Other case is U+0041
|
||||||
# 0 => No special Script Extension property
|
# 0 => No special Script Extension property
|
||||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||||
|
# 22 => Offset to Boolean properties
|
||||||
# 0 => Dummy value, unused at present
|
# 0 => Dummy value, unused at present
|
||||||
#
|
#
|
||||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||||
|
@ -181,36 +188,38 @@
|
||||||
# example, k, K and the Kelvin symbol are such a set).
|
# example, k, K and the Kelvin symbol are such a set).
|
||||||
#
|
#
|
||||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||||
# lookup 96 in stage1 table yields 91
|
# lookup 96 in stage1 table yields 93
|
||||||
# lookup 66 (0x42) in table 91 in stage2 yields 614
|
# lookup 66 (0x42) in table 93 in stage2 yields 819
|
||||||
# record 614 is { 17, 7, 12, 0, 0, 0, 9, 0 }
|
# record 614 is { 20, 7, 12, 0, 0, 0, 9, 41, 0 }
|
||||||
# 17 = ucp_Hiragana => Hiragana script
|
# 20 = ucp_Hiragana => Hiragana script
|
||||||
# 7 = ucp_Lo => Other letter
|
# 7 = ucp_Lo => Other letter
|
||||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||||
# 0 => Not part of a caseless set
|
# 0 => Not part of a caseless set
|
||||||
# 0 => No other case
|
# 0 => No other case
|
||||||
# 0 => No special Script Extension property
|
# 0 => No special Script Extension property
|
||||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||||
|
# 41 => Offset to Boolean properties
|
||||||
# 0 => Dummy value, unused at present
|
# 0 => Dummy value, unused at present
|
||||||
#
|
#
|
||||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||||
# lookup 57 in stage1 table yields 55
|
# lookup 57 in stage1 table yields 55
|
||||||
# lookup 80 (0x50) in table 55 in stage2 yields 486
|
# lookup 80 (0x50) in table 55 in stage2 yields 621
|
||||||
# record 485 is { 78, 12, 3, 0, 0, 138, 13, 0 }
|
# record 621 is { 84, 12, 3, 0, 0, 138, 13, 48, 0 }
|
||||||
# 78 = ucp_Inherited => Script inherited from predecessor
|
# 84 = ucp_Inherited => Script inherited from predecessor
|
||||||
# 12 = ucp_Mn => Non-spacing mark
|
# 12 = ucp_Mn => Non-spacing mark
|
||||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||||
# 0 => Not part of a caseless set
|
# 0 => Not part of a caseless set
|
||||||
# 0 => No other case
|
# 0 => No other case
|
||||||
# 138 => Script Extension list offset = 138
|
# 138 => Script Extension list offset = 138
|
||||||
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
||||||
|
# 48 => Offset to Boolean properties
|
||||||
# 0 => Dummy value, unused at present
|
# 0 => Dummy value, unused at present
|
||||||
#
|
#
|
||||||
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
||||||
# 18, and 47 set. This means that this character is expected to be used with
|
# 18, and 47 set. This means that this character is expected to be used with
|
||||||
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
||||||
#
|
#
|
||||||
# Philip Hazel, last updated 31 December 2021.
|
# Philip Hazel, last updated 10 January 2022.
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
@ -480,7 +489,7 @@ def write_bitsets(list, item_size):
|
||||||
unicode_version = ""
|
unicode_version = ""
|
||||||
|
|
||||||
# Some of the tables imported from GenerateCommon.py have alternate comment
|
# Some of the tables imported from GenerateCommon.py have alternate comment
|
||||||
# strings for use by GenerateUcpHeader. The comments are now wanted here, so
|
# strings for use by GenerateUcpHeader. The comments are not wanted here, so
|
||||||
# remove them.
|
# remove them.
|
||||||
|
|
||||||
bidi_classes = bidi_classes[::2]
|
bidi_classes = bidi_classes[::2]
|
||||||
|
@ -522,22 +531,18 @@ for line in file:
|
||||||
break_props[i] = break_properties.index('Extended_Pictographic')
|
break_props[i] = break_properties.index('Extended_Pictographic')
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
# The Script Extensions property default value is the Script value. Parse the
|
# Handle script extensions. The get_script_extesion() function maintains a
|
||||||
# file, setting 'Unknown' as the default (this will never be a Script Extension
|
# list of unique bitmaps representing lists of scripts, returning the offset
|
||||||
# value), then scan it and fill in the default from Scripts. Code added by PH
|
# in that list. Initialize the list with an empty set, which is used for
|
||||||
# in October 2018. Positive values are used for just a single script for a
|
# characters that have no script extensions.
|
||||||
# code point. Negative values are negated offsets in a list of bitsets of
|
|
||||||
# multiple scripts. Initialize this list with a single entry, as the zeroth
|
|
||||||
# element is never used.
|
|
||||||
|
|
||||||
script_lists = [[]]
|
script_lists = [[]]
|
||||||
|
|
||||||
last_script_extension = ""
|
last_script_extension = ""
|
||||||
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
|
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
|
||||||
|
|
||||||
# Find the Boolean properties of each character. This next bit of magic creates
|
# Find the Boolean properties of each character. This next bit of magic creates
|
||||||
# a list of empty lists. Just using [[]] * MAX_UNICODE gives a list of
|
# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
|
||||||
# references to the *same* list, which is not what we want.
|
# the *same* list, which is not what we want.
|
||||||
|
|
||||||
bprops = [[] for _ in range(MAX_UNICODE)]
|
bprops = [[] for _ in range(MAX_UNICODE)]
|
||||||
|
|
||||||
|
@ -601,7 +606,6 @@ for line in file:
|
||||||
|
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
# Scan each character's boolean property list and created a list of unique
|
# Scan each character's boolean property list and created a list of unique
|
||||||
# lists, at the same time, setting the index in that list for each property in
|
# lists, at the same time, setting the index in that list for each property in
|
||||||
# the bool_props vector.
|
# the bool_props vector.
|
||||||
|
@ -620,7 +624,6 @@ for c in range(MAX_UNICODE):
|
||||||
|
|
||||||
bool_props[c] = i
|
bool_props[c] = i
|
||||||
|
|
||||||
|
|
||||||
# With the addition of the Script Extensions field, we needed some padding to
|
# With the addition of the Script Extensions field, we needed some padding to
|
||||||
# get the Unicode records up to 12 bytes (multiple of 4). Originally this was a
|
# get the Unicode records up to 12 bytes (multiple of 4). Originally this was a
|
||||||
# 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits
|
# 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits
|
||||||
|
@ -793,7 +796,7 @@ const ucd_record PRIV(dummy_ucd_record)[] = {{
|
||||||
0, /* other case */
|
0, /* other case */
|
||||||
ucp_Unknown, /* script extension */
|
ucp_Unknown, /* script extension */
|
||||||
ucp_bidiL, /* bidi class */
|
ucp_bidiL, /* bidi class */
|
||||||
0, /* bool properties offset */
|
0, /* bool properties offset */
|
||||||
0 /* dummy filler */
|
0 /* dummy filler */
|
||||||
}};
|
}};
|
||||||
#endif
|
#endif
|
||||||
|
@ -864,14 +867,18 @@ for d in digitsets:
|
||||||
f.write("\n};\n\n")
|
f.write("\n};\n\n")
|
||||||
|
|
||||||
f.write("""\
|
f.write("""\
|
||||||
/* This vector is a list of script bitsets for the Script Extension property. */
|
/* This vector is a list of script bitsets for the Script Extension property.
|
||||||
|
The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
|
||||||
|
ucd_script_sets_item_size. */
|
||||||
|
|
||||||
const uint32_t PRIV(ucd_script_sets)[] = {
|
const uint32_t PRIV(ucd_script_sets)[] = {
|
||||||
""")
|
""")
|
||||||
write_bitsets(script_lists, script_list_item_size)
|
write_bitsets(script_lists, script_list_item_size)
|
||||||
|
|
||||||
f.write("""\
|
f.write("""\
|
||||||
/* This vector is a list of bitsets for Boolean properties. */
|
/* This vector is a list of bitsets for Boolean properties. The number of
|
||||||
|
32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
|
||||||
|
pcre2_ucp.h. */
|
||||||
|
|
||||||
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
||||||
""")
|
""")
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
# Note subsequent changes here:
|
# Note subsequent changes here:
|
||||||
#
|
#
|
||||||
# 27-December-2021: Added support for 4-letter script abbreviations.
|
# 27-December-2021: Added support for 4-letter script abbreviations.
|
||||||
# xx-January-2022: Further updates for Boolean property support
|
# 10-January-2022: Further updates for Boolean property support
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -112,20 +112,18 @@ utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(catego
|
||||||
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names)))
|
||||||
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names)))
|
||||||
|
|
||||||
|
|
||||||
for name in bool_properties:
|
for name in bool_properties:
|
||||||
utt_table.append((stdname(name), name, 'PT_BOOL'))
|
utt_table.append((stdname(name), name, 'PT_BOOL'))
|
||||||
if name in abbreviations:
|
if name in abbreviations:
|
||||||
for abbrev in abbreviations[name]:
|
for abbrev in abbreviations[name]:
|
||||||
utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
|
utt_table.append((stdname(abbrev), name, 'PT_BOOL'))
|
||||||
|
|
||||||
|
|
||||||
# Now add specials and synonyms. Note both the standardized and capitalized
|
# Now add specials and synonyms. Note both the standardized and capitalized
|
||||||
# forms are needed.
|
# forms are needed.
|
||||||
|
|
||||||
utt_table.append(('any', 'Any', 'PT_ANY'))
|
utt_table.append(('any', 'Any', 'PT_ANY'))
|
||||||
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
utt_table.append(('l&', 'L&', 'PT_LAMP'))
|
||||||
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
utt_table.append(('lc', 'LC', 'PT_LAMP'))
|
||||||
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
utt_table.append(('xan', 'Xan', 'PT_ALNUM'))
|
||||||
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
utt_table.append(('xps', 'Xps', 'PT_PXSPACE'))
|
||||||
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
utt_table.append(('xsp', 'Xsp', 'PT_SPACE'))
|
||||||
|
|
67
maint/README
67
maint/README
|
@ -22,13 +22,14 @@ GenerateCommon.py
|
||||||
|
|
||||||
GenerateTest26.py
|
GenerateTest26.py
|
||||||
A Python script that generates input and expected output test data for test
|
A Python script that generates input and expected output test data for test
|
||||||
26, which tests Unicode property support.
|
26, which tests certain aspects of Unicode property support.
|
||||||
|
|
||||||
GenerateUcd.py
|
GenerateUcd.py
|
||||||
A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
|
A Python script that generates the file pcre2_ucd.c from GenerateCommon.py
|
||||||
and Unicode data files, which are themselves downloaded from the Unicode web
|
and Unicode data files, which are themselves downloaded from the Unicode web
|
||||||
site. The generated file contains the tables for a 2-stage lookup of Unicode
|
site. The generated file contains the tables for a 2-stage lookup of Unicode
|
||||||
properties, along with some auxiliary tables.
|
properties, along with some auxiliary tables. The script starts with a long
|
||||||
|
comment that gives details of the tables it constructs.
|
||||||
|
|
||||||
GenerateUcpHeader.py
|
GenerateUcpHeader.py
|
||||||
A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
|
A Python script that generates the file pcre2_ucp.h from GenerateCommon.py
|
||||||
|
@ -38,7 +39,7 @@ GenerateUcpHeader.py
|
||||||
GenerateUcpTables.py
|
GenerateUcpTables.py
|
||||||
A Python script that generates the file pcre2_ucptables.c from
|
A Python script that generates the file pcre2_ucptables.c from
|
||||||
GenerateCommon.py and Unicode data files. The generated file contains tables
|
GenerateCommon.py and Unicode data files. The generated file contains tables
|
||||||
for looking up Unicode properties.
|
for looking up Unicode property names.
|
||||||
|
|
||||||
ManyConfigTests
|
ManyConfigTests
|
||||||
A shell script that runs "configure, make, test" a number of times with
|
A shell script that runs "configure, make, test" a number of times with
|
||||||
|
@ -63,11 +64,11 @@ Unicode.tables
|
||||||
ScriptExtensions.txt are where to look for script information.
|
ScriptExtensions.txt are where to look for script information.
|
||||||
|
|
||||||
ucptest.c
|
ucptest.c
|
||||||
A short C program for testing the Unicode property macros that do lookups in
|
A program for testing the Unicode property macros that do lookups in the
|
||||||
the pcre2_ucd.c data, mainly useful after rebuilding the Unicode property
|
pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables.
|
||||||
table. Compile and run this in the "maint" directory (see comments at its
|
Compile and run this in the "maint" directory (see comments at its head).
|
||||||
head). This program can also be used to find characters with specific
|
This program can also be used to find characters with specific properties and
|
||||||
properties.
|
to list which properties are supported.
|
||||||
|
|
||||||
ucptestdata
|
ucptestdata
|
||||||
A directory containing four files, testinput{1,2} and testoutput{1,2}, for
|
A directory containing four files, testinput{1,2} and testoutput{1,2}, for
|
||||||
|
@ -85,35 +86,18 @@ utf8.c
|
||||||
Updating to a new Unicode release
|
Updating to a new Unicode release
|
||||||
=================================
|
=================================
|
||||||
|
|
||||||
**** WORK IN PROGRESS ****
|
|
||||||
|
|
||||||
Work is going on in the area of Unicode property handling. What follows here is
|
|
||||||
now out-of-date. It will be updated once the current project is complete.
|
|
||||||
Updating to any new Unicode release is best left till then also.
|
|
||||||
06-January-2022
|
|
||||||
|
|
||||||
|
|
||||||
When there is a new release of Unicode, the files in Unicode.tables must be
|
When there is a new release of Unicode, the files in Unicode.tables must be
|
||||||
refreshed from the web site. If the new version of Unicode adds new character
|
refreshed from the web site. Once that is done, the four Python scripts that
|
||||||
scripts, the lists in GenerateCommon.py must be updated. I have been adding
|
generate files from the Unicode data can be run from within the "maint"
|
||||||
each new group at the end of the relevant list, with a comment.
|
directory.
|
||||||
|
|
||||||
NOTE: Both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of
|
Note: Previously, it was necessary to update lists of scripts and their
|
||||||
supported Unicode scripts that also have to be updated. These lists are in
|
abbreviations by hand before running the Python scripts. This is no longer
|
||||||
alphabetical order.
|
necessary because the scripts have been upgraded to extract this information
|
||||||
|
themselves. Also, there used to be explicit lists of script in two of the man
|
||||||
|
pages. This is no longer the case.
|
||||||
|
|
||||||
There are two lists in GenerateCommon.py: the full names and the abbreviations
|
You can give an output file name as an argument to the following scripts, but
|
||||||
that are found in the ScriptExtensions.txt file. A list of script names and
|
|
||||||
their abbreviations can be found in the PropertyValueAliases.txt file on the
|
|
||||||
Unicode web site. There is also a Wikipedia page that lists them, and notes the
|
|
||||||
Unicode version in which they were introduced:
|
|
||||||
|
|
||||||
https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts
|
|
||||||
|
|
||||||
Once the script name lists have been updated, the three generator scripts can
|
|
||||||
be run from within the maint directory. If you get the error "ValueError:
|
|
||||||
list.index(x): x not in list", the cause is usually a missing (or misspelt)
|
|
||||||
name in one of the lists. You can give an output file name as an argument, but
|
|
||||||
by default:
|
by default:
|
||||||
|
|
||||||
GenerateUcd.py creates pcre2_ucd.c )
|
GenerateUcd.py creates pcre2_ucd.c )
|
||||||
|
@ -121,17 +105,18 @@ GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory
|
||||||
GenerateUcpTables.py creates pcre2_ucptables.c )
|
GenerateUcpTables.py creates pcre2_ucptables.c )
|
||||||
|
|
||||||
These files can be compared against the existing versions in the src directory
|
These files can be compared against the existing versions in the src directory
|
||||||
to check on any changes before replacing the old files, but you can generate
|
to check on any changes before replacing the old files, but you can also
|
||||||
directly into the final location by running
|
generate directly into the final location by running:
|
||||||
|
|
||||||
./GenerateUcd.py ../src/pcre2_ucd.c
|
./GenerateUcd.py ../src/pcre2_ucd.c
|
||||||
./GenerateUcpHeader.py ../src/pcre2_ucp.h
|
./GenerateUcpHeader.py ../src/pcre2_ucp.h
|
||||||
./GenerateUcpTables.py ../src/pcre2_ucptables.c
|
./GenerateUcpTables.py ../src/pcre2_ucptables.c
|
||||||
|
|
||||||
The ucptest program can be compiled and used to check that the new tables work
|
Once the .c and .h files are in the ../src directory, the ucptest program can
|
||||||
properly, using the data files in ucptestdata to check a number of test
|
be compiled and used to check that the new tables work properly. The data files
|
||||||
characters. See the comments at the start of ucptest.c. If there are new
|
in ucptestdata are set up to check a number of test characters. See the
|
||||||
scripts, adding a few tests to the files in ucptestdata is a good idea.
|
comments at the start of ucptest.c. If there are new scripts, adding a few
|
||||||
|
tests to the files in ucptestdata is a good idea.
|
||||||
|
|
||||||
Finally, you should run the GenerateTest26.py script to regenerate new versions
|
Finally, you should run the GenerateTest26.py script to regenerate new versions
|
||||||
of the input and expected output from a series of Unicode property tests that
|
of the input and expected output from a series of Unicode property tests that
|
||||||
|
@ -476,4 +461,4 @@ years.
|
||||||
Philip Hazel
|
Philip Hazel
|
||||||
Email local part: Philip.Hazel
|
Email local part: Philip.Hazel
|
||||||
Email domain: gmail.com
|
Email domain: gmail.com
|
||||||
Last updated: 31 December 2021
|
Last updated: 10 January 2022
|
||||||
|
|
|
@ -107,7 +107,7 @@ const ucd_record PRIV(dummy_ucd_record)[] = {{
|
||||||
0, /* other case */
|
0, /* other case */
|
||||||
ucp_Unknown, /* script extension */
|
ucp_Unknown, /* script extension */
|
||||||
ucp_bidiL, /* bidi class */
|
ucp_bidiL, /* bidi class */
|
||||||
0, /* bool properties offset */
|
0, /* bool properties offset */
|
||||||
0 /* dummy filler */
|
0 /* dummy filler */
|
||||||
}};
|
}};
|
||||||
#endif
|
#endif
|
||||||
|
@ -168,7 +168,9 @@ const uint32_t PRIV(ucd_digit_sets)[] = {
|
||||||
0x1e959, 0x1fbf9,
|
0x1e959, 0x1fbf9,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* This vector is a list of script bitsets for the Script Extension property. */
|
/* This vector is a list of script bitsets for the Script Extension property.
|
||||||
|
The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
|
||||||
|
ucd_script_sets_item_size. */
|
||||||
|
|
||||||
const uint32_t PRIV(ucd_script_sets)[] = {
|
const uint32_t PRIV(ucd_script_sets)[] = {
|
||||||
0x00000000u, 0x00000000u, 0x00000000u,
|
0x00000000u, 0x00000000u, 0x00000000u,
|
||||||
|
@ -236,7 +238,9 @@ const uint32_t PRIV(ucd_script_sets)[] = {
|
||||||
0x2000ffc0u, 0x3984a010u, 0x00000001u,
|
0x2000ffc0u, 0x3984a010u, 0x00000001u,
|
||||||
};
|
};
|
||||||
|
|
||||||
/* This vector is a list of bitsets for Boolean properties. */
|
/* This vector is a list of bitsets for Boolean properties. The number of
|
||||||
|
32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
|
||||||
|
pcre2_ucp.h. */
|
||||||
|
|
||||||
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
||||||
0x00000000u, 0x00000000u,
|
0x00000000u, 0x00000000u,
|
||||||
|
|
Loading…
Reference in New Issue