Update descriptive comments in UCD generation.

This commit is contained in:
Philip Hazel 2022-01-12 17:38:48 +00:00
parent 061e57695a
commit 360a84e80b
2 changed files with 32 additions and 16 deletions

View File

@ -107,6 +107,7 @@
# #
# 26-December-2021: Refactoring completed # 26-December-2021: Refactoring completed
# 10-January-2022: Addition of general Boolean property support # 10-January-2022: Addition of general Boolean property support
# 12-January-2022: Merge scriptx and bidiclass fields
# #
# ---------------------------------------------------------------------------- # ----------------------------------------------------------------------------
# #
@ -172,17 +173,22 @@
# Example: lowercase "a" (U+0061) is in block 0 # Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0 # lookup 0 in stage1 table yields 0
# lookup 97 (0x61) in the first table in stage2 yields 35 # lookup 97 (0x61) in the first table in stage2 yields 35
# record 35 is { 0, 5, 12, 0, -32, 0, 9, 22, 0 } # record 35 is { 0, 5, 12, 0, -32, 18432, 22, 0 }
# 0 = ucp_Latin => Latin script # 0 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter # 5 = ucp_Ll => Lower case letter
# 12 = ucp_gbOther => Grapheme break property "Other" # 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set # 0 => Not part of a caseless set
# -32 (-0x20) => Other case is U+0041 # -32 (-0x20) => Other case is U+0041
# 0 => No special Script Extension property # 18432 = 0x4800 => Combined Bidi class + script extension values
# 9 = ucp_bidiL => Bidi class left-to-right
# 22 => Offset to Boolean properties # 22 => Offset to Boolean properties
# 0 => Dummy value, unused at present # 0 => Dummy value, unused at present
# #
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => No special script extension property
#
# Almost all lowercase latin characters resolve to the same record. One or two # Almost all lowercase latin characters resolve to the same record. One or two
# are different because they are part of a multi-character caseless set (for # are different because they are part of a multi-character caseless set (for
# example, k, K and the Kelvin symbol are such a set). # example, k, K and the Kelvin symbol are such a set).
@ -190,36 +196,46 @@
# Example: hiragana letter A (U+3042) is in block 96 (0x60) # Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 93 # lookup 96 in stage1 table yields 93
# lookup 66 (0x42) in table 93 in stage2 yields 819 # lookup 66 (0x42) in table 93 in stage2 yields 819
# record 614 is { 20, 7, 12, 0, 0, 0, 9, 41, 0 } # record 614 is { 20, 7, 12, 0, 0, 18432, 41, 0 }
# 20 = ucp_Hiragana => Hiragana script # 20 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter # 7 = ucp_Lo => Other letter
# 12 = ucp_gbOther => Grapheme break property "Other" # 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set # 0 => Not part of a caseless set
# 0 => No other case # 0 => No other case
# 0 => No special Script Extension property # 18432 = 0x4800 => Combined Bidi class + script extension values
# 9 = ucp_bidiL => Bidi class left-to-right
# 41 => Offset to Boolean properties # 41 => Offset to Boolean properties
# 0 => Dummy value, unused at present # 0 => Dummy value, unused at present
# #
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => No special script extension property
#
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
# lookup 57 in stage1 table yields 55 # lookup 57 in stage1 table yields 55
# lookup 80 (0x50) in table 55 in stage2 yields 621 # lookup 80 (0x50) in table 55 in stage2 yields 621
# record 621 is { 84, 12, 3, 0, 0, 138, 13, 48, 0 } # record 621 is { 84, 12, 3, 0, 0, 26762, 48, 0 }
# 84 = ucp_Inherited => Script inherited from predecessor # 84 = ucp_Inherited => Script inherited from predecessor
# 12 = ucp_Mn => Non-spacing mark # 12 = ucp_Mn => Non-spacing mark
# 3 = ucp_gbExtend => Grapheme break property "Extend" # 3 = ucp_gbExtend => Grapheme break property "Extend"
# 0 => Not part of a caseless set # 0 => Not part of a caseless set
# 0 => No other case # 0 => No other case
# 138 => Script Extension list offset = 138 # 26762 = 0x688A => Combined Bidi class + script extension values
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
# 48 => Offset to Boolean properties # 48 => Offset to Boolean properties
# 0 => Dummy value, unused at present # 0 => Dummy value, unused at present
# #
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
# 138 => Script Extension list offset = 138
#
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, # At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
# 18, and 47 set. This means that this character is expected to be used with # 18, and 47 set. This means that this character is expected to be used with
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
# #
# Philip Hazel, last updated 10 January 2022. # Philip Hazel, last updated 12 January 2022.
############################################################################## ##############################################################################
@ -894,9 +910,9 @@ f.write("""\
/* These are the main two-stage UCD tables. The fields in each record are: /* These are the main two-stage UCD tables. The fields in each record are:
script (8 bits), character type (8 bits), grapheme break property (8 bits), script (8 bits), character type (8 bits), grapheme break property (8 bits),
offset to multichar other cases or zero (8 bits), offset to other case or zero offset to multichar other cases or zero (8 bits), offset to other case or zero
(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool (32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
properties offset (8 bits), and a dummy 8-bit field to make the whole thing a into a 16-bit field, offset in binary properties table (8 bits), and a dummy
multiple of 4 bytes. */ 8-bit field to make the whole thing a multiple of 4 bytes. */
\n""") \n""")
write_records(records, record_size) write_records(records, record_size)

View File

@ -423,9 +423,9 @@ const uint32_t PRIV(ucd_boolprop_sets)[] = {
/* These are the main two-stage UCD tables. The fields in each record are: /* These are the main two-stage UCD tables. The fields in each record are:
script (8 bits), character type (8 bits), grapheme break property (8 bits), script (8 bits), character type (8 bits), grapheme break property (8 bits),
offset to multichar other cases or zero (8 bits), offset to other case or zero offset to multichar other cases or zero (8 bits), offset to other case or zero
(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool (32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
properties offset (8 bits), and a dummy 8-bit field to make the whole thing a into a 16-bit field, offset in binary properties table (8 bits), and a dummy
multiple of 4 bytes. */ 8-bit field to make the whole thing a multiple of 4 bytes. */
const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */ const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */
{ 69, 0, 2, 0, 0, 6144, 1, 0, }, /* 0 */ { 69, 0, 2, 0, 0, 6144, 1, 0, }, /* 0 */