Update descriptive comments in UCD generation.

This commit is contained in:
Philip Hazel 2022-01-12 17:38:48 +00:00
parent 061e57695a
commit 360a84e80b
2 changed files with 32 additions and 16 deletions

View File

@ -107,6 +107,7 @@
#
# 26-December-2021: Refactoring completed
# 10-January-2022: Addition of general Boolean property support
# 12-January-2022: Merge scriptx and bidiclass fields
#
# ----------------------------------------------------------------------------
#
@ -172,17 +173,22 @@
# Example: lowercase "a" (U+0061) is in block 0
# lookup 0 in stage1 table yields 0
# lookup 97 (0x61) in the first table in stage2 yields 35
# record 35 is { 0, 5, 12, 0, -32, 0, 9, 22, 0 }
# record 35 is { 0, 5, 12, 0, -32, 18432, 22, 0 }
# 0 = ucp_Latin => Latin script
# 5 = ucp_Ll => Lower case letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# -32 (-0x20) => Other case is U+0041
# 0 => No special Script Extension property
# 9 = ucp_bidiL => Bidi class left-to-right
# 18432 = 0x4800 => Combined Bidi class + script extension values
# 22 => Offset to Boolean properties
# 0 => Dummy value, unused at present
#
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => No special script extension property
#
# Almost all lowercase latin characters resolve to the same record. One or two
# are different because they are part of a multi-character caseless set (for
# example, k, K and the Kelvin symbol are such a set).
@ -190,36 +196,46 @@
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
# lookup 96 in stage1 table yields 93
# lookup 66 (0x42) in table 93 in stage2 yields 819
# record 614 is { 20, 7, 12, 0, 0, 0, 9, 41, 0 }
# record 614 is { 20, 7, 12, 0, 0, 18432, 41, 0 }
# 20 = ucp_Hiragana => Hiragana script
# 7 = ucp_Lo => Other letter
# 12 = ucp_gbOther => Grapheme break property "Other"
# 0 => Not part of a caseless set
# 0 => No other case
# 0 => No special Script Extension property
# 9 = ucp_bidiL => Bidi class left-to-right
# 18432 = 0x4800 => Combined Bidi class + script extension values
# 41 => Offset to Boolean properties
# 0 => Dummy value, unused at present
#
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 9 = ucp_bidiL => Bidi class left-to-right
# 0 => No special script extension property
#
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
# lookup 57 in stage1 table yields 55
# lookup 80 (0x50) in table 55 in stage2 yields 621
# record 621 is { 84, 12, 3, 0, 0, 138, 13, 48, 0 }
# record 621 is { 84, 12, 3, 0, 0, 26762, 48, 0 }
# 84 = ucp_Inherited => Script inherited from predecessor
# 12 = ucp_Mn => Non-spacing mark
# 3 = ucp_gbExtend => Grapheme break property "Extend"
# 0 => Not part of a caseless set
# 0 => No other case
# 138 => Script Extension list offset = 138
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
# 26762 = 0x688A => Combined Bidi class + script extension values
# 48 => Offset to Boolean properties
# 0 => Dummy value, unused at present
#
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
# script extension value, giving:
#
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
# 138 => Script Extension list offset = 138
#
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
# 18, and 47 set. This means that this character is expected to be used with
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
#
# Philip Hazel, last updated 10 January 2022.
# Philip Hazel, last updated 12 January 2022.
##############################################################################
@ -894,9 +910,9 @@ f.write("""\
/* These are the main two-stage UCD tables. The fields in each record are:
script (8 bits), character type (8 bits), grapheme break property (8 bits),
offset to multichar other cases or zero (8 bits), offset to other case or zero
(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool
properties offset (8 bits), and a dummy 8-bit field to make the whole thing a
multiple of 4 bytes. */
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
into a 16-bit field, offset in binary properties table (8 bits), and a dummy
8-bit field to make the whole thing a multiple of 4 bytes. */
\n""")
write_records(records, record_size)

View File

@ -423,9 +423,9 @@ const uint32_t PRIV(ucd_boolprop_sets)[] = {
/* These are the main two-stage UCD tables. The fields in each record are:
script (8 bits), character type (8 bits), grapheme break property (8 bits),
offset to multichar other cases or zero (8 bits), offset to other case or zero
(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool
properties offset (8 bits), and a dummy 8-bit field to make the whole thing a
multiple of 4 bytes. */
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
into a 16-bit field, offset in binary properties table (8 bits), and a dummy
8-bit field to make the whole thing a multiple of 4 bytes. */
const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */
{ 69, 0, 2, 0, 0, 6144, 1, 0, }, /* 0 */