Update descriptive comments in UCD generation.
This commit is contained in:
parent
061e57695a
commit
360a84e80b
|
@ -107,6 +107,7 @@
|
||||||
#
|
#
|
||||||
# 26-December-2021: Refactoring completed
|
# 26-December-2021: Refactoring completed
|
||||||
# 10-January-2022: Addition of general Boolean property support
|
# 10-January-2022: Addition of general Boolean property support
|
||||||
|
# 12-January-2022: Merge scriptx and bidiclass fields
|
||||||
#
|
#
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
|
@ -172,17 +173,22 @@
|
||||||
# Example: lowercase "a" (U+0061) is in block 0
|
# Example: lowercase "a" (U+0061) is in block 0
|
||||||
# lookup 0 in stage1 table yields 0
|
# lookup 0 in stage1 table yields 0
|
||||||
# lookup 97 (0x61) in the first table in stage2 yields 35
|
# lookup 97 (0x61) in the first table in stage2 yields 35
|
||||||
# record 35 is { 0, 5, 12, 0, -32, 0, 9, 22, 0 }
|
# record 35 is { 0, 5, 12, 0, -32, 18432, 22, 0 }
|
||||||
# 0 = ucp_Latin => Latin script
|
# 0 = ucp_Latin => Latin script
|
||||||
# 5 = ucp_Ll => Lower case letter
|
# 5 = ucp_Ll => Lower case letter
|
||||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||||
# 0 => Not part of a caseless set
|
# 0 => Not part of a caseless set
|
||||||
# -32 (-0x20) => Other case is U+0041
|
# -32 (-0x20) => Other case is U+0041
|
||||||
# 0 => No special Script Extension property
|
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
|
||||||
# 22 => Offset to Boolean properties
|
# 22 => Offset to Boolean properties
|
||||||
# 0 => Dummy value, unused at present
|
# 0 => Dummy value, unused at present
|
||||||
#
|
#
|
||||||
|
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||||
|
# script extension value, giving:
|
||||||
|
#
|
||||||
|
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||||
|
# 0 => No special script extension property
|
||||||
|
#
|
||||||
# Almost all lowercase latin characters resolve to the same record. One or two
|
# Almost all lowercase latin characters resolve to the same record. One or two
|
||||||
# are different because they are part of a multi-character caseless set (for
|
# are different because they are part of a multi-character caseless set (for
|
||||||
# example, k, K and the Kelvin symbol are such a set).
|
# example, k, K and the Kelvin symbol are such a set).
|
||||||
|
@ -190,36 +196,46 @@
|
||||||
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
||||||
# lookup 96 in stage1 table yields 93
|
# lookup 96 in stage1 table yields 93
|
||||||
# lookup 66 (0x42) in table 93 in stage2 yields 819
|
# lookup 66 (0x42) in table 93 in stage2 yields 819
|
||||||
# record 614 is { 20, 7, 12, 0, 0, 0, 9, 41, 0 }
|
# record 614 is { 20, 7, 12, 0, 0, 18432, 41, 0 }
|
||||||
# 20 = ucp_Hiragana => Hiragana script
|
# 20 = ucp_Hiragana => Hiragana script
|
||||||
# 7 = ucp_Lo => Other letter
|
# 7 = ucp_Lo => Other letter
|
||||||
# 12 = ucp_gbOther => Grapheme break property "Other"
|
# 12 = ucp_gbOther => Grapheme break property "Other"
|
||||||
# 0 => Not part of a caseless set
|
# 0 => Not part of a caseless set
|
||||||
# 0 => No other case
|
# 0 => No other case
|
||||||
# 0 => No special Script Extension property
|
# 18432 = 0x4800 => Combined Bidi class + script extension values
|
||||||
# 9 = ucp_bidiL => Bidi class left-to-right
|
|
||||||
# 41 => Offset to Boolean properties
|
# 41 => Offset to Boolean properties
|
||||||
# 0 => Dummy value, unused at present
|
# 0 => Dummy value, unused at present
|
||||||
#
|
#
|
||||||
|
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||||
|
# script extension value, giving:
|
||||||
|
#
|
||||||
|
# 9 = ucp_bidiL => Bidi class left-to-right
|
||||||
|
# 0 => No special script extension property
|
||||||
|
#
|
||||||
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
|
||||||
# lookup 57 in stage1 table yields 55
|
# lookup 57 in stage1 table yields 55
|
||||||
# lookup 80 (0x50) in table 55 in stage2 yields 621
|
# lookup 80 (0x50) in table 55 in stage2 yields 621
|
||||||
# record 621 is { 84, 12, 3, 0, 0, 138, 13, 48, 0 }
|
# record 621 is { 84, 12, 3, 0, 0, 26762, 48, 0 }
|
||||||
# 84 = ucp_Inherited => Script inherited from predecessor
|
# 84 = ucp_Inherited => Script inherited from predecessor
|
||||||
# 12 = ucp_Mn => Non-spacing mark
|
# 12 = ucp_Mn => Non-spacing mark
|
||||||
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
# 3 = ucp_gbExtend => Grapheme break property "Extend"
|
||||||
# 0 => Not part of a caseless set
|
# 0 => Not part of a caseless set
|
||||||
# 0 => No other case
|
# 0 => No other case
|
||||||
# 138 => Script Extension list offset = 138
|
# 26762 = 0x688A => Combined Bidi class + script extension values
|
||||||
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
|
||||||
# 48 => Offset to Boolean properties
|
# 48 => Offset to Boolean properties
|
||||||
# 0 => Dummy value, unused at present
|
# 0 => Dummy value, unused at present
|
||||||
#
|
#
|
||||||
|
# The top 5 bits of the sixth field are the Bidi class, with the rest being the
|
||||||
|
# script extension value, giving:
|
||||||
|
#
|
||||||
|
# 13 = ucp_bidiNSM => Bidi class non-spacing mark
|
||||||
|
# 138 => Script Extension list offset = 138
|
||||||
|
#
|
||||||
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
|
||||||
# 18, and 47 set. This means that this character is expected to be used with
|
# 18, and 47 set. This means that this character is expected to be used with
|
||||||
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
|
||||||
#
|
#
|
||||||
# Philip Hazel, last updated 10 January 2022.
|
# Philip Hazel, last updated 12 January 2022.
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
@ -894,9 +910,9 @@ f.write("""\
|
||||||
/* These are the main two-stage UCD tables. The fields in each record are:
|
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||||
(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool
|
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
|
||||||
properties offset (8 bits), and a dummy 8-bit field to make the whole thing a
|
into a 16-bit field, offset in binary properties table (8 bits), and a dummy
|
||||||
multiple of 4 bytes. */
|
8-bit field to make the whole thing a multiple of 4 bytes. */
|
||||||
\n""")
|
\n""")
|
||||||
|
|
||||||
write_records(records, record_size)
|
write_records(records, record_size)
|
||||||
|
|
|
@ -423,9 +423,9 @@ const uint32_t PRIV(ucd_boolprop_sets)[] = {
|
||||||
/* These are the main two-stage UCD tables. The fields in each record are:
|
/* These are the main two-stage UCD tables. The fields in each record are:
|
||||||
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
script (8 bits), character type (8 bits), grapheme break property (8 bits),
|
||||||
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
offset to multichar other cases or zero (8 bits), offset to other case or zero
|
||||||
(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool
|
(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
|
||||||
properties offset (8 bits), and a dummy 8-bit field to make the whole thing a
|
into a 16-bit field, offset in binary properties table (8 bits), and a dummy
|
||||||
multiple of 4 bytes. */
|
8-bit field to make the whole thing a multiple of 4 bytes. */
|
||||||
|
|
||||||
const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */
|
const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */
|
||||||
{ 69, 0, 2, 0, 0, 6144, 1, 0, }, /* 0 */
|
{ 69, 0, 2, 0, 0, 6144, 1, 0, }, /* 0 */
|
||||||
|
|
Loading…
Reference in New Issue