From 360a84e80bac572987e1c157dbf915b410d6ba28 Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Wed, 12 Jan 2022 17:38:48 +0000 Subject: [PATCH] Update descriptive comments in UCD generation. --- maint/GenerateUcd.py | 42 +++++++++++++++++++++++++++++------------- src/pcre2_ucd.c | 6 +++--- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py index 8529bf5..0aec810 100755 --- a/maint/GenerateUcd.py +++ b/maint/GenerateUcd.py @@ -107,6 +107,7 @@ # # 26-December-2021: Refactoring completed # 10-January-2022: Addition of general Boolean property support +# 12-January-2022: Merge scriptx and bidiclass fields # # ---------------------------------------------------------------------------- # @@ -172,17 +173,22 @@ # Example: lowercase "a" (U+0061) is in block 0 # lookup 0 in stage1 table yields 0 # lookup 97 (0x61) in the first table in stage2 yields 35 -# record 35 is { 0, 5, 12, 0, -32, 0, 9, 22, 0 } +# record 35 is { 0, 5, 12, 0, -32, 18432, 22, 0 } # 0 = ucp_Latin => Latin script # 5 = ucp_Ll => Lower case letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # -32 (-0x20) => Other case is U+0041 -# 0 => No special Script Extension property -# 9 = ucp_bidiL => Bidi class left-to-right +# 18432 = 0x4800 => Combined Bidi class + script extension values # 22 => Offset to Boolean properties # 0 => Dummy value, unused at present # +# The top 5 bits of the sixth field are the Bidi class, with the rest being the +# script extension value, giving: +# +# 9 = ucp_bidiL => Bidi class left-to-right +# 0 => No special script extension property +# # Almost all lowercase latin characters resolve to the same record. One or two # are different because they are part of a multi-character caseless set (for # example, k, K and the Kelvin symbol are such a set). @@ -190,36 +196,46 @@ # Example: hiragana letter A (U+3042) is in block 96 (0x60) # lookup 96 in stage1 table yields 93 # lookup 66 (0x42) in table 93 in stage2 yields 819 -# record 614 is { 20, 7, 12, 0, 0, 0, 9, 41, 0 } +# record 614 is { 20, 7, 12, 0, 0, 18432, 41, 0 } # 20 = ucp_Hiragana => Hiragana script # 7 = ucp_Lo => Other letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # 0 => No other case -# 0 => No special Script Extension property -# 9 = ucp_bidiL => Bidi class left-to-right +# 18432 = 0x4800 => Combined Bidi class + script extension values # 41 => Offset to Boolean properties # 0 => Dummy value, unused at present # +# The top 5 bits of the sixth field are the Bidi class, with the rest being the +# script extension value, giving: +# +# 9 = ucp_bidiL => Bidi class left-to-right +# 0 => No special script extension property +# # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) # lookup 57 in stage1 table yields 55 # lookup 80 (0x50) in table 55 in stage2 yields 621 -# record 621 is { 84, 12, 3, 0, 0, 138, 13, 48, 0 } +# record 621 is { 84, 12, 3, 0, 0, 26762, 48, 0 } # 84 = ucp_Inherited => Script inherited from predecessor # 12 = ucp_Mn => Non-spacing mark # 3 = ucp_gbExtend => Grapheme break property "Extend" # 0 => Not part of a caseless set # 0 => No other case -# 138 => Script Extension list offset = 138 -# 13 = ucp_bidiNSM => Bidi class non-spacing mark +# 26762 = 0x688A => Combined Bidi class + script extension values # 48 => Offset to Boolean properties # 0 => Dummy value, unused at present # +# The top 5 bits of the sixth field are the Bidi class, with the rest being the +# script extension value, giving: +# +# 13 = ucp_bidiNSM => Bidi class non-spacing mark +# 138 => Script Extension list offset = 138 +# # At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, # 18, and 47 set. This means that this character is expected to be used with # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. # -# Philip Hazel, last updated 10 January 2022. +# Philip Hazel, last updated 12 January 2022. ############################################################################## @@ -894,9 +910,9 @@ f.write("""\ /* These are the main two-stage UCD tables. The fields in each record are: script (8 bits), character type (8 bits), grapheme break property (8 bits), offset to multichar other cases or zero (8 bits), offset to other case or zero -(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool -properties offset (8 bits), and a dummy 8-bit field to make the whole thing a -multiple of 4 bytes. */ +(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed +into a 16-bit field, offset in binary properties table (8 bits), and a dummy +8-bit field to make the whole thing a multiple of 4 bytes. */ \n""") write_records(records, record_size) diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index 24d2b88..4a1bb59 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -423,9 +423,9 @@ const uint32_t PRIV(ucd_boolprop_sets)[] = { /* These are the main two-stage UCD tables. The fields in each record are: script (8 bits), character type (8 bits), grapheme break property (8 bits), offset to multichar other cases or zero (8 bits), offset to other case or zero -(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool -properties offset (8 bits), and a dummy 8-bit field to make the whole thing a -multiple of 4 bytes. */ +(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed +into a 16-bit field, offset in binary properties table (8 bits), and a dummy +8-bit field to make the whole thing a multiple of 4 bytes. */ const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */ { 69, 0, 2, 0, 0, 6144, 1, 0, }, /* 0 */