diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py index d895542..8f7dc0d 100755 --- a/maint/GenerateUcd.py +++ b/maint/GenerateUcd.py @@ -108,6 +108,7 @@ # 26-December-2021: Refactoring completed # 10-January-2022: Addition of general Boolean property support # 12-January-2022: Merge scriptx and bidiclass fields +# 14-January-2022: Enlarge Boolean property offset to 12 bits # # ---------------------------------------------------------------------------- # @@ -173,14 +174,14 @@ # Example: lowercase "a" (U+0061) is in block 0 # lookup 0 in stage1 table yields 0 # lookup 97 (0x61) in the first table in stage2 yields 35 -# record 35 is { 0, 5, 12, 0, -32, 18432, 22, 0 } +# record 35 is { 0, 5, 12, 0, -32, 18432, 44 } # 0 = ucp_Latin => Latin script # 5 = ucp_Ll => Lower case letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # -32 (-0x20) => Other case is U+0041 # 18432 = 0x4800 => Combined Bidi class + script extension values -# 22 => Offset to Boolean properties +# 44 => Offset to Boolean properties # # The top 5 bits of the sixth field are the Bidi class, with the rest being the # script extension value, giving: @@ -195,15 +196,14 @@ # Example: hiragana letter A (U+3042) is in block 96 (0x60) # lookup 96 in stage1 table yields 93 # lookup 66 (0x42) in table 93 in stage2 yields 819 -# record 614 is { 20, 7, 12, 0, 0, 18432, 41, 0 } +# record 819 is { 20, 7, 12, 0, 0, 18432, 82 } # 20 = ucp_Hiragana => Hiragana script # 7 = ucp_Lo => Other letter # 12 = ucp_gbOther => Grapheme break property "Other" # 0 => Not part of a caseless set # 0 => No other case # 18432 = 0x4800 => Combined Bidi class + script extension values -# 41 => Offset to Boolean properties -# 0 => Dummy value, unused at present +# 82 => Offset to Boolean properties # # The top 5 bits of the sixth field are the Bidi class, with the rest being the # script extension value, giving: @@ -214,14 +214,14 @@ # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) # lookup 57 in stage1 table yields 55 # lookup 80 (0x50) in table 55 in stage2 yields 621 -# record 621 is { 84, 12, 3, 0, 0, 26762, 48, 0 } +# record 621 is { 84, 12, 3, 0, 0, 26762, 96 } # 84 = ucp_Inherited => Script inherited from predecessor # 12 = ucp_Mn => Non-spacing mark # 3 = ucp_gbExtend => Grapheme break property "Extend" # 0 => Not part of a caseless set # 0 => No other case # 26762 = 0x688A => Combined Bidi class + script extension values -# 48 => Offset to Boolean properties +# 96 => Offset to Boolean properties # # The top 5 bits of the sixth field are the Bidi class, with the rest being the # script extension value, giving: @@ -233,7 +233,7 @@ # 18, and 47 set. This means that this character is expected to be used with # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. # -# Philip Hazel, last updated 12 January 2022. +# Philip Hazel, last updated 14 January 2022. ############################################################################## @@ -900,8 +900,7 @@ f.write("""\ script (8 bits), character type (8 bits), grapheme break property (8 bits), offset to multichar other cases or zero (8 bits), offset to other case or zero (32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed -into a 16-bit field, offset in binary properties table (8 bits), and a dummy -8-bit field to make the whole thing a multiple of 4 bytes. */ +into a 16-bit field, and offset in binary properties table (16 bits). */ \n""") write_records(records, record_size) diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c index 7e4433d..a7d3dfc 100644 --- a/src/pcre2_ucd.c +++ b/src/pcre2_ucd.c @@ -422,8 +422,7 @@ const uint32_t PRIV(ucd_boolprop_sets)[] = { script (8 bits), character type (8 bits), grapheme break property (8 bits), offset to multichar other cases or zero (8 bits), offset to other case or zero (32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed -into a 16-bit field, offset in binary properties table (8 bits), and a dummy -8-bit field to make the whole thing a multiple of 4 bytes. */ +into a 16-bit field, and offset in binary properties table (16 bits). */ const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */ { 69, 0, 2, 0, 0, 6144, 2, }, /* 0 */