From 360a84e80bac572987e1c157dbf915b410d6ba28 Mon Sep 17 00:00:00 2001
From: Philip Hazel <Philip.Hazel@gmail.com>
Date: Wed, 12 Jan 2022 17:38:48 +0000
Subject: [PATCH] Update descriptive comments in UCD generation.

---
 maint/GenerateUcd.py | 42 +++++++++++++++++++++++++++++-------------
 src/pcre2_ucd.c      |  6 +++---
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py
index 8529bf5..0aec810 100755
--- a/maint/GenerateUcd.py
+++ b/maint/GenerateUcd.py
@@ -107,6 +107,7 @@
 #
 # 26-December-2021:  Refactoring completed
 # 10-January-2022:   Addition of general Boolean property support
+# 12-January-2022:   Merge scriptx and bidiclass fields
 #
 # ----------------------------------------------------------------------------
 #
@@ -172,17 +173,22 @@
 # Example: lowercase "a" (U+0061) is in block 0
 #          lookup 0 in stage1 table yields 0
 #          lookup 97 (0x61) in the first table in stage2 yields 35
-#          record 35 is { 0, 5, 12, 0, -32, 0, 9, 22, 0 }
+#          record 35 is { 0, 5, 12, 0, -32, 18432, 22, 0 }
 #             0 = ucp_Latin   => Latin script
 #             5 = ucp_Ll      => Lower case letter
 #            12 = ucp_gbOther => Grapheme break property "Other"
 #             0               => Not part of a caseless set
 #           -32 (-0x20)       => Other case is U+0041
-#             0               => No special Script Extension property
-#             9 = ucp_bidiL   => Bidi class left-to-right
+#         18432 = 0x4800      => Combined Bidi class + script extension values
 #            22               => Offset to Boolean properties
 #             0               => Dummy value, unused at present
 #
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#             9 = ucp_bidiL   => Bidi class left-to-right
+#             0               => No special script extension property
+#
 # Almost all lowercase latin characters resolve to the same record. One or two
 # are different because they are part of a multi-character caseless set (for
 # example, k, K and the Kelvin symbol are such a set).
@@ -190,36 +196,46 @@
 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
 #          lookup 96 in stage1 table yields 93
 #          lookup 66 (0x42) in table 93 in stage2 yields 819
-#          record 614 is { 20, 7, 12, 0, 0, 0, 9, 41, 0 }
+#          record 614 is { 20, 7, 12, 0, 0, 18432, 41, 0 }
 #            20 = ucp_Hiragana => Hiragana script
 #             7 = ucp_Lo       => Other letter
 #            12 = ucp_gbOther  => Grapheme break property "Other"
 #             0                => Not part of a caseless set
 #             0                => No other case
-#             0                => No special Script Extension property
-#             9 = ucp_bidiL    => Bidi class left-to-right
+#         18432 = 0x4800       => Combined Bidi class + script extension values
 #            41                => Offset to Boolean properties
 #             0                => Dummy value, unused at present
 #
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#             9 = ucp_bidiL   => Bidi class left-to-right
+#             0               => No special script extension property
+#
 # Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
 #          lookup 57 in stage1 table yields 55
 #          lookup 80 (0x50) in table 55 in stage2 yields 621
-#          record 621 is { 84, 12, 3, 0, 0, 138, 13, 48, 0 }
+#          record 621 is { 84, 12, 3, 0, 0, 26762, 48, 0 }
 #            84 = ucp_Inherited => Script inherited from predecessor
 #            12 = ucp_Mn        => Non-spacing mark
 #             3 = ucp_gbExtend  => Grapheme break property "Extend"
 #             0                 => Not part of a caseless set
 #             0                 => No other case
-#           138                 => Script Extension list offset = 138
-#            13 = ucp_bidiNSM   => Bidi class non-spacing mark
+#         26762 = 0x688A        => Combined Bidi class + script extension values
 #            48                 => Offset to Boolean properties
 #             0                 => Dummy value, unused at present
 #
+# The top 5 bits of the sixth field are the Bidi class, with the rest being the
+# script extension value, giving:
+#
+#            13 = ucp_bidiNSM   => Bidi class non-spacing mark
+#           138                 => Script Extension list offset = 138
+#
 # At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
 # 18, and 47 set. This means that this character is expected to be used with
 # any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
 #
-#  Philip Hazel, last updated 10 January 2022.
+#  Philip Hazel, last updated 12 January 2022.
 ##############################################################################
 
 
@@ -894,9 +910,9 @@ f.write("""\
 /* These are the main two-stage UCD tables. The fields in each record are:
 script (8 bits), character type (8 bits), grapheme break property (8 bits),
 offset to multichar other cases or zero (8 bits), offset to other case or zero
-(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool
-properties offset (8 bits), and a dummy 8-bit field to make the whole thing a
-multiple of 4 bytes. */
+(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
+into a 16-bit field, offset in binary properties table (8 bits), and a dummy
+8-bit field to make the whole thing a multiple of 4 bytes. */
 \n""")
 
 write_records(records, record_size)
diff --git a/src/pcre2_ucd.c b/src/pcre2_ucd.c
index 24d2b88..4a1bb59 100644
--- a/src/pcre2_ucd.c
+++ b/src/pcre2_ucd.c
@@ -423,9 +423,9 @@ const uint32_t PRIV(ucd_boolprop_sets)[] = {
 /* These are the main two-stage UCD tables. The fields in each record are:
 script (8 bits), character type (8 bits), grapheme break property (8 bits),
 offset to multichar other cases or zero (8 bits), offset to other case or zero
-(32 bits, signed), script extension (8 bits), bidi class (8 bits), bool
-properties offset (8 bits), and a dummy 8-bit field to make the whole thing a
-multiple of 4 bytes. */
+(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
+into a 16-bit field, offset in binary properties table (8 bits), and a dummy
+8-bit field to make the whole thing a multiple of 4 bytes. */
 
 const ucd_record PRIV(ucd_records)[] = { /* 16908 bytes, record size 12 */
   {    69,      0,      2,      0,      0,   6144,      1,      0, }, /*   0 */