diff --git a/src/gen-ucd-table.py b/src/gen-ucd-table.py index 7dfad7b9c..d85ae4faa 100755 --- a/src/gen-ucd-table.py +++ b/src/gen-ucd-table.py @@ -25,14 +25,28 @@ hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2] logging.info('Preparing data tables...') + +# This is how the data is encoded: +# +# General_Category (gc), Canonical_Combining_Class (ccc), +# and Script (sc) are encoded as integers. +# +# Mirroring character (bmg) is encoded as difference from +# the original character. +# +# Composition & Decomposition (dm) are encoded elaborately, +# as discussed below. + gc = [u['gc'] for u in ucd] ccc = [int(u['ccc']) for u in ucd] bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)] -#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass) -#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr) - sc = [u['sc'] for u in ucd] + +# Prepare Compose / Decompose data +# +# This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic. + dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd) if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)} ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'} @@ -63,6 +77,9 @@ dm_order = {None: 0} dm_order.update(dm1_order) dm_order.update(dm2_order) + +# Prepare General_Category / Script mapping arrays + gc_order = dict() for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', @@ -83,6 +100,9 @@ for line in open(hb_common_h): sc_order[i] = tag sc_array.append(name) + +# Write out main data + DEFAULT = 'DEFAULT' COMPACT = 'COMPACT' SLOPPY = 'SLOPPY' @@ -109,6 +129,9 @@ print() print('#include "hb.hh"') print() + +# Write mapping data + code = packTab.Code('_hb_ucd') sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array) dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array) @@ -125,6 +148,9 @@ datasets = [ ('dm', dm, None, dm_order), ] + +# Write main data + for step in (DEFAULT, COMPACT, SLOPPY): compression = compression_level[step] logging.info(' Compression=%d:' % compression) @@ -165,6 +191,7 @@ for step in (DEFAULT, COMPACT, SLOPPY): print() + print('#endif') print() diff --git a/src/hb-ucd.cc b/src/hb-ucd.cc index baea224a2..4c8b1ee5e 100644 --- a/src/hb-ucd.cc +++ b/src/hb-ucd.cc @@ -129,12 +129,16 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, hb_codepoint_t a, hb_codepoint_t b, hb_codepoint_t *ab, void *user_data HB_UNUSED) { + // Hangul is handled algorithmically. if (_hb_ucd_compose_hangul (a, b, ab)) return true; hb_codepoint_t u = 0; if ((a & 0xFFFFF800u) == 0x0000u && (b & 0xFFFFFF80) == 0x0300u) { + /* If "a" is small enough and "b" is in the U+0300 range, + * the composition data is encoded in a 32bit array sorted + * by "a,b" pair. */ uint32_t k = HB_CODEPOINT_ENCODE3_11_7_14 (a, b, 0); const uint32_t *v = hb_bsearch (k, _hb_ucd_dm2_u32_map, @@ -146,6 +150,8 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED, } else { + /* Otherwise it is stored in a 64bit array sorted by + * "a,b" pair. */ uint64_t k = HB_CODEPOINT_ENCODE3 (a, b, 0); const uint64_t *v = hb_bsearch (k, _hb_ucd_dm2_u64_map, @@ -170,15 +176,22 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, unsigned i = _hb_ucd_dm (ab); + /* If no data, there's no decomposition. */ if (likely (!i)) return false; i--; + /* Check if it's a single-character decomposition. */ if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map)) { + /* Single-character decompositions currently are only in plane 0 or plane 2. */ if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map)) + { + /* Plane 0. */ *a = _hb_ucd_dm1_p0_map[i]; + } else { + /* Plane 2. */ i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map); *a = 0x20000 | _hb_ucd_dm1_p2_map[i]; } @@ -187,8 +200,10 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, } i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map); + /* Otherwise they are encoded either in a 32bit array or a 64bit array. */ if (i < ARRAY_LENGTH (_hb_ucd_dm2_u32_map)) { + /* 32bit array. */ uint32_t v = _hb_ucd_dm2_u32_map[i]; *a = HB_CODEPOINT_DECODE3_11_7_14_1 (v); *b = HB_CODEPOINT_DECODE3_11_7_14_2 (v); @@ -196,6 +211,7 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, } i -= ARRAY_LENGTH (_hb_ucd_dm2_u32_map); + /* 64bit array. */ uint64_t v = _hb_ucd_dm2_u64_map[i]; *a = HB_CODEPOINT_DECODE3_1 (v); *b = HB_CODEPOINT_DECODE3_2 (v);