[ucd] Document algorithms

This commit is contained in:
Behdad Esfahbod 2022-11-20 13:54:56 -07:00
parent ed43bc5118
commit 76ce390b5a
2 changed files with 46 additions and 3 deletions

View File

@ -25,14 +25,28 @@ hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
logging.info('Preparing data tables...')
# This is how the data is encoded:
#
# General_Category (gc), Canonical_Combining_Class (ccc),
# and Script (sc) are encoded as integers.
#
# Mirroring character (bmg) is encoded as difference from
# the original character.
#
# Composition & Decomposition (dm) are encoded elaborately,
# as discussed below.
gc = [u['gc'] for u in ucd]
ccc = [int(u['ccc']) for u in ucd]
bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
sc = [u['sc'] for u in ucd]
# Prepare Compose / Decompose data
#
# This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic.
dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
@ -63,6 +77,9 @@ dm_order = {None: 0}
dm_order.update(dm1_order)
dm_order.update(dm2_order)
# Prepare General_Category / Script mapping arrays
gc_order = dict()
for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
@ -83,6 +100,9 @@ for line in open(hb_common_h):
sc_order[i] = tag
sc_array.append(name)
# Write out main data
DEFAULT = 'DEFAULT'
COMPACT = 'COMPACT'
SLOPPY = 'SLOPPY'
@ -109,6 +129,9 @@ print()
print('#include "hb.hh"')
print()
# Write mapping data
code = packTab.Code('_hb_ucd')
sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
@ -125,6 +148,9 @@ datasets = [
('dm', dm, None, dm_order),
]
# Write main data
for step in (DEFAULT, COMPACT, SLOPPY):
compression = compression_level[step]
logging.info(' Compression=%d:' % compression)
@ -165,6 +191,7 @@ for step in (DEFAULT, COMPACT, SLOPPY):
print()
print('#endif')
print()

View File

@ -129,12 +129,16 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
hb_codepoint_t a, hb_codepoint_t b, hb_codepoint_t *ab,
void *user_data HB_UNUSED)
{
// Hangul is handled algorithmically.
if (_hb_ucd_compose_hangul (a, b, ab)) return true;
hb_codepoint_t u = 0;
if ((a & 0xFFFFF800u) == 0x0000u && (b & 0xFFFFFF80) == 0x0300u)
{
/* If "a" is small enough and "b" is in the U+0300 range,
* the composition data is encoded in a 32bit array sorted
* by "a,b" pair. */
uint32_t k = HB_CODEPOINT_ENCODE3_11_7_14 (a, b, 0);
const uint32_t *v = hb_bsearch (k,
_hb_ucd_dm2_u32_map,
@ -146,6 +150,8 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
}
else
{
/* Otherwise it is stored in a 64bit array sorted by
* "a,b" pair. */
uint64_t k = HB_CODEPOINT_ENCODE3 (a, b, 0);
const uint64_t *v = hb_bsearch (k,
_hb_ucd_dm2_u64_map,
@ -170,15 +176,22 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
unsigned i = _hb_ucd_dm (ab);
/* If no data, there's no decomposition. */
if (likely (!i)) return false;
i--;
/* Check if it's a single-character decomposition. */
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map))
{
/* Single-character decompositions currently are only in plane 0 or plane 2. */
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map))
{
/* Plane 0. */
*a = _hb_ucd_dm1_p0_map[i];
}
else
{
/* Plane 2. */
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map);
*a = 0x20000 | _hb_ucd_dm1_p2_map[i];
}
@ -187,8 +200,10 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
}
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map);
/* Otherwise they are encoded either in a 32bit array or a 64bit array. */
if (i < ARRAY_LENGTH (_hb_ucd_dm2_u32_map))
{
/* 32bit array. */
uint32_t v = _hb_ucd_dm2_u32_map[i];
*a = HB_CODEPOINT_DECODE3_11_7_14_1 (v);
*b = HB_CODEPOINT_DECODE3_11_7_14_2 (v);
@ -196,6 +211,7 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
}
i -= ARRAY_LENGTH (_hb_ucd_dm2_u32_map);
/* 64bit array. */
uint64_t v = _hb_ucd_dm2_u64_map[i];
*a = HB_CODEPOINT_DECODE3_1 (v);
*b = HB_CODEPOINT_DECODE3_2 (v);