[ucd] Document algorithms

This commit is contained in:
Behdad Esfahbod 2022-11-20 13:54:56 -07:00
parent ed43bc5118
commit 76ce390b5a
2 changed files with 46 additions and 3 deletions

View File

@ -25,14 +25,28 @@ hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
logging.info('Preparing data tables...') logging.info('Preparing data tables...')
# This is how the data is encoded:
#
# General_Category (gc), Canonical_Combining_Class (ccc),
# and Script (sc) are encoded as integers.
#
# Mirroring character (bmg) is encoded as difference from
# the original character.
#
# Composition & Decomposition (dm) are encoded elaborately,
# as discussed below.
gc = [u['gc'] for u in ucd] gc = [u['gc'] for u in ucd]
ccc = [int(u['ccc']) for u in ucd] ccc = [int(u['ccc']) for u in ucd]
bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)] bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
sc = [u['sc'] for u in ucd] sc = [u['sc'] for u in ucd]
# Prepare Compose / Decompose data
#
# This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic.
dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd) dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)} if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'} ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
@ -63,6 +77,9 @@ dm_order = {None: 0}
dm_order.update(dm1_order) dm_order.update(dm1_order)
dm_order.update(dm2_order) dm_order.update(dm2_order)
# Prepare General_Category / Script mapping arrays
gc_order = dict() gc_order = dict()
for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
@ -83,6 +100,9 @@ for line in open(hb_common_h):
sc_order[i] = tag sc_order[i] = tag
sc_array.append(name) sc_array.append(name)
# Write out main data
DEFAULT = 'DEFAULT' DEFAULT = 'DEFAULT'
COMPACT = 'COMPACT' COMPACT = 'COMPACT'
SLOPPY = 'SLOPPY' SLOPPY = 'SLOPPY'
@ -109,6 +129,9 @@ print()
print('#include "hb.hh"') print('#include "hb.hh"')
print() print()
# Write mapping data
code = packTab.Code('_hb_ucd') code = packTab.Code('_hb_ucd')
sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array) sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array) dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
@ -125,6 +148,9 @@ datasets = [
('dm', dm, None, dm_order), ('dm', dm, None, dm_order),
] ]
# Write main data
for step in (DEFAULT, COMPACT, SLOPPY): for step in (DEFAULT, COMPACT, SLOPPY):
compression = compression_level[step] compression = compression_level[step]
logging.info(' Compression=%d:' % compression) logging.info(' Compression=%d:' % compression)
@ -165,6 +191,7 @@ for step in (DEFAULT, COMPACT, SLOPPY):
print() print()
print('#endif') print('#endif')
print() print()

View File

@ -129,12 +129,16 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
hb_codepoint_t a, hb_codepoint_t b, hb_codepoint_t *ab, hb_codepoint_t a, hb_codepoint_t b, hb_codepoint_t *ab,
void *user_data HB_UNUSED) void *user_data HB_UNUSED)
{ {
// Hangul is handled algorithmically.
if (_hb_ucd_compose_hangul (a, b, ab)) return true; if (_hb_ucd_compose_hangul (a, b, ab)) return true;
hb_codepoint_t u = 0; hb_codepoint_t u = 0;
if ((a & 0xFFFFF800u) == 0x0000u && (b & 0xFFFFFF80) == 0x0300u) if ((a & 0xFFFFF800u) == 0x0000u && (b & 0xFFFFFF80) == 0x0300u)
{ {
/* If "a" is small enough and "b" is in the U+0300 range,
* the composition data is encoded in a 32bit array sorted
* by "a,b" pair. */
uint32_t k = HB_CODEPOINT_ENCODE3_11_7_14 (a, b, 0); uint32_t k = HB_CODEPOINT_ENCODE3_11_7_14 (a, b, 0);
const uint32_t *v = hb_bsearch (k, const uint32_t *v = hb_bsearch (k,
_hb_ucd_dm2_u32_map, _hb_ucd_dm2_u32_map,
@ -146,6 +150,8 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
} }
else else
{ {
/* Otherwise it is stored in a 64bit array sorted by
* "a,b" pair. */
uint64_t k = HB_CODEPOINT_ENCODE3 (a, b, 0); uint64_t k = HB_CODEPOINT_ENCODE3 (a, b, 0);
const uint64_t *v = hb_bsearch (k, const uint64_t *v = hb_bsearch (k,
_hb_ucd_dm2_u64_map, _hb_ucd_dm2_u64_map,
@ -170,15 +176,22 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
unsigned i = _hb_ucd_dm (ab); unsigned i = _hb_ucd_dm (ab);
/* If no data, there's no decomposition. */
if (likely (!i)) return false; if (likely (!i)) return false;
i--; i--;
/* Check if it's a single-character decomposition. */
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map)) if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map))
{ {
/* Single-character decompositions currently are only in plane 0 or plane 2. */
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map)) if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map))
{
/* Plane 0. */
*a = _hb_ucd_dm1_p0_map[i]; *a = _hb_ucd_dm1_p0_map[i];
}
else else
{ {
/* Plane 2. */
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map); i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map);
*a = 0x20000 | _hb_ucd_dm1_p2_map[i]; *a = 0x20000 | _hb_ucd_dm1_p2_map[i];
} }
@ -187,8 +200,10 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
} }
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map); i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map);
/* Otherwise they are encoded either in a 32bit array or a 64bit array. */
if (i < ARRAY_LENGTH (_hb_ucd_dm2_u32_map)) if (i < ARRAY_LENGTH (_hb_ucd_dm2_u32_map))
{ {
/* 32bit array. */
uint32_t v = _hb_ucd_dm2_u32_map[i]; uint32_t v = _hb_ucd_dm2_u32_map[i];
*a = HB_CODEPOINT_DECODE3_11_7_14_1 (v); *a = HB_CODEPOINT_DECODE3_11_7_14_1 (v);
*b = HB_CODEPOINT_DECODE3_11_7_14_2 (v); *b = HB_CODEPOINT_DECODE3_11_7_14_2 (v);
@ -196,6 +211,7 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
} }
i -= ARRAY_LENGTH (_hb_ucd_dm2_u32_map); i -= ARRAY_LENGTH (_hb_ucd_dm2_u32_map);
/* 64bit array. */
uint64_t v = _hb_ucd_dm2_u64_map[i]; uint64_t v = _hb_ucd_dm2_u64_map[i];
*a = HB_CODEPOINT_DECODE3_1 (v); *a = HB_CODEPOINT_DECODE3_1 (v);
*b = HB_CODEPOINT_DECODE3_2 (v); *b = HB_CODEPOINT_DECODE3_2 (v);