[ucd] Document algorithms
This commit is contained in:
parent
ed43bc5118
commit
76ce390b5a
|
@ -25,14 +25,28 @@ hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
|
|||
|
||||
logging.info('Preparing data tables...')
|
||||
|
||||
|
||||
# This is how the data is encoded:
|
||||
#
|
||||
# General_Category (gc), Canonical_Combining_Class (ccc),
|
||||
# and Script (sc) are encoded as integers.
|
||||
#
|
||||
# Mirroring character (bmg) is encoded as difference from
|
||||
# the original character.
|
||||
#
|
||||
# Composition & Decomposition (dm) are encoded elaborately,
|
||||
# as discussed below.
|
||||
|
||||
gc = [u['gc'] for u in ucd]
|
||||
ccc = [int(u['ccc']) for u in ucd]
|
||||
bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
|
||||
#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
|
||||
#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
|
||||
|
||||
sc = [u['sc'] for u in ucd]
|
||||
|
||||
|
||||
# Prepare Compose / Decompose data
|
||||
#
|
||||
# This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic.
|
||||
|
||||
dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
|
||||
if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
|
||||
ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
|
||||
|
@ -63,6 +77,9 @@ dm_order = {None: 0}
|
|||
dm_order.update(dm1_order)
|
||||
dm_order.update(dm2_order)
|
||||
|
||||
|
||||
# Prepare General_Category / Script mapping arrays
|
||||
|
||||
gc_order = dict()
|
||||
for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
|
||||
|
@ -83,6 +100,9 @@ for line in open(hb_common_h):
|
|||
sc_order[i] = tag
|
||||
sc_array.append(name)
|
||||
|
||||
|
||||
# Write out main data
|
||||
|
||||
DEFAULT = 'DEFAULT'
|
||||
COMPACT = 'COMPACT'
|
||||
SLOPPY = 'SLOPPY'
|
||||
|
@ -109,6 +129,9 @@ print()
|
|||
print('#include "hb.hh"')
|
||||
print()
|
||||
|
||||
|
||||
# Write mapping data
|
||||
|
||||
code = packTab.Code('_hb_ucd')
|
||||
sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
|
||||
dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
|
||||
|
@ -125,6 +148,9 @@ datasets = [
|
|||
('dm', dm, None, dm_order),
|
||||
]
|
||||
|
||||
|
||||
# Write main data
|
||||
|
||||
for step in (DEFAULT, COMPACT, SLOPPY):
|
||||
compression = compression_level[step]
|
||||
logging.info(' Compression=%d:' % compression)
|
||||
|
@ -165,6 +191,7 @@ for step in (DEFAULT, COMPACT, SLOPPY):
|
|||
|
||||
print()
|
||||
|
||||
|
||||
print('#endif')
|
||||
print()
|
||||
|
||||
|
|
|
@ -129,12 +129,16 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
|||
hb_codepoint_t a, hb_codepoint_t b, hb_codepoint_t *ab,
|
||||
void *user_data HB_UNUSED)
|
||||
{
|
||||
// Hangul is handled algorithmically.
|
||||
if (_hb_ucd_compose_hangul (a, b, ab)) return true;
|
||||
|
||||
hb_codepoint_t u = 0;
|
||||
|
||||
if ((a & 0xFFFFF800u) == 0x0000u && (b & 0xFFFFFF80) == 0x0300u)
|
||||
{
|
||||
/* If "a" is small enough and "b" is in the U+0300 range,
|
||||
* the composition data is encoded in a 32bit array sorted
|
||||
* by "a,b" pair. */
|
||||
uint32_t k = HB_CODEPOINT_ENCODE3_11_7_14 (a, b, 0);
|
||||
const uint32_t *v = hb_bsearch (k,
|
||||
_hb_ucd_dm2_u32_map,
|
||||
|
@ -146,6 +150,8 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
|||
}
|
||||
else
|
||||
{
|
||||
/* Otherwise it is stored in a 64bit array sorted by
|
||||
* "a,b" pair. */
|
||||
uint64_t k = HB_CODEPOINT_ENCODE3 (a, b, 0);
|
||||
const uint64_t *v = hb_bsearch (k,
|
||||
_hb_ucd_dm2_u64_map,
|
||||
|
@ -170,15 +176,22 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
|||
|
||||
unsigned i = _hb_ucd_dm (ab);
|
||||
|
||||
/* If no data, there's no decomposition. */
|
||||
if (likely (!i)) return false;
|
||||
i--;
|
||||
|
||||
/* Check if it's a single-character decomposition. */
|
||||
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map))
|
||||
{
|
||||
/* Single-character decompositions currently are only in plane 0 or plane 2. */
|
||||
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map))
|
||||
{
|
||||
/* Plane 0. */
|
||||
*a = _hb_ucd_dm1_p0_map[i];
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Plane 2. */
|
||||
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map);
|
||||
*a = 0x20000 | _hb_ucd_dm1_p2_map[i];
|
||||
}
|
||||
|
@ -187,8 +200,10 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
|||
}
|
||||
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map);
|
||||
|
||||
/* Otherwise they are encoded either in a 32bit array or a 64bit array. */
|
||||
if (i < ARRAY_LENGTH (_hb_ucd_dm2_u32_map))
|
||||
{
|
||||
/* 32bit array. */
|
||||
uint32_t v = _hb_ucd_dm2_u32_map[i];
|
||||
*a = HB_CODEPOINT_DECODE3_11_7_14_1 (v);
|
||||
*b = HB_CODEPOINT_DECODE3_11_7_14_2 (v);
|
||||
|
@ -196,6 +211,7 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
|||
}
|
||||
i -= ARRAY_LENGTH (_hb_ucd_dm2_u32_map);
|
||||
|
||||
/* 64bit array. */
|
||||
uint64_t v = _hb_ucd_dm2_u64_map[i];
|
||||
*a = HB_CODEPOINT_DECODE3_1 (v);
|
||||
*b = HB_CODEPOINT_DECODE3_2 (v);
|
||||
|
|
Loading…
Reference in New Issue