[ucd] Document algorithms
This commit is contained in:
parent
ed43bc5118
commit
76ce390b5a
|
@ -25,14 +25,28 @@ hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
|
||||||
|
|
||||||
logging.info('Preparing data tables...')
|
logging.info('Preparing data tables...')
|
||||||
|
|
||||||
|
|
||||||
|
# This is how the data is encoded:
|
||||||
|
#
|
||||||
|
# General_Category (gc), Canonical_Combining_Class (ccc),
|
||||||
|
# and Script (sc) are encoded as integers.
|
||||||
|
#
|
||||||
|
# Mirroring character (bmg) is encoded as difference from
|
||||||
|
# the original character.
|
||||||
|
#
|
||||||
|
# Composition & Decomposition (dm) are encoded elaborately,
|
||||||
|
# as discussed below.
|
||||||
|
|
||||||
gc = [u['gc'] for u in ucd]
|
gc = [u['gc'] for u in ucd]
|
||||||
ccc = [int(u['ccc']) for u in ucd]
|
ccc = [int(u['ccc']) for u in ucd]
|
||||||
bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
|
bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
|
||||||
#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
|
|
||||||
#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
|
|
||||||
|
|
||||||
sc = [u['sc'] for u in ucd]
|
sc = [u['sc'] for u in ucd]
|
||||||
|
|
||||||
|
|
||||||
|
# Prepare Compose / Decompose data
|
||||||
|
#
|
||||||
|
# This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic.
|
||||||
|
|
||||||
dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
|
dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
|
||||||
if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
|
if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
|
||||||
ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
|
ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
|
||||||
|
@ -63,6 +77,9 @@ dm_order = {None: 0}
|
||||||
dm_order.update(dm1_order)
|
dm_order.update(dm1_order)
|
||||||
dm_order.update(dm2_order)
|
dm_order.update(dm2_order)
|
||||||
|
|
||||||
|
|
||||||
|
# Prepare General_Category / Script mapping arrays
|
||||||
|
|
||||||
gc_order = dict()
|
gc_order = dict()
|
||||||
for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
||||||
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
|
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
|
||||||
|
@ -83,6 +100,9 @@ for line in open(hb_common_h):
|
||||||
sc_order[i] = tag
|
sc_order[i] = tag
|
||||||
sc_array.append(name)
|
sc_array.append(name)
|
||||||
|
|
||||||
|
|
||||||
|
# Write out main data
|
||||||
|
|
||||||
DEFAULT = 'DEFAULT'
|
DEFAULT = 'DEFAULT'
|
||||||
COMPACT = 'COMPACT'
|
COMPACT = 'COMPACT'
|
||||||
SLOPPY = 'SLOPPY'
|
SLOPPY = 'SLOPPY'
|
||||||
|
@ -109,6 +129,9 @@ print()
|
||||||
print('#include "hb.hh"')
|
print('#include "hb.hh"')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# Write mapping data
|
||||||
|
|
||||||
code = packTab.Code('_hb_ucd')
|
code = packTab.Code('_hb_ucd')
|
||||||
sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
|
sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
|
||||||
dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
|
dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
|
||||||
|
@ -125,6 +148,9 @@ datasets = [
|
||||||
('dm', dm, None, dm_order),
|
('dm', dm, None, dm_order),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Write main data
|
||||||
|
|
||||||
for step in (DEFAULT, COMPACT, SLOPPY):
|
for step in (DEFAULT, COMPACT, SLOPPY):
|
||||||
compression = compression_level[step]
|
compression = compression_level[step]
|
||||||
logging.info(' Compression=%d:' % compression)
|
logging.info(' Compression=%d:' % compression)
|
||||||
|
@ -165,6 +191,7 @@ for step in (DEFAULT, COMPACT, SLOPPY):
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
print('#endif')
|
print('#endif')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
|
@ -129,12 +129,16 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||||
hb_codepoint_t a, hb_codepoint_t b, hb_codepoint_t *ab,
|
hb_codepoint_t a, hb_codepoint_t b, hb_codepoint_t *ab,
|
||||||
void *user_data HB_UNUSED)
|
void *user_data HB_UNUSED)
|
||||||
{
|
{
|
||||||
|
// Hangul is handled algorithmically.
|
||||||
if (_hb_ucd_compose_hangul (a, b, ab)) return true;
|
if (_hb_ucd_compose_hangul (a, b, ab)) return true;
|
||||||
|
|
||||||
hb_codepoint_t u = 0;
|
hb_codepoint_t u = 0;
|
||||||
|
|
||||||
if ((a & 0xFFFFF800u) == 0x0000u && (b & 0xFFFFFF80) == 0x0300u)
|
if ((a & 0xFFFFF800u) == 0x0000u && (b & 0xFFFFFF80) == 0x0300u)
|
||||||
{
|
{
|
||||||
|
/* If "a" is small enough and "b" is in the U+0300 range,
|
||||||
|
* the composition data is encoded in a 32bit array sorted
|
||||||
|
* by "a,b" pair. */
|
||||||
uint32_t k = HB_CODEPOINT_ENCODE3_11_7_14 (a, b, 0);
|
uint32_t k = HB_CODEPOINT_ENCODE3_11_7_14 (a, b, 0);
|
||||||
const uint32_t *v = hb_bsearch (k,
|
const uint32_t *v = hb_bsearch (k,
|
||||||
_hb_ucd_dm2_u32_map,
|
_hb_ucd_dm2_u32_map,
|
||||||
|
@ -146,6 +150,8 @@ hb_ucd_compose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
/* Otherwise it is stored in a 64bit array sorted by
|
||||||
|
* "a,b" pair. */
|
||||||
uint64_t k = HB_CODEPOINT_ENCODE3 (a, b, 0);
|
uint64_t k = HB_CODEPOINT_ENCODE3 (a, b, 0);
|
||||||
const uint64_t *v = hb_bsearch (k,
|
const uint64_t *v = hb_bsearch (k,
|
||||||
_hb_ucd_dm2_u64_map,
|
_hb_ucd_dm2_u64_map,
|
||||||
|
@ -170,15 +176,22 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||||
|
|
||||||
unsigned i = _hb_ucd_dm (ab);
|
unsigned i = _hb_ucd_dm (ab);
|
||||||
|
|
||||||
|
/* If no data, there's no decomposition. */
|
||||||
if (likely (!i)) return false;
|
if (likely (!i)) return false;
|
||||||
i--;
|
i--;
|
||||||
|
|
||||||
|
/* Check if it's a single-character decomposition. */
|
||||||
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map))
|
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map))
|
||||||
{
|
{
|
||||||
|
/* Single-character decompositions currently are only in plane 0 or plane 2. */
|
||||||
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map))
|
if (i < ARRAY_LENGTH (_hb_ucd_dm1_p0_map))
|
||||||
|
{
|
||||||
|
/* Plane 0. */
|
||||||
*a = _hb_ucd_dm1_p0_map[i];
|
*a = _hb_ucd_dm1_p0_map[i];
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
/* Plane 2. */
|
||||||
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map);
|
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map);
|
||||||
*a = 0x20000 | _hb_ucd_dm1_p2_map[i];
|
*a = 0x20000 | _hb_ucd_dm1_p2_map[i];
|
||||||
}
|
}
|
||||||
|
@ -187,8 +200,10 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||||
}
|
}
|
||||||
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map);
|
i -= ARRAY_LENGTH (_hb_ucd_dm1_p0_map) + ARRAY_LENGTH (_hb_ucd_dm1_p2_map);
|
||||||
|
|
||||||
|
/* Otherwise they are encoded either in a 32bit array or a 64bit array. */
|
||||||
if (i < ARRAY_LENGTH (_hb_ucd_dm2_u32_map))
|
if (i < ARRAY_LENGTH (_hb_ucd_dm2_u32_map))
|
||||||
{
|
{
|
||||||
|
/* 32bit array. */
|
||||||
uint32_t v = _hb_ucd_dm2_u32_map[i];
|
uint32_t v = _hb_ucd_dm2_u32_map[i];
|
||||||
*a = HB_CODEPOINT_DECODE3_11_7_14_1 (v);
|
*a = HB_CODEPOINT_DECODE3_11_7_14_1 (v);
|
||||||
*b = HB_CODEPOINT_DECODE3_11_7_14_2 (v);
|
*b = HB_CODEPOINT_DECODE3_11_7_14_2 (v);
|
||||||
|
@ -196,6 +211,7 @@ hb_ucd_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED,
|
||||||
}
|
}
|
||||||
i -= ARRAY_LENGTH (_hb_ucd_dm2_u32_map);
|
i -= ARRAY_LENGTH (_hb_ucd_dm2_u32_map);
|
||||||
|
|
||||||
|
/* 64bit array. */
|
||||||
uint64_t v = _hb_ucd_dm2_u64_map[i];
|
uint64_t v = _hb_ucd_dm2_u64_map[i];
|
||||||
*a = HB_CODEPOINT_DECODE3_1 (v);
|
*a = HB_CODEPOINT_DECODE3_1 (v);
|
||||||
*b = HB_CODEPOINT_DECODE3_2 (v);
|
*b = HB_CODEPOINT_DECODE3_2 (v);
|
||||||
|
|
Loading…
Reference in New Issue