harfbuzz/src/gen-ucd-table.py

165 lines
5.3 KiB
Python
Raw Normal View History

2019-05-14 18:07:20 +02:00
#!/usr/bin/env python
from __future__ import print_function, division, absolute_import
2019-05-20 23:17:38 +02:00
import io, os.path, sys, re
2019-06-20 02:34:12 +02:00
import logging
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
2019-05-14 18:07:20 +02:00
if len (sys.argv) != 2:
2019-06-25 03:54:26 +02:00
print("usage: ./gen-ucd-table ucd.nounihan.grouped.xml", file=sys.stderr)
2019-05-22 21:46:19 +02:00
sys.exit(1)
2019-05-14 18:07:20 +02:00
2019-05-24 02:39:04 +02:00
# https://github.com/harfbuzz/packtab
import packTab
import packTab.ucdxml
2019-05-14 18:07:20 +02:00
2019-06-20 02:34:12 +02:00
logging.info('Loading UCDXML...')
ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
2019-05-22 21:46:19 +02:00
2019-06-20 02:34:12 +02:00
logging.info('Preparing data tables...')
2019-05-14 18:07:20 +02:00
gc = [u['gc'] for u in ucd]
ccc = [int(u['ccc']) for u in ucd]
bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
#gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
#gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
sc = [u['sc'] for u in ucd]
2019-05-14 18:07:20 +02:00
dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
assert not any(v for v in dm.values() if len(v) not in (1,2))
dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
assert all((v[0] >> 16) in (0,2) for v in dm1)
dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
dm1_order = {v:i+1 for i,v in enumerate(dm1)}
dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
for i,v in dm.items() if len(v) == 2)
filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
(v[1] & 0xFFFFFF80) == 0x0300 and
(v[2] & 0xFFF0C000) == 0x0000)
dm2_u32_array = [v for v in dm2 if filt(v[0])]
dm2_u64_array = [v for v in dm2 if not filt(v[0])]
assert dm2_u32_array + dm2_u64_array == dm2
dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
dm_order = {None: 0}
dm_order.update(dm1_order)
dm_order.update(dm2_order)
2019-05-20 21:47:49 +02:00
2019-05-20 22:57:04 +02:00
gc_order = packTab.AutoMapping()
for _ in ('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',):
gc_order[_]
2019-05-14 18:07:20 +02:00
2019-05-20 23:17:38 +02:00
sc_order = packTab.AutoMapping()
sc_array = []
sc_re = re.compile(" (HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
for line in open('hb-common.h'):
m = sc_re.search (line)
if not m: continue
name = m.group(1)
tag = ''.join(m.group(i) for i in range(2, 6))
i = sc_order[tag]
assert i == len(sc_array)
sc_array.append(name)
2019-05-21 19:02:54 +02:00
# TODO Currently if gc_order or sc_order do not capture all values, we get in
# trouble because they silently add new values. We should be able to "freeze"
# them, or just do the mapping ourselves.
2019-05-20 21:47:49 +02:00
DEFAULT = 1
COMPACT = 3
SLOPPY = 5
2019-05-22 21:46:19 +02:00
2019-06-20 02:34:12 +02:00
logging.info('Generating output...')
2019-05-22 21:46:19 +02:00
print("/* == Start of generated table == */")
print("/*")
print(" * The following table is generated by running:")
print(" *")
2019-06-25 03:54:26 +02:00
print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml")
2019-05-22 21:46:19 +02:00
print(" *")
print(" * on file with this description:", ucdxml.description)
print(" */")
print()
print("#ifndef HB_UCD_TABLE_HH")
print("#define HB_UCD_TABLE_HH")
2019-05-14 18:07:20 +02:00
print()
print('#include "hb.hh"')
2019-05-20 23:29:13 +02:00
print()
2019-05-22 21:46:19 +02:00
2019-05-20 23:29:13 +02:00
code = packTab.Code('_hb_ucd')
2019-05-25 22:33:21 +02:00
sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
2019-05-20 23:29:13 +02:00
code.print_c(linkage='static inline')
2019-06-20 02:34:12 +02:00
datasets = [
('gc', gc, 'Cn', gc_order),
('ccc', ccc, 0, None),
('bmg', bmg, 0, None),
('sc', sc, 'Zzzz', sc_order),
('dm', dm, None, dm_order),
]
for compression in (DEFAULT, COMPACT, SLOPPY):
2019-06-20 02:34:12 +02:00
logging.info(' Compression=%d:' % compression)
2019-05-20 21:47:49 +02:00
print()
if compression == DEFAULT:
print('#ifndef HB_OPTIMIZE_SIZE')
elif compression == COMPACT:
print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
2019-05-20 21:47:49 +02:00
else:
print('#else')
print()
if compression == SLOPPY:
for i in range(len(gc)):
if (i % 128) and gc[i] == 'Cn':
gc[i] = gc[i - 1]
for i in range(len(gc) - 2, -1, -1):
if ((i + 1) % 128) and gc[i] == 'Cn':
gc[i] = gc[i + 1]
for i in range(len(sc)):
if (i % 128) and sc[i] == 'Zzzz':
sc[i] = sc[i - 1]
for i in range(len(sc) - 2, -1, -1):
if ((i + 1) % 128) and sc[i] == 'Zzzz':
sc[i] = sc[i + 1]
2019-05-20 21:47:49 +02:00
code = packTab.Code('_hb_ucd')
2019-06-20 02:34:12 +02:00
for name,data,default,mapping in datasets:
sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
sol.genCode(code, name)
2019-05-20 21:47:49 +02:00
code.print_c(linkage='static inline')
print()
2019-05-22 21:46:19 +02:00
print('#endif')
print()
2019-05-22 21:46:19 +02:00
print()
print("#endif /* HB_UCD_TABLE_HH */")
print()
print("/* == End of generated table == */")
2019-06-20 02:34:12 +02:00
logging.info('Done.')