Start of gen-ucd.py, to replace UCDN
This commit is contained in:
parent
02e5e5d939
commit
b4eff38397
|
@ -0,0 +1,46 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from __future__ import print_function, division, absolute_import
|
||||||
|
|
||||||
|
import io, os.path, sys
|
||||||
|
|
||||||
|
if len (sys.argv) != 2:
|
||||||
|
print ("usage: ./gen-ucd ucdxml-file", file=sys.stderr)
|
||||||
|
sys.exit (1)
|
||||||
|
|
||||||
|
import youseedy, packTab
|
||||||
|
|
||||||
|
ucd = youseedy.load_ucdxml (sys.argv[1])
|
||||||
|
|
||||||
|
gc = [u['gc'] for u in ucd]
|
||||||
|
ccc = [int(u['ccc']) for u in ucd]
|
||||||
|
sc = [u['sc'] for u in ucd]
|
||||||
|
bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
|
||||||
|
dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
|
||||||
|
if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
|
||||||
|
|
||||||
|
gc_set = set(gc)
|
||||||
|
gc_ccc_non0 = set((cat,klass) for cat,klass in zip(gc,ccc) if klass)
|
||||||
|
gc_bmg_non0 = set((cat,mirr) for cat,mirr in zip(gc, bmg) if mirr)
|
||||||
|
sc_set = set(sc)
|
||||||
|
dm2 = set(v for v in dm.values() if len(v) == 2)
|
||||||
|
dm2diff = set(v[1] - v[0] for v in dm2)
|
||||||
|
dm1 = set(v[0] for i,v in dm.items() if len(v) == 1)
|
||||||
|
dmx = set(v for v in dm.values() if len(v) not in (1,2))
|
||||||
|
assert not dmx
|
||||||
|
|
||||||
|
print(len(sorted(gc_set)))
|
||||||
|
print(len(sorted(gc_ccc_non0)))
|
||||||
|
print(len(sorted(gc_bmg_non0)))
|
||||||
|
print("GC, CCC, and BMG fit in one byte. Compress together.")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(len(sorted(sc_set)))
|
||||||
|
print("SC fits in one byte. Compress separately.")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(len(dm))
|
||||||
|
print(len(dm1), min(dm1), max(dm1))
|
||||||
|
print(len(dm2))
|
||||||
|
#print(sorted(dm2diff))
|
||||||
|
print(len(sorted(set(v // 512 for v in dm1))))
|
Loading…
Reference in New Issue