fontconfig/fc-lang/fc-lang.py

388 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
#
# fontconfig/fc-lang/fc-lang.py
#
# Copyright © 2001-2002 Keith Packard
# Copyright © 2019 Tim-Philipp Müller
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# the above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear in supporting
# documentation, and that the name of the author(s) not be used in
# advertising or publicity pertaining to distribution of the software without
# specific, written prior permission. The authors make no
# representations about the suitability of this software for any purpose. It
# is provided "as is" without express or implied warranty.
#
# THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
# INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
# EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR
# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
# fc-lang
#
# Read a set of language orthographies and build C declarations for
# charsets which can then be used to identify which languages are
# supported by a given font.
#
# TODO: this code is not very pythonic, a lot of it is a 1:1 translation
# of the C code and we could probably simplify it a bit
import argparse
import string
import sys
import os
# we just store the leaves in a dict, we can order the leaves later if needed
class CharSet:
def __init__(self):
self.leaves = {} # leaf_number -> leaf data (= 16 uint32)
def add_char(self, ucs4):
assert ucs4 < 0x01000000
leaf_num = ucs4 >> 8
if leaf_num in self.leaves:
leaf = self.leaves[leaf_num]
else:
leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8
self.leaves[leaf_num] = leaf
leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f))
#print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
def del_char(self, ucs4):
assert ucs4 < 0x01000000
leaf_num = ucs4 >> 8
if leaf_num in self.leaves:
leaf = self.leaves[leaf_num]
leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f))
# We don't bother removing the leaf if it's empty */
#print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf))
def equals(self, other_cs):
keys = sorted(self.leaves.keys())
other_keys = sorted(other_cs.leaves.keys())
if len(keys) != len(other_keys):
return False
for k1, k2 in zip(keys, other_keys):
if k1 != k2:
return False
if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]):
return False
return True
# Convert a file name into a name suitable for C declarations
def get_name(file_name):
return file_name.split('.')[0]
# Convert a C name into a language name
def get_lang(c_name):
return c_name.replace('_', '-').replace(' ', '').lower()
def read_orth_file(file_name):
lines = []
with open(file_name, 'r', encoding='utf-8') as orth_file:
for num, line in enumerate(orth_file):
if line.startswith('include '):
include_fn = line[8:].strip()
lines += read_orth_file(include_fn)
else:
# remove comments and strip whitespaces
line = line.split('#')[0].strip()
line = line.split('\t')[0].strip()
# skip empty lines
if line:
lines += [(file_name, num, line)]
return lines
def leaves_equal(leaf1, leaf2):
for v1, v2 in zip(leaf1, leaf2):
if v1 != v2:
return False
return True
# Build a single charset from a source file
#
# The file format is quite simple, either
# a single hex value or a pair separated with a dash
def parse_orth_file(file_name, lines):
charset = CharSet()
for fn, num, line in lines:
delete_char = line.startswith('-')
if delete_char:
line = line[1:]
if line.find('-') != -1:
parts = line.split('-')
elif line.find('..') != -1:
parts = line.split('..')
else:
parts = [line]
start = int(parts.pop(0), 16)
end = start
if parts:
end = int(parts.pop(0), 16)
if parts:
print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num))
for ucs4 in range(start, end+1):
if delete_char:
charset.del_char(ucs4)
else:
charset.add_char(ucs4)
assert charset.equals(charset) # sanity check for the equals function
return charset
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument('orth_files', nargs='+', help='List of .orth files')
parser.add_argument('--directory', dest='directory', default=None)
parser.add_argument('--template', dest='template_file', default=None)
parser.add_argument('--output', dest='output_file', default=None)
args = parser.parse_args()
sets = []
names = []
langs = []
country = []
total_leaves = 0
LangCountrySets = {}
# Open output file
if args.output_file:
sys.stdout = open(args.output_file, 'w', encoding='utf-8')
# Read the template file
if args.template_file:
tmpl_file = open(args.template_file, 'r', encoding='utf-8')
else:
tmpl_file = sys.stdin
# Change into source dir if specified (after opening other files)
if args.directory:
os.chdir(args.directory)
orth_entries = {}
for i, fn in enumerate(args.orth_files):
orth_entries[fn] = i
for fn in sorted(orth_entries.keys()):
lines = read_orth_file(fn)
charset = parse_orth_file(fn, lines)
sets.append(charset)
name = get_name(fn)
names.append(name)
lang = get_lang(name)
langs.append(lang)
if lang.find('-') != -1:
country.append(orth_entries[fn]) # maps to original index
language_family = lang.split('-')[0]
if not language_family in LangCountrySets:
LangCountrySets[language_family] = []
LangCountrySets[language_family] += [orth_entries[fn]]
total_leaves += len(charset.leaves)
# Find unique leaves
leaves = []
for s in sets:
for leaf_num in sorted(s.leaves.keys()):
leaf = s.leaves[leaf_num]
is_unique = True
for existing_leaf in leaves:
if leaves_equal(leaf, existing_leaf):
is_unique = False
break
#print('unique: ', is_unique)
if is_unique:
leaves.append(leaf)
# Find duplicate charsets
duplicate = []
for i, s in enumerate(sets):
dup_num = None
if i >= 1:
for j, s_cmp in enumerate(sets):
if j >= i:
break
if s_cmp.equals(s):
dup_num = j
break
duplicate.append(dup_num)
tn = 0
off = {}
for i, s in enumerate(sets):
if duplicate[i]:
continue
off[i] = tn
tn += len(s.leaves)
# Scan the input until the marker is found
# FIXME: this is a bit silly really, might just as well hardcode
# the license header in the script and drop the template
for line in tmpl_file:
if line.strip() == '@@@':
break
print(line, end='')
print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves)))
print('#define LEAF0 ({} * sizeof (FcLangCharSet))'.format(len(sets)))
print('#define OFF0 (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves)))
print('#define NUM0 (OFF0 + {} * sizeof (uintptr_t))'.format(tn))
print('#define SET(n) (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))')
print('#define OFF(s,o) (OFF0 + o * sizeof (uintptr_t) - SET(s))')
print('#define NUM(s,n) (NUM0 + n * sizeof (FcChar16) - SET(s))')
print('#define LEAF(o,l) (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))')
print('#define fcLangCharSets (fcLangData.langCharSets)')
print('#define fcLangCharSetIndices (fcLangData.langIndices)')
print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)')
assert len(sets) < 65536 # FIXME: need to change index type to 32-bit below then
print('''
static const struct {{
FcLangCharSet langCharSets[{}];
FcCharLeaf leaves[{}];
uintptr_t leaf_offsets[{}];
FcChar16 numbers[{}];
{} langIndices[{}];
{} langIndicesInv[{}];
}} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn,
'FcChar16 ', len(sets), 'FcChar16 ', len(sets)))
# Dump sets
print('{')
for i, s in enumerate(sets):
if duplicate[i]:
j = duplicate[i]
else:
j = i
print(' {{ "{}", {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format(
langs[i], len(sets[j].leaves), i, off[j], i, off[j], i))
print('},')
# Dump leaves
print('{')
for l, leaf in enumerate(leaves):
print(' {{ {{ /* {} */'.format(l), end='')
for i in range(0, 8): # 256/32 = 8
if i % 4 == 0:
print('\n ', end='')
print(' 0x{:08x},'.format(leaf[i]), end='')
print('\n } },')
print('},')
# Dump leaves
print('{')
for i, s in enumerate(sets):
if duplicate[i]:
continue
print(' /* {} */'.format(names[i]))
for n, leaf_num in enumerate(sorted(s.leaves.keys())):
leaf = s.leaves[leaf_num]
if n % 4 == 0:
print(' ', end='')
found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)]
assert found, "Couldn't find leaf in unique leaves list!"
assert len(found) == 1
print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='')
if n % 4 == 3:
print('')
if len(s.leaves) % 4 != 0:
print('')
print('},')
print('{')
for i, s in enumerate(sets):
if duplicate[i]:
continue
print(' /* {} */'.format(names[i]))
for n, leaf_num in enumerate(sorted(s.leaves.keys())):
leaf = s.leaves[leaf_num]
if n % 8 == 0:
print(' ', end='')
print(' 0x{:04x},'.format(leaf_num), end='')
if n % 8 == 7:
print('')
if len(s.leaves) % 8 != 0:
print('')
print('},')
# langIndices
print('{')
for i, s in enumerate(sets):
fn = '{}.orth'.format(names[i])
print(' {}, /* {} */'.format(orth_entries[fn], names[i]))
print('},')
# langIndicesInv
print('{')
for i, k in enumerate(orth_entries.keys()):
name = get_name(k)
idx = names.index(name)
print(' {}, /* {} */'.format(idx, name))
print('}')
print('};\n')
print('#define NUM_LANG_CHAR_SET {}'.format(len(sets)))
num_lang_set_map = (len(sets) + 31) // 32;
print('#define NUM_LANG_SET_MAP {}'.format(num_lang_set_map))
# Dump indices with country codes
assert len(country) > 0
assert len(LangCountrySets) > 0
print('')
print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {')
for k in sorted(LangCountrySets.keys()):
langset_map = [0] * num_lang_set_map # initialise all zeros
for entries_id in LangCountrySets[k]:
langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f))
print(' {', end='')
for v in langset_map:
print(' 0x{:08x},'.format(v), end='')
print(' }}, /* {} */'.format(k))
print('};\n')
print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets)))
# Find ranges for each letter for faster searching
# Dump sets start/finish for the fastpath
print('static const FcLangCharSetRange fcLangCharSetRanges[] = {\n')
for c in string.ascii_lowercase: # a-z
start = 9999
stop = -1
for i, s in enumerate(sets):
if names[i].startswith(c):
start = min(start,i)
stop = max(stop,i)
print(' {{ {}, {} }}, /* {} */'.format(start, stop, c))
print('};\n')
# And flush out the rest of the input file
for line in tmpl_file:
print(line, end='')
sys.stdout.flush()