287 lines
9.1 KiB
Python
287 lines
9.1 KiB
Python
|
#!/usr/bin/python
|
||
|
|
||
|
"""Generator of the function to prohibit certain vowel sequences.
|
||
|
|
||
|
It creates ``preprocess_text_vowel_constraints``, which inserts dotted
|
||
|
circles into sequences prohibited by the USE script development spec.
|
||
|
This function should be used as the ``preprocess_text`` of an
|
||
|
``hb_ot_complex_shaper_t``.
|
||
|
|
||
|
It also creates the helper function ``_output_with_dotted_circle``.
|
||
|
"""
|
||
|
|
||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||
|
|
||
|
import collections
|
||
|
try:
|
||
|
from HTMLParser import HTMLParser
|
||
|
def write (s):
|
||
|
print (s.encode ('utf-8'), end='')
|
||
|
except ImportError:
|
||
|
from html.parser import HTMLParser
|
||
|
def write (s):
|
||
|
sys.stdout.flush ()
|
||
|
sys.stdout.buffer.write (s.encode ('utf-8'))
|
||
|
import itertools
|
||
|
import io
|
||
|
import sys
|
||
|
|
||
|
if len (sys.argv) != 3:
|
||
|
print ('usage: ./gen-vowel-constraints.py use Scripts.txt', file=sys.stderr)
|
||
|
sys.exit (1)
|
||
|
|
||
|
try:
|
||
|
from html import unescape
|
||
|
def html_unescape (parser, entity):
|
||
|
return unescape (entity)
|
||
|
except ImportError:
|
||
|
def html_unescape (parser, entity):
|
||
|
return parser.unescape (entity)
|
||
|
|
||
|
def expect (condition, message=None):
|
||
|
if not condition:
|
||
|
if message is None:
|
||
|
raise AssertionError
|
||
|
raise AssertionError (message)
|
||
|
|
||
|
with io.open (sys.argv[2], encoding='utf-8') as f:
|
||
|
scripts_header = [f.readline () for i in range (2)]
|
||
|
scripts = {}
|
||
|
script_order = {}
|
||
|
for line in f:
|
||
|
j = line.find ('#')
|
||
|
if j >= 0:
|
||
|
line = line[:j]
|
||
|
fields = [x.strip () for x in line.split (';')]
|
||
|
if len (fields) == 1:
|
||
|
continue
|
||
|
uu = fields[0].split ('..')
|
||
|
start = int (uu[0], 16)
|
||
|
if len (uu) == 1:
|
||
|
end = start
|
||
|
else:
|
||
|
end = int (uu[1], 16)
|
||
|
script = fields[1]
|
||
|
for u in range (start, end + 1):
|
||
|
scripts[u] = script
|
||
|
if script not in script_order:
|
||
|
script_order[script] = start
|
||
|
|
||
|
class ConstraintSet (object):
|
||
|
"""A set of prohibited code point sequences.
|
||
|
|
||
|
Args:
|
||
|
constraint (List[int]): A prohibited code point sequence.
|
||
|
|
||
|
"""
|
||
|
def __init__ (self, constraint):
|
||
|
# Either a list or a dictionary. As a list of code points, it
|
||
|
# represents a prohibited code point sequence. As a dictionary,
|
||
|
# it represents a set of prohibited sequences, where each item
|
||
|
# represents the set of prohibited sequences starting with the
|
||
|
# key (a code point) concatenated with any of the values
|
||
|
# (ConstraintSets).
|
||
|
self._c = constraint
|
||
|
|
||
|
def add (self, constraint):
|
||
|
"""Add a constraint to this set."""
|
||
|
if not constraint:
|
||
|
return
|
||
|
first = constraint[0]
|
||
|
rest = constraint[1:]
|
||
|
if isinstance (self._c, list):
|
||
|
if constraint == self._c[:len (constraint)]:
|
||
|
self._c = constraint
|
||
|
elif self._c != constraint[:len (self._c)]:
|
||
|
self._c = {self._c[0]: ConstraintSet (self._c[1:])}
|
||
|
if isinstance (self._c, dict):
|
||
|
if first in self._c:
|
||
|
self._c[first].add (rest)
|
||
|
else:
|
||
|
self._c[first] = ConstraintSet (rest)
|
||
|
|
||
|
def _indent (self, depth):
|
||
|
return (' ' * depth).replace (' ', '\t')
|
||
|
|
||
|
def __str__ (self, index=0, depth=4):
|
||
|
s = []
|
||
|
indent = self._indent (depth)
|
||
|
if isinstance (self._c, list):
|
||
|
if len (self._c) == 0:
|
||
|
s.append ('{}matched = true;\n'.format (indent))
|
||
|
elif len (self._c) == 1:
|
||
|
s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
|
||
|
else:
|
||
|
s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index))
|
||
|
s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), len (self._c)))
|
||
|
for i, cp in enumerate (self._c[1:], start=1):
|
||
|
s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
|
||
|
self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
|
||
|
s.append ('{}{{\n'.format (indent))
|
||
|
for i in range (len (self._c)):
|
||
|
s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
|
||
|
s.append ('{}buffer->output_glyph (0x25CCu);\n'.format (self._indent (depth + 1)))
|
||
|
s.append ('{}}}\n'.format (indent))
|
||
|
else:
|
||
|
s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
|
||
|
s.append ('{}{{\n'.format (indent))
|
||
|
cases = collections.defaultdict (set)
|
||
|
for first, rest in sorted (self._c.items ()):
|
||
|
cases[rest.__str__ (index + 1, depth + 2)].add (first)
|
||
|
for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
|
||
|
for i, cp in enumerate (sorted (labels)):
|
||
|
if i % 4 == 0:
|
||
|
s.append (self._indent (depth + 1))
|
||
|
else:
|
||
|
s.append (' ')
|
||
|
s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
|
||
|
if len (labels) % 4 != 0:
|
||
|
s.append ('\n')
|
||
|
s.append (body)
|
||
|
s.append ('{}break;\n'.format (self._indent (depth + 2)))
|
||
|
s.append ('{}}}\n'.format (indent))
|
||
|
return ''.join (s)
|
||
|
|
||
|
class USESpecParser (HTMLParser):
|
||
|
"""A parser for the USE script development spec.
|
||
|
|
||
|
Attributes:
|
||
|
header (str): The ``updated_at`` timestamp of the spec.
|
||
|
constraints (Mapping[str, ConstraintSet]): A map of script names
|
||
|
to the scripts' prohibited sequences.
|
||
|
"""
|
||
|
def __init__ (self):
|
||
|
HTMLParser.__init__ (self)
|
||
|
self.header = ''
|
||
|
self.constraints = {}
|
||
|
# Whether the next <code> contains the vowel constraints.
|
||
|
self._primed = False
|
||
|
# Whether the parser is in the <code> element with the constraints.
|
||
|
self._in_constraints = False
|
||
|
# The text of the constraints.
|
||
|
self._constraints = ''
|
||
|
|
||
|
def handle_starttag (self, tag, attrs):
|
||
|
if tag == 'meta':
|
||
|
for attr, value in attrs:
|
||
|
if attr == 'name' and value == 'updated_at':
|
||
|
self.header = self.get_starttag_text ()
|
||
|
break
|
||
|
elif tag == 'a':
|
||
|
for attr, value in attrs:
|
||
|
if attr == 'id' and value == 'ivdvconstraints':
|
||
|
self._primed = True
|
||
|
break
|
||
|
elif self._primed and tag == 'code':
|
||
|
self._primed = False
|
||
|
self._in_constraints = True
|
||
|
|
||
|
def handle_endtag (self, tag):
|
||
|
self._in_constraints = False
|
||
|
|
||
|
def handle_data (self, data):
|
||
|
if self._in_constraints:
|
||
|
self._constraints += data
|
||
|
|
||
|
def handle_charref (self, name):
|
||
|
self.handle_data (html_unescape (self, '&#%s;' % name))
|
||
|
|
||
|
def handle_entityref (self, name):
|
||
|
self.handle_data (html_unescape (self, '&%s;' % name))
|
||
|
|
||
|
def parse (self, filename):
|
||
|
"""Parse the USE script development spec.
|
||
|
|
||
|
Args:
|
||
|
filename (str): The file name of the spec.
|
||
|
"""
|
||
|
with io.open (filename, encoding='utf-8') as f:
|
||
|
self.feed (f.read ())
|
||
|
expect (self.header, 'No header found')
|
||
|
for line in self._constraints.splitlines ():
|
||
|
constraint = [int (cp, 16) for cp in line.split (';')[0].strip ().split (' ')]
|
||
|
expect (2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint))
|
||
|
script = scripts[constraint[0]]
|
||
|
if script in self.constraints:
|
||
|
self.constraints[script].add (constraint)
|
||
|
else:
|
||
|
self.constraints[script] = ConstraintSet (constraint)
|
||
|
expect (self.constraints, 'No constraints found')
|
||
|
|
||
|
use_parser = USESpecParser ()
|
||
|
use_parser.parse (sys.argv[1])
|
||
|
|
||
|
print ('/* == Start of generated functions == */')
|
||
|
print ('/*')
|
||
|
print (' * The following functions are generated by running:')
|
||
|
print (' *')
|
||
|
print (' * %s use Scripts.txt' % sys.argv[0])
|
||
|
print (' *')
|
||
|
print (' * on files with these headers:')
|
||
|
print (' *')
|
||
|
print (' * %s' % use_parser.header.strip ())
|
||
|
for line in scripts_header:
|
||
|
print (' * %s' % line.strip ())
|
||
|
print (' */')
|
||
|
print ()
|
||
|
print ('#ifndef HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH')
|
||
|
print ('#define HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH')
|
||
|
print ()
|
||
|
|
||
|
print ('static void')
|
||
|
print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
|
||
|
print ('{')
|
||
|
print (' hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);')
|
||
|
print (' _hb_glyph_info_reset_continuation (&dottedcircle);')
|
||
|
print ()
|
||
|
print (' buffer->next_glyph ();')
|
||
|
print ('}')
|
||
|
print ()
|
||
|
|
||
|
print ('static void')
|
||
|
print ('preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan,')
|
||
|
print ('\t\t\t\t hb_buffer_t *buffer,')
|
||
|
print ('\t\t\t\t hb_font_t *font)')
|
||
|
print ('{')
|
||
|
print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
|
||
|
print (' * vowel-sequences that look like another vowel. Data for each script')
|
||
|
print (' * collected from the USE script development spec.')
|
||
|
print (' *')
|
||
|
print (' * https://github.com/harfbuzz/harfbuzz/issues/1019')
|
||
|
print (' */')
|
||
|
print (' bool processed = false;')
|
||
|
print (' buffer->clear_output ();')
|
||
|
print (' unsigned int count = buffer->len;')
|
||
|
print (' switch ((unsigned) buffer->props.script)')
|
||
|
print (' {')
|
||
|
|
||
|
for script, constraints in sorted (use_parser.constraints.items (), key=lambda s_c: script_order[s_c[0]]):
|
||
|
print (' case HB_SCRIPT_{}:'.format (script.upper ()))
|
||
|
print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
|
||
|
print (' {')
|
||
|
print ('\tbool matched = false;')
|
||
|
write (str (constraints))
|
||
|
print ('\tbuffer->next_glyph ();')
|
||
|
print ('\tif (matched) _output_with_dotted_circle (buffer);')
|
||
|
print (' }')
|
||
|
print (' processed = true;')
|
||
|
print (' break;')
|
||
|
print ()
|
||
|
|
||
|
print (' default:')
|
||
|
print (' break;')
|
||
|
print (' }')
|
||
|
print (' if (processed)')
|
||
|
print (' {')
|
||
|
print (' if (buffer->idx < count)')
|
||
|
print (' buffer->next_glyph ();')
|
||
|
print (' if (likely (buffer->successful))')
|
||
|
print (' buffer->swap_buffers ();')
|
||
|
print (' }')
|
||
|
print ('}')
|
||
|
|
||
|
print ()
|
||
|
print ('#endif /* HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH */')
|
||
|
print ()
|
||
|
print ('/* == End of generated functions == */')
|