diff --git a/src/HBIndicVowelConstraints.txt b/src/HBIndicVowelConstraints.txt new file mode 100644 index 000000000..146ae1cb8 --- /dev/null +++ b/src/HBIndicVowelConstraints.txt @@ -0,0 +1,97 @@ +# Copied from https://docs.microsoft.com/en-us/typography/script-development/use +# On October 23, 2018; with documentd dated 02/07/2018. + + 0905 0946 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT E + 0905 093E ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AA + 0930 094D 0907 ; # DEVANAGARI LETTER RA, DEVANAGARI SIGN VIRAMA, DEVANAGARI LETTER I + 0909 0941 ; # DEVANAGARI LETTER U, DEVANAGARI VOWEL SIGN U + 090F 0945 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN CANDRA E + 090F 0946 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN SHORT E + 090F 0947 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN E + 0905 0949 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA O + 0906 0945 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN CANDRA E + 0905 094A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT O + 0906 0946 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN SHORT E + 0905 094B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN O + 0906 0947 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN E + 0905 094C ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AU + 0906 0948 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN AI + 0905 0945 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA E + 0905 093A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OE + 0905 093B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OOE + 0906 093A ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN OE + 0905 094F ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AW + 0905 0956 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UE + 0905 0957 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UUE + 0985 09BE ; # BENGALI LETTER A, BENGALI VOWEL SIGN AA + 098B 09C3 ; # BENGALI LETTER VOCALIC R, BENGALI VOWEL SIGN VOCALIC R + 098C 09E2 ; # BENGALI LETTER VOCALIC L, BENGALI VOWEL SIGN VOCALIC L + 0A05 0A3E ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AA + 0A72 0A3F ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN I + 0A72 0A40 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN II + 0A73 0A41 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN U + 0A73 0A42 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN UU + 0A72 0A47 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN EE + 0A05 0A48 ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AI + 0A73 0A4B ; # GURMUKHI URA, GURMUKHI VOWEL SIGN OO + 0A05 0A4C ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AU + 0A85 0ABE ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA + 0A85 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA E + 0A85 0AC7 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN E + 0A85 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AI + 0A85 0AC9 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA O + 0A85 0ACB ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN O + 0A85 0ABE 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN CANDRA E + 0A85 0ACC ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AU + 0A85 0ABE 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN AI + 0AC5 0ABE ; # GUJARATI VOWEL SIGN CANDRA E, GUJARATI VOWEL SIGN AA + 0B05 0B3E ; # ORIYA LETTER A, ORIYA VOWEL SIGN AA + 0B0F 0B57 ; # ORIYA LETTER E, ORIYA AU LENGTH MARK + 0B13 0B57 ; # ORIYA LETTER O, ORIYA AU LENGTH MARK + 0C12 0C55 ; # TELUGU LETTER O, TELUGU LENGTH MARK + 0C12 0C4C ; # TELUGU LETTER O, TELUGU VOWEL SIGN AU + 0C3F 0C55 ; # TELUGU VOWEL SIGN I, TELUGU LENGTH MARK + 0C46 0C55 ; # TELUGU VOWEL SIGN E, TELUGU LENGTH MARK + 0C4A 0C55 ; # TELUGU VOWEL SIGN O, TELUGU LENGTH MARK + 0C89 0CBE ; # KANNADA LETTER U, KANNADA VOWEL SIGN AA + 0C92 0CCC ; # KANNADA LETTER O, KANNADA VOWEL SIGN AU + 0C8B 0CBE ; # KANNADA LETTER VOCALIC R, KANNADA VOWEL SIGN AA + 0D07 0D57 ; # MALAYALAM LETTER I, MALAYALAM AU LENGTH MARK + 0D09 0D57 ; # MALAYALAM LETTER U, MALAYALAM AU LENGTH MARK + 0D0E 0D46 ; # MALAYALAM LETTER E, MALAYALAM VOWEL SIGN E + 0D12 0D3E ; # MALAYALAM LETTER O, MALAYALAM VOWEL SIGN AA + 0D12 0D57 ; # MALAYALAM LETTER O, MALAYALAM AU LENGTH MARK + 0D85 0DCF ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN AELA-PILLA + 0D85 0DD0 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN KETTI AEDA-PILLA + 0D85 0DD1 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN DIGA AEDA-PILLA + 0D8B 0DDF ; # SINHALA LETTER UYANNA, SINHALA VOWEL SIGN GAYANUKITTA + 0D8D 0DD8 ; # SINHALA LETTER IRUYANNA, SINHALA VOWEL SIGN GAETTA-PILLA + 0D8F 0DDF ; # SINHALA LETTER ILUYANNA, SINHALA VOWEL SIGN GAYANUKITTA + 0D91 0DCA ; # SINHALA LETTER EYANNA, SINHALA SIGN AL-LAKUNA + 0D91 0DD9 ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA + 0D91 0DDA ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN DIGA KOMBUVA + 0D91 0DDC ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA + 0D91 0DDD ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA + 0D91 0DDD ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA + 0D94 0DDF ; # SINHALA LETTER OYANNA, SINHALA VOWEL SIGN GAYANUKITTA + 11005 11038 ; # BRAHMI LETTER A, BRAHMI VOWEL SIGN AA + 1100B 1103E ; # BRAHMI LETTER VOCALIC R, BRAHMI VOWEL SIGN VOCALIC R + 1100F 11042 ; # BRAHMI LETTER E, BRAHMI VOWEL SIGN E + 11680 116AD ; # TAKRI LETTER A, TAKRI VOWEL SIGN AA + 11686 116B2 ; # TAKRI LETTER E, TAKRI VOWEL SIGN E + 11680 116B4 ; # TAKRI LETTER A, TAKRI VOWEL SIGN O + 11680 116B5 ; # TAKRI LETTER A, TAKRI VOWEL SIGN AU + 112B0 112E0 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AA + 112B0 112E5 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN E + 112B0 112E6 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AI + 112B0 112E7 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN O + 112B0 112E8 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AU + 11481 114B0 ; # TIRHUTA LETTER A, TIRHUTA VOWEL SIGN AA + 114AA 114B5 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC R + 114AA 114B6 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC RR + 1148B 114BA ; # TIRHUTA LETTER E, TIRHUTA VOWEL SIGN SHORT E + 1148D 114BA ; # TIRHUTA LETTER O, TIRHUTA VOWEL SIGN SHORT E + 11600 11639 ; # MODI LETTER A, MODI VOWEL SIGN E + 11600 1163A ; # MODI LETTER A, MODI VOWEL SIGN AI + 11601 11639 ; # MODI LETTER AA, MODI VOWEL SIGN E + 11601 1163A ; # MODI LETTER AA, MODI VOWEL SIGN AI diff --git a/src/Makefile.am b/src/Makefile.am index 782992d1c..ac03890b3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -317,9 +317,9 @@ use-table: gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.tx $(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-shape-complex-use-table.cc \ || ($(RM) $(srcdir)/hb-ot-shape-complex-use-table.cc; false) -vowel-constraints: gen-vowel-constraints.py use Scripts.txt - $(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-shape-complex-vowel-constraints.hh \ - || ($(RM) $(srcdir)/hb-ot-shape-complex-vowel-constraints.hh; false) +vowel-constraints: gen-vowel-constraints.py HBIndicVowelConstraints.txt Scripts.txt + $(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-ot-shape-complex-vowel-constraints.cc \ + || ($(RM) $(srcdir)/hb-ot-shape-complex-vowel-constraints.cc; false) emoji-table: gen-emoji-table.py emoji-data.txt $(AM_V_GEN) $(builddir)/$^ > $(srcdir)/hb-unicode-emoji-table.hh \ diff --git a/src/Makefile.sources b/src/Makefile.sources index b30291054..72968aee9 100644 --- a/src/Makefile.sources +++ b/src/Makefile.sources @@ -142,6 +142,7 @@ HB_OT_sources = \ hb-ot-shape-complex-use.cc \ hb-ot-shape-complex-use.hh \ hb-ot-shape-complex-use-table.cc \ + hb-ot-shape-complex-vowel-constraints.cc \ hb-ot-shape-complex-vowel-constraints.hh \ hb-ot-shape-complex.hh \ hb-ot-shape-normalize.hh \ diff --git a/src/gen-vowel-constraints.py b/src/gen-vowel-constraints.py index bcb5d27bd..19629abeb 100755 --- a/src/gen-vowel-constraints.py +++ b/src/gen-vowel-constraints.py @@ -2,12 +2,10 @@ """Generator of the function to prohibit certain vowel sequences. -It creates ``preprocess_text_vowel_constraints``, which inserts dotted +It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted circles into sequences prohibited by the USE script development spec. This function should be used as the ``preprocess_text`` of an ``hb_ot_complex_shaper_t``. - -It also creates the helper function ``_output_with_dotted_circle``. """ from __future__ import absolute_import, division, print_function, unicode_literals @@ -27,23 +25,9 @@ import io import sys if len (sys.argv) != 3: - print ('usage: ./gen-vowel-constraints.py use Scripts.txt', file=sys.stderr) + print ('usage: ./gen-vowel-constraints.py HBIndicVowelConstraints.txt Scripts.txt', file=sys.stderr) sys.exit (1) -try: - from html import unescape - def html_unescape (parser, entity): - return unescape (entity) -except ImportError: - def html_unescape (parser, entity): - return parser.unescape (entity) - -def expect (condition, message=None): - if not condition: - if message is None: - raise AssertionError - raise AssertionError (message) - with io.open (sys.argv[2], encoding='utf-8') as f: scripts_header = [f.readline () for i in range (2)] scripts = {} @@ -142,74 +126,22 @@ class ConstraintSet (object): s.append ('{}}}\n'.format (indent)) return ''.join (s) -class USESpecParser (HTMLParser): - """A parser for the USE script development spec. - - Attributes: - header (str): The ``updated_at`` timestamp of the spec. - constraints (Mapping[str, ConstraintSet]): A map of script names - to the scripts' prohibited sequences. - """ - def __init__ (self): - HTMLParser.__init__ (self) - self.header = '' - self.constraints = {} - # Whether the next contains the vowel constraints. - self._primed = False - # Whether the parser is in the element with the constraints. - self._in_constraints = False - # The text of the constraints. - self._constraints = '' - - def handle_starttag (self, tag, attrs): - if tag == 'meta': - for attr, value in attrs: - if attr == 'name' and value == 'updated_at': - self.header = self.get_starttag_text () - break - elif tag == 'a': - for attr, value in attrs: - if attr == 'id' and value == 'ivdvconstraints': - self._primed = True - break - elif self._primed and tag == 'code': - self._primed = False - self._in_constraints = True - - def handle_endtag (self, tag): - self._in_constraints = False - - def handle_data (self, data): - if self._in_constraints: - self._constraints += data - - def handle_charref (self, name): - self.handle_data (html_unescape (self, '&#%s;' % name)) - - def handle_entityref (self, name): - self.handle_data (html_unescape (self, '&%s;' % name)) - - def parse (self, filename): - """Parse the USE script development spec. - - Args: - filename (str): The file name of the spec. - """ - with io.open (filename, encoding='utf-8') as f: - self.feed (f.read ()) - expect (self.header, 'No header found') - for line in self._constraints.splitlines (): - constraint = [int (cp, 16) for cp in line.split (';')[0].strip ().split (' ')] - expect (2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)) - script = scripts[constraint[0]] - if script in self.constraints: - self.constraints[script].add (constraint) - else: - self.constraints[script] = ConstraintSet (constraint) - expect (self.constraints, 'No constraints found') - -use_parser = USESpecParser () -use_parser.parse (sys.argv[1]) +constraints = {} +with io.open (sys.argv[1], encoding='utf-8') as f: + constraints_header = [f.readline ().strip () for i in range (2)] + for line in f: + j = line.find ('#') + if j >= 0: + line = line[:j] + constraint = [int (cp, 16) for cp in line.split (';')[0].split ()] + if not constraint: continue + assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint) + script = scripts[constraint[0]] + if script in constraints: + constraints[script].add (constraint) + else: + constraints[script] = ConstraintSet (constraint) + assert constraints, 'No constraints found' print ('/* == Start of generated functions == */') print ('/*') @@ -219,15 +151,15 @@ print (' * %s use Scripts.txt' % sys.argv[0]) print (' *') print (' * on files with these headers:') print (' *') -print (' * %s' % use_parser.header.strip ()) +for line in constraints_header: + print (' * %s' % line.strip ()) +print (' *') for line in scripts_header: print (' * %s' % line.strip ()) print (' */') print () -print ('#ifndef HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH') -print ('#define HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH') +print ('#include "hb-ot-shape-complex-vowel-constraints.hh"') print () - print ('static void') print ('_output_with_dotted_circle (hb_buffer_t *buffer)') print ('{') @@ -238,10 +170,10 @@ print (' buffer->next_glyph ();') print ('}') print () -print ('static void') -print ('preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan,') -print ('\t\t\t\t hb_buffer_t *buffer,') -print ('\t\t\t\t hb_font_t *font)') +print ('void') +print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan,') +print ('\t\t\t\t hb_buffer_t *buffer,') +print ('\t\t\t\t hb_font_t *font)') print ('{') print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of') print (' * vowel-sequences that look like another vowel. Data for each script') @@ -255,7 +187,7 @@ print (' unsigned int count = buffer->len;') print (' switch ((unsigned) buffer->props.script)') print (' {') -for script, constraints in sorted (use_parser.constraints.items (), key=lambda s_c: script_order[s_c[0]]): +for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]): print (' case HB_SCRIPT_{}:'.format (script.upper ())) print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)') print (' {') @@ -280,7 +212,5 @@ print (' buffer->swap_buffers ();') print (' }') print ('}') -print () -print ('#endif /* HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH */') print () print ('/* == End of generated functions == */') diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc index 092ac6846..3babbfec6 100644 --- a/src/hb-ot-shape-complex-indic.cc +++ b/src/hb-ot-shape-complex-indic.cc @@ -1517,6 +1517,14 @@ clear_syllables (const hb_ot_shape_plan_t *plan HB_UNUSED, } +static void +preprocess_text_indic (const hb_ot_shape_plan_t *plan, + hb_buffer_t *buffer, + hb_font_t *font) +{ + _hb_preprocess_text_vowel_constraints (plan, buffer, font); +} + static bool decompose_indic (const hb_ot_shape_normalize_context_t *c, hb_codepoint_t ab, @@ -1616,7 +1624,7 @@ const hb_ot_complex_shaper_t _hb_ot_complex_shaper_indic = override_features_indic, data_create_indic, data_destroy_indic, - preprocess_text_vowel_constraints, + preprocess_text_indic, nullptr, /* postprocess_glyphs */ HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT, decompose_indic, diff --git a/src/hb-ot-shape-complex-use.cc b/src/hb-ot-shape-complex-use.cc index 8c44fe016..addfb87ba 100644 --- a/src/hb-ot-shape-complex-use.cc +++ b/src/hb-ot-shape-complex-use.cc @@ -572,6 +572,15 @@ reorder (const hb_ot_shape_plan_t *plan, HB_BUFFER_DEALLOCATE_VAR (buffer, use_category); } + +static void +preprocess_text_use (const hb_ot_shape_plan_t *plan, + hb_buffer_t *buffer, + hb_font_t *font) +{ + _hb_preprocess_text_vowel_constraints (plan, buffer, font); +} + static bool compose_use (const hb_ot_shape_normalize_context_t *c, hb_codepoint_t a, @@ -592,7 +601,7 @@ const hb_ot_complex_shaper_t _hb_ot_complex_shaper_use = nullptr, /* override_features */ data_create_use, data_destroy_use, - preprocess_text_vowel_constraints, + preprocess_text_use, nullptr, /* postprocess_glyphs */ HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT, nullptr, /* decompose */ diff --git a/src/hb-ot-shape-complex-vowel-constraints.cc b/src/hb-ot-shape-complex-vowel-constraints.cc new file mode 100644 index 000000000..e50233924 --- /dev/null +++ b/src/hb-ot-shape-complex-vowel-constraints.cc @@ -0,0 +1,433 @@ +/* == Start of generated functions == */ +/* + * The following functions are generated by running: + * + * ./gen-vowel-constraints.py use Scripts.txt + * + * on files with these headers: + * + * # Copied from https://docs.microsoft.com/en-us/typography/script-development/use + * # On October 23, 2018; with documentd dated 02/07/2018. + * + * # Scripts-11.0.0.txt + * # Date: 2018-02-21, 05:34:31 GMT + */ + +#include "hb-ot-shape-complex-vowel-constraints.hh" + +static void +_output_with_dotted_circle (hb_buffer_t *buffer) +{ + hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu); + _hb_glyph_info_reset_continuation (&dottedcircle); + + buffer->next_glyph (); +} + +void +_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan, + hb_buffer_t *buffer, + hb_font_t *font) +{ + /* UGLY UGLY UGLY business of adding dotted-circle in the middle of + * vowel-sequences that look like another vowel. Data for each script + * collected from the USE script development spec. + * + * https://github.com/harfbuzz/harfbuzz/issues/1019 + */ + bool processed = false; + buffer->clear_output (); + unsigned int count = buffer->len; + switch ((unsigned) buffer->props.script) + { + case HB_SCRIPT_DEVANAGARI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0905u: + switch (buffer->cur (1).codepoint) + { + case 0x093Au: case 0x093Bu: case 0x093Eu: case 0x0945u: + case 0x0946u: case 0x0949u: case 0x094Au: case 0x094Bu: + case 0x094Cu: case 0x094Fu: case 0x0956u: case 0x0957u: + matched = true; + break; + } + break; + case 0x0906u: + switch (buffer->cur (1).codepoint) + { + case 0x093Au: case 0x0945u: case 0x0946u: case 0x0947u: + case 0x0948u: + matched = true; + break; + } + break; + case 0x0909u: + matched = 0x0941u == buffer->cur (1).codepoint; + break; + case 0x090Fu: + switch (buffer->cur (1).codepoint) + { + case 0x0945u: case 0x0946u: case 0x0947u: + matched = true; + break; + } + break; + case 0x0930u: + if (0x094Du == buffer->cur (1).codepoint && + buffer->idx + 2 < count && + 0x0907u == buffer->cur (2).codepoint) + { + buffer->next_glyph (); + buffer->next_glyph (); + buffer->output_glyph (0x25CCu); + } + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_BENGALI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0985u: + matched = 0x09BEu == buffer->cur (1).codepoint; + break; + case 0x098Bu: + matched = 0x09C3u == buffer->cur (1).codepoint; + break; + case 0x098Cu: + matched = 0x09E2u == buffer->cur (1).codepoint; + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_GURMUKHI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0A05u: + switch (buffer->cur (1).codepoint) + { + case 0x0A3Eu: case 0x0A48u: case 0x0A4Cu: + matched = true; + break; + } + break; + case 0x0A72u: + switch (buffer->cur (1).codepoint) + { + case 0x0A3Fu: case 0x0A40u: case 0x0A47u: + matched = true; + break; + } + break; + case 0x0A73u: + switch (buffer->cur (1).codepoint) + { + case 0x0A41u: case 0x0A42u: case 0x0A4Bu: + matched = true; + break; + } + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_GUJARATI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0A85u: + switch (buffer->cur (1).codepoint) + { + case 0x0ABEu: case 0x0AC5u: case 0x0AC7u: case 0x0AC8u: + case 0x0AC9u: case 0x0ACBu: case 0x0ACCu: + matched = true; + break; + } + break; + case 0x0AC5u: + matched = 0x0ABEu == buffer->cur (1).codepoint; + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_ORIYA: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0B05u: + matched = 0x0B3Eu == buffer->cur (1).codepoint; + break; + case 0x0B0Fu: case 0x0B13u: + matched = 0x0B57u == buffer->cur (1).codepoint; + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_TELUGU: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0C12u: + switch (buffer->cur (1).codepoint) + { + case 0x0C4Cu: case 0x0C55u: + matched = true; + break; + } + break; + case 0x0C3Fu: case 0x0C46u: case 0x0C4Au: + matched = 0x0C55u == buffer->cur (1).codepoint; + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_KANNADA: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0C89u: case 0x0C8Bu: + matched = 0x0CBEu == buffer->cur (1).codepoint; + break; + case 0x0C92u: + matched = 0x0CCCu == buffer->cur (1).codepoint; + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_MALAYALAM: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0D07u: case 0x0D09u: + matched = 0x0D57u == buffer->cur (1).codepoint; + break; + case 0x0D0Eu: + matched = 0x0D46u == buffer->cur (1).codepoint; + break; + case 0x0D12u: + switch (buffer->cur (1).codepoint) + { + case 0x0D3Eu: case 0x0D57u: + matched = true; + break; + } + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_SINHALA: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x0D85u: + switch (buffer->cur (1).codepoint) + { + case 0x0DCFu: case 0x0DD0u: case 0x0DD1u: + matched = true; + break; + } + break; + case 0x0D8Bu: case 0x0D8Fu: case 0x0D94u: + matched = 0x0DDFu == buffer->cur (1).codepoint; + break; + case 0x0D8Du: + matched = 0x0DD8u == buffer->cur (1).codepoint; + break; + case 0x0D91u: + switch (buffer->cur (1).codepoint) + { + case 0x0DCAu: case 0x0DD9u: case 0x0DDAu: case 0x0DDCu: + case 0x0DDDu: + matched = true; + break; + } + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_BRAHMI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x11005u: + matched = 0x11038u == buffer->cur (1).codepoint; + break; + case 0x1100Bu: + matched = 0x1103Eu == buffer->cur (1).codepoint; + break; + case 0x1100Fu: + matched = 0x11042u == buffer->cur (1).codepoint; + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_KHUDAWADI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x112B0u: + switch (buffer->cur (1).codepoint) + { + case 0x112E0u: case 0x112E5u: case 0x112E6u: case 0x112E7u: + case 0x112E8u: + matched = true; + break; + } + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_TIRHUTA: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x11481u: + matched = 0x114B0u == buffer->cur (1).codepoint; + break; + case 0x1148Bu: case 0x1148Du: + matched = 0x114BAu == buffer->cur (1).codepoint; + break; + case 0x114AAu: + switch (buffer->cur (1).codepoint) + { + case 0x114B5u: case 0x114B6u: + matched = true; + break; + } + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_MODI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x11600u: case 0x11601u: + switch (buffer->cur (1).codepoint) + { + case 0x11639u: case 0x1163Au: + matched = true; + break; + } + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + case HB_SCRIPT_TAKRI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x11680u: + switch (buffer->cur (1).codepoint) + { + case 0x116ADu: case 0x116B4u: case 0x116B5u: + matched = true; + break; + } + break; + case 0x11686u: + matched = 0x116B2u == buffer->cur (1).codepoint; + break; + } + buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + processed = true; + break; + + default: + break; + } + if (processed) + { + if (buffer->idx < count) + buffer->next_glyph (); + if (likely (buffer->successful)) + buffer->swap_buffers (); + } +} + +/* == End of generated functions == */ diff --git a/src/hb-ot-shape-complex-vowel-constraints.hh b/src/hb-ot-shape-complex-vowel-constraints.hh index 1b07c2f40..d9082d4ea 100644 --- a/src/hb-ot-shape-complex-vowel-constraints.hh +++ b/src/hb-ot-shape-complex-vowel-constraints.hh @@ -1,434 +1,39 @@ -/* == Start of generated functions == */ /* - * The following functions are generated by running: + * Copyright © 2018 Google, Inc. * - * ./gen-vowel-constraints.py use Scripts.txt + * This is part of HarfBuzz, a text shaping library. * - * on files with these headers: + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. * - * - * # Scripts-11.0.0.txt - * # Date: 2018-02-21, 05:34:31 GMT + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + * Google Author(s): Behdad Esfahbod */ #ifndef HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH #define HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH -static void -_output_with_dotted_circle (hb_buffer_t *buffer) -{ - hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu); - _hb_glyph_info_reset_continuation (&dottedcircle); +#include "hb.hh" - buffer->next_glyph (); -} +#include "hb-ot-shape-complex.hh" -static void -preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan, - hb_buffer_t *buffer, - hb_font_t *font) -{ - /* UGLY UGLY UGLY business of adding dotted-circle in the middle of - * vowel-sequences that look like another vowel. Data for each script - * collected from the USE script development spec. - * - * https://github.com/harfbuzz/harfbuzz/issues/1019 - */ - bool processed = false; - buffer->clear_output (); - unsigned int count = buffer->len; - switch ((unsigned) buffer->props.script) - { - case HB_SCRIPT_DEVANAGARI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0905u: - switch (buffer->cur (1).codepoint) - { - case 0x093Au: case 0x093Bu: case 0x093Eu: case 0x0945u: - case 0x0946u: case 0x0949u: case 0x094Au: case 0x094Bu: - case 0x094Cu: case 0x094Fu: case 0x0956u: case 0x0957u: - matched = true; - break; - } - break; - case 0x0906u: - switch (buffer->cur (1).codepoint) - { - case 0x093Au: case 0x0945u: case 0x0946u: case 0x0947u: - case 0x0948u: - matched = true; - break; - } - break; - case 0x0909u: - matched = 0x0941u == buffer->cur (1).codepoint; - break; - case 0x090Fu: - switch (buffer->cur (1).codepoint) - { - case 0x0945u: case 0x0946u: case 0x0947u: - matched = true; - break; - } - break; - case 0x0930u: - if (0x094Du == buffer->cur (1).codepoint && - buffer->idx + 2 < count && - 0x0907u == buffer->cur (2).codepoint) - { - buffer->next_glyph (); - buffer->next_glyph (); - buffer->output_glyph (0x25CCu); - } - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_BENGALI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0985u: - matched = 0x09BEu == buffer->cur (1).codepoint; - break; - case 0x098Bu: - matched = 0x09C3u == buffer->cur (1).codepoint; - break; - case 0x098Cu: - matched = 0x09E2u == buffer->cur (1).codepoint; - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_GURMUKHI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0A05u: - switch (buffer->cur (1).codepoint) - { - case 0x0A3Eu: case 0x0A48u: case 0x0A4Cu: - matched = true; - break; - } - break; - case 0x0A72u: - switch (buffer->cur (1).codepoint) - { - case 0x0A3Fu: case 0x0A40u: case 0x0A47u: - matched = true; - break; - } - break; - case 0x0A73u: - switch (buffer->cur (1).codepoint) - { - case 0x0A41u: case 0x0A42u: case 0x0A4Bu: - matched = true; - break; - } - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_GUJARATI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0A85u: - switch (buffer->cur (1).codepoint) - { - case 0x0ABEu: case 0x0AC5u: case 0x0AC7u: case 0x0AC8u: - case 0x0AC9u: case 0x0ACBu: case 0x0ACCu: - matched = true; - break; - } - break; - case 0x0AC5u: - matched = 0x0ABEu == buffer->cur (1).codepoint; - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_ORIYA: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0B05u: - matched = 0x0B3Eu == buffer->cur (1).codepoint; - break; - case 0x0B0Fu: case 0x0B13u: - matched = 0x0B57u == buffer->cur (1).codepoint; - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_TELUGU: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0C12u: - switch (buffer->cur (1).codepoint) - { - case 0x0C4Cu: case 0x0C55u: - matched = true; - break; - } - break; - case 0x0C3Fu: case 0x0C46u: case 0x0C4Au: - matched = 0x0C55u == buffer->cur (1).codepoint; - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_KANNADA: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0C89u: case 0x0C8Bu: - matched = 0x0CBEu == buffer->cur (1).codepoint; - break; - case 0x0C92u: - matched = 0x0CCCu == buffer->cur (1).codepoint; - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_MALAYALAM: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0D07u: case 0x0D09u: - matched = 0x0D57u == buffer->cur (1).codepoint; - break; - case 0x0D0Eu: - matched = 0x0D46u == buffer->cur (1).codepoint; - break; - case 0x0D12u: - switch (buffer->cur (1).codepoint) - { - case 0x0D3Eu: case 0x0D57u: - matched = true; - break; - } - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_SINHALA: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x0D85u: - switch (buffer->cur (1).codepoint) - { - case 0x0DCFu: case 0x0DD0u: case 0x0DD1u: - matched = true; - break; - } - break; - case 0x0D8Bu: case 0x0D8Fu: case 0x0D94u: - matched = 0x0DDFu == buffer->cur (1).codepoint; - break; - case 0x0D8Du: - matched = 0x0DD8u == buffer->cur (1).codepoint; - break; - case 0x0D91u: - switch (buffer->cur (1).codepoint) - { - case 0x0DCAu: case 0x0DD9u: case 0x0DDAu: case 0x0DDCu: - case 0x0DDDu: - matched = true; - break; - } - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_BRAHMI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x11005u: - matched = 0x11038u == buffer->cur (1).codepoint; - break; - case 0x1100Bu: - matched = 0x1103Eu == buffer->cur (1).codepoint; - break; - case 0x1100Fu: - matched = 0x11042u == buffer->cur (1).codepoint; - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_KHUDAWADI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x112B0u: - switch (buffer->cur (1).codepoint) - { - case 0x112E0u: case 0x112E5u: case 0x112E6u: case 0x112E7u: - case 0x112E8u: - matched = true; - break; - } - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_TIRHUTA: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x11481u: - matched = 0x114B0u == buffer->cur (1).codepoint; - break; - case 0x1148Bu: case 0x1148Du: - matched = 0x114BAu == buffer->cur (1).codepoint; - break; - case 0x114AAu: - switch (buffer->cur (1).codepoint) - { - case 0x114B5u: case 0x114B6u: - matched = true; - break; - } - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_MODI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x11600u: case 0x11601u: - switch (buffer->cur (1).codepoint) - { - case 0x11639u: case 0x1163Au: - matched = true; - break; - } - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - case HB_SCRIPT_TAKRI: - for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) - { - bool matched = false; - switch (buffer->cur ().codepoint) - { - case 0x11680u: - switch (buffer->cur (1).codepoint) - { - case 0x116ADu: case 0x116B4u: case 0x116B5u: - matched = true; - break; - } - break; - case 0x11686u: - matched = 0x116B2u == buffer->cur (1).codepoint; - break; - } - buffer->next_glyph (); - if (matched) _output_with_dotted_circle (buffer); - } - processed = true; - break; - - default: - break; - } - if (processed) - { - if (buffer->idx < count) - buffer->next_glyph (); - if (likely (buffer->successful)) - buffer->swap_buffers (); - } -} +HB_INTERNAL void +_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan, + hb_buffer_t *buffer, + hb_font_t *font); #endif /* HB_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS_HH */ - -/* == End of generated functions == */