From 397ed53e55b9450742867a43d164b498ec735f50 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Tue, 13 Feb 2018 18:54:26 -0800 Subject: [PATCH] [indic] Add print-indic-table --- src/Makefile.am | 14 +- src/hb-ot-shape-complex-indic-private.hh | 185 ++++++++++++++++++++++ src/hb-ot-shape-complex-indic.cc | 187 ----------------------- src/print-indic-table.cc | 43 ++++++ 4 files changed, 239 insertions(+), 190 deletions(-) create mode 100644 src/print-indic-table.cc diff --git a/src/Makefile.am b/src/Makefile.am index ee8e1d561..7616ac6e1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -9,6 +9,8 @@ CLEANFILES = DISTCLEANFILES = MAINTAINERCLEANFILES = DISTCHECK_CONFIGURE_FLAGS = --enable-introspection +TESTS = +check_PROGRAMS = # The following warning options are useful for debugging: -Wpadded #AM_CXXFLAGS = @@ -341,6 +343,7 @@ dist_check_SCRIPTS = \ check-static-inits.sh \ check-symbols.sh \ $(NULL) +TESTS += $(dist_check_SCRIPTS) if !WITH_LIBSTDCXX dist_check_SCRIPTS += \ @@ -348,14 +351,19 @@ dist_check_SCRIPTS += \ $(NULL) endif -check_PROGRAMS = \ - test-ot-tag \ +check_PROGRAMS += \ + print-indic-table \ $(NULL) +print_indic_table_SOURCES = print-indic-table.cc hb-ot-shape-complex-indic-table.cc +print_indic_table_CPPFLAGS = $(HBCFLAGS) -DMAIN +print_indic_table_LDADD = libharfbuzz.la $(HBLIBS) + +check_PROGRAMS += test-ot-tag +TESTS += test-ot-tag test_ot_tag_SOURCES = hb-ot-tag.cc test_ot_tag_CPPFLAGS = $(HBCFLAGS) -DMAIN test_ot_tag_LDADD = libharfbuzz.la $(HBLIBS) -TESTS = $(dist_check_SCRIPTS) $(check_PROGRAMS) TESTS_ENVIRONMENT = \ srcdir="$(srcdir)" \ MAKE="$(MAKE) $(AM_MAKEFLAGS)" \ diff --git a/src/hb-ot-shape-complex-indic-private.hh b/src/hb-ot-shape-complex-indic-private.hh index a64e9d039..4d700f74e 100644 --- a/src/hb-ot-shape-complex-indic-private.hh +++ b/src/hb-ot-shape-complex-indic-private.hh @@ -186,4 +186,189 @@ enum indic_matra_category_t { HB_INTERNAL INDIC_TABLE_ELEMENT_TYPE hb_indic_get_categories (hb_codepoint_t u); +/* buffer var allocations */ +#define indic_category() complex_var_u8_0() /* indic_category_t */ +#define indic_position() complex_var_u8_1() /* indic_position_t */ + + +#define IN_HALF_BLOCK(u, Base) (((u) & ~0x7Fu) == (Base)) + +#define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900u)) +#define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980u)) +#define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00u)) +#define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80u)) +#define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00u)) +#define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80u)) +#define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00u)) +#define IS_KNDA(u) (IN_HALF_BLOCK (u, 0x0C80u)) +#define IS_MLYM(u) (IN_HALF_BLOCK (u, 0x0D00u)) +#define IS_SINH(u) (IN_HALF_BLOCK (u, 0x0D80u)) + + +#define MATRA_POS_LEFT(u) POS_PRE_M +#define MATRA_POS_RIGHT(u) ( \ + IS_DEVA(u) ? POS_AFTER_SUB : \ + IS_BENG(u) ? POS_AFTER_POST : \ + IS_GURU(u) ? POS_AFTER_POST : \ + IS_GUJR(u) ? POS_AFTER_POST : \ + IS_ORYA(u) ? POS_AFTER_POST : \ + IS_TAML(u) ? POS_AFTER_POST : \ + IS_TELU(u) ? (u <= 0x0C42u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \ + IS_KNDA(u) ? (u < 0x0CC3u || u > 0xCD6u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \ + IS_MLYM(u) ? POS_AFTER_POST : \ + IS_SINH(u) ? POS_AFTER_SUB : \ + /*default*/ POS_AFTER_SUB \ + ) +#define MATRA_POS_TOP(u) ( /* BENG and MLYM don't have top matras. */ \ + IS_DEVA(u) ? POS_AFTER_SUB : \ + IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \ + IS_GUJR(u) ? POS_AFTER_SUB : \ + IS_ORYA(u) ? POS_AFTER_MAIN : \ + IS_TAML(u) ? POS_AFTER_SUB : \ + IS_TELU(u) ? POS_BEFORE_SUB : \ + IS_KNDA(u) ? POS_BEFORE_SUB : \ + IS_SINH(u) ? POS_AFTER_SUB : \ + /*default*/ POS_AFTER_SUB \ + ) +#define MATRA_POS_BOTTOM(u) ( \ + IS_DEVA(u) ? POS_AFTER_SUB : \ + IS_BENG(u) ? POS_AFTER_SUB : \ + IS_GURU(u) ? POS_AFTER_POST : \ + IS_GUJR(u) ? POS_AFTER_POST : \ + IS_ORYA(u) ? POS_AFTER_SUB : \ + IS_TAML(u) ? POS_AFTER_POST : \ + IS_TELU(u) ? POS_BEFORE_SUB : \ + IS_KNDA(u) ? POS_BEFORE_SUB : \ + IS_MLYM(u) ? POS_AFTER_POST : \ + IS_SINH(u) ? POS_AFTER_SUB : \ + /*default*/ POS_AFTER_SUB \ + ) + +static inline indic_position_t +matra_position (hb_codepoint_t u, indic_position_t side) +{ + switch ((int) side) + { + case POS_PRE_C: return MATRA_POS_LEFT (u); + case POS_POST_C: return MATRA_POS_RIGHT (u); + case POS_ABOVE_C: return MATRA_POS_TOP (u); + case POS_BELOW_C: return MATRA_POS_BOTTOM (u); + }; + return side; +} + +/* XXX + * This is a hack for now. We should move this data into the main Indic table. + * Or completely remove it and just check in the tables. + */ +static const hb_codepoint_t ra_chars[] = { + 0x0930u, /* Devanagari */ + 0x09B0u, /* Bengali */ + 0x09F0u, /* Bengali */ + 0x0A30u, /* Gurmukhi */ /* No Reph */ + 0x0AB0u, /* Gujarati */ + 0x0B30u, /* Oriya */ + 0x0BB0u, /* Tamil */ /* No Reph */ + 0x0C30u, /* Telugu */ /* Reph formed only with ZWJ */ + 0x0CB0u, /* Kannada */ + 0x0D30u, /* Malayalam */ /* No Reph, Logical Repha */ + + 0x0DBBu, /* Sinhala */ /* Reph formed only with ZWJ */ +}; + +static inline bool +is_ra (hb_codepoint_t u) +{ + for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++) + if (u == ra_chars[i]) + return true; + return false; +} + +static inline void +set_indic_properties (hb_glyph_info_t &info) +{ + hb_codepoint_t u = info.codepoint; + unsigned int type = hb_indic_get_categories (u); + indic_category_t cat = (indic_category_t) (type & 0x7Fu); + indic_position_t pos = (indic_position_t) (type >> 8); + + + /* + * Re-assign category + */ + + /* The following act more like the Bindus. */ + if (unlikely (hb_in_range (u, 0x0953u, 0x0954u))) + cat = OT_SM; + /* The following act like consonants. */ + else if (unlikely (hb_in_ranges (u, 0x0A72u, 0x0A73u, + 0x1CF5u, 0x1CF6u))) + cat = OT_C; + /* TODO: The following should only be allowed after a Visarga. + * For now, just treat them like regular tone marks. */ + else if (unlikely (hb_in_range (u, 0x1CE2u, 0x1CE8u))) + cat = OT_A; + /* TODO: The following should only be allowed after some of + * the nasalization marks, maybe only for U+1CE9..U+1CF1. + * For now, just treat them like tone marks. */ + else if (unlikely (u == 0x1CEDu)) + cat = OT_A; + /* The following take marks in standalone clusters, similar to Avagraha. */ + else if (unlikely (hb_in_ranges (u, 0xA8F2u, 0xA8F7u, + 0x1CE9u, 0x1CECu, + 0x1CEEu, 0x1CF1u))) + { + cat = OT_Symbol; + static_assert (((int) INDIC_SYLLABIC_CATEGORY_AVAGRAHA == OT_Symbol), ""); + } + else if (unlikely (u == 0x0A51u)) + { + /* https://github.com/harfbuzz/harfbuzz/issues/524 */ + cat = OT_M; + pos = POS_BELOW_C; + } + + /* According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil, + * so the Indic shaper needs to know their categories. */ + else if (unlikely (u == 0x11301u || u == 0x11303u)) cat = OT_SM; + else if (unlikely (u == 0x1133cu)) cat = OT_N; + + else if (unlikely (u == 0x0AFBu)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/552 */ + + else if (unlikely (u == 0x0980u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/issues/538 */ + else if (unlikely (u == 0x0C80u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/623 */ + else if (unlikely (hb_in_range (u, 0x2010u, 0x2011u))) + cat = OT_PLACEHOLDER; + else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE; + + + /* + * Re-assign position. + */ + + if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS)) + { + pos = POS_BASE_C; + if (is_ra (u)) + cat = OT_Ra; + } + else if (cat == OT_M) + { + pos = matra_position (u, pos); + } + else if ((FLAG_UNSAFE (cat) & (FLAG (OT_SM) /* | FLAG (OT_VD) */ | FLAG (OT_A) | FLAG (OT_Symbol)))) + { + pos = POS_SMVD; + } + + if (unlikely (u == 0x0B01u)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */ + + + + info.indic_category() = cat; + info.indic_position() = pos; +} + + #endif /* HB_OT_SHAPE_COMPLEX_INDIC_PRIVATE_HH */ diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc index 9d150e0d3..5594f8bde 100644 --- a/src/hb-ot-shape-complex-indic.cc +++ b/src/hb-ot-shape-complex-indic.cc @@ -27,110 +27,12 @@ #include "hb-ot-shape-complex-indic-private.hh" #include "hb-ot-layout-private.hh" -/* buffer var allocations */ -#define indic_category() complex_var_u8_0() /* indic_category_t */ -#define indic_position() complex_var_u8_1() /* indic_position_t */ - /* * Indic shaper. */ -#define IN_HALF_BLOCK(u, Base) (((u) & ~0x7Fu) == (Base)) - -#define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900u)) -#define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980u)) -#define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00u)) -#define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80u)) -#define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00u)) -#define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80u)) -#define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00u)) -#define IS_KNDA(u) (IN_HALF_BLOCK (u, 0x0C80u)) -#define IS_MLYM(u) (IN_HALF_BLOCK (u, 0x0D00u)) -#define IS_SINH(u) (IN_HALF_BLOCK (u, 0x0D80u)) - - -#define MATRA_POS_LEFT(u) POS_PRE_M -#define MATRA_POS_RIGHT(u) ( \ - IS_DEVA(u) ? POS_AFTER_SUB : \ - IS_BENG(u) ? POS_AFTER_POST : \ - IS_GURU(u) ? POS_AFTER_POST : \ - IS_GUJR(u) ? POS_AFTER_POST : \ - IS_ORYA(u) ? POS_AFTER_POST : \ - IS_TAML(u) ? POS_AFTER_POST : \ - IS_TELU(u) ? (u <= 0x0C42u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \ - IS_KNDA(u) ? (u < 0x0CC3u || u > 0xCD6u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \ - IS_MLYM(u) ? POS_AFTER_POST : \ - IS_SINH(u) ? POS_AFTER_SUB : \ - /*default*/ POS_AFTER_SUB \ - ) -#define MATRA_POS_TOP(u) ( /* BENG and MLYM don't have top matras. */ \ - IS_DEVA(u) ? POS_AFTER_SUB : \ - IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \ - IS_GUJR(u) ? POS_AFTER_SUB : \ - IS_ORYA(u) ? POS_AFTER_MAIN : \ - IS_TAML(u) ? POS_AFTER_SUB : \ - IS_TELU(u) ? POS_BEFORE_SUB : \ - IS_KNDA(u) ? POS_BEFORE_SUB : \ - IS_SINH(u) ? POS_AFTER_SUB : \ - /*default*/ POS_AFTER_SUB \ - ) -#define MATRA_POS_BOTTOM(u) ( \ - IS_DEVA(u) ? POS_AFTER_SUB : \ - IS_BENG(u) ? POS_AFTER_SUB : \ - IS_GURU(u) ? POS_AFTER_POST : \ - IS_GUJR(u) ? POS_AFTER_POST : \ - IS_ORYA(u) ? POS_AFTER_SUB : \ - IS_TAML(u) ? POS_AFTER_POST : \ - IS_TELU(u) ? POS_BEFORE_SUB : \ - IS_KNDA(u) ? POS_BEFORE_SUB : \ - IS_MLYM(u) ? POS_AFTER_POST : \ - IS_SINH(u) ? POS_AFTER_SUB : \ - /*default*/ POS_AFTER_SUB \ - ) - -static inline indic_position_t -matra_position (hb_codepoint_t u, indic_position_t side) -{ - switch ((int) side) - { - case POS_PRE_C: return MATRA_POS_LEFT (u); - case POS_POST_C: return MATRA_POS_RIGHT (u); - case POS_ABOVE_C: return MATRA_POS_TOP (u); - case POS_BELOW_C: return MATRA_POS_BOTTOM (u); - }; - return side; -} - -/* XXX - * This is a hack for now. We should move this data into the main Indic table. - * Or completely remove it and just check in the tables. - */ -static const hb_codepoint_t ra_chars[] = { - 0x0930u, /* Devanagari */ - 0x09B0u, /* Bengali */ - 0x09F0u, /* Bengali */ - 0x0A30u, /* Gurmukhi */ /* No Reph */ - 0x0AB0u, /* Gujarati */ - 0x0B30u, /* Oriya */ - 0x0BB0u, /* Tamil */ /* No Reph */ - 0x0C30u, /* Telugu */ /* Reph formed only with ZWJ */ - 0x0CB0u, /* Kannada */ - 0x0D30u, /* Malayalam */ /* No Reph, Logical Repha */ - - 0x0DBBu, /* Sinhala */ /* Reph formed only with ZWJ */ -}; - -static inline bool -is_ra (hb_codepoint_t u) -{ - for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++) - if (u == ra_chars[i]) - return true; - return false; -} - static inline bool is_one_of (const hb_glyph_info_t &info, unsigned int flags) { @@ -157,95 +59,6 @@ is_halant (const hb_glyph_info_t &info) return is_one_of (info, FLAG (OT_H)); } -static inline void -set_indic_properties (hb_glyph_info_t &info) -{ - hb_codepoint_t u = info.codepoint; - unsigned int type = hb_indic_get_categories (u); - indic_category_t cat = (indic_category_t) (type & 0x7Fu); - indic_position_t pos = (indic_position_t) (type >> 8); - - - /* - * Re-assign category - */ - - /* The following act more like the Bindus. */ - if (unlikely (hb_in_range (u, 0x0953u, 0x0954u))) - cat = OT_SM; - /* The following act like consonants. */ - else if (unlikely (hb_in_ranges (u, 0x0A72u, 0x0A73u, - 0x1CF5u, 0x1CF6u))) - cat = OT_C; - /* TODO: The following should only be allowed after a Visarga. - * For now, just treat them like regular tone marks. */ - else if (unlikely (hb_in_range (u, 0x1CE2u, 0x1CE8u))) - cat = OT_A; - /* TODO: The following should only be allowed after some of - * the nasalization marks, maybe only for U+1CE9..U+1CF1. - * For now, just treat them like tone marks. */ - else if (unlikely (u == 0x1CEDu)) - cat = OT_A; - /* The following take marks in standalone clusters, similar to Avagraha. */ - else if (unlikely (hb_in_ranges (u, 0xA8F2u, 0xA8F7u, - 0x1CE9u, 0x1CECu, - 0x1CEEu, 0x1CF1u))) - { - cat = OT_Symbol; - static_assert (((int) INDIC_SYLLABIC_CATEGORY_AVAGRAHA == OT_Symbol), ""); - } - else if (unlikely (u == 0x0A51u)) - { - /* https://github.com/harfbuzz/harfbuzz/issues/524 */ - cat = OT_M; - pos = POS_BELOW_C; - } - - /* According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil, - * so the Indic shaper needs to know their categories. */ - else if (unlikely (u == 0x11301u || u == 0x11303u)) cat = OT_SM; - else if (unlikely (u == 0x1133cu)) cat = OT_N; - - else if (unlikely (u == 0x0AFBu)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/552 */ - - else if (unlikely (u == 0x0980u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/issues/538 */ - else if (unlikely (u == 0x0C80u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/623 */ - else if (unlikely (hb_in_range (u, 0x2010u, 0x2011u))) - cat = OT_PLACEHOLDER; - else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE; - - - /* - * Re-assign position. - */ - - if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS)) - { - pos = POS_BASE_C; - if (is_ra (u)) - cat = OT_Ra; - } - else if (cat == OT_M) - { - pos = matra_position (u, pos); - } - else if ((FLAG_UNSAFE (cat) & (FLAG (OT_SM) /* | FLAG (OT_VD) */ | FLAG (OT_A) | FLAG (OT_Symbol)))) - { - pos = POS_SMVD; - } - - if (unlikely (u == 0x0B01u)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */ - - - - info.indic_category() = cat; - info.indic_position() = pos; -} - -/* - * Things above this line should ideally be moved to the Indic table itself. - */ - /* * Indic configurations. Note that we do not want to keep every single script-specific diff --git a/src/print-indic-table.cc b/src/print-indic-table.cc new file mode 100644 index 000000000..d57413884 --- /dev/null +++ b/src/print-indic-table.cc @@ -0,0 +1,43 @@ +/* + * Copyright © 2018 Google, Inc. + * + * This is part of HarfBuzz, a text shaping library. + * + * Permission is hereby granted, without written agreement and without + * license or royalty fees, to use, copy, modify, and distribute this + * software and its documentation for any purpose, provided that the + * above copyright notice and the following two paragraphs appear in + * all copies of this software. + * + * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN + * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + * + * Google Author(s): Behdad Esfahbod + */ + +#include "hb-ot-shape-complex-indic-private.hh" + +int +main (void) +{ + for (hb_codepoint_t u = 0; u <= 0x10FFFF; u++) + { + hb_glyph_info_t info; + info.codepoint = u; + set_indic_properties (info); + if (info.indic_category() != INDIC_SYLLABIC_CATEGORY_OTHER || + info.indic_position() != INDIC_MATRA_CATEGORY_NOT_APPLICABLE) + printf("U+%04X %u %u\n", u, + info.indic_category(), + info.indic_position()); + } +}