From 20e246e674155d5fb6527722fc3ef3accf2413df Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Mon, 20 Jul 2015 15:56:19 +0100 Subject: [PATCH] [USE] Start moving Unicode-to-USE mapping into Python code --- src/Makefile.am | 2 +- src/gen-use-table.py | 176 +++++++++++++++++++++++++-- src/hb-ot-shape-complex-use-table.cc | 3 +- 3 files changed, 169 insertions(+), 12 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 44de2b694..947d279ac 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -294,7 +294,7 @@ indic-table: gen-indic-table.py IndicSyllabicCategory-7.0.0.txt IndicMatraCatego $(AM_V_GEN) $(builddir)/$^ > hb-ot-shape-complex-indic-table.cc \ || ($(RM) hb-ot-shape-complex-indic-table.cc; false) -use-table: gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt +use-table: gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt $(AM_V_GEN) $(builddir)/$^ > hb-ot-shape-complex-use-table.cc \ || ($(RM) hb-ot-shape-complex-use-table.cc; false) diff --git a/src/gen-use-table.py b/src/gen-use-table.py index 73aa379d5..a79becb72 100755 --- a/src/gen-use-table.py +++ b/src/gen-use-table.py @@ -2,15 +2,16 @@ import sys -if len (sys.argv) != 4: - print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt" +if len (sys.argv) != 5: + print >>sys.stderr, "usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" sys.exit (1) BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"] files = [file (x) for x in sys.argv[1:]] -headers = [[f.readline () for i in range (2)] for f in files] +headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] +headers.append (["UnicodeData.txt does not have a header."]) data = [{} for f in files] values = [{} for f in files] @@ -32,29 +33,184 @@ for i, f in enumerate (files): else: end = int (uu[1], 16) - t = fields[1] + t = fields[1 if i != 2 else 2] for u in range (start, end + 1): data[i][u] = t values[i][t] = values[i].get (t, 0) + end - start + 1 # Merge data into one dict: -defaults = ('Other', 'Not_Applicable', 'No_Block') +defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') for i,v in enumerate (defaults): values[i][v] = values[i].get (v, 0) + 1 combined = {} for i,d in enumerate (data): for u,v in d.items (): - if i == 2 and not u in combined: + if i >= 2 and not u in combined: continue if not u in combined: combined[u] = list (defaults) combined[u][i] = v -combined = {k:v for k,v in combined.items() if v[2] not in BLACKLISTED_BLOCKS} +combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} data = combined del combined num = len (data) + +property_names = [ + # General_Category + 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', + 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', + 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', + # Indic_Syllabic_Category + 'Bindu', + 'Visarga', + 'Avagraha', + 'Nukta', + 'Virama', + 'Pure_Killer', + 'Invisible_Stacker', + 'Vowel_Independent', + 'Vowel_Dependent', + 'Vowel', + 'Consonant_Placeholder', + 'Consonant', + 'Consonant_Dead', + 'Consonant_With_Stacker', + 'Consonant_Prefixed', + 'Consonant_Preceding_Repha', + 'Consonant_Succeeding_Repha', + 'Consonant_Subjoined', + 'Consonant_Medial', + 'Consonant_Final', + 'Consonant_Head_Letter', + 'Modifying_Letter', + 'Tone_Letter', + 'Tone_Mark', + 'Gemination_Mark', + 'Cantillation_Mark', + 'Register_Shifter', + 'Syllable_Modifier', + 'Consonant_Killer', + 'Non_Joiner', + 'Joiner', + 'Number_Joiner', + 'Number', + 'Brahmi_Joining_Number', + # Indic_Positional_Category + 'Right', + 'Left', + 'Visual_Order_Left', + 'Left_And_Right', + 'Top', + 'Bottom', + 'Top_And_Bottom', + 'Top_And_Right', + 'Top_And_Left', + 'Top_And_Left_And_Right', + 'Bottom_And_Right', + 'Top_And_Bottom_And_Right', + 'Overstruck', +] + +class PropertyValue(object): + def __init__(self, name_): + self.name = name_ + +property_values = {} + +for name in property_names: + value = PropertyValue(name) + assert value not in property_values + assert value not in globals() + property_values[name] = value +globals().update(property_values) + + +def is_BASE(U, UISC, UGC): + return (UISC in [Number, Consonant, Consonant_Head_Letter, Consonant_Placeholder, Tone_Letter] or + (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, + Consonant_Subjoined, Vowel, Vowel_Dependent])) +def is_BASE_VOWEL(U, UISC, UGC): + return UISC == Vowel_Independent +def is_BASE_IND(U, UISC, UGC): + return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) +def is_BASE_NUM(U, UISC, UGC): + return UISC == Brahmi_Joining_Number +def is_BASE_OTHER(U, UISC, UGC): + return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, + 0x25FB, 0x25FC, 0x25FD, 0x25FE] +def is_CGJ(U, UISC, UGC): + return U == 0x034F +def is_CONS_FINAL(U, UISC, UGC): + return ((UISC == Consonant_Final and UGC != Lo) or + UISC == Consonant_Succeeding_Repha) +def is_CONS_FINAL_MOD(U, UISC, UGC): + return UISC in [Consonant_Final_Modifier, Syllable_Modifier] +def is_CONS_MED(U, UISC, UGC): + return UISC == Consonant_Medial and UGC != Lo +def is_CONS_MOD(U, UISC, UGC): + return UISC in [Nukta, Gemination_Mark, Consonant_Killer] +def is_CONS_SUB(U, UISC, UGC): + return UISC == Consonant_Subjoined +def is_HALANT(U, UISC, UGC): + return UISC in [Virama, Invisible_Stacker] +def is_HALANT_NUM(U, UISC, UGC): + return UISC == Number_Joiner +def is_ZWNJ(U, UISC, UGC): + return UISC == Non_Joiner +def is_ZWJ(U, UISC, UGC): + return UISC == Joiner +def is_Word_Joiner(U, UISC, UGC): + return U == 0x2060 +def is_OTHER(U, UISC, UGC): + return UGC == Zs # or any other SCRIPT_COMMON characters +def is_Reserved(U, UISC, UGC): + return UGC == 'Cn' +def is_REPHA(U, UISC, UGC): + return UISC == Consonant_Preceding_Repha +def is_SYM(U, UISC, UGC): + return UGC in [So, Sc] or UISC == Symbol_Letter +def is_SYM_MOD(U, UISC, UGC): + return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] +def is_VARIATION_SELECTOR(U, UISC, UGC): + return 0xFE00 <= U <= 0xFE0F +def is_VOWEL(U, UISC, UGC): + return (UISC == Pure_Killer or + (UGC != Lo and UISC in [Vowel, Vowel_Dependent])) +def is_VOWEL_MOD(U, UISC, UGC): + return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or + (UGC != Lo and UISC == Bindu)) + +use_mapping = { + 'B': is_BASE, + 'IV': is_BASE_VOWEL, + 'IND': is_BASE_IND, + 'N': is_BASE_NUM, + 'GB': is_BASE_OTHER, + 'CGJ': is_CGJ, + 'F': is_CONS_FINAL, + 'FM': is_CONS_FINAL_MOD, + 'M': is_CONS_MED, + 'CM': is_CONS_MOD, + 'SUB': is_CONS_SUB, + 'H': is_HALANT, + 'HN': is_HALANT_NUM, + 'ZWNJ': is_ZWNJ, + 'ZWJ': is_ZWJ, + 'WJ': is_Word_Joiner, + 'O': is_OTHER, + 'Rsv': is_Reserved, + 'R': is_REPHA, + 'S': is_SYM, + 'SM': is_SYM_MOD, + 'VS': is_VARIATION_SELECTOR, + 'V': is_VOWEL, + 'VM': is_VOWEL_MOD, +} + +#data = map_to_use(data) + # Remove the outliers singles = {} for u in [0x25CC, 0x1107F]: @@ -65,7 +221,7 @@ print "/* == Start of generated table == */" print "/*" print " * The following table is generated by running:" print " *" -print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt" +print " * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt" print " *" print " * on files with these headers:" print " *" @@ -164,11 +320,11 @@ print "static const USE_TABLE_ELEMENT_TYPE use_table[] = {" for u in uu: if u <= last: continue - block = data[u][2] + block = data[u][3] start = u//8*8 end = start+1 - while end in uu and block == data[end][2]: + while end in uu and block == data[end][3]: end += 1 end = (end-1)//8*8 + 7 diff --git a/src/hb-ot-shape-complex-use-table.cc b/src/hb-ot-shape-complex-use-table.cc index 0ca4cc49f..8f5234c60 100644 --- a/src/hb-ot-shape-complex-use-table.cc +++ b/src/hb-ot-shape-complex-use-table.cc @@ -2,7 +2,7 @@ /* * The following table is generated by running: * - * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt + * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt * * on files with these headers: * @@ -12,6 +12,7 @@ * # Date: 2015-05-12, 10:00:00 GMT [RP, KW, LI] * # Blocks-8.0.0.txt * # Date: 2014-11-10, 23:04:00 GMT [KW] + * UnicodeData.txt does not have a header. */ #include "hb-ot-shape-complex-use-private.hh"