diff --git a/src/gen-use-table.py b/src/gen-use-table.py index 55eff1de4..34540ca6d 100755 --- a/src/gen-use-table.py +++ b/src/gen-use-table.py @@ -1,34 +1,37 @@ #!/usr/bin/env python3 # flake8: noqa: F821 -"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt DerivedCoreProperties.txt UnicodeData.txt ArabicShaping.txt Blocks.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt +"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt Input files: * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt +* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt -* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt +* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt * ms-use/IndicSyllabicCategory-Additional.txt * ms-use/IndicPositionalCategory-Additional.txt """ import sys -if len (sys.argv) != 9: +if len (sys.argv) != 10: sys.exit (__doc__) -DISABLED_BLOCKS = [ - 'Samaritan', - 'Thai', +DISABLED_SCRIPTS = { + 'Arabic', 'Lao', -] + 'Samaritan', + 'Syriac', + 'Thai', +} files = [open (x, encoding='utf-8') for x in sys.argv[1:]] -headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 3] -for j in range(6, 8): +headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4] +for j in range(7, 9): for line in files[j]: line = line.rstrip() if not line: @@ -56,27 +59,26 @@ for i, f in enumerate (files): else: end = int (uu[1], 16) - t = fields[1 if i not in [3, 4] else 2] + t = fields[1 if i not in [2, 4] else 2] - if i == 2 and t != 'Default_Ignorable_Code_Point': - continue - elif i == 4: + if i == 2: t = 'jt_' + t - elif i == 6 and t == 'Consonant_Final_Modifier': + elif i == 3 and t != 'Default_Ignorable_Code_Point': + continue + elif i == 7 and t == 'Consonant_Final_Modifier': # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336 t = 'Syllable_Modifier' - elif i == 7 and t == 'NA': + elif i == 8 and t == 'NA': t = 'Not_Applicable' - i0 = i if i < 6 else i - 6 + i0 = i if i < 7 else i - 7 for u in range (start, end + 1): data[i0][u] = t values[i0][t] = values[i0].get (t, 0) + end - start + 1 -defaults = ('Other', 'Not_Applicable', '', 'Cn', 'jt_X', 'No_Block') +defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown') # TODO Characters that are not in Unicode Indic files, but used in USE -data[0][0x0640] = defaults[0] data[0][0x1B61] = defaults[0] data[0][0x1B63] = defaults[0] data[0][0x1B64] = defaults[0] @@ -86,29 +88,6 @@ data[0][0x1B67] = defaults[0] data[0][0x1B69] = defaults[0] data[0][0x1B6A] = defaults[0] data[0][0x2060] = defaults[0] -for u in range (0x07CA, 0x07EA + 1): - data[0][u] = defaults[0] -data[0][0x07FA] = defaults[0] -for u in range (0x0840, 0x0858 + 1): - data[0][u] = defaults[0] -for u in range (0x1887, 0x18A8 + 1): - data[0][u] = defaults[0] -data[0][0x18AA] = defaults[0] -for u in range (0xA840, 0xA872 + 1): - data[0][u] = defaults[0] -for u in range (0x10B80, 0x10B91 + 1): - data[0][u] = defaults[0] -for u in range (0x10BA9, 0x10BAE + 1): - data[0][u] = defaults[0] -data[0][0x10FB0] = defaults[0] -for u in range (0x10FB2, 0x10FB6 + 1): - data[0][u] = defaults[0] -for u in range (0x10FB8, 0x10FBF + 1): - data[0][u] = defaults[0] -for u in range (0x10FC1, 0x10FC4 + 1): - data[0][u] = defaults[0] -for u in range (0x10FC9, 0x10FCB + 1): - data[0][u] = defaults[0] # TODO https://github.com/harfbuzz/harfbuzz/pull/1685 data[0][0x1B5B] = 'Consonant_Placeholder' data[0][0x1B5C] = 'Consonant_Placeholder' @@ -127,12 +106,12 @@ for i,v in enumerate (defaults): combined = {} for i,d in enumerate (data): for u,v in d.items (): - if i >= 3 and not u in combined: - continue if not u in combined: + if i >= 4: + continue combined[u] = list (defaults) combined[u][i] = v -combined = {k: v for k, v in combined.items() if v[5] not in DISABLED_BLOCKS} +combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS} data = combined del combined @@ -383,7 +362,7 @@ use_positions = { def map_to_use(data): out = {} items = use_mapping.items() - for U, (UISC, UIPC, UDI, UGC, AJT, UBlock) in data.items(): + for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items(): # Resolve Indic_Syllabic_Category @@ -444,7 +423,7 @@ print ("/* == Start of generated table == */") print ("/*") print (" * The following table is generated by running:") print (" *") -print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt DerivedCoreProperties.txt UnicodeData.txt ArabicShaping.txt Blocks.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0])) +print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0])) print (" *") print (" * on files with these headers:") print (" *") diff --git a/src/hb-ot-shape-complex-use-table.hh b/src/hb-ot-shape-complex-use-table.hh index ce18df939..7a3a995c8 100644 --- a/src/hb-ot-shape-complex-use-table.hh +++ b/src/hb-ot-shape-complex-use-table.hh @@ -2,7 +2,7 @@ /* * The following table is generated by running: * - * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt DerivedCoreProperties.txt UnicodeData.txt ArabicShaping.txt Blocks.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt + * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt * * on files with these headers: * @@ -10,12 +10,14 @@ * # Date: 2021-05-22, 01:01:00 GMT [KW, RP] * # IndicPositionalCategory-14.0.0.txt * # Date: 2021-05-22, 01:01:00 GMT [KW, RP] - * # DerivedCoreProperties-14.0.0.txt - * # Date: 2021-08-12, 23:12:53 GMT * # ArabicShaping-14.0.0.txt * # Date: 2021-05-21, 01:54:00 GMT [KW, RP] + * # DerivedCoreProperties-14.0.0.txt + * # Date: 2021-08-12, 23:12:53 GMT * # Blocks-14.0.0.txt * # Date: 2021-01-22, 23:29:00 GMT [KW] + * # Scripts-14.0.0.txt + * # Date: 2021-07-10, 00:35:31 GMT * # Override values For Indic_Syllabic_Category * # Not derivable * # Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17 @@ -598,9 +600,9 @@ static const uint8_t use_table[] = { /* 10AC0 */ B, B, B, B, B, B, B, B, O, B, B, B, B, B, B, B, /* 10AD0 */ B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, - /* 10AE0 */ B, B, B, B, B, CMBlw, CMBlw, O, + /* 10AE0 */ B, B, B, B, B, CMBlw, CMBlw, O, O, O, O, B, B, B, B, B, -#define use_offset_0x10b80u 4232 +#define use_offset_0x10b80u 4240 /* Psalter Pahlavi */ @@ -609,7 +611,7 @@ static const uint8_t use_table[] = { /* 10B90 */ B, B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 10BA0 */ O, O, O, O, O, O, O, O, O, B, B, B, B, B, B, O, -#define use_offset_0x10d00u 4280 +#define use_offset_0x10d00u 4288 /* Hanifi Rohingya */ @@ -619,7 +621,7 @@ static const uint8_t use_table[] = { /* 10D20 */ B, B, B, B, VMAbv, VMAbv, VMAbv, CMAbv, O, O, O, O, O, O, O, O, /* 10D30 */ B, B, B, B, B, B, B, B, B, B, O, O, O, O, O, O, -#define use_offset_0x10e80u 4344 +#define use_offset_0x10e80u 4352 /* Yezidi */ @@ -629,7 +631,7 @@ static const uint8_t use_table[] = { /* 10EA0 */ B, B, B, B, B, B, B, B, B, B, O, VAbv, VAbv, O, O, O, /* 10EB0 */ B, B, O, O, O, O, O, O, -#define use_offset_0x10f30u 4400 +#define use_offset_0x10f30u 4408 /* Sogdian */ @@ -638,11 +640,11 @@ static const uint8_t use_table[] = { /* 10F40 */ B, B, B, B, B, B, VMBlw, VMBlw, VMBlw, VMBlw, VMBlw, VMBlw, VMBlw, VMBlw, VMBlw, VMBlw, /* 10F50 */ VMBlw, B, B, B, B, O, O, O, O, O, O, O, O, O, O, O, /* 10F60 */ O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, - /* 10F70 */ O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* Old Uyghur */ - /* 10F80 */ O, O, CMBlw, CMBlw, CMBlw, CMBlw, O, O, O, O, O, O, O, O, O, O, + /* 10F70 */ B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, + /* 10F80 */ B, B, CMBlw, CMBlw, CMBlw, CMBlw, O, O, O, O, O, O, O, O, O, O, /* 10F90 */ O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, /* 10FA0 */ O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, @@ -673,7 +675,7 @@ static const uint8_t use_table[] = { /* 110B0 */ VPst, VPre, VPst, VBlw, VBlw, VAbv, VAbv, VPst, VPst, H, CMBlw, O, O, O, O, O, /* 110C0 */ O, O, VBlw, O, O, O, O, O, -#define use_offset_0x11100u 4808 +#define use_offset_0x11100u 4816 /* Chakma */ @@ -711,7 +713,7 @@ static const uint8_t use_table[] = { /* 11220 */ B, B, B, B, B, B, B, B, B, B, B, B, VPst, VPst, VPst, VBlw, /* 11230 */ VAbv, VAbv, VAbv, VAbv, VMAbv, H, CMAbv, CMAbv, O, O, O, O, O, O, VMAbv, O, -#define use_offset_0x11280u 5128 +#define use_offset_0x11280u 5136 /* Multani */ @@ -739,7 +741,7 @@ static const uint8_t use_table[] = { /* 11360 */ B, B, VPst, VPst, O, O, VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, O, O, O, /* 11370 */ VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, O, O, O, -#define use_offset_0x11400u 5376 +#define use_offset_0x11400u 5384 /* Newa */ @@ -762,7 +764,7 @@ static const uint8_t use_table[] = { /* 114C0 */ VMAbv, VMAbv, H, CMBlw, B, O, O, O, O, O, O, O, O, O, O, O, /* 114D0 */ B, B, B, B, B, B, B, B, B, B, O, O, O, O, O, O, -#define use_offset_0x11580u 5600 +#define use_offset_0x11580u 5608 /* Siddham */ @@ -806,7 +808,7 @@ static const uint8_t use_table[] = { /* 11730 */ B, B, B, B, B, B, B, B, B, B, B, B, O, O, O, O, /* 11740 */ B, B, B, B, B, B, B, O, -#define use_offset_0x11800u 6056 +#define use_offset_0x11800u 6064 /* Dogra */ @@ -816,7 +818,7 @@ static const uint8_t use_table[] = { /* 11820 */ B, B, B, B, B, B, B, B, B, B, B, B, VPst, VPre, VPst, VBlw, /* 11830 */ VBlw, VBlw, VBlw, VAbv, VAbv, VAbv, VAbv, VMAbv, VMPst, H, CMBlw, O, O, O, O, O, -#define use_offset_0x11900u 6120 +#define use_offset_0x11900u 6128 /* Dives Akuru */ @@ -828,7 +830,7 @@ static const uint8_t use_table[] = { /* 11940 */ MPst, R, MPst, CMBlw, O, O, O, O, O, O, O, O, O, O, O, O, /* 11950 */ B, B, B, B, B, B, B, B, B, B, O, O, O, O, O, O, -#define use_offset_0x119a0u 6216 +#define use_offset_0x119a0u 6224 /* Nandinagari */ @@ -856,7 +858,7 @@ static const uint8_t use_table[] = { /* 11A80 */ B, B, B, B, R, R, R, R, R, R, FBlw, FBlw, FBlw, FBlw, FBlw, FBlw, /* 11A90 */ FBlw, FBlw, FBlw, FBlw, FBlw, FBlw, VMAbv, VMPst, CMAbv, H, O, O, O, B, O, O, -#define use_offset_0x11c00u 6472 +#define use_offset_0x11c00u 6480 /* Bhaiksuki */ @@ -877,7 +879,7 @@ static const uint8_t use_table[] = { /* 11CA0 */ SUB, SUB, SUB, SUB, SUB, SUB, SUB, SUB, O, SUB, SUB, SUB, SUB, SUB, SUB, SUB, /* 11CB0 */ VBlw, VPre, VBlw, VAbv, VPst, VMAbv, VMAbv, O, -#define use_offset_0x11d00u 6656 +#define use_offset_0x11d00u 6664 /* Masaram Gondi */ @@ -897,7 +899,7 @@ static const uint8_t use_table[] = { /* 11D90 */ VAbv, VAbv, O, VPst, VPst, VMAbv, VMPst, H, O, O, O, O, O, O, O, O, /* 11DA0 */ B, B, B, B, B, B, B, B, B, B, O, O, O, O, O, O, -#define use_offset_0x11ee0u 6832 +#define use_offset_0x11ee0u 6840 /* Makasar */ @@ -905,7 +907,7 @@ static const uint8_t use_table[] = { /* 11EE0 */ B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, /* 11EF0 */ B, B, GB, VAbv, VBlw, VPre, VPst, O, -#define use_offset_0x13000u 6856 +#define use_offset_0x13000u 6864 /* Egyptian Hieroglyphs */ @@ -982,7 +984,7 @@ static const uint8_t use_table[] = { /* 13430 */ H, H, H, H, H, H, H, B, B, O, O, O, O, O, O, O, -#define use_offset_0x16ac0u 7944 +#define use_offset_0x16ac0u 7952 /* Tangsa */ @@ -999,7 +1001,7 @@ static const uint8_t use_table[] = { /* 16B20 */ B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, /* 16B30 */ VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, O, -#define use_offset_0x16f00u 8064 +#define use_offset_0x16f00u 8072 /* Miao */ @@ -1015,14 +1017,14 @@ static const uint8_t use_table[] = { /* 16F80 */ VBlw, VBlw, VBlw, VBlw, VBlw, VBlw, VBlw, VBlw, O, O, O, O, O, O, O, VMBlw, /* 16F90 */ VMBlw, VMBlw, VMBlw, O, O, O, O, O, -#define use_offset_0x16fe0u 8216 +#define use_offset_0x16fe0u 8224 /* Ideographic Symbols and Punctuation */ /* 16FE0 */ O, O, O, O, B, O, O, O, -#define use_offset_0x18b00u 8224 +#define use_offset_0x18b00u 8232 /* Khitan Small Script */ @@ -1058,7 +1060,7 @@ static const uint8_t use_table[] = { /* 18CC0 */ B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, B, /* 18CD0 */ B, B, B, B, B, B, O, O, -#define use_offset_0x1bc00u 8696 +#define use_offset_0x1bc00u 8704 /* Duployan */ @@ -1074,7 +1076,7 @@ static const uint8_t use_table[] = { /* 1BC80 */ B, B, B, B, B, B, B, B, B, O, O, O, O, O, O, O, /* 1BC90 */ B, B, B, B, B, B, B, B, B, B, O, O, O, CMBlw, CMBlw, O, -#define use_offset_0x1e100u 8856 +#define use_offset_0x1e100u 8864 /* Nyiakeng Puachue Hmong */ @@ -1085,7 +1087,7 @@ static const uint8_t use_table[] = { /* 1E130 */ VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, VMAbv, B, B, B, B, B, B, B, O, O, /* 1E140 */ B, B, B, B, B, B, B, B, B, B, O, O, O, O, B, B, -#define use_offset_0x1e290u 8936 +#define use_offset_0x1e290u 8944 /* Toto */ @@ -1101,7 +1103,7 @@ static const uint8_t use_table[] = { /* 1E2E0 */ B, B, B, B, B, B, B, B, B, B, B, B, VMAbv, VMAbv, VMAbv, VMAbv, /* 1E2F0 */ B, B, B, B, B, B, B, B, B, B, O, O, O, O, O, O, -#define use_offset_0x1e900u 9048 +#define use_offset_0x1e900u 9056 /* Adlam */ @@ -1113,7 +1115,7 @@ static const uint8_t use_table[] = { /* 1E940 */ B, B, B, B, CMAbv, CMAbv, CMAbv, CMAbv, CMAbv, CMAbv, CMAbv, B, O, O, O, O, /* 1E950 */ B, B, B, B, B, B, B, B, B, B, O, O, O, O, O, O, -#define use_offset_0xe0100u 9144 +#define use_offset_0xe0100u 9152 /* Variation Selectors Supplement */ @@ -1134,7 +1136,7 @@ static const uint8_t use_table[] = { /* E01D0 */ CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, /* E01E0 */ CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, CGJ, -}; /* Table items: 9384; occupancy: 78% */ +}; /* Table items: 9392; occupancy: 79% */ static inline uint8_t hb_use_get_category (hb_codepoint_t u) @@ -1181,7 +1183,7 @@ hb_use_get_category (hb_codepoint_t u) case 0x10u: if (hb_in_range (u, 0x10570u, 0x105BFu)) return use_table[u - 0x10570u + use_offset_0x10570u]; if (hb_in_range (u, 0x10A00u, 0x10A4Fu)) return use_table[u - 0x10A00u + use_offset_0x10a00u]; - if (hb_in_range (u, 0x10AC0u, 0x10AE7u)) return use_table[u - 0x10AC0u + use_offset_0x10ac0u]; + if (hb_in_range (u, 0x10AC0u, 0x10AEFu)) return use_table[u - 0x10AC0u + use_offset_0x10ac0u]; if (hb_in_range (u, 0x10B80u, 0x10BAFu)) return use_table[u - 0x10B80u + use_offset_0x10b80u]; if (hb_in_range (u, 0x10D00u, 0x10D3Fu)) return use_table[u - 0x10D00u + use_offset_0x10d00u]; if (hb_in_range (u, 0x10E80u, 0x10EB7u)) return use_table[u - 0x10E80u + use_offset_0x10e80u]; diff --git a/src/update-unicode-tables.make b/src/update-unicode-tables.make index 23d595d28..8c2eaa418 100755 --- a/src/update-unicode-tables.make +++ b/src/update-unicode-tables.make @@ -21,7 +21,7 @@ hb-ot-tag-table.hh: gen-tag-table.py languagetags language-subtag-registry ./$^ > $@ || ($(RM) $@; false) hb-ucd-table.hh: gen-ucd-table.py ucd.nounihan.grouped.zip hb-common.h ./$^ > $@ || ($(RM) $@; false) -hb-ot-shape-complex-use-table.hh: gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt DerivedCoreProperties.txt UnicodeData.txt ArabicShaping.txt Blocks.txt ms-use/IndicSyllabicCategory-Additional.txt ms-use/IndicPositionalCategory-Additional.txt +hb-ot-shape-complex-use-table.hh: gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt ms-use/IndicSyllabicCategory-Additional.txt ms-use/IndicPositionalCategory-Additional.txt ./$^ > $@ || ($(RM) $@; false) hb-ot-shape-complex-vowel-constraints.cc: gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt ./$^ > $@ || ($(RM) $@; false)