[indic-generator] Move category overrides to generator

This commit is contained in:
Behdad Esfahbod 2022-06-09 06:33:51 -06:00
parent 58eeb3a180
commit 4907514026
3 changed files with 172 additions and 126 deletions

View File

@ -42,7 +42,6 @@ files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
headers = [[f.readline () for i in range (2)] for f in files] headers = [[f.readline () for i in range (2)] for f in files]
data = [{} for _ in files] data = [{} for _ in files]
values = [{} for _ in files]
for i, f in enumerate (files): for i, f in enumerate (files):
for line in f: for line in f:
@ -65,12 +64,9 @@ for i, f in enumerate (files):
for u in range (start, end + 1): for u in range (start, end + 1):
data[i][u] = t data[i][u] = t
values[i][t] = values[i].get (t, 0) + end - start + 1
# Merge data into one dict: # Merge data into one dict:
defaults = ('Other', 'Not_Applicable', 'No_Block') defaults = ('Other', 'Not_Applicable', 'No_Block')
for i,v in enumerate (defaults):
values[i][v] = values[i].get (v, 0) + 1
combined = {} combined = {}
for i,d in enumerate (data): for i,d in enumerate (data):
for u,v in d.items (): for u,v in d.items ():
@ -84,7 +80,7 @@ data = combined
del combined del combined
# Convert data # Convert categories & positions types
category_map = { category_map = {
'Other' : 'X', 'Other' : 'X',
@ -123,33 +119,94 @@ category_map = {
'Vowel' : 'V', 'Vowel' : 'V',
'Vowel_Dependent' : 'M', 'Vowel_Dependent' : 'M',
'Vowel_Independent' : 'V', 'Vowel_Independent' : 'V',
'Dotted_Circle' : 'DOTTEDCIRCLE', # Ours, not Unicode's
}
category_overrides = {
# The following act more like the Bindus.
0x0953: 'SM',
0x0954: 'SM',
# The following act like consonants.
0x0A72: 'C',
0x0A73: 'C',
0x1CF5: 'C',
0x1CF6: 'C',
# TODO: The following should only be allowed after a Visarga.
# For now, just treat them like regular tone marks.
0x1CE2: 'A',
0x1CE3: 'A',
0x1CE4: 'A',
0x1CE5: 'A',
0x1CE6: 'A',
0x1CE7: 'A',
0x1CE8: 'A',
# TODO: The following should only be allowed after some of
# the nasalization marks, maybe only for U+1CE9..U+1CF1.
# For now, just treat them like tone marks.
0x1CED: 'A',
# The following take marks in standalone clusters, similar to Avagraha.
0xA8F2: 'Symbol',
0xA8F3: 'Symbol',
0xA8F4: 'Symbol',
0xA8F5: 'Symbol',
0xA8F6: 'Symbol',
0xA8F7: 'Symbol',
0x1CE9: 'Symbol',
0x1CEA: 'Symbol',
0x1CEB: 'Symbol',
0x1CEC: 'Symbol',
0x1CEE: 'Symbol',
0x1CEF: 'Symbol',
0x1CF0: 'Symbol',
0x1CF1: 'Symbol',
0x0A51: 'M', # https://github.com/harfbuzz/harfbuzz/issues/524
# According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil,
# so the Indic shaper needs to know their categories.
0x11301: 'SM',
0x11302: 'SM',
0x11303: 'SM',
0x1133B: 'N',
0x1133C: 'N',
0x0AFB: 'N', # https://github.com/harfbuzz/harfbuzz/issues/552
0x0B55: 'N', # https://github.com/harfbuzz/harfbuzz/issues/2849
0x0980: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/issues/538
0x09FC: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/1613
0x0C80: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/623
0x0D04: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/3511
0x2010: 'PLACEHOLDER',
0x2011: 'PLACEHOLDER',
0x25CC: 'DOTTEDCIRCLE',
} }
new_values0 = {}
for k,v in values[0].items():
new_values0[category_map[k]] = new_values0.get(category_map[k], 0) + v
values[0] = new_values0
defaults = (category_map[defaults[0]], defaults[1], defaults[2]) defaults = (category_map[defaults[0]], defaults[1], defaults[2])
new_data = {} new_data = {}
for key, (cat, pos, block) in data.items(): for key, (cat, pos, block) in data.items():
cat = category_map[cat] cat = category_map[cat]
new_data[key] = (cat, pos, block) new_data[key] = (cat, pos, block)
data = new_data data = new_data
for k,new_cat in category_overrides.items():
(cat, pos, block) in data.get(k, defaults)
data[k] = (new_cat, pos, block)
values = [{_: 1} for _ in defaults]
for vv in data.values():
for i,v in enumerate(vv):
values[i][v] = values[i].get (v, 0) + 1
@ -184,6 +241,7 @@ print ()
short = [{ short = [{
"Coeng": 'Co', "Coeng": 'Co',
"PLACEHOLDER": 'GB', "PLACEHOLDER": 'GB',
"DOTTEDCIRCLE": 'DC',
},{ },{
"Not_Applicable": 'x', "Not_Applicable": 'x',
}] }]

View File

@ -23,40 +23,37 @@
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-macros" #pragma GCC diagnostic ignored "-Wunused-macros"
#define ISC_A OT_A /* 59 chars; A */ #define ISC_A OT_A /* 51 chars; A */
#define ISC_C OT_C /* 2226 chars; C */ #define ISC_C OT_C /* 532 chars; C */
#define ISC_CM OT_CM /* 196 chars; CM */ #define ISC_CM OT_CM /* 10 chars; CM */
#define ISC_CS OT_CS /* 8 chars; CS */ #define ISC_CS OT_CS /* 2 chars; CS */
#define ISC_Co OT_Coeng /* 12 chars; Coeng */ #define ISC_Co OT_Coeng /* 2 chars; Coeng */
#define ISC_H OT_H /* 27 chars; H */ #define ISC_DC OT_DOTTEDCIRCLE /* 1 chars; DOTTEDCIRCLE */
#define ISC_M OT_M /* 713 chars; M */ #define ISC_H OT_H /* 10 chars; H */
#define ISC_N OT_N /* 74 chars; N */ #define ISC_M OT_M /* 209 chars; M */
#define ISC_GB OT_PLACEHOLDER /* 534 chars; PLACEHOLDER */ #define ISC_N OT_N /* 35 chars; N */
#define ISC_RS OT_RS /* 2 chars; RS */ #define ISC_GB OT_PLACEHOLDER /* 168 chars; PLACEHOLDER */
#define ISC_R OT_Repha /* 3 chars; Repha */ #define ISC_RS OT_RS /* 2 chars; RS */
#define ISC_SM OT_SM /* 154 chars; SM */ #define ISC_R OT_Repha /* 1 chars; Repha */
#define ISC_S OT_Symbol /* 17 chars; Symbol */ #define ISC_SM OT_SM /* 56 chars; SM */
#define ISC_V OT_V /* 516 chars; V */ #define ISC_S OT_Symbol /* 22 chars; Symbol */
#define ISC_X OT_X /* 19 chars; X */ #define ISC_V OT_V /* 190 chars; V */
#define ISC_ZWJ OT_ZWJ /* 1 chars; ZWJ */ #define ISC_X OT_X /* 2 chars; X */
#define ISC_ZWNJ OT_ZWNJ /* 1 chars; ZWNJ */ #define ISC_ZWJ OT_ZWJ /* 1 chars; ZWJ */
#define ISC_ZWNJ OT_ZWNJ /* 1 chars; ZWNJ */
#define IMC_B INDIC_MATRA_CATEGORY_BOTTOM /* 352 chars; Bottom */ #define IMC_B INDIC_MATRA_CATEGORY_BOTTOM /* 77 chars; Bottom */
#define IMC_BL INDIC_MATRA_CATEGORY_BOTTOM_AND_LEFT /* 1 chars; Bottom_And_Left */ #define IMC_L INDIC_MATRA_CATEGORY_LEFT /* 21 chars; Left */
#define IMC_BR INDIC_MATRA_CATEGORY_BOTTOM_AND_RIGHT /* 4 chars; Bottom_And_Right */ #define IMC_LR INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT /* 14 chars; Left_And_Right */
#define IMC_L INDIC_MATRA_CATEGORY_LEFT /* 64 chars; Left */ #define IMC_x INDIC_MATRA_CATEGORY_NOT_APPLICABLE /* 904 chars; Not_Applicable */
#define IMC_LR INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT /* 22 chars; Left_And_Right */ #define IMC_O INDIC_MATRA_CATEGORY_OVERSTRUCK /* 44 chars; Overstruck */
#define IMC_x INDIC_MATRA_CATEGORY_NOT_APPLICABLE /* 1 chars; Not_Applicable */ #define IMC_R INDIC_MATRA_CATEGORY_RIGHT /* 98 chars; Right */
#define IMC_O INDIC_MATRA_CATEGORY_OVERSTRUCK /* 10 chars; Overstruck */ #define IMC_T INDIC_MATRA_CATEGORY_TOP /* 122 chars; Top */
#define IMC_R INDIC_MATRA_CATEGORY_RIGHT /* 290 chars; Right */ #define IMC_TB INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM /* 1 chars; Top_And_Bottom */
#define IMC_T INDIC_MATRA_CATEGORY_TOP /* 418 chars; Top */ #define IMC_TBL INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_LEFT /* 1 chars; Top_And_Bottom_And_Left */
#define IMC_TB INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM /* 10 chars; Top_And_Bottom */ #define IMC_TL INDIC_MATRA_CATEGORY_TOP_AND_LEFT /* 3 chars; Top_And_Left */
#define IMC_TBL INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_LEFT /* 2 chars; Top_And_Bottom_And_Left */ #define IMC_TLR INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT /* 3 chars; Top_And_Left_And_Right */
#define IMC_TBR INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_RIGHT /* 1 chars; Top_And_Bottom_And_Right */ #define IMC_TR INDIC_MATRA_CATEGORY_TOP_AND_RIGHT /* 7 chars; Top_And_Right */
#define IMC_TL INDIC_MATRA_CATEGORY_TOP_AND_LEFT /* 6 chars; Top_And_Left */
#define IMC_TLR INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT /* 4 chars; Top_And_Left_And_Right */
#define IMC_TR INDIC_MATRA_CATEGORY_TOP_AND_RIGHT /* 13 chars; Top_And_Right */
#define IMC_VOL INDIC_MATRA_CATEGORY_VISUAL_ORDER_LEFT /* 19 chars; Visual_Order_Left */
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
@ -101,16 +98,19 @@ static const uint16_t indic_table[] = {
/* 0938 */ _(C,x), _(C,x), _(M,T), _(M,R), _(N,B), _(S,x), _(M,R), _(M,L), /* 0938 */ _(C,x), _(C,x), _(M,T), _(M,R), _(N,B), _(S,x), _(M,R), _(M,L),
/* 0940 */ _(M,R), _(M,B), _(M,B), _(M,B), _(M,B), _(M,T), _(M,T), _(M,T), /* 0940 */ _(M,R), _(M,B), _(M,B), _(M,B), _(M,B), _(M,T), _(M,T), _(M,T),
/* 0948 */ _(M,T), _(M,R), _(M,R), _(M,R), _(M,R), _(H,B), _(M,L), _(M,R), /* 0948 */ _(M,T), _(M,R), _(M,R), _(M,R), _(M,R), _(H,B), _(M,L), _(M,R),
/* 0950 */ _(X,x), _(A,T), _(A,B), _(X,T), _(X,T), _(M,T), _(M,B), _(M,B), /* 0950 */ _(X,x), _(A,T), _(A,B), _(SM,O), _(SM,O), _(M,T), _(M,B), _(M,B),
/* 0958 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), /* 0958 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x),
/* 0960 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x), /* 0960 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x),
/* 0968 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), /* 0968 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x),
/* 0970 */ _(X,x), _(X,x), _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), /* 0970 */ _(X,x), _(X,x), _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(V,x),
/* 0978 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), /* 0978 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x),
/* Vedic Extensions */
/* 0980 */ _(GB,O), _(SM,T), _(SM,R), _(SM,R), _(X,x), _(V,x), _(V,x), _(V,x),
/* Bengali */ /* Bengali */
/* 0980 */ _(GB,x), _(SM,T), _(SM,R), _(SM,R), _(X,x), _(V,x), _(V,x), _(V,x),
/* 0988 */ _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(X,x), _(X,x), _(V,x), /* 0988 */ _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(X,x), _(X,x), _(V,x),
/* 0990 */ _(V,x), _(X,x), _(X,x), _(V,x), _(V,x), _(C,x), _(C,x), _(C,x), /* 0990 */ _(V,x), _(X,x), _(X,x), _(V,x), _(V,x), _(C,x), _(C,x), _(C,x),
/* 0998 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), /* 0998 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x),
@ -125,7 +125,10 @@ static const uint16_t indic_table[] = {
/* 09E0 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x), /* 09E0 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x),
/* 09E8 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), /* 09E8 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x),
/* 09F0 */ _(C,x), _(C,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), /* 09F0 */ _(C,x), _(C,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 09F8 */ _(X,x), _(X,x), _(X,x), _(X,x), _(SM,x), _(X,x), _(SM,T), _(X,x),
/* Vedic Extensions */
/* 09F8 */ _(X,x), _(X,x), _(X,x), _(X,x), _(GB,O), _(X,x), _(SM,T), _(X,x),
/* Gurmukhi */ /* Gurmukhi */
@ -139,11 +142,17 @@ static const uint16_t indic_table[] = {
/* 0A38 */ _(C,x), _(C,x), _(X,x), _(X,x), _(N,B), _(X,x), _(M,R), _(M,L), /* 0A38 */ _(C,x), _(C,x), _(X,x), _(X,x), _(N,B), _(X,x), _(M,R), _(M,L),
/* 0A40 */ _(M,R), _(M,B), _(M,B), _(X,x), _(X,x), _(X,x), _(X,x), _(M,T), /* 0A40 */ _(M,R), _(M,B), _(M,B), _(X,x), _(X,x), _(X,x), _(X,x), _(M,T),
/* 0A48 */ _(M,T), _(X,x), _(X,x), _(M,T), _(M,T), _(H,B), _(X,x), _(X,x), /* 0A48 */ _(M,T), _(X,x), _(X,x), _(M,T), _(M,T), _(H,B), _(X,x), _(X,x),
/* 0A50 */ _(X,x), _(A,B), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* Vedic Extensions */
/* 0A50 */ _(X,x), _(M,O), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* Gurmukhi */
/* 0A58 */ _(X,x), _(C,x), _(C,x), _(C,x), _(C,x), _(X,x), _(C,x), _(X,x), /* 0A58 */ _(X,x), _(C,x), _(C,x), _(C,x), _(C,x), _(X,x), _(C,x), _(X,x),
/* 0A60 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(GB,x), _(GB,x), /* 0A60 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(GB,x), _(GB,x),
/* 0A68 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), /* 0A68 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x),
/* 0A70 */ _(SM,T), _(SM,T), _(GB,x), _(GB,x), _(X,x), _(CM,B), _(X,x), _(X,x), /* 0A70 */ _(SM,T), _(SM,T), _(C,O), _(C,O), _(X,x), _(CM,B), _(X,x), _(X,x),
/* 0A78 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), /* 0A78 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* Gujarati */ /* Gujarati */
@ -163,7 +172,7 @@ static const uint16_t indic_table[] = {
/* 0AE0 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x), /* 0AE0 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x),
/* 0AE8 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), /* 0AE8 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x),
/* 0AF0 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), /* 0AF0 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 0AF8 */ _(X,x), _(C,x), _(A,T), _(A,T), _(A,T), _(N,T), _(N,T), _(N,T), /* 0AF8 */ _(X,x), _(C,x), _(A,T), _(N,O), _(A,T), _(N,T), _(N,T), _(N,T),
/* Oriya */ /* Oriya */
@ -177,7 +186,13 @@ static const uint16_t indic_table[] = {
/* 0B38 */ _(C,x), _(C,x), _(X,x), _(X,x), _(N,B), _(S,x), _(M,R), _(M,T), /* 0B38 */ _(C,x), _(C,x), _(X,x), _(X,x), _(N,B), _(S,x), _(M,R), _(M,T),
/* 0B40 */ _(M,R), _(M,B), _(M,B), _(M,B), _(M,B), _(X,x), _(X,x), _(M,L), /* 0B40 */ _(M,R), _(M,B), _(M,B), _(M,B), _(M,B), _(X,x), _(X,x), _(M,L),
/* 0B48 */ _(M,TL), _(X,x), _(X,x), _(M,LR),_(M,TLR), _(H,B), _(X,x), _(X,x), /* 0B48 */ _(M,TL), _(X,x), _(X,x), _(M,LR),_(M,TLR), _(H,B), _(X,x), _(X,x),
/* 0B50 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(M,T), _(M,T), _(M,TR),
/* Vedic Extensions */
/* 0B50 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(N,O), _(M,T), _(M,TR),
/* Oriya */
/* 0B58 */ _(X,x), _(X,x), _(X,x), _(X,x), _(C,x), _(C,x), _(X,x), _(C,x), /* 0B58 */ _(X,x), _(X,x), _(X,x), _(X,x), _(C,x), _(C,x), _(X,x), _(C,x),
/* 0B60 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x), /* 0B60 */ _(V,x), _(V,x), _(M,B), _(M,B), _(X,x), _(X,x), _(GB,x), _(GB,x),
/* 0B68 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), /* 0B68 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x),
@ -222,9 +237,12 @@ static const uint16_t indic_table[] = {
/* 0C70 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), /* 0C70 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 0C78 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), /* 0C78 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* Vedic Extensions */
/* 0C80 */ _(GB,O), _(SM,T), _(SM,R), _(SM,R), _(X,x), _(V,x), _(V,x), _(V,x),
/* Kannada */ /* Kannada */
/* 0C80 */ _(SM,x), _(SM,T), _(SM,R), _(SM,R), _(X,x), _(V,x), _(V,x), _(V,x),
/* 0C88 */ _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(X,x), _(V,x), _(V,x), /* 0C88 */ _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(X,x), _(V,x), _(V,x),
/* 0C90 */ _(V,x), _(X,x), _(V,x), _(V,x), _(V,x), _(C,x), _(C,x), _(C,x), /* 0C90 */ _(V,x), _(X,x), _(V,x), _(V,x), _(V,x), _(C,x), _(C,x), _(C,x),
/* 0C98 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), /* 0C98 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x),
@ -243,7 +261,7 @@ static const uint16_t indic_table[] = {
/* Malayalam */ /* Malayalam */
/* 0D00 */ _(SM,T), _(SM,T), _(SM,R), _(SM,R), _(SM,x), _(V,x), _(V,x), _(V,x), /* 0D00 */ _(SM,T), _(SM,T), _(SM,R), _(SM,R), _(GB,O), _(V,x), _(V,x), _(V,x),
/* 0D08 */ _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(X,x), _(V,x), _(V,x), /* 0D08 */ _(V,x), _(V,x), _(V,x), _(V,x), _(V,x), _(X,x), _(V,x), _(V,x),
/* 0D10 */ _(V,x), _(X,x), _(V,x), _(V,x), _(V,x), _(C,x), _(C,x), _(C,x), /* 0D10 */ _(V,x), _(X,x), _(V,x), _(V,x), _(V,x), _(C,x), _(C,x), _(C,x),
/* 0D18 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), /* 0D18 */ _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x), _(C,x),
@ -331,9 +349,9 @@ static const uint16_t indic_table[] = {
/* 1CD0 */ _(A,T), _(A,T), _(A,T), _(X,x), _(A,O), _(A,B), _(A,B), _(A,B), /* 1CD0 */ _(A,T), _(A,T), _(A,T), _(X,x), _(A,O), _(A,B), _(A,B), _(A,B),
/* 1CD8 */ _(A,B), _(A,B), _(A,T), _(A,T), _(A,B), _(A,B), _(A,B), _(A,B), /* 1CD8 */ _(A,B), _(A,B), _(A,T), _(A,T), _(A,B), _(A,B), _(A,B), _(A,B),
/* 1CE0 */ _(A,T), _(A,R), _(X,O), _(X,O), _(X,O), _(X,O), _(X,O), _(X,O), /* 1CE0 */ _(A,T), _(A,R), _(A,O), _(A,O), _(A,O), _(A,O), _(A,O), _(A,O),
/* 1CE8 */ _(X,O), _(X,x), _(X,x), _(X,x), _(X,x), _(X,B), _(X,x), _(X,x), /* 1CE8 */ _(A,O), _(S,O), _(S,O), _(S,O), _(S,O), _(A,O), _(S,O), _(S,O),
/* 1CF0 */ _(X,x), _(X,x), _(C,x), _(C,x), _(A,T), _(CS,x), _(CS,x), _(A,R), /* 1CF0 */ _(S,O), _(S,O), _(C,x), _(C,x), _(A,T), _(C,O), _(C,O), _(A,R),
/* 1CF8 */ _(A,x), _(A,x), _(GB,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), /* 1CF8 */ _(A,x), _(A,x), _(GB,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
#define indic_offset_0x2008u 1656 #define indic_offset_0x2008u 1656
@ -342,7 +360,10 @@ static const uint16_t indic_table[] = {
/* General Punctuation */ /* General Punctuation */
/* 2008 */ _(X,x), _(X,x), _(X,x), _(X,x),_(ZWNJ,x),_(ZWJ,x), _(X,x), _(X,x), /* 2008 */ _(X,x), _(X,x), _(X,x), _(X,x),_(ZWNJ,x),_(ZWJ,x), _(X,x), _(X,x),
/* 2010 */ _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(GB,x), _(X,x), _(X,x), _(X,x),
/* Vedic Extensions */
/* 2010 */ _(GB,O), _(GB,O), _(GB,x), _(GB,x), _(GB,x), _(X,x), _(X,x), _(X,x),
#define indic_offset_0x2070u 1672 #define indic_offset_0x2070u 1672
@ -360,7 +381,7 @@ static const uint16_t indic_table[] = {
/* A8E0 */ _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), /* A8E0 */ _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T),
/* A8E8 */ _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), /* A8E8 */ _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T), _(A,T),
/* A8F0 */ _(A,T), _(A,T), _(SM,x), _(SM,x), _(X,x), _(X,x), _(X,x), _(X,x), /* A8F0 */ _(A,T), _(A,T), _(S,O), _(S,O), _(S,O), _(S,O), _(S,O), _(S,O),
/* A8F8 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(V,x), _(M,T), /* A8F8 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(V,x), _(M,T),
#define indic_offset_0xa9e0u 1728 #define indic_offset_0xa9e0u 1728
@ -383,7 +404,21 @@ static const uint16_t indic_table[] = {
/* AA70 */ _(X,x), _(C,x), _(C,x), _(C,x), _(GB,x), _(GB,x), _(GB,x), _(X,x), /* AA70 */ _(X,x), _(C,x), _(C,x), _(C,x), _(GB,x), _(GB,x), _(GB,x), _(X,x),
/* AA78 */ _(X,x), _(X,x), _(C,x), _(N,R), _(N,T), _(N,R), _(C,x), _(C,x), /* AA78 */ _(X,x), _(X,x), _(C,x), _(N,R), _(N,T), _(N,R), _(C,x), _(C,x),
}; /* Table items: 1792; occupancy: 71% */ #define indic_offset_0x11300u 1792
/* Vedic Extensions */
/* 11300 */ _(X,x), _(SM,O), _(SM,O), _(SM,O), _(X,x), _(X,x), _(X,x), _(X,x),
/* 11308 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 11310 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 11318 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 11320 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 11328 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 11330 */ _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x), _(X,x),
/* 11338 */ _(X,x), _(X,x), _(X,x), _(N,O), _(N,O), _(X,x), _(X,x), _(X,x),
}; /* Table items: 1856; occupancy: 69% */
uint16_t uint16_t
hb_indic_get_categories (hb_codepoint_t u) hb_indic_get_categories (hb_codepoint_t u)
@ -404,7 +439,7 @@ hb_indic_get_categories (hb_codepoint_t u)
break; break;
case 0x2u: case 0x2u:
if (unlikely (u == 0x25CCu)) return _(GB,x); if (unlikely (u == 0x25CCu)) return _(DC,O);
if (hb_in_range<hb_codepoint_t> (u, 0x2008u, 0x2017u)) return indic_table[u - 0x2008u + indic_offset_0x2008u]; if (hb_in_range<hb_codepoint_t> (u, 0x2008u, 0x2017u)) return indic_table[u - 0x2008u + indic_offset_0x2008u];
if (hb_in_range<hb_codepoint_t> (u, 0x2070u, 0x2087u)) return indic_table[u - 0x2070u + indic_offset_0x2070u]; if (hb_in_range<hb_codepoint_t> (u, 0x2070u, 0x2087u)) return indic_table[u - 0x2070u + indic_offset_0x2070u];
break; break;
@ -415,6 +450,10 @@ hb_indic_get_categories (hb_codepoint_t u)
if (hb_in_range<hb_codepoint_t> (u, 0xAA60u, 0xAA7Fu)) return indic_table[u - 0xAA60u + indic_offset_0xaa60u]; if (hb_in_range<hb_codepoint_t> (u, 0xAA60u, 0xAA7Fu)) return indic_table[u - 0xAA60u + indic_offset_0xaa60u];
break; break;
case 0x11u:
if (hb_in_range<hb_codepoint_t> (u, 0x11300u, 0x1133Fu)) return indic_table[u - 0x11300u + indic_offset_0x11300u];
break;
default: default:
break; break;
} }
@ -428,6 +467,7 @@ hb_indic_get_categories (hb_codepoint_t u)
#undef ISC_CM #undef ISC_CM
#undef ISC_CS #undef ISC_CS
#undef ISC_Co #undef ISC_Co
#undef ISC_DC
#undef ISC_H #undef ISC_H
#undef ISC_M #undef ISC_M
#undef ISC_N #undef ISC_N
@ -442,8 +482,6 @@ hb_indic_get_categories (hb_codepoint_t u)
#undef ISC_ZWNJ #undef ISC_ZWNJ
#undef IMC_B #undef IMC_B
#undef IMC_BL
#undef IMC_BR
#undef IMC_L #undef IMC_L
#undef IMC_LR #undef IMC_LR
#undef IMC_x #undef IMC_x
@ -452,11 +490,9 @@ hb_indic_get_categories (hb_codepoint_t u)
#undef IMC_T #undef IMC_T
#undef IMC_TB #undef IMC_TB
#undef IMC_TBL #undef IMC_TBL
#undef IMC_TBR
#undef IMC_TL #undef IMC_TL
#undef IMC_TLR #undef IMC_TLR
#undef IMC_TR #undef IMC_TR
#undef IMC_VOL
#endif #endif

View File

@ -265,63 +265,16 @@ set_indic_properties (hb_glyph_info_t &info)
indic_category_t cat = (indic_category_t) (type & 0xFFu); indic_category_t cat = (indic_category_t) (type & 0xFFu);
indic_position_t pos = (indic_position_t) (type >> 8); indic_position_t pos = (indic_position_t) (type >> 8);
/*
* Re-assign category
*/
/* The following act more like the Bindus. */
if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x0953u, 0x0954u)))
cat = OT_SM;
/* The following act like consonants. */
else if (unlikely (hb_in_ranges<hb_codepoint_t> (u, 0x0A72u, 0x0A73u,
0x1CF5u, 0x1CF6u)))
cat = OT_C;
/* TODO: The following should only be allowed after a Visarga.
* For now, just treat them like regular tone marks. */
else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x1CE2u, 0x1CE8u)))
cat = OT_A;
/* TODO: The following should only be allowed after some of
* the nasalization marks, maybe only for U+1CE9..U+1CF1.
* For now, just treat them like tone marks. */
else if (unlikely (u == 0x1CEDu))
cat = OT_A;
/* The following take marks in standalone clusters, similar to Avagraha. */
else if (unlikely (hb_in_ranges<hb_codepoint_t> (u, 0xA8F2u, 0xA8F7u,
0x1CE9u, 0x1CECu,
0x1CEEu, 0x1CF1u)))
{
cat = OT_Symbol;
//static_assert (((int) INDIC_SYLLABIC_CATEGORY_AVAGRAHA == OT_Symbol), "");
}
else if (unlikely (u == 0x0A51u))
{
/* https://github.com/harfbuzz/harfbuzz/issues/524 */
cat = OT_M;
pos = POS_BELOW_C;
}
/* According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil,
* so the Indic shaper needs to know their categories. */
else if (unlikely (u == 0x11301u || u == 0x11303u)) cat = OT_SM;
else if (unlikely (u == 0x1133Bu || u == 0x1133Cu)) cat = OT_N;
else if (unlikely (u == 0x0AFBu)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/552 */
else if (unlikely (u == 0x0B55u)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/2849 */
else if (unlikely (u == 0x0980u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/issues/538 */
else if (unlikely (u == 0x09FCu)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/1613 */
else if (unlikely (u == 0x0C80u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/623 */
else if (unlikely (u == 0x0D04u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/3511 */
else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x2010u, 0x2011u)))
cat = OT_PLACEHOLDER;
else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE;
/* /*
* Re-assign position. * Re-assign position.
*/ */
if (unlikely (u == 0x0A51u))
{
/* https://github.com/harfbuzz/harfbuzz/issues/524 */
pos = POS_BELOW_C;
}
if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS)) if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS))
{ {
pos = POS_BASE_C; pos = POS_BASE_C;
@ -340,7 +293,6 @@ set_indic_properties (hb_glyph_info_t &info)
if (unlikely (u == 0x0B01u)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */ if (unlikely (u == 0x0B01u)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */
info.indic_category() = cat; info.indic_category() = cat;
info.indic_position() = pos; info.indic_position() = pos;
} }