[USE] Treat all gc=Cn as independent clusters
This commit is contained in:
parent
e497a8f142
commit
c33468d48e
|
@ -39,7 +39,7 @@ for j in range(7, 9):
|
||||||
headers[j - 1].append(line)
|
headers[j - 1].append(line)
|
||||||
headers.append (["UnicodeData.txt does not have a header."])
|
headers.append (["UnicodeData.txt does not have a header."])
|
||||||
|
|
||||||
data = [{} for _ in files]
|
unicode_data = [{} for _ in files]
|
||||||
values = [{} for _ in files]
|
values = [{} for _ in files]
|
||||||
for i, f in enumerate (files):
|
for i, f in enumerate (files):
|
||||||
for line in f:
|
for line in f:
|
||||||
|
@ -73,27 +73,27 @@ for i, f in enumerate (files):
|
||||||
|
|
||||||
i0 = i if i < 7 else i - 7
|
i0 = i if i < 7 else i - 7
|
||||||
for u in range (start, end + 1):
|
for u in range (start, end + 1):
|
||||||
data[i0][u] = t
|
unicode_data[i0][u] = t
|
||||||
values[i0][t] = values[i0].get (t, 0) + end - start + 1
|
values[i0][t] = values[i0].get (t, 0) + end - start + 1
|
||||||
|
|
||||||
defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
|
defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
|
||||||
|
|
||||||
# TODO Characters that are not in Unicode Indic files, but used in USE
|
# TODO Characters that are not in Unicode Indic files, but used in USE
|
||||||
data[0][0x1B61] = defaults[0]
|
unicode_data[0][0x1B61] = defaults[0]
|
||||||
data[0][0x1B63] = defaults[0]
|
unicode_data[0][0x1B63] = defaults[0]
|
||||||
data[0][0x1B64] = defaults[0]
|
unicode_data[0][0x1B64] = defaults[0]
|
||||||
data[0][0x1B65] = defaults[0]
|
unicode_data[0][0x1B65] = defaults[0]
|
||||||
data[0][0x1B66] = defaults[0]
|
unicode_data[0][0x1B66] = defaults[0]
|
||||||
data[0][0x1B67] = defaults[0]
|
unicode_data[0][0x1B67] = defaults[0]
|
||||||
data[0][0x1B69] = defaults[0]
|
unicode_data[0][0x1B69] = defaults[0]
|
||||||
data[0][0x1B6A] = defaults[0]
|
unicode_data[0][0x1B6A] = defaults[0]
|
||||||
data[0][0x2060] = defaults[0]
|
unicode_data[0][0x2060] = defaults[0]
|
||||||
|
|
||||||
# Merge data into one dict:
|
# Merge data into one dict:
|
||||||
for i,v in enumerate (defaults):
|
for i,v in enumerate (defaults):
|
||||||
values[i][v] = values[i].get (v, 0) + 1
|
values[i][v] = values[i].get (v, 0) + 1
|
||||||
combined = {}
|
combined = {}
|
||||||
for i,d in enumerate (data):
|
for i,d in enumerate (unicode_data):
|
||||||
for u,v in d.items ():
|
for u,v in d.items ():
|
||||||
if not u in combined:
|
if not u in combined:
|
||||||
if i >= 4:
|
if i >= 4:
|
||||||
|
@ -101,8 +101,6 @@ for i,d in enumerate (data):
|
||||||
combined[u] = list (defaults)
|
combined[u] = list (defaults)
|
||||||
combined[u][i] = v
|
combined[u][i] = v
|
||||||
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
|
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
|
||||||
data = combined
|
|
||||||
del combined
|
|
||||||
|
|
||||||
|
|
||||||
property_names = [
|
property_names = [
|
||||||
|
@ -254,8 +252,8 @@ def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
|
||||||
def is_ZWNJ(U, UISC, UDI, UGC, AJT):
|
def is_ZWNJ(U, UISC, UDI, UGC, AJT):
|
||||||
return UISC == Non_Joiner
|
return UISC == Non_Joiner
|
||||||
def is_OTHER(U, UISC, UDI, UGC, AJT):
|
def is_OTHER(U, UISC, UDI, UGC, AJT):
|
||||||
# Also includes BASE_IND, Rsv, and SYM
|
# Also includes BASE_IND and SYM
|
||||||
return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
|
return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
|
||||||
and not is_BASE(U, UISC, UDI, UGC, AJT)
|
and not is_BASE(U, UISC, UDI, UGC, AJT)
|
||||||
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
|
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
|
||||||
and not is_CGJ(U, UISC, UDI, UGC, AJT)
|
and not is_CGJ(U, UISC, UDI, UGC, AJT)
|
||||||
|
@ -278,10 +276,11 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
|
||||||
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
|
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
|
||||||
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
|
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
|
||||||
def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
|
def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
|
||||||
|
# Also includes Rsv
|
||||||
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
|
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
|
||||||
and UISC == Other
|
and UISC == Other
|
||||||
and not is_CGJ(U, UISC, UDI, UGC, AJT)
|
and not is_CGJ(U, UISC, UDI, UGC, AJT)
|
||||||
)
|
) or UGC == Cn
|
||||||
|
|
||||||
use_mapping = {
|
use_mapping = {
|
||||||
'B': is_BASE,
|
'B': is_BASE,
|
||||||
|
@ -412,8 +411,7 @@ def map_to_use(data):
|
||||||
out[U] = (USE, UBlock)
|
out[U] = (USE, UBlock)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
defaults = ('O', 'No_Block')
|
use_data = map_to_use(combined)
|
||||||
data = map_to_use(data)
|
|
||||||
|
|
||||||
print ("/* == Start of generated table == */")
|
print ("/* == Start of generated table == */")
|
||||||
print ("/*")
|
print ("/*")
|
||||||
|
@ -439,7 +437,7 @@ print ()
|
||||||
total = 0
|
total = 0
|
||||||
used = 0
|
used = 0
|
||||||
last_block = None
|
last_block = None
|
||||||
def print_block (block, start, end, data):
|
def print_block (block, start, end, use_data):
|
||||||
global total, used, last_block
|
global total, used, last_block
|
||||||
if block and block != last_block:
|
if block and block != last_block:
|
||||||
print ()
|
print ()
|
||||||
|
@ -454,17 +452,23 @@ def print_block (block, start, end, data):
|
||||||
if u % 16 == 0:
|
if u % 16 == 0:
|
||||||
print ()
|
print ()
|
||||||
print (" /* %04X */" % u, end='')
|
print (" /* %04X */" % u, end='')
|
||||||
if u in data:
|
if u in use_data:
|
||||||
num += 1
|
num += 1
|
||||||
d = data.get (u, defaults)
|
d = use_data.get (u)
|
||||||
print ("%6s," % d[0], end='')
|
if d is not None:
|
||||||
|
d = d[0]
|
||||||
|
elif u in unicode_data[4]:
|
||||||
|
d = 'O'
|
||||||
|
else:
|
||||||
|
d = 'WJ'
|
||||||
|
print ("%6s," % d, end='')
|
||||||
|
|
||||||
total += end - start + 1
|
total += end - start + 1
|
||||||
used += num
|
used += num
|
||||||
if block:
|
if block:
|
||||||
last_block = block
|
last_block = block
|
||||||
|
|
||||||
uu = sorted (data.keys ())
|
uu = sorted (use_data.keys ())
|
||||||
|
|
||||||
last = -100000
|
last = -100000
|
||||||
num = 0
|
num = 0
|
||||||
|
@ -487,19 +491,19 @@ print ("static const uint8_t use_table[] = {")
|
||||||
for u in uu:
|
for u in uu:
|
||||||
if u <= last:
|
if u <= last:
|
||||||
continue
|
continue
|
||||||
if data[u][0] == 'O':
|
if use_data[u][0] == 'O':
|
||||||
continue
|
continue
|
||||||
block = data[u][1]
|
block = use_data[u][1]
|
||||||
|
|
||||||
start = u//8*8
|
start = u//8*8
|
||||||
end = start+1
|
end = start+1
|
||||||
while end in uu and block == data[end][1]:
|
while end in uu and block == use_data[end][1]:
|
||||||
end += 1
|
end += 1
|
||||||
end = (end-1)//8*8 + 7
|
end = (end-1)//8*8 + 7
|
||||||
|
|
||||||
if start != last + 1:
|
if start != last + 1:
|
||||||
if start - last <= 1+16*3:
|
if start - last <= 1+16*3:
|
||||||
print_block (None, last+1, start-1, data)
|
print_block (None, last+1, start-1, use_data)
|
||||||
else:
|
else:
|
||||||
if last >= 0:
|
if last >= 0:
|
||||||
ends.append (last + 1)
|
ends.append (last + 1)
|
||||||
|
@ -509,7 +513,7 @@ for u in uu:
|
||||||
print ("#define use_offset_0x%04xu %d" % (start, offset))
|
print ("#define use_offset_0x%04xu %d" % (start, offset))
|
||||||
starts.append (start)
|
starts.append (start)
|
||||||
|
|
||||||
print_block (block, start, end, data)
|
print_block (block, start, end, use_data)
|
||||||
last = end
|
last = end
|
||||||
ends.append (last + 1)
|
ends.append (last + 1)
|
||||||
offset += ends[-1] - starts[-1]
|
offset += ends[-1] - starts[-1]
|
||||||
|
@ -520,8 +524,9 @@ page_bits = 12
|
||||||
print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
|
print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
|
||||||
print ()
|
print ()
|
||||||
print ("static inline uint8_t")
|
print ("static inline uint8_t")
|
||||||
print ("hb_use_get_category (hb_codepoint_t u)")
|
print ("hb_use_get_category (hb_glyph_info_t info)")
|
||||||
print ("{")
|
print ("{")
|
||||||
|
print (" hb_codepoint_t u = info.codepoint;")
|
||||||
print (" switch (u >> %d)" % page_bits)
|
print (" switch (u >> %d)" % page_bits)
|
||||||
print (" {")
|
print (" {")
|
||||||
pages = set([u>>page_bits for u in starts+ends])
|
pages = set([u>>page_bits for u in starts+ends])
|
||||||
|
@ -536,7 +541,9 @@ for p in sorted(pages):
|
||||||
print (" default:")
|
print (" default:")
|
||||||
print (" break;")
|
print (" break;")
|
||||||
print (" }")
|
print (" }")
|
||||||
print (" return USE(O);")
|
print (" if (_hb_glyph_info_get_general_category (&info) == HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED)")
|
||||||
|
print (" return WJ;")
|
||||||
|
print (" return O;")
|
||||||
print ("}")
|
print ("}")
|
||||||
print ()
|
print ()
|
||||||
for k in sorted(use_mapping.keys()):
|
for k in sorted(use_mapping.keys()):
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -206,7 +206,7 @@ setup_masks_use (const hb_ot_shape_plan_t *plan,
|
||||||
unsigned int count = buffer->len;
|
unsigned int count = buffer->len;
|
||||||
hb_glyph_info_t *info = buffer->info;
|
hb_glyph_info_t *info = buffer->info;
|
||||||
for (unsigned int i = 0; i < count; i++)
|
for (unsigned int i = 0; i < count; i++)
|
||||||
info[i].use_category() = hb_use_get_category (info[i].codepoint);
|
info[i].use_category() = hb_use_get_category (info[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
Loading…
Reference in New Issue