[USE] Treat all gc=Cn as independent clusters
This commit is contained in:
parent
e497a8f142
commit
c33468d48e
|
@ -39,7 +39,7 @@ for j in range(7, 9):
|
|||
headers[j - 1].append(line)
|
||||
headers.append (["UnicodeData.txt does not have a header."])
|
||||
|
||||
data = [{} for _ in files]
|
||||
unicode_data = [{} for _ in files]
|
||||
values = [{} for _ in files]
|
||||
for i, f in enumerate (files):
|
||||
for line in f:
|
||||
|
@ -73,27 +73,27 @@ for i, f in enumerate (files):
|
|||
|
||||
i0 = i if i < 7 else i - 7
|
||||
for u in range (start, end + 1):
|
||||
data[i0][u] = t
|
||||
unicode_data[i0][u] = t
|
||||
values[i0][t] = values[i0].get (t, 0) + end - start + 1
|
||||
|
||||
defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
|
||||
|
||||
# TODO Characters that are not in Unicode Indic files, but used in USE
|
||||
data[0][0x1B61] = defaults[0]
|
||||
data[0][0x1B63] = defaults[0]
|
||||
data[0][0x1B64] = defaults[0]
|
||||
data[0][0x1B65] = defaults[0]
|
||||
data[0][0x1B66] = defaults[0]
|
||||
data[0][0x1B67] = defaults[0]
|
||||
data[0][0x1B69] = defaults[0]
|
||||
data[0][0x1B6A] = defaults[0]
|
||||
data[0][0x2060] = defaults[0]
|
||||
unicode_data[0][0x1B61] = defaults[0]
|
||||
unicode_data[0][0x1B63] = defaults[0]
|
||||
unicode_data[0][0x1B64] = defaults[0]
|
||||
unicode_data[0][0x1B65] = defaults[0]
|
||||
unicode_data[0][0x1B66] = defaults[0]
|
||||
unicode_data[0][0x1B67] = defaults[0]
|
||||
unicode_data[0][0x1B69] = defaults[0]
|
||||
unicode_data[0][0x1B6A] = defaults[0]
|
||||
unicode_data[0][0x2060] = defaults[0]
|
||||
|
||||
# Merge data into one dict:
|
||||
for i,v in enumerate (defaults):
|
||||
values[i][v] = values[i].get (v, 0) + 1
|
||||
combined = {}
|
||||
for i,d in enumerate (data):
|
||||
for i,d in enumerate (unicode_data):
|
||||
for u,v in d.items ():
|
||||
if not u in combined:
|
||||
if i >= 4:
|
||||
|
@ -101,8 +101,6 @@ for i,d in enumerate (data):
|
|||
combined[u] = list (defaults)
|
||||
combined[u][i] = v
|
||||
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
|
||||
data = combined
|
||||
del combined
|
||||
|
||||
|
||||
property_names = [
|
||||
|
@ -254,8 +252,8 @@ def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
|
|||
def is_ZWNJ(U, UISC, UDI, UGC, AJT):
|
||||
return UISC == Non_Joiner
|
||||
def is_OTHER(U, UISC, UDI, UGC, AJT):
|
||||
# Also includes BASE_IND, Rsv, and SYM
|
||||
return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
|
||||
# Also includes BASE_IND and SYM
|
||||
return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
|
||||
and not is_BASE(U, UISC, UDI, UGC, AJT)
|
||||
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
|
||||
and not is_CGJ(U, UISC, UDI, UGC, AJT)
|
||||
|
@ -278,10 +276,11 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
|
|||
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
|
||||
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
|
||||
def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
|
||||
# Also includes Rsv
|
||||
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
|
||||
and UISC == Other
|
||||
and not is_CGJ(U, UISC, UDI, UGC, AJT)
|
||||
)
|
||||
) or UGC == Cn
|
||||
|
||||
use_mapping = {
|
||||
'B': is_BASE,
|
||||
|
@ -412,8 +411,7 @@ def map_to_use(data):
|
|||
out[U] = (USE, UBlock)
|
||||
return out
|
||||
|
||||
defaults = ('O', 'No_Block')
|
||||
data = map_to_use(data)
|
||||
use_data = map_to_use(combined)
|
||||
|
||||
print ("/* == Start of generated table == */")
|
||||
print ("/*")
|
||||
|
@ -439,7 +437,7 @@ print ()
|
|||
total = 0
|
||||
used = 0
|
||||
last_block = None
|
||||
def print_block (block, start, end, data):
|
||||
def print_block (block, start, end, use_data):
|
||||
global total, used, last_block
|
||||
if block and block != last_block:
|
||||
print ()
|
||||
|
@ -454,17 +452,23 @@ def print_block (block, start, end, data):
|
|||
if u % 16 == 0:
|
||||
print ()
|
||||
print (" /* %04X */" % u, end='')
|
||||
if u in data:
|
||||
if u in use_data:
|
||||
num += 1
|
||||
d = data.get (u, defaults)
|
||||
print ("%6s," % d[0], end='')
|
||||
d = use_data.get (u)
|
||||
if d is not None:
|
||||
d = d[0]
|
||||
elif u in unicode_data[4]:
|
||||
d = 'O'
|
||||
else:
|
||||
d = 'WJ'
|
||||
print ("%6s," % d, end='')
|
||||
|
||||
total += end - start + 1
|
||||
used += num
|
||||
if block:
|
||||
last_block = block
|
||||
|
||||
uu = sorted (data.keys ())
|
||||
uu = sorted (use_data.keys ())
|
||||
|
||||
last = -100000
|
||||
num = 0
|
||||
|
@ -487,19 +491,19 @@ print ("static const uint8_t use_table[] = {")
|
|||
for u in uu:
|
||||
if u <= last:
|
||||
continue
|
||||
if data[u][0] == 'O':
|
||||
if use_data[u][0] == 'O':
|
||||
continue
|
||||
block = data[u][1]
|
||||
block = use_data[u][1]
|
||||
|
||||
start = u//8*8
|
||||
end = start+1
|
||||
while end in uu and block == data[end][1]:
|
||||
while end in uu and block == use_data[end][1]:
|
||||
end += 1
|
||||
end = (end-1)//8*8 + 7
|
||||
|
||||
if start != last + 1:
|
||||
if start - last <= 1+16*3:
|
||||
print_block (None, last+1, start-1, data)
|
||||
print_block (None, last+1, start-1, use_data)
|
||||
else:
|
||||
if last >= 0:
|
||||
ends.append (last + 1)
|
||||
|
@ -509,7 +513,7 @@ for u in uu:
|
|||
print ("#define use_offset_0x%04xu %d" % (start, offset))
|
||||
starts.append (start)
|
||||
|
||||
print_block (block, start, end, data)
|
||||
print_block (block, start, end, use_data)
|
||||
last = end
|
||||
ends.append (last + 1)
|
||||
offset += ends[-1] - starts[-1]
|
||||
|
@ -520,8 +524,9 @@ page_bits = 12
|
|||
print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
|
||||
print ()
|
||||
print ("static inline uint8_t")
|
||||
print ("hb_use_get_category (hb_codepoint_t u)")
|
||||
print ("hb_use_get_category (hb_glyph_info_t info)")
|
||||
print ("{")
|
||||
print (" hb_codepoint_t u = info.codepoint;")
|
||||
print (" switch (u >> %d)" % page_bits)
|
||||
print (" {")
|
||||
pages = set([u>>page_bits for u in starts+ends])
|
||||
|
@ -536,7 +541,9 @@ for p in sorted(pages):
|
|||
print (" default:")
|
||||
print (" break;")
|
||||
print (" }")
|
||||
print (" return USE(O);")
|
||||
print (" if (_hb_glyph_info_get_general_category (&info) == HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED)")
|
||||
print (" return WJ;")
|
||||
print (" return O;")
|
||||
print ("}")
|
||||
print ()
|
||||
for k in sorted(use_mapping.keys()):
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -206,7 +206,7 @@ setup_masks_use (const hb_ot_shape_plan_t *plan,
|
|||
unsigned int count = buffer->len;
|
||||
hb_glyph_info_t *info = buffer->info;
|
||||
for (unsigned int i = 0; i < count; i++)
|
||||
info[i].use_category() = hb_use_get_category (info[i].codepoint);
|
||||
info[i].use_category() = hb_use_get_category (info[i]);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
Loading…
Reference in New Issue