[USE] Treat all gc=Cn as independent clusters

This commit is contained in:
David Corbett 2022-03-06 12:26:37 -05:00 committed by Behdad Esfahbod
parent e497a8f142
commit c33468d48e
3 changed files with 290 additions and 280 deletions

View File

@ -39,7 +39,7 @@ for j in range(7, 9):
headers[j - 1].append(line)
headers.append (["UnicodeData.txt does not have a header."])
data = [{} for _ in files]
unicode_data = [{} for _ in files]
values = [{} for _ in files]
for i, f in enumerate (files):
for line in f:
@ -73,27 +73,27 @@ for i, f in enumerate (files):
i0 = i if i < 7 else i - 7
for u in range (start, end + 1):
data[i0][u] = t
unicode_data[i0][u] = t
values[i0][t] = values[i0].get (t, 0) + end - start + 1
defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
# TODO Characters that are not in Unicode Indic files, but used in USE
data[0][0x1B61] = defaults[0]
data[0][0x1B63] = defaults[0]
data[0][0x1B64] = defaults[0]
data[0][0x1B65] = defaults[0]
data[0][0x1B66] = defaults[0]
data[0][0x1B67] = defaults[0]
data[0][0x1B69] = defaults[0]
data[0][0x1B6A] = defaults[0]
data[0][0x2060] = defaults[0]
unicode_data[0][0x1B61] = defaults[0]
unicode_data[0][0x1B63] = defaults[0]
unicode_data[0][0x1B64] = defaults[0]
unicode_data[0][0x1B65] = defaults[0]
unicode_data[0][0x1B66] = defaults[0]
unicode_data[0][0x1B67] = defaults[0]
unicode_data[0][0x1B69] = defaults[0]
unicode_data[0][0x1B6A] = defaults[0]
unicode_data[0][0x2060] = defaults[0]
# Merge data into one dict:
for i,v in enumerate (defaults):
values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i,d in enumerate (data):
for i,d in enumerate (unicode_data):
for u,v in d.items ():
if not u in combined:
if i >= 4:
@ -101,8 +101,6 @@ for i,d in enumerate (data):
combined[u] = list (defaults)
combined[u][i] = v
combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
data = combined
del combined
property_names = [
@ -254,8 +252,8 @@ def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
def is_ZWNJ(U, UISC, UDI, UGC, AJT):
return UISC == Non_Joiner
def is_OTHER(U, UISC, UDI, UGC, AJT):
# Also includes BASE_IND, Rsv, and SYM
return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
# Also includes BASE_IND and SYM
return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
and not is_BASE(U, UISC, UDI, UGC, AJT)
and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
and not is_CGJ(U, UISC, UDI, UGC, AJT)
@ -278,10 +276,11 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
# Also includes Rsv
return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
and UISC == Other
and not is_CGJ(U, UISC, UDI, UGC, AJT)
)
) or UGC == Cn
use_mapping = {
'B': is_BASE,
@ -412,8 +411,7 @@ def map_to_use(data):
out[U] = (USE, UBlock)
return out
defaults = ('O', 'No_Block')
data = map_to_use(data)
use_data = map_to_use(combined)
print ("/* == Start of generated table == */")
print ("/*")
@ -439,7 +437,7 @@ print ()
total = 0
used = 0
last_block = None
def print_block (block, start, end, data):
def print_block (block, start, end, use_data):
global total, used, last_block
if block and block != last_block:
print ()
@ -454,17 +452,23 @@ def print_block (block, start, end, data):
if u % 16 == 0:
print ()
print (" /* %04X */" % u, end='')
if u in data:
if u in use_data:
num += 1
d = data.get (u, defaults)
print ("%6s," % d[0], end='')
d = use_data.get (u)
if d is not None:
d = d[0]
elif u in unicode_data[4]:
d = 'O'
else:
d = 'WJ'
print ("%6s," % d, end='')
total += end - start + 1
used += num
if block:
last_block = block
uu = sorted (data.keys ())
uu = sorted (use_data.keys ())
last = -100000
num = 0
@ -487,19 +491,19 @@ print ("static const uint8_t use_table[] = {")
for u in uu:
if u <= last:
continue
if data[u][0] == 'O':
if use_data[u][0] == 'O':
continue
block = data[u][1]
block = use_data[u][1]
start = u//8*8
end = start+1
while end in uu and block == data[end][1]:
while end in uu and block == use_data[end][1]:
end += 1
end = (end-1)//8*8 + 7
if start != last + 1:
if start - last <= 1+16*3:
print_block (None, last+1, start-1, data)
print_block (None, last+1, start-1, use_data)
else:
if last >= 0:
ends.append (last + 1)
@ -509,7 +513,7 @@ for u in uu:
print ("#define use_offset_0x%04xu %d" % (start, offset))
starts.append (start)
print_block (block, start, end, data)
print_block (block, start, end, use_data)
last = end
ends.append (last + 1)
offset += ends[-1] - starts[-1]
@ -520,8 +524,9 @@ page_bits = 12
print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
print ()
print ("static inline uint8_t")
print ("hb_use_get_category (hb_codepoint_t u)")
print ("hb_use_get_category (hb_glyph_info_t info)")
print ("{")
print (" hb_codepoint_t u = info.codepoint;")
print (" switch (u >> %d)" % page_bits)
print (" {")
pages = set([u>>page_bits for u in starts+ends])
@ -536,7 +541,9 @@ for p in sorted(pages):
print (" default:")
print (" break;")
print (" }")
print (" return USE(O);")
print (" if (_hb_glyph_info_get_general_category (&info) == HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED)")
print (" return WJ;")
print (" return O;")
print ("}")
print ()
for k in sorted(use_mapping.keys()):

File diff suppressed because it is too large Load Diff

View File

@ -206,7 +206,7 @@ setup_masks_use (const hb_ot_shape_plan_t *plan,
unsigned int count = buffer->len;
hb_glyph_info_t *info = buffer->info;
for (unsigned int i = 0; i < count; i++)
info[i].use_category() = hb_use_get_category (info[i].codepoint);
info[i].use_category() = hb_use_get_category (info[i]);
}
static void