[USE] Treat all gc=Cn as independent clusters

2022-03-06 12:26:37 -05:00 · 2022-03-06 12:26:37 -05:00 · c33468d48e
parent e497a8f142
commit c33468d48e
3 changed files with 290 additions and 280 deletions
--- a/src/gen-use-table.py
+++ b/src/gen-use-table.py
@ -39,7 +39,7 @@ for j in range(7, 9):
 		headers[j - 1].append(line)
 headers.append (["UnicodeData.txt does not have a header."])

-data = [{} for _ in files]
+unicode_data = [{} for _ in files]
 values = [{} for _ in files]
 for i, f in enumerate (files):
 	for line in f:
@ -73,27 +73,27 @@ for i, f in enumerate (files):

 		i0 = i if i < 7 else i - 7
 		for u in range (start, end + 1):
-			data[i0][u] = t
+			unicode_data[i0][u] = t
 		values[i0][t] = values[i0].get (t, 0) + end - start + 1

 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')

 # TODO Characters that are not in Unicode Indic files, but used in USE
-data[0][0x1B61] = defaults[0]
-data[0][0x1B63] = defaults[0]
-data[0][0x1B64] = defaults[0]
-data[0][0x1B65] = defaults[0]
-data[0][0x1B66] = defaults[0]
-data[0][0x1B67] = defaults[0]
-data[0][0x1B69] = defaults[0]
-data[0][0x1B6A] = defaults[0]
-data[0][0x2060] = defaults[0]
+unicode_data[0][0x1B61] = defaults[0]
+unicode_data[0][0x1B63] = defaults[0]
+unicode_data[0][0x1B64] = defaults[0]
+unicode_data[0][0x1B65] = defaults[0]
+unicode_data[0][0x1B66] = defaults[0]
+unicode_data[0][0x1B67] = defaults[0]
+unicode_data[0][0x1B69] = defaults[0]
+unicode_data[0][0x1B6A] = defaults[0]
+unicode_data[0][0x2060] = defaults[0]

 # Merge data into one dict:
 for i,v in enumerate (defaults):
 	values[i][v] = values[i].get (v, 0) + 1
 combined = {}
-for i,d in enumerate (data):
+for i,d in enumerate (unicode_data):
 	for u,v in d.items ():
 		if not u in combined:
 			if i >= 4:
@ -101,8 +101,6 @@ for i,d in enumerate (data):
 			combined[u] = list (defaults)
 		combined[u][i] = v
 combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
-data = combined
-del combined


 property_names = [
@ -254,8 +252,8 @@ def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
 def is_ZWNJ(U, UISC, UDI, UGC, AJT):
 	return UISC == Non_Joiner
 def is_OTHER(U, UISC, UDI, UGC, AJT):
-	# Also includes BASE_IND, Rsv, and SYM
-	return ((UGC in [Cn, Po] or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
+	# Also includes BASE_IND and SYM
+	return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
 		and not is_BASE(U, UISC, UDI, UGC, AJT)
 		and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
 		and not is_CGJ(U, UISC, UDI, UGC, AJT)
@ -278,10 +276,11 @@ def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
 	return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
 		(UGC != Lo and (UISC == Bindu or U in [0xAA29])))
 def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
+	# Also includes Rsv
 	return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
 		and UISC == Other
 		and not is_CGJ(U, UISC, UDI, UGC, AJT)
-	)
+	) or UGC == Cn

 use_mapping = {
 	'B':	is_BASE,
@ -412,8 +411,7 @@ def map_to_use(data):
 		out[U] = (USE, UBlock)
 	return out

-defaults = ('O', 'No_Block')
-data = map_to_use(data)
+use_data = map_to_use(combined)

 print ("/* == Start of generated table == */")
 print ("/*")
@ -439,7 +437,7 @@ print ()
 total = 0
 used = 0
 last_block = None
-def print_block (block, start, end, data):
+def print_block (block, start, end, use_data):
 	global total, used, last_block
 	if block and block != last_block:
 		print ()
@ -454,17 +452,23 @@ def print_block (block, start, end, data):
 		if u % 16 == 0:
 			print ()
 			print ("  /* %04X */" % u, end='')
-		if u in data:
+		if u in use_data:
 			num += 1
-		d = data.get (u, defaults)
-		print ("%6s," % d[0], end='')
+		d = use_data.get (u)
+		if d is not None:
+			d = d[0]
+		elif u in unicode_data[4]:
+			d = 'O'
+		else:
+			d = 'WJ'
+		print ("%6s," % d, end='')

 	total += end - start + 1
 	used += num
 	if block:
 		last_block = block

-uu = sorted (data.keys ())
+uu = sorted (use_data.keys ())

 last = -100000
 num = 0
@ -487,19 +491,19 @@ print ("static const uint8_t use_table[] = {")
 for u in uu:
 	if u <= last:
 		continue
-	if data[u][0] == 'O':
+	if use_data[u][0] == 'O':
 		continue
-	block = data[u][1]
+	block = use_data[u][1]

 	start = u//8*8
 	end = start+1
-	while end in uu and block == data[end][1]:
+	while end in uu and block == use_data[end][1]:
 		end += 1
 	end = (end-1)//8*8 + 7

 	if start != last + 1:
 		if start - last <= 1+16*3:
-			print_block (None, last+1, start-1, data)
+			print_block (None, last+1, start-1, use_data)
 		else:
 			if last >= 0:
 				ends.append (last + 1)
@ -509,7 +513,7 @@ for u in uu:
 			print ("#define use_offset_0x%04xu %d" % (start, offset))
 			starts.append (start)

-	print_block (block, start, end, data)
+	print_block (block, start, end, use_data)
 	last = end
 ends.append (last + 1)
 offset += ends[-1] - starts[-1]
@ -520,8 +524,9 @@ page_bits = 12
 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
 print ()
 print ("static inline uint8_t")
-print ("hb_use_get_category (hb_codepoint_t u)")
+print ("hb_use_get_category (hb_glyph_info_t info)")
 print ("{")
+print ("  hb_codepoint_t u = info.codepoint;")
 print ("  switch (u >> %d)" % page_bits)
 print ("  {")
 pages = set([u>>page_bits for u in starts+ends])
@ -536,7 +541,9 @@ for p in sorted(pages):
 print ("    default:")
 print ("      break;")
 print ("  }")
-print ("  return USE(O);")
+print ("  if (_hb_glyph_info_get_general_category (&info) == HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED)")
+print ("    return WJ;")
+print ("  return O;")
 print ("}")
 print ()
 for k in sorted(use_mapping.keys()):
--- a/src/hb-ot-shape-complex-use-table.hh
+++ b/src/hb-ot-shape-complex-use-table.hh
--- a/src/hb-ot-shape-complex-use.cc
+++ b/src/hb-ot-shape-complex-use.cc
@ -206,7 +206,7 @@ setup_masks_use (const hb_ot_shape_plan_t *plan,
  unsigned int count = buffer->len;
  hb_glyph_info_t *info = buffer->info;
  for (unsigned int i = 0; i < count; i++)
-    info[i].use_category() = hb_use_get_category (info[i].codepoint);
+    info[i].use_category() = hb_use_get_category (info[i]);
 }

 static void