Update Arabic joining table to include Mandaic

Mandaic was added to Unicode 6.0, but the joining data was not updated.
Draft ArabicShaping.txt from 6.1 includes the joining data for Mandaic.
Use that.
This commit is contained in:
Behdad Esfahbod 2010-11-17 16:52:58 -05:00
parent 43bf2f7f1e
commit 14d784116b
2 changed files with 137 additions and 31 deletions

View File

@ -11,22 +11,39 @@ for line in sys.stdin:
fields = [x.strip() for x in line.split(';')] fields = [x.strip() for x in line.split(';')]
u = int(fields[0], 16) u = int(fields[0], 16)
if u < 0x0600 or (u > 0x07FF and u != 0x200C and u != 0x200D): if u == 0x200C or u == 0x200D:
continue
if u < 0x0600:
raise Exception ("Ooops, unexpected unicode character: ", fields) raise Exception ("Ooops, unexpected unicode character: ", fields)
dic[u] = fields dic[u] = fields
print " /*" v = dic.keys()
print " * The following table is generated by running:" v.sort()
print " *" min_u, max_u = v[0], v[-1]
print " * ./gen-arabic-joining-table.py < ArabicShaping.txt" occupancy = len(v) * 100 / (max_u - min_u + 1)
print " *"
print " * on the ArabicShaping.txt file with the header:" # Maintain at least 40% occupancy in the table */
print " *" if occupancy < 40:
raise Exception ("Table too sparse, please investigate: ", occupancy)
print "/* == Start of generated table == */"
print "/*"
print " * The following table is generated by running:"
print " *"
print " * ./gen-arabic-joining-table.py < ArabicShaping.txt"
print " *"
print " * on the ArabicShaping.txt file with the header:"
print " *"
for line in header: for line in header:
print " * %s" % (line.strip()) print " * %s" % (line.strip())
print " */" print " */"
print " /* == Start of generated table == */"
for i in range(0x0600, 0x0800): print "#define JOINING_TABLE_FIRST 0x%04x" % min_u
print "#define JOINING_TABLE_LAST 0x%04x" % max_u
print "static const uint8_t joining_table[JOINING_TABLE_LAST-JOINING_TABLE_FIRST+2] ="
print "{"
for i in range(min_u, max_u + 1):
if i not in dic: if i not in dic:
print " JOINING_TYPE_X, /* %04X */" % i print " JOINING_TYPE_X, /* %04X */" % i
else: else:
@ -36,4 +53,6 @@ for i in range(0x0600, 0x0800):
else: else:
value = "JOINING_TYPE_" + entry[2] value = "JOINING_TYPE_" + entry[2]
print " %s, /* %s */" % (value, '; '.join(entry)) print " %s, /* %s */" % (value, '; '.join(entry))
print " /* == End of generated table == */" print " JOINING_TYPE_X /* dummy */"
print "};"
print "/* == End of generated table == */"

View File

@ -56,23 +56,21 @@ enum {
*/ */
/* == Start of generated table == */
/* /*
* Main joining-type table, covering U+0600..U+07FF. * The following table is generated by running:
* Includes Arabic, Syriac, and N'ko. *
* ./gen-arabic-joining-table.py < ArabicShaping.txt
*
* on the ArabicShaping.txt file with the header:
*
* # ArabicShaping-6.1.0.txt
* # Date: 2010-11-09, 12:10:00 PST [KW]
*/ */
static const uint8_t arabic_syriac_nko_joining_types[0x0800 - 0x0600 + 1] = #define JOINING_TABLE_FIRST 0x0600
#define JOINING_TABLE_LAST 0x0858
static const uint8_t joining_table[JOINING_TABLE_LAST-JOINING_TABLE_FIRST+2] =
{ {
/*
* The following table is generated by running:
*
* ./gen-arabic-joining-table.py < ArabicShaping.txt
*
* on the ArabicShaping.txt file with the header:
*
* # ArabicShaping-6.0.0.txt
* # Date: 2010-04-30, 13:47:00 PDT [KW]
*/
/* == Start of generated table == */
JOINING_TYPE_U, /* 0600; ARABIC NUMBER SIGN; U; No_Joining_Group */ JOINING_TYPE_U, /* 0600; ARABIC NUMBER SIGN; U; No_Joining_Group */
JOINING_TYPE_U, /* 0601; ARABIC SIGN SANAH; U; No_Joining_Group */ JOINING_TYPE_U, /* 0601; ARABIC SIGN SANAH; U; No_Joining_Group */
JOINING_TYPE_U, /* 0602; ARABIC FOOTNOTE MARKER; U; No_Joining_Group */ JOINING_TYPE_U, /* 0602; ARABIC FOOTNOTE MARKER; U; No_Joining_Group */
@ -585,16 +583,105 @@ static const uint8_t arabic_syriac_nko_joining_types[0x0800 - 0x0600 + 1] =
JOINING_TYPE_X, /* 07FD */ JOINING_TYPE_X, /* 07FD */
JOINING_TYPE_X, /* 07FE */ JOINING_TYPE_X, /* 07FE */
JOINING_TYPE_X, /* 07FF */ JOINING_TYPE_X, /* 07FF */
/* == End of generated table == */ JOINING_TYPE_X, /* 0800 */
JOINING_TYPE_X JOINING_TYPE_X, /* 0801 */
JOINING_TYPE_X, /* 0802 */
JOINING_TYPE_X, /* 0803 */
JOINING_TYPE_X, /* 0804 */
JOINING_TYPE_X, /* 0805 */
JOINING_TYPE_X, /* 0806 */
JOINING_TYPE_X, /* 0807 */
JOINING_TYPE_X, /* 0808 */
JOINING_TYPE_X, /* 0809 */
JOINING_TYPE_X, /* 080A */
JOINING_TYPE_X, /* 080B */
JOINING_TYPE_X, /* 080C */
JOINING_TYPE_X, /* 080D */
JOINING_TYPE_X, /* 080E */
JOINING_TYPE_X, /* 080F */
JOINING_TYPE_X, /* 0810 */
JOINING_TYPE_X, /* 0811 */
JOINING_TYPE_X, /* 0812 */
JOINING_TYPE_X, /* 0813 */
JOINING_TYPE_X, /* 0814 */
JOINING_TYPE_X, /* 0815 */
JOINING_TYPE_X, /* 0816 */
JOINING_TYPE_X, /* 0817 */
JOINING_TYPE_X, /* 0818 */
JOINING_TYPE_X, /* 0819 */
JOINING_TYPE_X, /* 081A */
JOINING_TYPE_X, /* 081B */
JOINING_TYPE_X, /* 081C */
JOINING_TYPE_X, /* 081D */
JOINING_TYPE_X, /* 081E */
JOINING_TYPE_X, /* 081F */
JOINING_TYPE_X, /* 0820 */
JOINING_TYPE_X, /* 0821 */
JOINING_TYPE_X, /* 0822 */
JOINING_TYPE_X, /* 0823 */
JOINING_TYPE_X, /* 0824 */
JOINING_TYPE_X, /* 0825 */
JOINING_TYPE_X, /* 0826 */
JOINING_TYPE_X, /* 0827 */
JOINING_TYPE_X, /* 0828 */
JOINING_TYPE_X, /* 0829 */
JOINING_TYPE_X, /* 082A */
JOINING_TYPE_X, /* 082B */
JOINING_TYPE_X, /* 082C */
JOINING_TYPE_X, /* 082D */
JOINING_TYPE_X, /* 082E */
JOINING_TYPE_X, /* 082F */
JOINING_TYPE_X, /* 0830 */
JOINING_TYPE_X, /* 0831 */
JOINING_TYPE_X, /* 0832 */
JOINING_TYPE_X, /* 0833 */
JOINING_TYPE_X, /* 0834 */
JOINING_TYPE_X, /* 0835 */
JOINING_TYPE_X, /* 0836 */
JOINING_TYPE_X, /* 0837 */
JOINING_TYPE_X, /* 0838 */
JOINING_TYPE_X, /* 0839 */
JOINING_TYPE_X, /* 083A */
JOINING_TYPE_X, /* 083B */
JOINING_TYPE_X, /* 083C */
JOINING_TYPE_X, /* 083D */
JOINING_TYPE_X, /* 083E */
JOINING_TYPE_X, /* 083F */
JOINING_TYPE_R, /* 0840; MANDAIC HALQA; R; No_Joining_Group */
JOINING_TYPE_D, /* 0841; MANDAIC AB; D; No_Joining_Group */
JOINING_TYPE_D, /* 0842; MANDAIC AG; D; No_Joining_Group */
JOINING_TYPE_D, /* 0843; MANDAIC AD; D; No_Joining_Group */
JOINING_TYPE_D, /* 0844; MANDAIC AH; D; No_Joining_Group */
JOINING_TYPE_D, /* 0845; MANDAIC USHENNA; D; No_Joining_Group */
JOINING_TYPE_R, /* 0846; MANDAIC AZ; R; No_Joining_Group */
JOINING_TYPE_D, /* 0847; MANDAIC IT; D; No_Joining_Group */
JOINING_TYPE_D, /* 0848; MANDAIC ATT; D; No_Joining_Group */
JOINING_TYPE_R, /* 0849; MANDAIC AKSA; R; No_Joining_Group */
JOINING_TYPE_D, /* 084A; MANDAIC AK; D; No_Joining_Group */
JOINING_TYPE_D, /* 084B; MANDAIC AL; D; No_Joining_Group */
JOINING_TYPE_D, /* 084C; MANDAIC AM; D; No_Joining_Group */
JOINING_TYPE_D, /* 084D; MANDAIC AN; D; No_Joining_Group */
JOINING_TYPE_D, /* 084E; MANDAIC AS; D; No_Joining_Group */
JOINING_TYPE_R, /* 084F; MANDAIC IN; R; No_Joining_Group */
JOINING_TYPE_D, /* 0850; MANDAIC AP; D; No_Joining_Group */
JOINING_TYPE_D, /* 0851; MANDAIC ASZ; D; No_Joining_Group */
JOINING_TYPE_D, /* 0852; MANDAIC AQ; D; No_Joining_Group */
JOINING_TYPE_D, /* 0853; MANDAIC AR; D; No_Joining_Group */
JOINING_TYPE_R, /* 0854; MANDAIC ASH; R; No_Joining_Group */
JOINING_TYPE_D, /* 0855; MANDAIC AT; D; No_Joining_Group */
JOINING_TYPE_U, /* 0856; MANDAIC DUSHENNA; U; No_Joining_Group */
JOINING_TYPE_U, /* 0857; MANDAIC KAD; U; No_Joining_Group */
JOINING_TYPE_U, /* 0858; MANDAIC AIN; U; No_Joining_Group */
JOINING_TYPE_X /* dummy */
}; };
/* == End of generated table == */
static unsigned int get_joining_type (hb_codepoint_t u, hb_category_t gen_cat) static unsigned int get_joining_type (hb_codepoint_t u, hb_category_t gen_cat)
{ {
/* TODO Macroize the magic bit operations */ /* TODO Macroize the magic bit operations */
if (likely ((u & ~(0x0600^0x07FF)) == 0x0600)) { if (likely (JOINING_TABLE_FIRST <= u && u <= JOINING_TABLE_LAST)) {
unsigned int j_type = arabic_syriac_nko_joining_types[u - 0x0600]; unsigned int j_type = joining_table[u - JOINING_TABLE_FIRST];
if (likely (j_type != JOINING_TYPE_X)) if (likely (j_type != JOINING_TYPE_X))
return j_type; return j_type;
} }