[khmer] Rewrite grammar completely

Based on experimenting with Uniscribe to extract grammar and categories.

Failures down from 44 to 35:

KHMER: 299089 out of 299124 tests passed. 35 failed (0.0117008%)

We still don't enforce the one-matra rule pre-decomposition, but enforce
an order and one-matra-per-position post-decomposition.

https://github.com/harfbuzz/harfbuzz/issues/667
This commit is contained in:
Behdad Esfahbod 2018-10-01 19:09:58 +02:00
parent aaaa65baa7
commit 5143654716
5 changed files with 266 additions and 173 deletions

View File

@ -125,7 +125,7 @@ enum indic_syllabic_category_t {
INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA = OT_Repha, INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA = OT_Repha,
INDIC_SYLLABIC_CATEGORY_CONSONANT_PREFIXED = OT_X, /* Don't care. */ INDIC_SYLLABIC_CATEGORY_CONSONANT_PREFIXED = OT_X, /* Don't care. */
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED = OT_CM, INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA = OT_N, INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_WITH_STACKER = OT_CS, INDIC_SYLLABIC_CATEGORY_CONSONANT_WITH_STACKER = OT_CS,
INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK = OT_SM, /* https://github.com/harfbuzz/harfbuzz/issues/552 */ INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK = OT_SM, /* https://github.com/harfbuzz/harfbuzz/issues/552 */
INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER = OT_Coeng, INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER = OT_Coeng,

View File

@ -34,130 +34,200 @@
#line 36 "hb-ot-shape-complex-khmer-machine.hh" #line 36 "hb-ot-shape-complex-khmer-machine.hh"
static const unsigned char _khmer_syllable_machine_trans_keys[] = { static const unsigned char _khmer_syllable_machine_trans_keys[] = {
7u, 7u, 1u, 16u, 13u, 13u, 1u, 16u, 7u, 13u, 7u, 7u, 1u, 16u, 13u, 13u, 5u, 26u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 5u, 21u,
1u, 16u, 7u, 13u, 1u, 16u, 3u, 14u, 3u, 14u, 5u, 14u, 3u, 14u, 5u, 14u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u, 5u, 26u, 5u, 21u, 1u, 16u, 5u, 21u,
8u, 8u, 3u, 13u, 3u, 8u, 8u, 8u, 3u, 8u, 3u, 14u, 3u, 14u, 5u, 14u, 5u, 26u, 5u, 21u, 5u, 26u, 5u, 21u, 5u, 26u, 1u, 16u, 1u, 29u, 5u, 29u,
3u, 14u, 5u, 14u, 8u, 8u, 3u, 13u, 3u, 8u, 8u, 8u, 3u, 8u, 3u, 14u, 5u, 29u, 5u, 29u, 22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 5u, 29u, 5u, 26u,
3u, 14u, 7u, 13u, 7u, 7u, 1u, 16u, 0 5u, 29u, 5u, 29u, 22u, 22u, 5u, 22u, 5u, 29u, 5u, 29u, 1u, 16u, 5u, 29u,
5u, 29u, 0
}; };
static const char _khmer_syllable_machine_key_spans[] = { static const char _khmer_syllable_machine_key_spans[] = {
1, 16, 1, 16, 7, 1, 16, 1, 22, 17, 22, 17, 16, 17, 22, 17,
16, 7, 16, 12, 12, 10, 12, 10, 22, 17, 16, 17, 22, 17, 16, 17,
1, 11, 6, 1, 6, 12, 12, 10, 22, 17, 22, 17, 22, 16, 29, 25,
12, 10, 1, 11, 6, 1, 6, 12, 25, 25, 1, 18, 25, 25, 25, 22,
12, 7, 1, 16 25, 25, 1, 18, 25, 25, 16, 25,
25
}; };
static const short _khmer_syllable_machine_index_offsets[] = { static const short _khmer_syllable_machine_index_offsets[] = {
0, 2, 19, 21, 38, 46, 48, 65, 0, 23, 41, 64, 82, 99, 117, 140,
67, 84, 92, 109, 122, 135, 146, 159, 158, 181, 199, 216, 234, 257, 275, 292,
170, 172, 184, 191, 193, 200, 213, 226, 310, 333, 351, 374, 392, 415, 432, 462,
237, 250, 261, 263, 275, 282, 284, 291, 488, 514, 540, 542, 561, 587, 613, 639,
304, 317, 325, 327 662, 688, 714, 716, 735, 761, 787, 804,
830
}; };
static const char _khmer_syllable_machine_indicies[] = { static const char _khmer_syllable_machine_indicies[] = {
1, 0, 2, 2, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2,
3, 0, 0, 0, 0, 4, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3,
0, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 2, 0, 3, 0, 4, 4, 0, 0, 3, 0, 0, 0, 0, 4, 0,
5, 5, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 4, 0, 1, 0, 4, 0, 6, 6, 0, 0, 0, 0,
0, 0, 0, 0, 5, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0,
8, 8, 6, 6, 6, 6, 6, 6, 0, 6, 0, 7, 7, 0, 0, 0,
6, 6, 6, 6, 6, 6, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0,
6, 9, 6, 10, 10, 6, 6, 6, 0, 0, 0, 8, 0, 9, 9, 0,
6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0,
6, 6, 10, 6, 7, 6, 6, 6, 0, 0, 0, 0, 0, 10, 0, 0,
6, 6, 11, 6, 4, 4, 13, 12, 0, 0, 4, 0, 9, 9, 0, 0,
14, 15, 7, 16, 12, 12, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0,
11, 17, 12, 4, 12, 19, 18, 20, 0, 0, 0, 0, 10, 0, 11, 11,
21, 1, 22, 18, 18, 18, 18, 5, 0, 0, 0, 0, 0, 0, 0, 0,
23, 18, 24, 18, 21, 21, 1, 22, 0, 0, 0, 0, 0, 0, 12, 0,
18, 18, 18, 18, 18, 23, 18, 21, 0, 0, 0, 4, 0, 11, 11, 0,
21, 1, 22, 18, 18, 18, 18, 18, 0, 0, 0, 0, 0, 0, 0, 0,
23, 18, 25, 18, 21, 21, 1, 22, 0, 0, 0, 0, 0, 12, 0, 13,
18, 18, 18, 18, 18, 26, 18, 21, 13, 0, 0, 0, 0, 0, 0, 0,
21, 1, 22, 18, 18, 18, 18, 18, 0, 0, 0, 0, 0, 0, 13, 0,
26, 18, 27, 18, 28, 18, 29, 18, 15, 15, 14, 14, 14, 14, 14, 14,
18, 22, 18, 18, 18, 18, 3, 18, 14, 14, 14, 14, 14, 14, 14, 14,
30, 18, 18, 18, 18, 22, 18, 22, 16, 14, 15, 15, 17, 17, 17, 17,
18, 28, 18, 18, 18, 18, 22, 18, 17, 17, 17, 17, 17, 17, 17, 17,
19, 18, 21, 21, 1, 22, 18, 18, 17, 17, 16, 17, 17, 17, 17, 18,
18, 18, 18, 23, 18, 32, 31, 33, 17, 19, 19, 17, 17, 17, 17, 17,
33, 7, 16, 31, 31, 31, 31, 31, 17, 17, 17, 17, 17, 17, 17, 17,
34, 31, 33, 33, 7, 16, 31, 31, 17, 18, 17, 20, 20, 17, 17, 17,
31, 31, 31, 34, 31, 35, 31, 33, 17, 17, 17, 17, 17, 17, 17, 17,
33, 7, 16, 31, 31, 31, 31, 31, 17, 17, 20, 17, 21, 21, 17, 17,
36, 31, 33, 33, 7, 16, 31, 31, 17, 17, 17, 17, 17, 17, 17, 17,
31, 31, 31, 36, 31, 37, 31, 38, 17, 17, 17, 17, 22, 17, 23, 23,
31, 39, 31, 31, 16, 31, 31, 31, 17, 17, 17, 17, 17, 17, 17, 17,
31, 9, 31, 40, 31, 31, 31, 31, 17, 17, 17, 17, 17, 17, 24, 17,
16, 31, 16, 31, 38, 31, 31, 31, 17, 17, 17, 18, 17, 23, 23, 17,
31, 16, 31, 13, 31, 41, 33, 7, 17, 17, 17, 17, 17, 17, 17, 17,
16, 31, 31, 31, 31, 11, 34, 31, 17, 17, 17, 17, 17, 24, 17, 25,
13, 31, 33, 33, 7, 16, 31, 31, 25, 17, 17, 17, 17, 17, 17, 17,
31, 31, 31, 34, 31, 7, 42, 42, 17, 17, 17, 17, 17, 17, 17, 26,
42, 42, 42, 11, 42, 7, 42, 10, 17, 17, 17, 17, 18, 17, 25, 25,
10, 42, 42, 42, 42, 42, 42, 42, 17, 17, 17, 17, 17, 17, 17, 17,
42, 42, 42, 42, 42, 42, 10, 42, 17, 17, 17, 17, 17, 17, 26, 17,
15, 15, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 17, 27,
16, 17, 17, 17, 17, 18, 17, 28,
28, 17, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 28, 17,
13, 13, 29, 29, 30, 30, 29, 29,
29, 29, 2, 2, 29, 31, 29, 13,
29, 29, 29, 29, 16, 20, 29, 29,
29, 18, 24, 26, 22, 29, 33, 33,
32, 32, 32, 32, 32, 32, 32, 34,
32, 32, 32, 32, 32, 2, 3, 6,
32, 32, 32, 4, 10, 12, 8, 32,
35, 35, 32, 32, 32, 32, 32, 32,
32, 36, 32, 32, 32, 32, 32, 32,
3, 6, 32, 32, 32, 4, 10, 12,
8, 32, 5, 5, 32, 32, 32, 32,
32, 32, 32, 36, 32, 32, 32, 32,
32, 32, 4, 6, 32, 32, 32, 32,
32, 32, 8, 32, 6, 32, 7, 7,
32, 32, 32, 32, 32, 32, 32, 36,
32, 32, 32, 32, 32, 32, 8, 6,
32, 37, 37, 32, 32, 32, 32, 32,
32, 32, 36, 32, 32, 32, 32, 32,
32, 10, 6, 32, 32, 32, 4, 32,
32, 8, 32, 38, 38, 32, 32, 32,
32, 32, 32, 32, 36, 32, 32, 32,
32, 32, 32, 12, 6, 32, 32, 32,
4, 10, 32, 8, 32, 35, 35, 32,
32, 32, 32, 32, 32, 32, 34, 32,
32, 32, 32, 32, 32, 3, 6, 32,
32, 32, 4, 10, 12, 8, 32, 15,
15, 39, 39, 39, 39, 39, 39, 39,
39, 39, 39, 39, 39, 39, 39, 16,
39, 39, 39, 39, 18, 39, 41, 41,
40, 40, 40, 40, 40, 40, 40, 42,
40, 40, 40, 40, 40, 40, 16, 20,
40, 40, 40, 18, 24, 26, 22, 40,
19, 19, 40, 40, 40, 40, 40, 40,
40, 42, 40, 40, 40, 40, 40, 40,
18, 20, 40, 40, 40, 40, 40, 40,
22, 40, 20, 40, 21, 21, 40, 40,
40, 40, 40, 40, 40, 42, 40, 40,
40, 40, 40, 40, 22, 20, 40, 43,
43, 40, 40, 40, 40, 40, 40, 40,
42, 40, 40, 40, 40, 40, 40, 24,
20, 40, 40, 40, 18, 40, 40, 22,
40, 44, 44, 40, 40, 40, 40, 40,
40, 40, 42, 40, 40, 40, 40, 40,
40, 26, 20, 40, 40, 40, 18, 24,
40, 22, 40, 28, 28, 39, 39, 39,
39, 39, 39, 39, 39, 39, 39, 39,
39, 39, 28, 39, 45, 45, 40, 40,
40, 40, 40, 40, 40, 46, 40, 40,
40, 40, 40, 27, 16, 20, 40, 40,
40, 18, 24, 26, 22, 40, 41, 41,
40, 40, 40, 40, 40, 40, 40, 46,
40, 40, 40, 40, 40, 40, 16, 20,
40, 40, 40, 18, 24, 26, 22, 40,
0 0
}; };
static const char _khmer_syllable_machine_trans_targs[] = { static const char _khmer_syllable_machine_trans_targs[] = {
10, 14, 17, 20, 11, 21, 10, 24, 22, 1, 30, 24, 25, 3, 26, 5,
27, 30, 31, 32, 10, 22, 33, 34, 27, 7, 28, 9, 29, 23, 22, 11,
26, 35, 10, 12, 4, 0, 16, 3, 32, 22, 33, 13, 34, 15, 35, 17,
13, 15, 1, 10, 18, 2, 19, 10, 36, 19, 37, 40, 39, 22, 31, 38,
23, 5, 8, 25, 6, 10, 28, 7, 22, 0, 10, 2, 4, 6, 8, 22,
29, 9, 10 22, 12, 14, 16, 18, 20, 21
}; };
static const char _khmer_syllable_machine_trans_actions[] = { static const char _khmer_syllable_machine_trans_actions[] = {
1, 2, 2, 0, 2, 2, 3, 2, 1, 0, 2, 2, 2, 0, 0, 0,
2, 0, 2, 2, 6, 2, 0, 0, 2, 0, 2, 0, 2, 2, 3, 0,
0, 0, 7, 2, 0, 0, 0, 0, 4, 5, 2, 0, 0, 0, 2, 0,
2, 2, 0, 8, 0, 0, 0, 9, 2, 0, 2, 4, 4, 8, 9, 0,
2, 0, 0, 2, 0, 10, 0, 0, 10, 0, 0, 0, 0, 0, 0, 11,
0, 0, 11 12, 0, 0, 0, 0, 0, 0
}; };
static const char _khmer_syllable_machine_to_state_actions[] = { static const char _khmer_syllable_machine_to_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 6, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0 0
}; };
static const char _khmer_syllable_machine_from_state_actions[] = { static const char _khmer_syllable_machine_from_state_actions[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 7, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0 0
}; };
static const unsigned char _khmer_syllable_machine_eof_trans[] = { static const unsigned char _khmer_syllable_machine_eof_trans[] = {
1, 1, 1, 1, 1, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1,
7, 7, 0, 19, 19, 19, 19, 19, 1, 1, 1, 15, 18, 18, 18, 18,
19, 19, 19, 19, 19, 19, 32, 32, 18, 18, 18, 18, 18, 18, 0, 33,
32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 40,
32, 43, 43, 43 41, 41, 41, 41, 41, 41, 40, 41,
41
}; };
static const int khmer_syllable_machine_start = 10; static const int khmer_syllable_machine_start = 22;
static const int khmer_syllable_machine_first_final = 10; static const int khmer_syllable_machine_first_final = 22;
static const int khmer_syllable_machine_error = -1; static const int khmer_syllable_machine_error = -1;
static const int khmer_syllable_machine_en_main = 10; static const int khmer_syllable_machine_en_main = 22;
#line 36 "hb-ot-shape-complex-khmer-machine.rl" #line 36 "hb-ot-shape-complex-khmer-machine.rl"
#line 74 "hb-ot-shape-complex-khmer-machine.rl" #line 80 "hb-ot-shape-complex-khmer-machine.rl"
#define found_syllable(syllable_type) \ #define found_syllable(syllable_type) \
@ -177,7 +247,7 @@ find_syllables (hb_buffer_t *buffer)
int cs; int cs;
hb_glyph_info_t *info = buffer->info; hb_glyph_info_t *info = buffer->info;
#line 181 "hb-ot-shape-complex-khmer-machine.hh" #line 251 "hb-ot-shape-complex-khmer-machine.hh"
{ {
cs = khmer_syllable_machine_start; cs = khmer_syllable_machine_start;
ts = 0; ts = 0;
@ -185,7 +255,7 @@ find_syllables (hb_buffer_t *buffer)
act = 0; act = 0;
} }
#line 95 "hb-ot-shape-complex-khmer-machine.rl" #line 101 "hb-ot-shape-complex-khmer-machine.rl"
p = 0; p = 0;
@ -194,7 +264,7 @@ find_syllables (hb_buffer_t *buffer)
unsigned int last = 0; unsigned int last = 0;
unsigned int syllable_serial = 1; unsigned int syllable_serial = 1;
#line 198 "hb-ot-shape-complex-khmer-machine.hh" #line 268 "hb-ot-shape-complex-khmer-machine.hh"
{ {
int _slen; int _slen;
int _trans; int _trans;
@ -204,11 +274,11 @@ find_syllables (hb_buffer_t *buffer)
goto _test_eof; goto _test_eof;
_resume: _resume:
switch ( _khmer_syllable_machine_from_state_actions[cs] ) { switch ( _khmer_syllable_machine_from_state_actions[cs] ) {
case 5: case 7:
#line 1 "NONE" #line 1 "NONE"
{ts = p;} {ts = p;}
break; break;
#line 212 "hb-ot-shape-complex-khmer-machine.hh" #line 282 "hb-ot-shape-complex-khmer-machine.hh"
} }
_keys = _khmer_syllable_machine_trans_keys + (cs<<1); _keys = _khmer_syllable_machine_trans_keys + (cs<<1);
@ -231,47 +301,63 @@ _eof_trans:
{te = p+1;} {te = p+1;}
break; break;
case 8: case 8:
#line 68 "hb-ot-shape-complex-khmer-machine.rl" #line 76 "hb-ot-shape-complex-khmer-machine.rl"
{te = p+1;{ found_syllable (consonant_syllable); }}
break;
case 10:
#line 69 "hb-ot-shape-complex-khmer-machine.rl"
{te = p+1;{ found_syllable (broken_cluster); }}
break;
case 6:
#line 70 "hb-ot-shape-complex-khmer-machine.rl"
{te = p+1;{ found_syllable (non_khmer_cluster); }} {te = p+1;{ found_syllable (non_khmer_cluster); }}
break; break;
case 7: case 10:
#line 68 "hb-ot-shape-complex-khmer-machine.rl" #line 74 "hb-ot-shape-complex-khmer-machine.rl"
{te = p;p--;{ found_syllable (consonant_syllable); }} {te = p;p--;{ found_syllable (consonant_syllable); }}
break; break;
case 9: case 12:
#line 69 "hb-ot-shape-complex-khmer-machine.rl" #line 75 "hb-ot-shape-complex-khmer-machine.rl"
{te = p;p--;{ found_syllable (broken_cluster); }} {te = p;p--;{ found_syllable (broken_cluster); }}
break; break;
case 11: case 11:
#line 70 "hb-ot-shape-complex-khmer-machine.rl" #line 76 "hb-ot-shape-complex-khmer-machine.rl"
{te = p;p--;{ found_syllable (non_khmer_cluster); }} {te = p;p--;{ found_syllable (non_khmer_cluster); }}
break; break;
case 1: case 1:
#line 68 "hb-ot-shape-complex-khmer-machine.rl" #line 74 "hb-ot-shape-complex-khmer-machine.rl"
{{p = ((te))-1;}{ found_syllable (consonant_syllable); }} {{p = ((te))-1;}{ found_syllable (consonant_syllable); }}
break; break;
case 3: case 5:
#line 69 "hb-ot-shape-complex-khmer-machine.rl" #line 75 "hb-ot-shape-complex-khmer-machine.rl"
{{p = ((te))-1;}{ found_syllable (broken_cluster); }} {{p = ((te))-1;}{ found_syllable (broken_cluster); }}
break; break;
#line 266 "hb-ot-shape-complex-khmer-machine.hh" case 3:
#line 1 "NONE"
{ switch( act ) {
case 2:
{{p = ((te))-1;} found_syllable (broken_cluster); }
break;
case 3:
{{p = ((te))-1;} found_syllable (non_khmer_cluster); }
break;
}
}
break;
case 4:
#line 1 "NONE"
{te = p+1;}
#line 75 "hb-ot-shape-complex-khmer-machine.rl"
{act = 2;}
break;
case 9:
#line 1 "NONE"
{te = p+1;}
#line 76 "hb-ot-shape-complex-khmer-machine.rl"
{act = 3;}
break;
#line 352 "hb-ot-shape-complex-khmer-machine.hh"
} }
_again: _again:
switch ( _khmer_syllable_machine_to_state_actions[cs] ) { switch ( _khmer_syllable_machine_to_state_actions[cs] ) {
case 4: case 6:
#line 1 "NONE" #line 1 "NONE"
{ts = 0;} {ts = 0;}
break; break;
#line 275 "hb-ot-shape-complex-khmer-machine.hh" #line 361 "hb-ot-shape-complex-khmer-machine.hh"
} }
if ( ++p != pe ) if ( ++p != pe )
@ -287,7 +373,7 @@ _again:
} }
#line 104 "hb-ot-shape-complex-khmer-machine.rl" #line 110 "hb-ot-shape-complex-khmer-machine.rl"
} }

View File

@ -40,28 +40,34 @@
# Same order as enum khmer_category_t. Not sure how to avoid duplication. # Same order as enum khmer_category_t. Not sure how to avoid duplication.
C = 1; C = 1;
V = 2; V = 2;
N = 3;
ZWNJ = 5; ZWNJ = 5;
ZWJ = 6; ZWJ = 6;
M = 7;
SM = 8;
PLACEHOLDER = 11; PLACEHOLDER = 11;
DOTTEDCIRCLE = 12; DOTTEDCIRCLE = 12;
RS = 13; Coeng= 14;
Coeng = 14; Ra = 16;
Ra = 16; Robatic = 20;
Xgroup = 21;
Ygroup = 22;
VAbv = 26;
VBlw = 27;
VPre = 28;
VPst = 29;
c = (C | Ra | V); # is_consonant c = (C | Ra | V);
n = ((ZWNJ?.RS)? (N.N?)?); # is_consonant_modifier cn = c.((ZWJ|ZWNJ)?.Robatic)?;
z = ZWJ|ZWNJ; # is_joiner joiner = (ZWJ | ZWNJ);
xgroup = (joiner*.Xgroup)*;
ygroup = Ygroup*;
cn = c.n?; # This grammar was experimentally extracted from what Uniscribe allows.
matra_group = z?.M.N?;
syllable_tail = (SM.SM?)?; matra_group = VPre? xgroup VBlw? xgroup (joiner?.VAbv)? xgroup VPst?;
syllable_tail = xgroup matra_group xgroup (Coeng.c)? ygroup;
broken_cluster = n? (Coeng.cn)* matra_group* (Coeng.cn)? syllable_tail; broken_cluster = (Coeng.cn)* syllable_tail;
consonant_syllable = (c|PLACEHOLDER|DOTTEDCIRCLE) broken_cluster; consonant_syllable = (cn|PLACEHOLDER|DOTTEDCIRCLE) broken_cluster;
other = any; other = any;
main := |* main := |*

View File

@ -241,7 +241,6 @@ setup_masks_khmer (const hb_ot_shape_plan_t *plan HB_UNUSED,
hb_font_t *font HB_UNUSED) hb_font_t *font HB_UNUSED)
{ {
HB_BUFFER_ALLOCATE_VAR (buffer, khmer_category); HB_BUFFER_ALLOCATE_VAR (buffer, khmer_category);
HB_BUFFER_ALLOCATE_VAR (buffer, khmer_position);
/* We cannot setup masks here. We save information about characters /* We cannot setup masks here. We save information about characters
* and setup masks later on in a pause-callback. */ * and setup masks later on in a pause-callback. */
@ -330,7 +329,7 @@ reorder_consonant_syllable (const hb_ot_shape_plan_t *plan,
} }
/* Reorder left matra piece. */ /* Reorder left matra piece. */
else if (info[i].khmer_position() == POS_PRE_M) else if (info[i].khmer_category() == OT_VPre)
{ {
/* Move to the start. */ /* Move to the start. */
buffer->merge_clusters (start, i + 1); buffer->merge_clusters (start, i + 1);
@ -432,7 +431,6 @@ reorder (const hb_ot_shape_plan_t *plan,
initial_reordering_syllable (plan, font->face, buffer, start, end); initial_reordering_syllable (plan, font->face, buffer, start, end);
HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_category); HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_category);
HB_BUFFER_DEALLOCATE_VAR (buffer, khmer_position);
} }
static void static void

View File

@ -34,30 +34,22 @@
/* buffer var allocations */ /* buffer var allocations */
#define khmer_category() indic_category() /* khmer_category_t */ #define khmer_category() indic_category() /* khmer_category_t */
#define khmer_position() indic_position() /* khmer_position_t */ #define khmer_position() indic_position() /* indic_position_t */
typedef indic_category_t khmer_category_t; /* Note: This enum is duplicated in the -machine.rl source file.
typedef indic_position_t khmer_position_t; * Not sure how to avoid duplication. */
enum khmer_category_t
static inline khmer_position_t
matra_position_khmer (khmer_position_t side)
{ {
switch ((int) side) OT_Robatic = 20,
{ OT_Xgroup = 21,
case POS_PRE_C: OT_Ygroup = 22,
return POS_PRE_M;
case POS_POST_C: OT_VAbv = 26,
case POS_ABOVE_C: OT_VBlw = 27,
case POS_BELOW_C: OT_VPre = 28,
return POS_AFTER_POST; OT_VPst = 29,
};
default:
return side;
};
}
static inline void static inline void
set_khmer_properties (hb_glyph_info_t &info) set_khmer_properties (hb_glyph_info_t &info)
@ -65,47 +57,58 @@ set_khmer_properties (hb_glyph_info_t &info)
hb_codepoint_t u = info.codepoint; hb_codepoint_t u = info.codepoint;
unsigned int type = hb_indic_get_categories (u); unsigned int type = hb_indic_get_categories (u);
khmer_category_t cat = (khmer_category_t) (type & 0x7Fu); khmer_category_t cat = (khmer_category_t) (type & 0x7Fu);
khmer_position_t pos = (khmer_position_t) (type >> 8); indic_position_t pos = (indic_position_t) (type >> 8);
/* /*
* Re-assign category * Re-assign category
*
* These categories are experimentally extracted from what Uniscribe allows.
*/ */
switch (u)
if (unlikely (u == 0x17C6u)) cat = OT_N; /* Khmer Bindu doesn't like to be repositioned. */
else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x17CDu, 0x17D1u) ||
u == 0x17CBu || u == 0x17D3u || u == 0x17DDu)) /* Khmer Various signs */
{ {
/* These can occur mid-syllable (eg. before matras), even though Unicode marks them as Syllable_Modifier. case 0x179Au:
* https://github.com/roozbehp/unicode-data/issues/5 */ cat = (khmer_category_t) OT_Ra;
cat = OT_M; break;
pos = POS_ABOVE_C;
}
else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x2010u, 0x2011u))) cat = OT_PLACEHOLDER;
else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE;
case 0x17CCu:
case 0x17C9u:
case 0x17CAu:
cat = OT_Robatic;
break;
case 0x17C6u:
case 0x17CBu:
case 0x17CDu:
case 0x17CEu:
case 0x17CFu:
case 0x17D0u:
case 0x17D1u:
cat = OT_Xgroup;
break;
case 0x17C7u:
case 0x17C8u:
case 0x17DDu:
case 0x17D3u: /* Just guessing. Uniscribe doesn't categorize it. */
cat = OT_Ygroup;
break;
}
/* /*
* Re-assign position. * Re-assign position.
*/ */
if (cat == (khmer_category_t) OT_M)
if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS)) switch ((int) pos)
{ {
pos = POS_BASE_C; case POS_PRE_C: cat = OT_VPre; break;
if (u == 0x179Au) case POS_BELOW_C: cat = OT_VBlw; break;
cat = OT_Ra; case POS_ABOVE_C: cat = OT_VAbv; break;
} case POS_POST_C: cat = OT_VPst; break;
else if (cat == OT_M) default: assert (0);
{ };
pos = matra_position_khmer (pos);
}
else if ((FLAG_UNSAFE (cat) & (FLAG (OT_SM) | FLAG (OT_A) | FLAG (OT_Symbol))))
{
pos = POS_SMVD;
}
info.khmer_category() = cat; info.khmer_category() = cat;
info.khmer_position() = pos;
} }