[indic] Misc harmless fixes!

First, we were abusing OT_VD instead of OT_A.  Fix that
but moving OT_A in the grammar where it belongs (which
is different from what the spec says).

Also, only allow medial consonants after all other
consonants.  This doesn't affect any current character.

Finally, fix Halant attachment in presence of medial
consonants.  Again, this currently doesn't affect any
sequence.

I lied.  There's Gurmukhi U+0A75 which is Consonant_Medial.
Uniscribe allows one of those in each of these positions:
before matras, after matras and before syllable modifiers,
and after syllable modifiers!  We currently just allow
unlimited numbers of it, before matras.
This commit is contained in:
Behdad Esfahbod 2013-10-16 19:06:29 +02:00
parent c52ddab72e
commit 3756efaf4e
3 changed files with 11 additions and 10 deletions

View File

@ -58,7 +58,7 @@ Ra = 16;
CM = 17;
Avag = 18;
c = (C | Ra)CM*; # is_consonant
c = (C | Ra); # is_consonant
n = ((ZWNJ?.RS)? (N.N?)?); # is_consonant_modifier
z = ZWJ|ZWNJ; # is_joiner
h = H | Coeng; # is_halant_or_coeng
@ -67,14 +67,14 @@ reph = (Ra H | Repha); # possible reph
cn = c.ZWJ?.n?;
forced_rakar = ZWJ H ZWJ Ra;
matra_group = z{0,3}.M.N?.(H | forced_rakar)?;
syllable_tail = (Coeng (cn|V))? (Avag.N?)? (SM.SM?.ZWNJ?)? (VD.VD?)?;
syllable_tail = (Coeng (cn|V))? (Avag.N?)? (SM.SM?.ZWNJ?)? (A.A?)? VD?;
place_holder = NBSP | DOTTEDCIRCLE;
halant_group = (z?.h.(ZWJ.N?)?);
final_halant_group = halant_group | h.ZWNJ;
halant_or_matra_group = (final_halant_group | (h.ZWJ)? matra_group{0,4});
halant_or_matra_group = (CM.CM* | final_halant_group | (h.ZWJ)? matra_group{0,4});
consonant_syllable = Repha? (cn.halant_group){0,4} cn A? halant_or_matra_group? syllable_tail;
consonant_syllable = Repha? (cn.halant_group){0,4} cn halant_or_matra_group? syllable_tail;
vowel_syllable = reph? V.n? (ZWJ | (halant_group.cn){0,4} halant_or_matra_group? syllable_tail);
standalone_cluster = reph? place_holder.n? (halant_group.cn){0,4} halant_or_matra_group? syllable_tail;
avagraha_cluster = Avag.N? (SM.ZWNJ?)? (VD VD?)?;

View File

@ -102,7 +102,7 @@ enum indic_syllabic_category_t {
INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER = OT_C,
INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER = OT_NBSP,
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED = OT_C,
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_REPHA = OT_Repha,
INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER = OT_X,
INDIC_SYLLABIC_CATEGORY_NUKTA = OT_N,

View File

@ -194,15 +194,15 @@ set_indic_properties (hb_glyph_info_t &info)
/* The spec says U+0952 is OT_A. However, testing shows that Uniscribe
* treats U+0951..U+0952 all as OT_VD.
* treats U+0951..U+0954 all behave similarly.
* TESTS:
* U+092E,U+0947,U+0952
* U+092E,U+0952,U+0947
* U+092E,U+0947,U+0951
* U+092E,U+0951,U+0947
* */
*/
if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x0951, 0x0954)))
cat = OT_VD;
cat = OT_A;
if (unlikely (u == 0x17D1))
cat = OT_X;
@ -220,7 +220,7 @@ set_indic_properties (hb_glyph_info_t &info)
else if (unlikely (u == 0x200C)) cat = OT_ZWNJ;
else if (unlikely (u == 0x200D)) cat = OT_ZWJ;
else if (unlikely (u == 0x25CC)) cat = OT_DOTTEDCIRCLE;
else if (unlikely (u == 0x0A71)) cat = OT_SM; /* GURMUKHI ADDAK. More like consonant medial. like 0A75. */
else if (unlikely (u == 0x0A71)) cat = OT_SM; /* GURMUKHI ADDAK. Move it to the end. */
if (cat == OT_Repha) {
/* There are two kinds of characters marked as Repha:
@ -249,7 +249,7 @@ set_indic_properties (hb_glyph_info_t &info)
{
pos = matra_position (u, pos);
}
else if (cat == OT_SM || cat == OT_VD || cat == OT_Avag)
else if ((FLAG (cat) & (FLAG (OT_SM) | FLAG (OT_VD) | FLAG (OT_A) | FLAG (OT_Avag))))
{
pos = POS_SMVD;
}
@ -933,6 +933,7 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan,
for (unsigned int j = last_halant; j < i; j++)
if (info[j].indic_position() != POS_SMVD)
info[j].indic_position() = info[i].indic_position();
last_halant = end;
}
}