From 3ca5fbda76098cf74a9ba0c55feea611e48b0b5c Mon Sep 17 00:00:00 2001 From: David Corbett Date: Wed, 21 Sep 2022 18:13:17 -0400 Subject: [PATCH] [USE] Update the data files This uses the data files from and closes #3817. --- src/gen-use-table.py | 23 +- src/hb-ot-shaper-use-table.hh | 120 ++++++----- src/hb-ot-shaper-vowel-constraints.cc | 34 +++ .../IndicPositionalCategory-Additional.txt | 15 +- src/ms-use/IndicShapingInvalidCluster.txt | 198 +++++++++--------- .../IndicSyllabicCategory-Additional.txt | 91 ++++++-- 6 files changed, 284 insertions(+), 197 deletions(-) diff --git a/src/gen-use-table.py b/src/gen-use-table.py index 5e6a3b02f..0600a3388 100755 --- a/src/gen-use-table.py +++ b/src/gen-use-table.py @@ -134,6 +134,7 @@ property_names = [ 'Number_Joiner', 'Number', 'Brahmi_Joining_Number', + 'Symbol_Modifier', 'Hieroglyph', 'Hieroglyph_Joiner', 'Hieroglyph_Segment_Begin', @@ -214,8 +215,7 @@ def is_CONS_MED(U, UISC, UDI, UGC, AJT): return (UISC == Consonant_Medial and UGC != Lo or UISC == Consonant_Initial_Postfixed) def is_CONS_MOD(U, UISC, UDI, UGC, AJT): - return (UISC in [Nukta, Gemination_Mark, Consonant_Killer] and - not is_SYM_MOD(U, UISC, UDI, UGC, AJT)) + return UISC in [Nukta, Gemination_Mark, Consonant_Killer] def is_CONS_SUB(U, UISC, UDI, UGC, AJT): return UISC == Consonant_Subjoined and UGC != Lo def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT): @@ -257,7 +257,7 @@ def is_SAKOT(U, UISC, UDI, UGC, AJT): # Split off of HALANT return U == 0x1A60 def is_SYM_MOD(U, UISC, UDI, UGC, AJT): - return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] + return UISC == Symbol_Modifier def is_VOWEL(U, UISC, UDI, UGC, AJT): return (UISC == Pure_Killer or UGC != Lo and UISC in [Vowel, Vowel_Dependent]) @@ -359,9 +359,6 @@ def map_to_use(data): # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent - # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 - if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom - # TODO: U+1CED should only be allowed after some of # the nasalization marks, maybe only for U+1CE9..U+1CF1. if U == 0x1CED: UISC = Tone_Mark @@ -372,23 +369,9 @@ def map_to_use(data): # Resolve Indic_Positional_Category - # TODO: These should die, but have UIPC in Unicode 13.0.0 - if U in [0x953, 0x954]: UIPC = Not_Applicable - - # TODO: These are not in USE's override list that we have, nor are they in Unicode 13.0.0 - if 0xA926 <= U <= 0xA92A: UIPC = Top # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 # and https://github.com/harfbuzz/harfbuzz/issues/1631 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top - if 0x1CF8 <= U <= 0x1CF9: UIPC = Top - - # TODO: https://github.com/harfbuzz/harfbuzz/issues/3550 - if U == 0x10A38: UIPC = Bottom - - # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 - # also https://github.com/harfbuzz/harfbuzz/issues/1012 - if 0x1112A <= U <= 0x1112B: UIPC = Top - if 0x11131 <= U <= 0x11132: UIPC = Top assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT) diff --git a/src/hb-ot-shaper-use-table.hh b/src/hb-ot-shaper-use-table.hh index 6395d689a..9833fb55d 100644 --- a/src/hb-ot-shaper-use-table.hh +++ b/src/hb-ot-shaper-use-table.hh @@ -25,6 +25,7 @@ * # Updated for Unicode 12.1 by Andrew Glass 2019-05-24 * # Updated for Unicode 13.0 by Andrew Glass 2020-07-28 * # Updated for Unicode 14.0 by Andrew Glass 2021-09-25 + * # Updated for Unicode 15.0 by Andrew Glass 2022-09-16 * # Override values For Indic_Positional_Category * # Not derivable * # Initial version based on Unicode 7.0 by Andrew Glass 2014-03-17 @@ -34,6 +35,7 @@ * # Updated for Unicode 12.1 by Andrew Glass 2019-05-30 * # Updated for Unicode 13.0 by Andrew Glass 2020-07-28 * # Updated for Unicode 14.0 by Andrew Glass 2021-09-28 + * # Updated for Unicode 15.0 by Andrew Glass 2022-09-16 * UnicodeData.txt does not have a header. */ @@ -90,7 +92,7 @@ #pragma GCC diagnostic pop static const uint8_t -hb_use_u8[3115] = +hb_use_u8[3141] = { 16, 50, 51, 51, 51, 52, 51, 83, 118, 131, 51, 57, 58, 179, 195, 61, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, @@ -125,11 +127,11 @@ hb_use_u8[3115] = 2, 2, 2, 2, 2, 2, 2, 2, 2, 88, 89, 2, 2, 2, 2, 2, 2, 2, 2, 90, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 91, 2, 2, 92, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 93, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 94, 94, 95, 96, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, - 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, - 94, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 2, 2, 2, 91, 2, 2, 92, 2, 2, 2, 93, 2, 2, 2, 2, 2, + 2, 2, 2, 94, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 95, 95, 96, 97, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, + 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, + 95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0, 0, 0, 0, @@ -226,70 +228,72 @@ hb_use_u8[3115] = 0, 9, 47, 2, 2, 2, 2, 2, 2, 2, 2, 2, 125, 18, 20, 151, 20, 19, 152, 153, 2, 2, 2, 2, 2, 0, 0, 63, 154, 0, 0, 0, 0, 2, 11, 0, 0, 0, 0, 0, 0, 2, 63, 23, 18, 18, 18, 20, - 20, 106, 155, 0, 0, 156, 157, 29, 158, 28, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 21, 17, 20, 20, 159, 42, 0, 0, 0, + 20, 106, 155, 0, 0, 54, 156, 29, 157, 28, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 21, 17, 20, 20, 158, 42, 0, 0, 0, 47, 125, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 7, 7, 2, 2, 28, 2, 2, 2, 2, 2, 2, 2, 28, 2, 2, 2, 2, 2, 2, 2, - 8, 16, 17, 19, 20, 160, 29, 0, 0, 9, 9, 28, 2, 2, 2, 7, + 8, 16, 17, 19, 20, 159, 29, 0, 0, 9, 9, 28, 2, 2, 2, 7, 28, 7, 2, 28, 2, 2, 56, 15, 21, 14, 21, 45, 30, 31, 30, 32, 0, 0, 0, 0, 33, 0, 0, 0, 2, 2, 21, 0, 9, 9, 9, 44, 0, 9, 9, 44, 0, 0, 0, 0, 0, 2, 2, 63, 23, 18, 18, 18, 20, 21, 123, 13, 15, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, - 161, 162, 0, 0, 0, 0, 0, 0, 0, 16, 17, 18, 18, 64, 97, 23, - 158, 9, 163, 7, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, - 63, 23, 18, 18, 0, 46, 46, 9, 164, 35, 0, 0, 0, 0, 0, 0, + 160, 161, 0, 0, 0, 0, 0, 0, 0, 16, 17, 18, 18, 64, 97, 23, + 157, 9, 162, 7, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, + 63, 23, 18, 18, 0, 46, 46, 9, 163, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 18, 0, 21, 17, 18, 18, 19, 14, 80, - 164, 36, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 8, 165, - 23, 18, 20, 20, 163, 7, 0, 0, 0, 2, 2, 2, 2, 2, 7, 41, + 163, 36, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 8, 164, + 23, 18, 20, 20, 162, 7, 0, 0, 0, 2, 2, 2, 2, 2, 7, 41, 133, 21, 20, 18, 74, 19, 20, 0, 0, 2, 2, 2, 7, 0, 0, 0, - 0, 2, 2, 2, 2, 2, 2, 16, 17, 18, 19, 20, 103, 164, 35, 0, + 0, 2, 2, 2, 2, 2, 2, 16, 17, 18, 19, 20, 103, 163, 35, 0, 0, 2, 2, 2, 7, 28, 0, 2, 2, 2, 2, 28, 7, 2, 2, 2, - 2, 21, 21, 16, 30, 31, 10, 166, 167, 168, 169, 0, 0, 0, 0, 0, + 2, 21, 21, 16, 30, 31, 10, 165, 166, 167, 168, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 63, 23, 18, 18, 0, 20, 21, 27, 106, 0, 31, 0, 0, 0, 0, 0, 50, 18, 20, 20, 20, 137, 2, - 2, 2, 170, 171, 9, 13, 172, 70, 173, 0, 0, 1, 144, 0, 0, 0, - 0, 50, 18, 20, 14, 17, 18, 2, 2, 2, 2, 155, 155, 155, 174, 174, - 174, 174, 174, 174, 13, 175, 0, 28, 0, 20, 18, 18, 29, 20, 20, 9, - 164, 0, 59, 59, 59, 59, 59, 59, 59, 64, 19, 80, 44, 0, 0, 0, + 2, 2, 169, 170, 9, 13, 171, 70, 172, 0, 0, 1, 144, 0, 0, 0, + 0, 50, 18, 20, 14, 17, 18, 2, 2, 2, 2, 155, 155, 155, 173, 173, + 173, 173, 173, 173, 13, 174, 0, 28, 0, 20, 18, 18, 29, 20, 20, 9, + 163, 0, 59, 59, 59, 59, 59, 59, 59, 64, 19, 80, 44, 0, 0, 0, 0, 2, 2, 2, 7, 2, 28, 2, 2, 50, 20, 20, 29, 0, 36, 20, - 25, 9, 157, 176, 172, 0, 0, 0, 0, 2, 2, 2, 28, 7, 2, 2, + 25, 9, 156, 175, 171, 0, 0, 0, 0, 2, 2, 2, 28, 7, 2, 2, 2, 2, 2, 2, 2, 2, 21, 21, 45, 20, 33, 80, 66, 0, 0, 0, - 0, 2, 177, 64, 45, 0, 0, 0, 0, 9, 178, 2, 2, 2, 2, 2, + 0, 2, 176, 64, 45, 0, 0, 0, 0, 9, 177, 2, 2, 2, 2, 2, 2, 2, 2, 21, 20, 18, 29, 0, 46, 14, 140, 0, 0, 0, 0, 0, - 0, 179, 179, 179, 106, 7, 0, 0, 0, 9, 9, 9, 44, 0, 0, 0, - 0, 2, 2, 2, 2, 2, 7, 0, 56, 180, 18, 18, 18, 18, 18, 18, + 0, 178, 178, 178, 106, 179, 178, 0, 0, 145, 2, 2, 180, 114, 114, 114, + 114, 114, 114, 114, 0, 0, 0, 0, 0, 9, 9, 9, 44, 0, 0, 0, + 0, 2, 2, 2, 2, 2, 7, 0, 56, 181, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 0, 0, 0, 38, 114, 24, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 56, 35, 0, 4, 118, 118, 118, 119, 0, 0, 9, 9, 9, 47, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, - 44, 2, 2, 2, 2, 2, 2, 9, 9, 2, 2, 42, 42, 42, 90, 0, - 0, O, O, O, GB, B, B, GB, O, O, WJ,FMPst,FMPst, O, CGJ, B, - O, B,VMAbv,VMAbv,VMAbv, O,VMAbv, B,CMBlw,CMBlw,CMBlw,VMAbv,VMPst, VAbv, VPst,CMBlw, - B, VPst, VPre, VPst, VBlw, VBlw, VBlw, VBlw, VAbv, VAbv, VAbv, VPst, VPst, VPst, H, VPre, - VPst,VMBlw, O, O, VAbv, GB,VMAbv,VMPst,VMPst, O, B, VBlw, O, O, VPre, VPre, - O, VPre, H, O, VPst,FMAbv, O,CMBlw, O, VAbv, O, VAbv, H, O,VMBlw,VMAbv, - CMAbv, GB, GB, O, MBlw,CMAbv,CMAbv, VPst, VAbv,VMAbv, O, VPst, O, VPre, VPre,VMAbv, - B, O, CS, CS,VMPst, B, VAbv, VAbv, B, R, O, HVM, O, O, FBlw, O, - CMAbv, O,CMBlw, VAbv, VBlw, B, SUB, SUB, SUB, O, SUB, SUB, O, FBlw, O, B, - VPst, VBlw, VPre,VMAbv,VMBlw,VMPst, IS, VAbv, MPst, MPre, MBlw, MBlw, B, MBlw, MBlw, VPst, - VMPst,VMPst, B, MBlw, VPst, VPre, VAbv, VAbv,VMPst,VMPst,VMBlw, B,VMPst, VBlw, VPst, CGJ, - CGJ, VPst,VMAbv,VMAbv,FMAbv, FAbv,CMAbv,FMAbv,VMAbv,FMAbv, VAbv, IS,FMAbv, B,FMAbv, B, - CGJ, WJ, CGJ, GB,CMAbv,CMAbv, B, GB, B, VAbv, SUB, FPst, FPst,VMBlw, FPst, FPst, - FBlw,VMAbv,FMBlw, VAbv, VPre, B, MPre, MBlw, SUB, FAbv, FAbv, MAbv, SUB, Sk, VPst, VAbv, - VMAbv,VMAbv, FAbv,CMAbv, VPst, H, B, O,SMAbv,SMBlw,SMAbv,SMAbv,SMAbv, VPst, IS, VBlw, - FAbv,VMPre,VMPre,FMAbv,CMBlw,VMBlw,VMBlw,VMAbv, CS, O,FMAbv, ZWNJ, CGJ, WJ, WJ, WJ, - O,FMPst, O, O, H, MPst, VPst, H,VMAbv, VAbv,VMBlw, B, VBlw, FPst, VPst, FAbv, - VMPst, B,CMAbv, VAbv, MBlw, MPst, MBlw, H, O, VBlw, MPst, MPre, MAbv, MBlw, O, B, - FAbv, FAbv, FPst, VBlw, B, B, VPre, O,VMPst, IS, O,VMPst, VBlw, VPst,VMBlw,VMBlw, - VMAbv, O, IS,VMBlw, B,VMPst,VMAbv,VMPst, CS, CS, B, N, N, O, HN, VPre, - VBlw, VAbv, IS,CMAbv, O, VPst, B, R, R, O,FMBlw,CMBlw, VAbv, VPre,VMAbv,VMAbv, - H, VAbv,CMBlw,FMAbv, B, CS, CS, H,CMBlw,VMPst, H,VMPst, VAbv,VMAbv, VPst, IS, - R, MPst, R, MPst,CMBlw, B,FMBlw, VBlw,VMAbv, R, MBlw, MBlw, GB, FBlw, FBlw,CMAbv, - IS, VBlw, IS, GB, VAbv, R,VMPst, H, H, O, VBlw, + 44, 2, 2, 2, 2, 2, 2, 9, 9, 2, 2, 2, 2, 2, 2, 20, + 20, 2, 2, 42, 42, 42, 90, 0, 0, O, O, O, GB, B, B, GB, + O, O, WJ,FMPst,FMPst, O, CGJ, B, O, B,VMAbv,VMAbv,VMAbv, O,VMAbv, B, + CMBlw,CMBlw,CMBlw,VMAbv,VMPst, VAbv, VPst,CMBlw, B, VPst, VPre, VPst, VBlw, VBlw, VBlw, VBlw, + VAbv, VAbv, VAbv, VPst, VPst, VPst, H, VPre, VPst,VMBlw, O, O, VAbv, GB,VMAbv,VMPst, + VMPst, O, B, VBlw, O, O, VPre, VPre, O, VPre, H, O, VPst,FMAbv, O,CMBlw, + O, VAbv, O, VAbv, H, O,VMBlw,VMAbv,CMAbv, GB, GB, O, MBlw,CMAbv,CMAbv, VPst, + VAbv,VMAbv, O, VPst, O, VPre, VPre,VMAbv, B, O, CS, CS,VMPst, B, VAbv, VAbv, + B, R, O, HVM, O, O,FMBlw, O,CMAbv, O,CMBlw, VAbv, VBlw, B, SUB, SUB, + SUB, O, SUB, SUB, O,FMBlw, O, B, VPst, VBlw, VPre,VMAbv,VMBlw,VMPst, IS, VAbv, + MPst, MPre, MBlw, MBlw, B, MBlw, MBlw, VPst,VMPst,VMPst, B, MBlw, VPst, VPre, VAbv, VAbv, + VMPst,VMPst,VMBlw, B,VMPst, VBlw, VPst, CGJ, CGJ, VPst,VMAbv,VMAbv,FMAbv, FAbv,CMAbv,FMAbv, + VMAbv,FMAbv, VAbv, IS,FMAbv, B,FMAbv, B, CGJ, WJ, CGJ, GB,CMAbv,CMAbv, B, GB, + B, VAbv, SUB, FPst, FPst,VMBlw, FPst, FPst, FBlw,VMAbv,FMBlw, VAbv, VPre, B, MPre, MBlw, + SUB, FAbv, FAbv, MAbv, SUB, Sk, VPst, VAbv,VMAbv,VMAbv, FAbv,CMAbv, VPst, H, B, O, + SMAbv,SMBlw,SMAbv,SMAbv,SMAbv, VPst, IS, VBlw, FAbv,VMPre,VMPre,FMAbv,CMBlw,VMBlw,VMBlw,VMAbv, + CS, O,FMAbv, ZWNJ, CGJ, WJ, WJ, WJ, O,FMPst, O, O, H, MPst, VPst, H, + VMAbv, VAbv,VMBlw, B, VBlw, FPst, VPst, FAbv,VMPst, B,CMAbv, VAbv, MBlw, MPst, MBlw, H, + O, VBlw, MPst, MPre, MAbv, MBlw, O, B, FAbv, FAbv, FPst, VBlw, B, B, VPre, O, + VMPst, IS, O,VMPst, VBlw, VPst,VMBlw,VMBlw,VMAbv, O, IS,VMBlw, B,VMPst,VMAbv,VMPst, + CS, CS, B, N, N, O, HN, VPre, VBlw, VAbv, IS,CMAbv, O, VPst, B, R, + R,CMBlw, VAbv, VPre,VMAbv,VMAbv, H, VAbv,CMBlw,FMAbv, B, CS, CS, H,CMBlw,VMPst, + H,VMPst, VAbv,VMAbv, VPst, IS, R, MPst, R, MPst,CMBlw, B,FMBlw, VBlw,VMAbv, R, + MBlw, MBlw, GB, FBlw, FBlw,CMAbv, IS, VBlw, IS, GB, VAbv, R,VMPst, H, H, B, + H, B,VMBlw, O, VBlw, }; static const uint16_t -hb_use_u16[776] = +hb_use_u16[784] = { 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 3, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, @@ -332,14 +336,14 @@ hb_use_u16[776] = 9,242, 73,243, 0, 0, 0, 0,244, 9, 9,245,246, 2,247, 9, 248,249, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,250, 251, 48, 9,252,253, 2, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 98,254, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, - 9, 9, 9,255, 0, 0, 0, 0, 9, 9, 9, 9,256,257,258,258, - 259,260, 0, 0, 0, 0,261, 0, 9, 9, 9, 9, 9,262, 0, 0, - 9, 9, 9, 9, 9, 9,105, 70, 94,263, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0,264, 9, 9, 70,265,266, 0, 0, 0, - 0, 9,267, 0, 9, 9,268, 2, 9, 9, 9, 9,269, 2, 0, 0, - 129,129,129,129,129,129,129,129,160,160,160,160,160,160,160,160, - 160,160,160,160,160,160,160,129, + 9, 9, 9,254,255,256, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, + 9, 9, 9,257, 0, 0, 0, 0, 9, 9, 9, 9,258,259,260,260, + 261,262, 0, 0, 0, 0,263, 0, 9, 9, 9, 9, 9,264, 0, 0, + 9, 9, 9, 9, 9, 9,105, 70, 94,265, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0,266, 9, 9, 70,267,268, 0, 0, 0, + 0, 9,269, 0, 9, 9,270, 2, 0, 0, 0, 0, 0, 9,271, 2, + 9, 9, 9, 9,272, 2, 0, 0,129,129,129,129,129,129,129,129, + 160,160,160,160,160,160,160,160,160,160,160,160,160,160,160,129, }; static inline unsigned @@ -350,7 +354,7 @@ hb_use_b4 (const uint8_t* a, unsigned i) static inline uint_fast8_t hb_use_get_category (unsigned u) { - return u<921600u?hb_use_u8[2753+(((hb_use_u8[593+(((hb_use_u16[((hb_use_u8[113+(((hb_use_b4(hb_use_u8,u>>1>>3>>3>>5))<<5)+((u>>1>>3>>3)&31u))])<<3)+((u>>1>>3)&7u)])<<3)+((u>>1)&7u))])<<1)+((u)&1u))]:O; + return u<921600u?hb_use_u8[2777+(((hb_use_u8[593+(((hb_use_u16[((hb_use_u8[113+(((hb_use_b4(hb_use_u8,u>>1>>3>>3>>5))<<5)+((u>>1>>3>>3)&31u))])<<3)+((u>>1>>3)&7u)])<<3)+((u>>1)&7u))])<<1)+((u)&1u))]:O; } #undef B diff --git a/src/hb-ot-shaper-vowel-constraints.cc b/src/hb-ot-shaper-vowel-constraints.cc index cb4db4a8b..e76b554b0 100644 --- a/src/hb-ot-shaper-vowel-constraints.cc +++ b/src/hb-ot-shaper-vowel-constraints.cc @@ -342,6 +342,40 @@ _hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED, } break; + case HB_SCRIPT_KHOJKI: + for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) + { + bool matched = false; + switch (buffer->cur ().codepoint) + { + case 0x11200u: + switch (buffer->cur (1).codepoint) + { + case 0x1122Cu: case 0x11231u: case 0x11233u: + matched = true; + break; + } + break; + case 0x11206u: + matched = 0x1122Cu == buffer->cur (1).codepoint; + break; + case 0x1122Cu: + switch (buffer->cur (1).codepoint) + { + case 0x11230u: case 0x11231u: + matched = true; + break; + } + break; + case 0x11240u: + matched = 0x1122Eu == buffer->cur (1).codepoint; + break; + } + (void) buffer->next_glyph (); + if (matched) _output_with_dotted_circle (buffer); + } + break; + case HB_SCRIPT_KHUDAWADI: for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;) { diff --git a/src/ms-use/IndicPositionalCategory-Additional.txt b/src/ms-use/IndicPositionalCategory-Additional.txt index 83a164e49..884fee590 100644 --- a/src/ms-use/IndicPositionalCategory-Additional.txt +++ b/src/ms-use/IndicPositionalCategory-Additional.txt @@ -7,6 +7,7 @@ # Updated for Unicode 12.1 by Andrew Glass 2019-05-30 # Updated for Unicode 13.0 by Andrew Glass 2020-07-28 # Updated for Unicode 14.0 by Andrew Glass 2021-09-28 +# Updated for Unicode 15.0 by Andrew Glass 2022-09-16 # ================================================ # ================================================ @@ -19,9 +20,12 @@ 0F7A..0F7D ; Bottom # Mn [4] TIBETAN VOWEL SIGN E..TIBETAN VOWEL SIGN OO # Not really below, but need to override to fit into Universal model 0F80 ; Bottom # Mn TIBETAN VOWEL SIGN REVERSED I # Not really below, but need to override to fit into Universal model A9BF ; Bottom # Mc JAVANESE CONSONANT SIGN CAKRA +10A38 ; Bottom # Mn KHAROSHTHI SIGN BAR ABOVE # Overriden, ccc controls order USE issue #26 11127..11129 ; Bottom # Mn [3] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN II 1112D ; Bottom # Mn CHAKMA VOWEL SIGN AI 11130 ; Bottom # Mn CHAKMA VOWEL SIGN OI +1BF2..1BF3 ; Bottom # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN # see USE issue #20 + # ================================================ @@ -42,12 +46,15 @@ A9BE ; Right # Mc JAVANESE CONSONANT SIGN PENGKAL # Reduced from 0F74 ; Top # Mn TIBETAN VOWEL SIGN U # Not really above, but need to override to fit into Universal model 1A18 ; Top # Mn BUGINESE VOWEL SIGN U # Workaround to allow below to occur before above by treating all below marks as above AA35   ; Top # Mn       CHAM CONSONANT SIGN +1112A..1112B ; Top # Mn [2] CHAKMA VOWEL SIGN U..CHAKMA VOWEL SIGN UU # see USE issue #25 +11131..11132 ; Top # Mn [2] CHAKMA O MARK..CHAKMA AU MARK # see USE issue #25 +1E4EC..1E4EF ; Top # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH # 1E4EE is below, but made to for ccc # ================================================ # Indic_Positional_Category=Top_And_Right -0E33 ; Top_And_Right # Lo THAI CHARACTER SARA AM # IMC has Right, which seems to be a mistake. -0EB3 ; Top_And_Right # Lo LAO VOWEL SIGN AM # IMC has Right, which seems to be a mistake. +0E33 ; Top_And_Right # Lo THAI CHARACTER SARA AM # IPC has Right, which seems to be a mistake. +0EB3 ; Top_And_Right # Lo LAO VOWEL SIGN AM # IPC has Right, which seems to be a mistake. # ================================================ # ================================================ @@ -72,6 +79,9 @@ AA35   ; Top # Mn       CHAM CONSONANT SIGN 16F4F ; Bottom # Mn MIAO SIGN CONSONANT MODIFIER BAR 16F51..16F87 ; Bottom # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI 16F8F..16F92 ; Bottom # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW +#HIEROGLYPHS defined here while ISC is being used as a proxy for dedicated Hieroglyph cluster +13440 ; Bottom # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY +13447..13455 ; Bottom # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED # ================================================ @@ -84,6 +94,7 @@ AA35   ; Top # Mn       CHAM CONSONANT SIGN 07EB..07F3 ; Top # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE 07FD ; Top # Mn NKO DANTAYALAN # Not really top, but assigned here to allow ccc to control mark order 1885..1886 ; Top # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA +1CF8..1CF9 ; Top # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 10D24..10D27 ; Top # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI 10EAB..10EAC ; Top # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 16B30..16B36 ; Top # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM diff --git a/src/ms-use/IndicShapingInvalidCluster.txt b/src/ms-use/IndicShapingInvalidCluster.txt index c4efe1472..9e0edd34c 100644 --- a/src/ms-use/IndicShapingInvalidCluster.txt +++ b/src/ms-use/IndicShapingInvalidCluster.txt @@ -8,98 +8,106 @@ # # Scope: This file enumerates sequences of characters that should be treated as invalid clusters - 0905 0946 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT E - 0905 093E ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AA - 0930 094D 0907 ; # DEVANAGARI LETTER RA, DEVANAGARI SIGN VIRAMA, DEVANAGARI LETTER I - 0909 0941 ; # DEVANAGARI LETTER U, DEVANAGARI VOWEL SIGN U - 090F 0945 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN CANDRA E - 090F 0946 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN SHORT E - 090F 0947 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN E - 0905 0949 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA O - 0906 0945 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN CANDRA E - 0905 094A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT O - 0906 0946 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN SHORT E - 0905 094B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN O - 0906 0947 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN E - 0905 094C ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AU - 0906 0948 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN AI - 0905 0945 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA E - 0905 093A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OE - 0905 093B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OOE - 0906 093A ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN OE - 0905 094F ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AW - 0905 0956 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UE - 0905 0957 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UUE - 0985 09BE ; # BENGALI LETTER A, BENGALI VOWEL SIGN AA - 098B 09C3 ; # BENGALI LETTER VOCALIC R, BENGALI VOWEL SIGN VOCALIC R - 098C 09E2 ; # BENGALI LETTER VOCALIC L, BENGALI VOWEL SIGN VOCALIC L - 0A05 0A3E ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AA - 0A72 0A3F ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN I - 0A72 0A40 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN II - 0A73 0A41 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN U - 0A73 0A42 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN UU - 0A72 0A47 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN EE - 0A05 0A48 ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AI - 0A73 0A4B ; # GURMUKHI URA, GURMUKHI VOWEL SIGN OO - 0A05 0A4C ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AU - 0A85 0ABE ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA - 0A85 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA E - 0A85 0AC7 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN E - 0A85 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AI - 0A85 0AC9 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA O - 0A85 0ACB ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN O - 0A85 0ABE 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN CANDRA E - 0A85 0ACC ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AU - 0A85 0ABE 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN AI - 0AC5 0ABE ; # GUJARATI VOWEL SIGN CANDRA E, GUJARATI VOWEL SIGN AA - 0B05 0B3E ; # ORIYA LETTER A, ORIYA VOWEL SIGN AA - 0B0F 0B57 ; # ORIYA LETTER E, ORIYA AU LENGTH MARK - 0B13 0B57 ; # ORIYA LETTER O, ORIYA AU LENGTH MARK - 0B85 0BC2 ; # TAMIL LETTER A, TAMIL VOWEL SIGN UU - 0C12 0C55 ; # TELUGU LETTER O, TELUGU LENGTH MARK - 0C12 0C4C ; # TELUGU LETTER O, TELUGU VOWEL SIGN AU - 0C3F 0C55 ; # TELUGU VOWEL SIGN I, TELUGU LENGTH MARK - 0C46 0C55 ; # TELUGU VOWEL SIGN E, TELUGU LENGTH MARK - 0C4A 0C55 ; # TELUGU VOWEL SIGN O, TELUGU LENGTH MARK - 0C89 0CBE ; # KANNADA LETTER U, KANNADA VOWEL SIGN AA - 0C92 0CCC ; # KANNADA LETTER O, KANNADA VOWEL SIGN AU - 0C8B 0CBE ; # KANNADA LETTER VOCALIC R, KANNADA VOWEL SIGN AA - 0D07 0D57 ; # MALAYALAM LETTER I, MALAYALAM AU LENGTH MARK - 0D09 0D57 ; # MALAYALAM LETTER U, MALAYALAM AU LENGTH MARK - 0D0E 0D46 ; # MALAYALAM LETTER E, MALAYALAM VOWEL SIGN E - 0D12 0D3E ; # MALAYALAM LETTER O, MALAYALAM VOWEL SIGN AA - 0D12 0D57 ; # MALAYALAM LETTER O, MALAYALAM AU LENGTH MARK - 0D85 0DCF ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN AELA-PILLA - 0D85 0DD0 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN KETTI AEDA-PILLA - 0D85 0DD1 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN DIGA AEDA-PILLA - 0D8B 0DDF ; # SINHALA LETTER UYANNA, SINHALA VOWEL SIGN GAYANUKITTA - 0D8D 0DD8 ; # SINHALA LETTER IRUYANNA, SINHALA VOWEL SIGN GAETTA-PILLA - 0D8F 0DDF ; # SINHALA LETTER ILUYANNA, SINHALA VOWEL SIGN GAYANUKITTA - 0D91 0DCA ; # SINHALA LETTER EYANNA, SINHALA SIGN AL-LAKUNA - 0D91 0DD9 ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA - 0D91 0DDA ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN DIGA KOMBUVA - 0D91 0DDC ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA - 0D91 0DDD ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA - 0D91 0DDE ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA - 0D94 0DDF ; # SINHALA LETTER OYANNA, SINHALA VOWEL SIGN GAYANUKITTA - 11005 11038 ; # BRAHMI LETTER A, BRAHMI VOWEL SIGN AA - 1100B 1103E ; # BRAHMI LETTER VOCALIC R, BRAHMI VOWEL SIGN VOCALIC R - 1100F 11042 ; # BRAHMI LETTER E, BRAHMI VOWEL SIGN E - 11680 116AD ; # TAKRI LETTER A, TAKRI VOWEL SIGN AA - 11686 116B2 ; # TAKRI LETTER E, TAKRI VOWEL SIGN E - 11680 116B4 ; # TAKRI LETTER A, TAKRI VOWEL SIGN O - 11680 116B5 ; # TAKRI LETTER A, TAKRI VOWEL SIGN AU - 112B0 112E0 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AA - 112B0 112E5 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN E - 112B0 112E6 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AI - 112B0 112E7 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN O - 112B0 112E8 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AU - 11481 114B0 ; # TIRHUTA LETTER A, TIRHUTA VOWEL SIGN AA - 114AA 114B5 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC R - 114AA 114B6 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC RR - 1148B 114BA ; # TIRHUTA LETTER E, TIRHUTA VOWEL SIGN SHORT E - 1148D 114BA ; # TIRHUTA LETTER O, TIRHUTA VOWEL SIGN SHORT E - 11600 11639 ; # MODI LETTER A, MODI VOWEL SIGN E - 11600 1163A ; # MODI LETTER A, MODI VOWEL SIGN AI - 11601 11639 ; # MODI LETTER AA, MODI VOWEL SIGN E - 11601 1163A ; # MODI LETTER AA, MODI VOWEL SIGN AI + 0905 0946 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT E + 0905 093E ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AA + 0930 094D 0907 ; # DEVANAGARI LETTER RA, DEVANAGARI SIGN VIRAMA, DEVANAGARI LETTER I + 0909 0941 ; # DEVANAGARI LETTER U, DEVANAGARI VOWEL SIGN U + 090F 0945 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN CANDRA E + 090F 0946 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN SHORT E + 090F 0947 ; # DEVANAGARI LETTER E, DEVANAGARI VOWEL SIGN E + 0905 0949 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA O + 0906 0945 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN CANDRA E + 0905 094A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN SHORT O + 0906 0946 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN SHORT E + 0905 094B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN O + 0906 0947 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN E + 0905 094C ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AU + 0906 0948 ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN AI + 0905 0945 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA E + 0905 093A ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OE + 0905 093B ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN OOE + 0906 093A ; # DEVANAGARI LETTER AA, DEVANAGARI VOWEL SIGN OE + 0905 094F ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN AW + 0905 0956 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UE + 0905 0957 ; # DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN UUE + 0985 09BE ; # BENGALI LETTER A, BENGALI VOWEL SIGN AA + 098B 09C3 ; # BENGALI LETTER VOCALIC R, BENGALI VOWEL SIGN VOCALIC R + 098C 09E2 ; # BENGALI LETTER VOCALIC L, BENGALI VOWEL SIGN VOCALIC L + 0A05 0A3E ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AA + 0A72 0A3F ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN I + 0A72 0A40 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN II + 0A73 0A41 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN U + 0A73 0A42 ; # GURMUKHI URA, GURMUKHI VOWEL SIGN UU + 0A72 0A47 ; # GURMUKHI IRI, GURMUKHI VOWEL SIGN EE + 0A05 0A48 ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AI + 0A73 0A4B ; # GURMUKHI URA, GURMUKHI VOWEL SIGN OO + 0A05 0A4C ; # GURMUKHI LETTER A, GURMUKHI VOWEL SIGN AU + 0A85 0ABE ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA + 0A85 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA E + 0A85 0AC7 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN E + 0A85 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AI + 0A85 0AC9 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN CANDRA O + 0A85 0ACB ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN O + 0A85 0ABE 0AC5 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN CANDRA E + 0A85 0ACC ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AU + 0A85 0ABE 0AC8 ; # GUJARATI LETTER A, GUJARATI VOWEL SIGN AA, GUJARATI VOWEL SIGN AI + 0AC5 0ABE ; # GUJARATI VOWEL SIGN CANDRA E, GUJARATI VOWEL SIGN AA + 0B05 0B3E ; # ORIYA LETTER A, ORIYA VOWEL SIGN AA + 0B0F 0B57 ; # ORIYA LETTER E, ORIYA AU LENGTH MARK + 0B13 0B57 ; # ORIYA LETTER O, ORIYA AU LENGTH MARK + 0B85 0BC2 ; # TAMIL LETTER A, TAMIL VOWEL SIGN UU + 0C12 0C55 ; # TELUGU LETTER O, TELUGU LENGTH MARK + 0C12 0C4C ; # TELUGU LETTER O, TELUGU VOWEL SIGN AU + 0C3F 0C55 ; # TELUGU VOWEL SIGN I, TELUGU LENGTH MARK + 0C46 0C55 ; # TELUGU VOWEL SIGN E, TELUGU LENGTH MARK + 0C4A 0C55 ; # TELUGU VOWEL SIGN O, TELUGU LENGTH MARK + 0C89 0CBE ; # KANNADA LETTER U, KANNADA VOWEL SIGN AA + 0C92 0CCC ; # KANNADA LETTER O, KANNADA VOWEL SIGN AU + 0C8B 0CBE ; # KANNADA LETTER VOCALIC R, KANNADA VOWEL SIGN AA + 0D07 0D57 ; # MALAYALAM LETTER I, MALAYALAM AU LENGTH MARK + 0D09 0D57 ; # MALAYALAM LETTER U, MALAYALAM AU LENGTH MARK + 0D0E 0D46 ; # MALAYALAM LETTER E, MALAYALAM VOWEL SIGN E + 0D12 0D3E ; # MALAYALAM LETTER O, MALAYALAM VOWEL SIGN AA + 0D12 0D57 ; # MALAYALAM LETTER O, MALAYALAM AU LENGTH MARK + 0D85 0DCF ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN AELA-PILLA + 0D85 0DD0 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN KETTI AEDA-PILLA + 0D85 0DD1 ; # SINHALA LETTER AYANNA, SINHALA VOWEL SIGN DIGA AEDA-PILLA + 0D8B 0DDF ; # SINHALA LETTER UYANNA, SINHALA VOWEL SIGN GAYANUKITTA + 0D8D 0DD8 ; # SINHALA LETTER IRUYANNA, SINHALA VOWEL SIGN GAETTA-PILLA + 0D8F 0DDF ; # SINHALA LETTER ILUYANNA, SINHALA VOWEL SIGN GAYANUKITTA + 0D91 0DCA ; # SINHALA LETTER EYANNA, SINHALA SIGN AL-LAKUNA + 0D91 0DD9 ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA + 0D91 0DDA ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN DIGA KOMBUVA + 0D91 0DDC ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA + 0D91 0DDD ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA + 0D91 0DDE ; # SINHALA LETTER EYANNA, SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA + 0D94 0DDF ; # SINHALA LETTER OYANNA, SINHALA VOWEL SIGN GAYANUKITTA + 11005 11038 ; # BRAHMI LETTER A, BRAHMI VOWEL SIGN AA + 1100B 1103E ; # BRAHMI LETTER VOCALIC R, BRAHMI VOWEL SIGN VOCALIC R + 1100F 11042 ; # BRAHMI LETTER E, BRAHMI VOWEL SIGN E + 11680 116AD ; # TAKRI LETTER A, TAKRI VOWEL SIGN AA + 11686 116B2 ; # TAKRI LETTER E, TAKRI VOWEL SIGN E + 11680 116B4 ; # TAKRI LETTER A, TAKRI VOWEL SIGN O + 11680 116B5 ; # TAKRI LETTER A, TAKRI VOWEL SIGN AU + 11200 1122C ; # KHOJKI LETTER A, KHOJKI VOWEL SIGN AA + 11240 1122E ; # KHOJKI LETTER SHORT I, KHOJKI VOWEL SIGN II + 11206 1122C ; # KHOJKI LETTER O, KHOJKI VOWEL SIGN AA + 11200 11231 ; # KHOJKI LETTER A, KHOJKI VOWEL SIGN AI + 11200 11233 ; # KHOJKI LETTER A, KHOJKI VOWEL SIGN AU + 11200 1122C 11231 ; # KHOJKI LETTER A, KHOJKI VOWEL SIGN AA, KHOJKI VOWEL SIGN AI + 1122C 11230 ; # KHOJKI VOWEL SIGN AA, KHOJKI VOWEL SIGN E + 1122C 11231 ; # KHOJKI VOWEL SIGN AA, KHOJKI VOWEL SIGN AI + 112B0 112E0 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AA + 112B0 112E5 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN E + 112B0 112E6 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AI + 112B0 112E7 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN O + 112B0 112E8 ; # KHUDAWADI LETTER A, KHUDAWADI VOWEL SIGN AU + 11481 114B0 ; # TIRHUTA LETTER A, TIRHUTA VOWEL SIGN AA + 114AA 114B5 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC R + 114AA 114B6 ; # TIRHUTA LETTER LA, TIRHUTA VOWEL SIGN VOCALIC RR + 1148B 114BA ; # TIRHUTA LETTER E, TIRHUTA VOWEL SIGN SHORT E + 1148D 114BA ; # TIRHUTA LETTER O, TIRHUTA VOWEL SIGN SHORT E + 11600 11639 ; # MODI LETTER A, MODI VOWEL SIGN E + 11600 1163A ; # MODI LETTER A, MODI VOWEL SIGN AI + 11601 11639 ; # MODI LETTER AA, MODI VOWEL SIGN E + 11601 1163A ; # MODI LETTER AA, MODI VOWEL SIGN AI diff --git a/src/ms-use/IndicSyllabicCategory-Additional.txt b/src/ms-use/IndicSyllabicCategory-Additional.txt index 8bcada38a..0ca2aad10 100644 --- a/src/ms-use/IndicSyllabicCategory-Additional.txt +++ b/src/ms-use/IndicSyllabicCategory-Additional.txt @@ -5,6 +5,7 @@ # Updated for Unicode 12.1 by Andrew Glass 2019-05-24 # Updated for Unicode 13.0 by Andrew Glass 2020-07-28 # Updated for Unicode 14.0 by Andrew Glass 2021-09-25 +# Updated for Unicode 15.0 by Andrew Glass 2022-09-16 # ================================================ # OVERRIDES TO ASSIGNED VALUES @@ -18,23 +19,13 @@ AA29 ; Bindu # Mn  CHAM VOWEL SIGN AA # ================================================ # Indic_Syllabic_Category=Consonant -0840..0858 ; Consonant # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN -0F00..0F01 ; Consonant # Lo [2] TIBETAN SYLLABLE OM..TIBETAN MARK GTER YIG MGO TRUNCATED -0F04..0F06 ; Consonant # Po TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK CARET YIG MGO PHUR SHAD MA 19C1..19C7 ; Consonant # Lo [7] NEW TAI LUE LETTER FINAL V..NEW TAI LUE LETTER FINAL B # Reassigned to avoid clustering with a base consonant -25CC ; Consonant # So DOTTED CIRCLE +25CC ; Consonant # So DOTTED CIRCLE #Reassigned to allow it to cluster as a generic base # ================================================ # Indic_Syllabic_Category=Consonant_Dead -0F7F ; Consonant_Dead # Mc TIBETAN SIGN RNAM BCAD # reassigned so that visarga will form an independent cluster - -# ================================================ - -# Indic_Syllabic_Category=Consonant_Final -0F35 ; Consonant_Final # Mn TIBETAN MARK NGAS BZUNG NYI ZLA -0F37 ; Consonant_Final # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS -0FC6 ; Consonant_Final # Mn TIBETAN SYMBOL PADMA GDAN +0F7F ; Consonant_Dead # Mc TIBETAN SIGN RNAM BCAD # reassigned so that visarga can form an independent cluster, but see #19 # ================================================ @@ -49,8 +40,8 @@ AA29 ; Bindu # Mn  CHAM VOWEL SIGN AA # ================================================ # Indic_Syllabic_Category=Nukta -0F71 ; Nukta # Mn TIBETAN VOWEL SIGN AA # Reassigned to get this before an above vowel -10A38..10A3A ; Nukta # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW +0F71 ; Nukta # Mn TIBETAN VOWEL SIGN AA # Reassigned to get this before an above vowel, but see #22 +1BF2..1BF3 ; Nukta # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN # see USE issue #20 # ================================================ @@ -73,6 +64,9 @@ AABD ; Vowel_Independent # Lo TAI VIET VOWEL AN # Indic_Syllabic_Category=Consonant 0800..0815 ; Consonant # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF +0840..0858 ; Consonant # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN +0F00..0F01 ; Consonant # Lo [2] TIBETAN SYLLABLE OM..TIBETAN MARK GTER YIG MGO TRUNCATED +0F04..0F06 ; Consonant # Po TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK CARET YIG MGO PHUR SHAD MA 1800 ; Consonant # Po MONGOLIAN BIRGA # Reassigned so that legacy Birga + MFVS sequences still work 1807 ; Consonant # Po MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER 180A ; Consonant # Po MONGOLIAN NIRUGU @@ -94,11 +88,13 @@ AABD ; Vowel_Independent # Lo TAI VIET VOWEL AN 10E80..10EA9 ; Consonant # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET 10EB0..10EB1 ; Consonant # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE 10F30..10F45 ; Consonant # Lo [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN +10F70..10F81 ; Consonant # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH 111DA ; Consonant # Lo SHARADA EKAM #HIEROGLYPHS to be moved to new category -13000..1342E ; Consonant # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032 +13000..1342F ; Consonant # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D #For the Begin and End segment to be handled fully correctly, the cluster model needs to be modified. 13437..13438 ; Consonant # Lo [2] EGYPTIAN HIEROGLYPH BEGIN SEGMENT..EGYPTIAN HIEROGLYPH END SEGMENT +13441..13446 ; Consonant # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..HIEROGLYPH WIDE LOST SIGN 16B00..16B2F ; Consonant # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU 16F00..16F4A ; Consonant # Lo [75] MIAO LETTER PA..MIAO LETTER RTE 16FE4 ; Consonant # Mn KHITAN SMALL SCRIPT FILLER # Avoids Mn pushing this into VOWEL class @@ -113,6 +109,8 @@ AABD ; Vowel_Independent # Lo TAI VIET VOWEL AN 1E14F ; Consonant # So NYIAKENG PUACHUE HMONG CIRCLED CA 1E290..1E2AD ; Consonant # Lo [30] TOTO LETTER PA..TOTO LETTER A 1E2C0..1E2EB ; Consonant # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH +1E4D0..1E4EA ; Consonant # Lo [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL +1E4EB ; Consonant # Lm NAG MUNDARI SIGN OJOD 1E900..1E921 ; Consonant # Lu [34] ADLAM CAPITAL LETTER ALIF..ADLAM CAPITAL LETTER SHA 1E922..1E943 ; Consonant # Ll [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA 1E94B ; Consonant # Lm ADLAM NASALIZATION MARK @@ -140,7 +138,6 @@ FE00..FE0F ; Modifying_Letter # Mn [16] VARIATION SELECTOR-1..VARIATION SEL 0F39 ; Nukta # Mn TIBETAN MARK TSA -PHRU # NOW IN UNICODE 10.0 1885..1886 ; Nukta # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 18A9 ; Nukta # Mn MONGOLIAN LETTER ALI GALI DAGALGA -1B6B..1B73 ; Nukta # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG 10AE5..10AE6 ; Nukta # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 16F4F ; Nukta # Mn MIAO SIGN CONSONANT MODIFIER BAR 1BC9D..1BC9E ; Nukta # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK @@ -155,6 +152,7 @@ FE00..FE0F ; Modifying_Letter # Mn [16] VARIATION SELECTOR-1..VARIATION SEL 16AC0..16AC9 ; Number # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE 1E140..1E149 ; Number # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE 1E2F0..1E2F9 ; Number # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE +1E4F0..1E4F9 ; Number # Nd [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE 1E950..1E959 ; Number # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE # ================================================ @@ -176,7 +174,9 @@ FE00..FE0F ; Modifying_Letter # Mn [16] VARIATION SELECTOR-1..VARIATION SEL # Indic_Syllabic_Category=Virama 2D7F ; Virama # Mn TIFINAGH CONSONANT JOINER +#HIEROGLYPHS to be moved to new category 13430..13436 ; Virama # Cf [7] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE +13439..1343B ; Virama # Cf [3] EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM # ================================================ @@ -191,6 +191,21 @@ AABD ; Vowel_Independent # Lo TAI VIET VOWEL AN 0B55 ; Vowel_Dependent # Mn ORIYA SIGN OVERLINE 10EAB..10EAC ; Vowel_Dependent # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK 16F51..16F87 ; Vowel_Dependent # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI +1E4EC..1E4EF ; Vowel_Dependent # Mn [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH + +# ================================================ + +# Indic_Syllabic_Category=Cantillation_Mark + +1CF8..1CF9 ; Cantillation_Mark # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE +#HIEROGLYPHS to be moved to new category +13440 ; Cantillation_Mark # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY +13447..13455 ; Cantillation_Mark # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED + +# ================================================ + +# Indic_Syllabic_Category=Symbol_Modifier +1B6B..1B73 ; Symbol_Modifier # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG # ================================================ # ================================================ @@ -198,24 +213,56 @@ AABD ; Vowel_Independent # Lo TAI VIET VOWEL AN # ================================================ # ================================================ -# USE_Syllabic_Category=Hieroglyph -# 13000..1342E ; Hieroglyph # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032 +# USE, Extended_Syllabic_Category=Hieroglyph +# 13000..1342F ; Hieroglyph # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D +# 13441..13446 ; Hieroglyph # Lo [6] EGYPTIAN HIEROGLYPH FULL BLANK..HIEROGLYPH WIDE LOST SIGN # ================================================ -# USE_Syllabic_Category=Hieroglyph_Joiner -# 13430..13436 ; Hieroglyph_Joiner # Cf EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE +# USE, Extended_Syllabic_Category=Hieroglyph_Joiner +# 13430..13436 ; Hieroglyph_Joiner # Cf [7] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE +# 13439..1343B ; Hieroglyph_Joiner # Cf [3] EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM # ================================================ -# USE_Syllabic_Category= Hieroglyph_Segment_Begin +# USE, Extended_Syllabic_Category=Hieroglyph_Mark_Begin +# 005B ; Hieroglyph_Mark_Begin # Ps LEFT SQUARE BRACKET +# 007B ; Hieroglyph_Mark_Begin # Ps LEFT CURLY BRACKET +# 27E6 ; Hieroglyph_Mark_Begin # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET +# 27E8 ; Hieroglyph_Mark_Begin # Ps MATHEMATICAL LEFT ANGLE BRACKET +# 2E22 ; Hieroglyph_Mark_Begin # Ps TOP LEFT HALF BRACKET +# 2E24 ; Hieroglyph_Mark_Begin # Ps BOTTOM LEFT HALF BRACKET + +# ================================================ + +# USE, Extended_Syllabic_Category=Hieroglyph_Mark_End +# 005D ; Hieroglyph_Mark_Begin # Pe RIGHT SQUARE BRACKET +# 007D ; Hieroglyph_Mark_Begin # Pe RIGHT CURLY BRACKET +# 27E7 ; Hieroglyph_Mark_Begin # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET +# 27E9 ; Hieroglyph_Mark_Begin # Pe MATHEMATICAL RIGHT ANGLE BRACKET +# 2E23 ; Hieroglyph_Mark_Begin # Pe TOP RIGHT HALF BRACKET +# 2E25 ; Hieroglyph_Mark_Begin # Pe BOTTOM RIGHT HALF BRACKET + +# ================================================ + +# USE, Extended_Syllabic_Category=Hieroglyph_Segment_Begin # 13437 ; Hieroglyph_Segment_Begin # Cf EGYPTIAN HIEROGLYPH BEGIN SEGMENT # ================================================ -# USE_Syllabic_Category= Hieroglyph_Segment_End +# USE, Extended_Syllabic_Category=Hieroglyph_Segment_End # 13438 ; Hieroglyph_Segment_End # Cf EGYPTIAN HIEROGLYPH END SEGMENT # ================================================ +# USE, Extended_Syllabic_Category=Hieroglyph_Mirror +# 13440 ; Hieroglyph_Mirror # Mn EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY + +# ================================================ + +# USE, Extended_Syllabic_Category=Hieroglyph_Modifier +# 13447..13455 ; Hieroglyph_Modifier # Mn [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED + +# ================================================ + # eof