Use a lookup table for modified_combining_class

2012-08-01 18:07:42 -04:00 · 2012-08-01 18:07:42 -04:00 · 6adf417bc1
parent 208f70f055
commit 6adf417bc1
3 changed files with 155 additions and 68 deletions
--- a/src/hb-unicode-private.hh
+++ b/src/hb-unicode-private.hh
@ -1,7 +1,7 @@
 /*
 * Copyright © 2009  Red Hat, Inc.
 * Copyright © 2011  Codethink Limited
- * Copyright © 2010,2011  Google, Inc.
+ * Copyright © 2010,2011,2012  Google, Inc.
 *
 *  This is part of HarfBuzz, a text shaping library.
 *
@ -37,6 +37,7 @@
 #include "hb-object-private.hh"
 extern HB_INTERNAL const uint8_t _hb_modified_combining_class[256];
 /*
 * hb_unicode_funcs_t
@ -143,8 +144,11 @@ HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE
  }
-  HB_INTERNAL unsigned int
+  unsigned int
-  modified_combining_class (hb_codepoint_t unicode);
+  modified_combining_class (hb_codepoint_t unicode)
  {
    return _hb_modified_combining_class[combining_class (unicode)];
  }
  inline hb_bool_t
  is_variation_selector (hb_codepoint_t unicode)
--- a/src/hb-unicode.cc
+++ b/src/hb-unicode.cc
@ -1,7 +1,7 @@
 /*
 * Copyright © 2009  Red Hat, Inc.
- * Copyright © 2011 Codethink Limited
+ * Copyright © 2011  Codethink Limited
- * Copyright © 2010,2011  Google, Inc.
+ * Copyright © 2010,2011,2012  Google, Inc.
 *
 *  This is part of HarfBuzz, a text shaping library.
 *
@ -287,69 +287,148 @@ hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs,
 }
-unsigned int
+const uint8_t
-hb_unicode_funcs_t::modified_combining_class (hb_codepoint_t unicode)
+_hb_modified_combining_class[256] =
 {
-  int c = combining_class (unicode);
+  0, /* HB_UNICODE_COMBINING_CLASS_NOT_REORDERED */
  1, /* HB_UNICODE_COMBINING_CLASS_OVERLAY */
  2, 3, 4, 5, 6,
  7, /* HB_UNICODE_COMBINING_CLASS_NUKTA */
  8, /* HB_UNICODE_COMBINING_CLASS_KANA_VOICING */
  9, /* HB_UNICODE_COMBINING_CLASS_VIRAMA */
-  if (unlikely (hb_in_range<int> (c, 27, 33)))
+  /* Hebrew */
  {
    /* Modify the combining-class to suit Arabic better.  See:
     * http://unicode.org/faq/normalization.html#8
     * http://unicode.org/faq/normalization.html#9
     */
    c = c == 33 ? 27 : c + 1;
  }
  else if (unlikely (hb_in_range<int> (c, 10, 26)))
  {
    /* The equivalent fix for Hebrew is more complex.
     *
     * We permute the "fixed-position" classes 10-26 into the order
     * described in the SBL Hebrew manual:
     *
     * http://www.sbl-site.org/Fonts/SBLHebrewUserManual1.5x.pdf
     *
     * (as recommended by:
     *  http://forum.fontlab.com/archive-old-microsoft-volt-group/vista-and-diacritic-ordering-t6751.0.html)
     *
     * More details here:
     * https://bugzilla.mozilla.org/show_bug.cgi?id=662055
     */
    static const int permuted_hebrew_classes[26 - 10 + 1] = {
      /* 10 sheva */        22,
      /* 11 hataf segol */  15,
      /* 12 hataf patah */  16,
      /* 13 hataf qamats */ 17,
      /* 14 hiriq */        23,
      /* 15 tsere */        18,
      /* 16 segol */        19,
      /* 17 patah */        20,
      /* 18 qamats */       21,
      /* 19 holam */        14,
      /* 20 qubuts */       24,
      /* 21 dagesh */       12,
      /* 22 meteg */        25,
      /* 23 rafe */         13,
      /* 24 shin dot */     10,
      /* 25 sin dot */      11,
      /* 26 point varika */ 26,
    };
    c = permuted_hebrew_classes[c - 10];
  }
  else if (unlikely (unicode == 0x0E3A)) /* THAI VOWEL SIGN PHINTHU */
  {
    /* Assign 104, so it reorders after the THAI ccc=103 marks.
     * Uniscribe does this. */
    c = 104;
  }
  else if (unlikely (hb_in_range<hb_codepoint_t> (unicode, 0x0C55, 0x0C56)))
  {
    /* Telugu length marks.
     * These are the only matras in the main Indic script range that have
     * a non-zero ccc.  That makes them reorder with the Halant that is
     * ccc=9.  Just zero them, we don't need them in our Indic shaper. */
    c = 0;
  }
-  return c;
+  /*
-}
+   * We permute the "fixed-position" classes 10-26 into the order
   * described in the SBL Hebrew manual:
   *
   * http://www.sbl-site.org/Fonts/SBLHebrewUserManual1.5x.pdf
   *
   * (as recommended by:
   *  http://forum.fontlab.com/archive-old-microsoft-volt-group/vista-and-diacritic-ordering-t6751.0.html)
   *
   * More details here:
   * https://bugzilla.mozilla.org/show_bug.cgi?id=662055
   */
  22, /* HB_UNICODE_COMBINING_CLASS_CCC10 sheva */
  15, /* HB_UNICODE_COMBINING_CLASS_CCC11 hataf segol */
  16, /* HB_UNICODE_COMBINING_CLASS_CCC12 hataf patah*/
  17, /* HB_UNICODE_COMBINING_CLASS_CCC13 hataf qamats */
  23, /* HB_UNICODE_COMBINING_CLASS_CCC14 hiriq */
  18, /* HB_UNICODE_COMBINING_CLASS_CCC15 tsere */
  19, /* HB_UNICODE_COMBINING_CLASS_CCC16 segol */
  20, /* HB_UNICODE_COMBINING_CLASS_CCC17 patah */
  21, /* HB_UNICODE_COMBINING_CLASS_CCC18 qamats */
  14, /* HB_UNICODE_COMBINING_CLASS_CCC19 holam */
  24, /* HB_UNICODE_COMBINING_CLASS_CCC20 qubuts */
  12, /* HB_UNICODE_COMBINING_CLASS_CCC21 dagesh */
  25, /* HB_UNICODE_COMBINING_CLASS_CCC22 meteg */
  13, /* HB_UNICODE_COMBINING_CLASS_CCC23 rafe */
  10, /* HB_UNICODE_COMBINING_CLASS_CCC24 shin dot */
  11, /* HB_UNICODE_COMBINING_CLASS_CCC25 sin dot */
  26, /* HB_UNICODE_COMBINING_CLASS_CCC26 */
  /* Arabic */
  /*
   * Modify to move Shadda (ccc=33) before other marks.  See:
   * http://unicode.org/faq/normalization.html#8
   * http://unicode.org/faq/normalization.html#9
   */
  28, /* HB_UNICODE_COMBINING_CLASS_CCC27 */
  29, /* HB_UNICODE_COMBINING_CLASS_CCC28 */
  30, /* HB_UNICODE_COMBINING_CLASS_CCC29 */
  31, /* HB_UNICODE_COMBINING_CLASS_CCC30 */
  32, /* HB_UNICODE_COMBINING_CLASS_CCC31 */
  33, /* HB_UNICODE_COMBINING_CLASS_CCC32 */
  27, /* HB_UNICODE_COMBINING_CLASS_CCC33 shadda */
  34, /* HB_UNICODE_COMBINING_CLASS_CCC34 */
  35, /* HB_UNICODE_COMBINING_CLASS_CCC35 */
  /* Syriac */
  36, /* HB_UNICODE_COMBINING_CLASS_CCC36 */
  37, 38, 39,
  40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
  60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
  80, 81, 82, 83,
  /* Telugu */
  /*
   * Modify Telugu length marks (ccc=84, ccc=91).
   * These are the only matras in the main Indic scripts range that have
   * a non-zero ccc.  That makes them reorder with the Halant that is
   * ccc=9.  Just zero them, we don't need them in our Indic shaper.
   */
  0, /* HB_UNICODE_COMBINING_CLASS_CCC84 */
  85, 86, 87, 88, 89, 90,
  0, /* HB_UNICODE_COMBINING_CLASS_CCC91 */
  92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
  /* Thai */
  /*
   * Modify U+0E38 and U+0E39 (ccc=104) to be reordered before U+0E3A (ccc=9).
   * Uniscribe does this too.
   */
  3, /* HB_UNICODE_COMBINING_CLASS_CCC103 */
  104, 105, 106,
  107, /* HB_UNICODE_COMBINING_CLASS_CCC107 */
  108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
  /* Lao */
  118, /* HB_UNICODE_COMBINING_CLASS_CCC118 */
  119, 120, 121,
  122, /* HB_UNICODE_COMBINING_CLASS_CCC122 */
  123, 124, 125, 126, 127, 128,
  /* Tibetan */
  129, /* HB_UNICODE_COMBINING_CLASS_CCC129 */
  130, /* HB_UNICODE_COMBINING_CLASS_CCC130 */
  131,
  132, /* HB_UNICODE_COMBINING_CLASS_CCC133 */
  133, 134, 135, 136, 137, 138, 139,
  140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
  150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
  160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
  170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
  180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
  190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
  200, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_BELOW_LEFT */
  201,
  202, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_BELOW */
  203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
  214, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_ABOVE */
  215,
  216, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_ABOVE_RIGHT */
  217,
  218, /* HB_UNICODE_COMBINING_CLASS_BELOW_LEFT */
  219,
  220, /* HB_UNICODE_COMBINING_CLASS_BELOW */
  221,
  222, /* HB_UNICODE_COMBINING_CLASS_BELOW_RIGHT */
  223,
  224, /* HB_UNICODE_COMBINING_CLASS_LEFT */
  225,
  226, /* HB_UNICODE_COMBINING_CLASS_RIGHT */
  227,
  228, /* HB_UNICODE_COMBINING_CLASS_ABOVE_LEFT */
  229,
  230, /* HB_UNICODE_COMBINING_CLASS_ABOVE */
  231,
  232, /* HB_UNICODE_COMBINING_CLASS_ABOVE_RIGHT */
  233, /* HB_UNICODE_COMBINING_CLASS_DOUBLE_BELOW */
  234, /* HB_UNICODE_COMBINING_CLASS_DOUBLE_ABOVE */
  235, 236, 237, 238, 239,
  240, /* HB_UNICODE_COMBINING_CLASS_IOTA_SUBSCRIPT */
  241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
  255, /* HB_UNICODE_COMBINING_CLASS_INVALID */
 };
--- a/src/hb-unicode.h
+++ b/src/hb-unicode.h
@ -79,6 +79,10 @@ typedef enum
 /* hb_unicode_combining_class_t */
 /* Note: newer versions of Unicode may add new values.  Clients should be ready to handle
 * any value in the 0..254 range being returned from hb_unicode_combining_class().
 */
 /* Unicode Character Database property: Canonical_Combining_Class (ccc) */
 typedef enum
 {