From f144a8ea840c6452c1fece2fd988b42a8ea7c5a6 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Wed, 20 Apr 2011 02:54:42 -0400 Subject: [PATCH] [icu] Add two-way script conversion functions Also optimizes the common-direction script lookup. --- src/hb-common.c | 5 +- src/hb-icu.cc | 205 +++++++++++++++++++++++++--------------------- src/hb-icu.h | 9 ++ test/test-types.c | 2 +- 4 files changed, 124 insertions(+), 97 deletions(-) diff --git a/src/hb-common.c b/src/hb-common.c index 56537e479..c1c4c548a 100644 --- a/src/hb-common.c +++ b/src/hb-common.c @@ -37,7 +37,7 @@ hb_tag_from_string (const char *s) char tag[4]; unsigned int i; - if (!s) + if (!s || !*s) return HB_TAG_NONE; for (i = 0; i < 4 && s[i]; i++) @@ -146,6 +146,9 @@ hb_language_to_string (hb_language_t language) hb_script_t hb_script_from_iso15924_tag (hb_tag_t tag) { + if (unlikely (tag == HB_TAG_NONE)) + return HB_SCRIPT_INVALID; + /* Be lenient, adjust case (one capital letter followed by three small letters) */ tag = (tag & 0xDFDFDFDF) | 0x00202020; diff --git a/src/hb-icu.cc b/src/hb-icu.cc index dcfbade0a..8fc8c81ac 100644 --- a/src/hb-icu.cc +++ b/src/hb-icu.cc @@ -33,111 +33,27 @@ #include #include -#include HB_BEGIN_DECLS -static unsigned int -hb_icu_get_combining_class (hb_unicode_funcs_t *ufuncs, - hb_codepoint_t unicode, - void *user_data) - +hb_script_t +hb_icu_script_to_script (UScriptCode script) { - return u_getCombiningClass (unicode); + return hb_script_from_string (uscript_getShortName (script)); } -static unsigned int -hb_icu_get_eastasian_width (hb_unicode_funcs_t *ufuncs, - hb_codepoint_t unicode, - void *user_data) +UScriptCode +hb_icu_script_from_script (hb_script_t script) { - switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) - { - case U_EA_WIDE: - case U_EA_FULLWIDTH: - return 2; - case U_EA_NEUTRAL: - case U_EA_AMBIGUOUS: - case U_EA_HALFWIDTH: - case U_EA_NARROW: - return 1; - } - return 1; -} - -static hb_unicode_general_category_t -hb_icu_get_general_category (hb_unicode_funcs_t *ufuncs, - hb_codepoint_t unicode, - void *user_data) -{ - switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) - { - case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; - - case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; - case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; - case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; - case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; - case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; - - case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; - case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; - case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_COMBINING_MARK; - - case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; - case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; - case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; - - case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; - case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; - case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; - - case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; - case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; - case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; - case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; - - - case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; - case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; - case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; - case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; - case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; - - case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; - case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; - case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; - case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; - - case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; - case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; - } - - return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; -} - -static hb_codepoint_t -hb_icu_get_mirroring (hb_unicode_funcs_t *ufuncs, - hb_codepoint_t unicode, - void *user_data) -{ - return u_charMirror(unicode); -} - -static hb_script_t -hb_icu_get_script (hb_unicode_funcs_t *ufuncs, - hb_codepoint_t unicode, - void *user_data) -{ - UErrorCode status = U_ZERO_ERROR; - UScriptCode scriptCode = uscript_getScript(unicode, &status); - switch ((int) scriptCode) + switch ((int) script) { #define CHECK_ICU_VERSION(major, minor) \ U_ICU_VERSION_MAJOR_NUM > (major) || (U_ICU_VERSION_MAJOR_NUM == (major) && U_ICU_VERSION_MINOR_NUM >= (minor)) -#define MATCH_SCRIPT(C) case USCRIPT_##C: return HB_SCRIPT_##C -#define MATCH_SCRIPT2(C1, C2) case USCRIPT_##C1: return HB_SCRIPT_##C2 +#define MATCH_SCRIPT(C) case HB_SCRIPT_##C: return USCRIPT_##C +#define MATCH_SCRIPT2(C1, C2) case HB_SCRIPT_##C2: return USCRIPT_##C1 + + MATCH_SCRIPT2(INVALID_CODE, INVALID); MATCH_SCRIPT (COMMON); MATCH_SCRIPT (INHERITED); @@ -259,7 +175,106 @@ hb_icu_get_script (hb_unicode_funcs_t *ufuncs, #undef MATCH_SCRIPT2 } - return HB_SCRIPT_UNKNOWN; + return USCRIPT_UNKNOWN; +} + + +static unsigned int +hb_icu_get_combining_class (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t unicode, + void *user_data) + +{ + return u_getCombiningClass (unicode); +} + +static unsigned int +hb_icu_get_eastasian_width (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t unicode, + void *user_data) +{ + switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) + { + case U_EA_WIDE: + case U_EA_FULLWIDTH: + return 2; + case U_EA_NEUTRAL: + case U_EA_AMBIGUOUS: + case U_EA_HALFWIDTH: + case U_EA_NARROW: + return 1; + } + return 1; +} + +static hb_unicode_general_category_t +hb_icu_get_general_category (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t unicode, + void *user_data) +{ + switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY)) + { + case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; + + case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER; + case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER; + case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER; + case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER; + case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER; + + case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK; + case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK; + case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_COMBINING_MARK; + + case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER; + case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER; + case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER; + + case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR; + case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR; + case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR; + + case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL; + case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT; + case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE; + case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE; + + + case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION; + case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION; + case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION; + case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION; + case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION; + + case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL; + case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL; + case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL; + case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL; + + case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION; + case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION; + } + + return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; +} + +static hb_codepoint_t +hb_icu_get_mirroring (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t unicode, + void *user_data) +{ + return u_charMirror(unicode); +} + +static hb_script_t +hb_icu_get_script (hb_unicode_funcs_t *ufuncs, + hb_codepoint_t unicode, + void *user_data) +{ + UErrorCode status = U_ZERO_ERROR; + UScriptCode scriptCode = uscript_getScript(unicode, &status); + + return hb_icu_script_to_script (scriptCode); } static hb_unicode_funcs_t icu_ufuncs = { diff --git a/src/hb-icu.h b/src/hb-icu.h index cc17af8bb..2e7f14604 100644 --- a/src/hb-icu.h +++ b/src/hb-icu.h @@ -28,10 +28,19 @@ #define HB_ICU_H #include "hb.h" +#include + HB_BEGIN_DECLS +hb_script_t +hb_icu_script_to_script (UScriptCode script); + +UScriptCode +hb_icu_script_from_script (hb_script_t script); + + hb_unicode_funcs_t * hb_icu_get_unicode_funcs (void); diff --git a/test/test-types.c b/test/test-types.c index 5d7043ee5..daf275481 100644 --- a/test/test-types.c +++ b/test/test-types.c @@ -91,8 +91,8 @@ test_types_tag (void) g_assert_cmphex (hb_tag_from_string ("aBc"), ==, 0x61426320); g_assert_cmphex (hb_tag_from_string ("aB"), ==, 0x61422020); g_assert_cmphex (hb_tag_from_string ("a"), ==, 0x61202020); - g_assert_cmphex (hb_tag_from_string (""), ==, 0x20202020); + g_assert_cmphex (hb_tag_from_string (""), ==, HB_TAG_NONE); g_assert_cmphex (hb_tag_from_string (NULL), ==, HB_TAG_NONE); }