[icu] Add two-way script conversion functions

Also optimizes the common-direction script lookup.
This commit is contained in:
Behdad Esfahbod 2011-04-20 02:54:42 -04:00
parent 0809aadd4b
commit f144a8ea84
4 changed files with 124 additions and 97 deletions

View File

@ -37,7 +37,7 @@ hb_tag_from_string (const char *s)
char tag[4]; char tag[4];
unsigned int i; unsigned int i;
if (!s) if (!s || !*s)
return HB_TAG_NONE; return HB_TAG_NONE;
for (i = 0; i < 4 && s[i]; i++) for (i = 0; i < 4 && s[i]; i++)
@ -146,6 +146,9 @@ hb_language_to_string (hb_language_t language)
hb_script_t hb_script_t
hb_script_from_iso15924_tag (hb_tag_t tag) hb_script_from_iso15924_tag (hb_tag_t tag)
{ {
if (unlikely (tag == HB_TAG_NONE))
return HB_SCRIPT_INVALID;
/* Be lenient, adjust case (one capital letter followed by three small letters) */ /* Be lenient, adjust case (one capital letter followed by three small letters) */
tag = (tag & 0xDFDFDFDF) | 0x00202020; tag = (tag & 0xDFDFDFDF) | 0x00202020;

View File

@ -33,111 +33,27 @@
#include <unicode/uversion.h> #include <unicode/uversion.h>
#include <unicode/uchar.h> #include <unicode/uchar.h>
#include <unicode/uscript.h>
HB_BEGIN_DECLS HB_BEGIN_DECLS
static unsigned int hb_script_t
hb_icu_get_combining_class (hb_unicode_funcs_t *ufuncs, hb_icu_script_to_script (UScriptCode script)
hb_codepoint_t unicode,
void *user_data)
{ {
return u_getCombiningClass (unicode); return hb_script_from_string (uscript_getShortName (script));
} }
static unsigned int UScriptCode
hb_icu_get_eastasian_width (hb_unicode_funcs_t *ufuncs, hb_icu_script_from_script (hb_script_t script)
hb_codepoint_t unicode,
void *user_data)
{ {
switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH)) switch ((int) script)
{
case U_EA_WIDE:
case U_EA_FULLWIDTH:
return 2;
case U_EA_NEUTRAL:
case U_EA_AMBIGUOUS:
case U_EA_HALFWIDTH:
case U_EA_NARROW:
return 1;
}
return 1;
}
static hb_unicode_general_category_t
hb_icu_get_general_category (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
{
case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_COMBINING_MARK;
case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
}
return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
}
static hb_codepoint_t
hb_icu_get_mirroring (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
return u_charMirror(unicode);
}
static hb_script_t
hb_icu_get_script (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
UErrorCode status = U_ZERO_ERROR;
UScriptCode scriptCode = uscript_getScript(unicode, &status);
switch ((int) scriptCode)
{ {
#define CHECK_ICU_VERSION(major, minor) \ #define CHECK_ICU_VERSION(major, minor) \
U_ICU_VERSION_MAJOR_NUM > (major) || (U_ICU_VERSION_MAJOR_NUM == (major) && U_ICU_VERSION_MINOR_NUM >= (minor)) U_ICU_VERSION_MAJOR_NUM > (major) || (U_ICU_VERSION_MAJOR_NUM == (major) && U_ICU_VERSION_MINOR_NUM >= (minor))
#define MATCH_SCRIPT(C) case USCRIPT_##C: return HB_SCRIPT_##C #define MATCH_SCRIPT(C) case HB_SCRIPT_##C: return USCRIPT_##C
#define MATCH_SCRIPT2(C1, C2) case USCRIPT_##C1: return HB_SCRIPT_##C2 #define MATCH_SCRIPT2(C1, C2) case HB_SCRIPT_##C2: return USCRIPT_##C1
MATCH_SCRIPT2(INVALID_CODE, INVALID);
MATCH_SCRIPT (COMMON); MATCH_SCRIPT (COMMON);
MATCH_SCRIPT (INHERITED); MATCH_SCRIPT (INHERITED);
@ -259,7 +175,106 @@ hb_icu_get_script (hb_unicode_funcs_t *ufuncs,
#undef MATCH_SCRIPT2 #undef MATCH_SCRIPT2
} }
return HB_SCRIPT_UNKNOWN; return USCRIPT_UNKNOWN;
}
static unsigned int
hb_icu_get_combining_class (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
return u_getCombiningClass (unicode);
}
static unsigned int
hb_icu_get_eastasian_width (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
switch (u_getIntPropertyValue(unicode, UCHAR_EAST_ASIAN_WIDTH))
{
case U_EA_WIDE:
case U_EA_FULLWIDTH:
return 2;
case U_EA_NEUTRAL:
case U_EA_AMBIGUOUS:
case U_EA_HALFWIDTH:
case U_EA_NARROW:
return 1;
}
return 1;
}
static hb_unicode_general_category_t
hb_icu_get_general_category (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
switch (u_getIntPropertyValue(unicode, UCHAR_GENERAL_CATEGORY))
{
case U_UNASSIGNED: return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
case U_UPPERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER;
case U_LOWERCASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER;
case U_TITLECASE_LETTER: return HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER;
case U_MODIFIER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER;
case U_OTHER_LETTER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER;
case U_NON_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK;
case U_ENCLOSING_MARK: return HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK;
case U_COMBINING_SPACING_MARK: return HB_UNICODE_GENERAL_CATEGORY_COMBINING_MARK;
case U_DECIMAL_DIGIT_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER;
case U_LETTER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER;
case U_OTHER_NUMBER: return HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER;
case U_SPACE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR;
case U_LINE_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR;
case U_PARAGRAPH_SEPARATOR: return HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR;
case U_CONTROL_CHAR: return HB_UNICODE_GENERAL_CATEGORY_CONTROL;
case U_FORMAT_CHAR: return HB_UNICODE_GENERAL_CATEGORY_FORMAT;
case U_PRIVATE_USE_CHAR: return HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE;
case U_SURROGATE: return HB_UNICODE_GENERAL_CATEGORY_SURROGATE;
case U_DASH_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION;
case U_START_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION;
case U_END_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION;
case U_CONNECTOR_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION;
case U_OTHER_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION;
case U_MATH_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL;
case U_CURRENCY_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL;
case U_MODIFIER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL;
case U_OTHER_SYMBOL: return HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL;
case U_INITIAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION;
case U_FINAL_PUNCTUATION: return HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION;
}
return HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
}
static hb_codepoint_t
hb_icu_get_mirroring (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
return u_charMirror(unicode);
}
static hb_script_t
hb_icu_get_script (hb_unicode_funcs_t *ufuncs,
hb_codepoint_t unicode,
void *user_data)
{
UErrorCode status = U_ZERO_ERROR;
UScriptCode scriptCode = uscript_getScript(unicode, &status);
return hb_icu_script_to_script (scriptCode);
} }
static hb_unicode_funcs_t icu_ufuncs = { static hb_unicode_funcs_t icu_ufuncs = {

View File

@ -28,10 +28,19 @@
#define HB_ICU_H #define HB_ICU_H
#include "hb.h" #include "hb.h"
#include <unicode/uscript.h>
HB_BEGIN_DECLS HB_BEGIN_DECLS
hb_script_t
hb_icu_script_to_script (UScriptCode script);
UScriptCode
hb_icu_script_from_script (hb_script_t script);
hb_unicode_funcs_t * hb_unicode_funcs_t *
hb_icu_get_unicode_funcs (void); hb_icu_get_unicode_funcs (void);

View File

@ -91,8 +91,8 @@ test_types_tag (void)
g_assert_cmphex (hb_tag_from_string ("aBc"), ==, 0x61426320); g_assert_cmphex (hb_tag_from_string ("aBc"), ==, 0x61426320);
g_assert_cmphex (hb_tag_from_string ("aB"), ==, 0x61422020); g_assert_cmphex (hb_tag_from_string ("aB"), ==, 0x61422020);
g_assert_cmphex (hb_tag_from_string ("a"), ==, 0x61202020); g_assert_cmphex (hb_tag_from_string ("a"), ==, 0x61202020);
g_assert_cmphex (hb_tag_from_string (""), ==, 0x20202020);
g_assert_cmphex (hb_tag_from_string (""), ==, HB_TAG_NONE);
g_assert_cmphex (hb_tag_from_string (NULL), ==, HB_TAG_NONE); g_assert_cmphex (hb_tag_from_string (NULL), ==, HB_TAG_NONE);
} }