[ot-tags] Speed up hb_ot_tags_from_language()

Part of https://github.com/harfbuzz/harfbuzz/issues/3591

"After that, bulk of the time I suppose is spent in binary-searching the
language table. I suggest we split the language table in 2-letter and
3-letter tags, to speed-up the vast majority of cases that are
2-letter."

benchmark-ot, before:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN        112 ns          111 ns      6286271
BM_hb_ot_tags_from_script_and_language/COMMON en_US       60.6 ns         60.4 ns     11671176
BM_hb_ot_tags_from_script_and_language/LATIN en_US        61.3 ns         61.1 ns     11442645
BM_hb_ot_tags_from_script_and_language/COMMON none        4.75 ns         4.74 ns    146997235
BM_hb_ot_tags_from_script_and_language/LATIN none         4.65 ns         4.64 ns    150938747

After:

----------------------------------------------------------------------------------------------
Benchmark                                                    Time             CPU   Iterations
----------------------------------------------------------------------------------------------
BM_hb_ot_tags_from_script_and_language/COMMON zh_CN       89.5 ns         89.2 ns      7747649
BM_hb_ot_tags_from_script_and_language/COMMON en_US       38.5 ns         38.4 ns     18199432
BM_hb_ot_tags_from_script_and_language/LATIN en_US        39.0 ns         38.9 ns     18049238
BM_hb_ot_tags_from_script_and_language/COMMON none        4.53 ns         4.52 ns    154895110
BM_hb_ot_tags_from_script_and_language/LATIN none         4.54 ns         4.52 ns    154762105
This commit is contained in:
Behdad Esfahbod 2022-05-17 14:28:28 -06:00
parent 9baccb9860
commit dd3c858f84
3 changed files with 270 additions and 239 deletions

View File

@ -894,7 +894,6 @@ print ()
print ('#ifndef HB_OT_TAG_TABLE_HH') print ('#ifndef HB_OT_TAG_TABLE_HH')
print ('#define HB_OT_TAG_TABLE_HH') print ('#define HB_OT_TAG_TABLE_HH')
print () print ()
print ('static const LangTag ot_languages[] = {')
def hb_tag (tag): def hb_tag (tag):
"""Convert a tag to ``HB_TAG`` form. """Convert a tag to ``HB_TAG`` form.
@ -944,9 +943,12 @@ def get_matching_language_name (intersection, candidates):
def same_tag (bcp_47_tag, ot_tags): def same_tag (bcp_47_tag, ot_tags):
return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower () return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
for language_len in (2, 3):
print ('static const LangTag ot_languages%d[] = {' % language_len)
for language, tags in sorted (ot.from_bcp_47.items ()): for language, tags in sorted (ot.from_bcp_47.items ()):
if language == '' or '-' in language: if language == '' or '-' in language:
continue continue
if len(language) != language_len: continue
commented_out = same_tag (language, tags) commented_out = same_tag (language, tags)
for i, tag in enumerate (tags, start=1): for i, tag in enumerate (tags, start=1):
print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='') print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
@ -968,7 +970,6 @@ for language, tags in sorted (ot.from_bcp_47.items ()):
bcp_47.names[language] = name bcp_47.names[language] = name
write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope)) write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
print (' */') print (' */')
print ('};') print ('};')
print () print ()

File diff suppressed because it is too large Load Diff

View File

@ -283,7 +283,21 @@ hb_ot_tags_from_language (const char *lang_str,
ISALPHA (s[1])) ISALPHA (s[1]))
lang_str = s + 1; lang_str = s + 1;
} }
if (hb_sorted_array (ot_languages).bfind (lang_str, &tag_idx)) const LangTag *ot_languages = nullptr;
unsigned ot_languages_len = 0;
const char *dash = strchr (lang_str, '-');
unsigned first_len = dash ? dash - lang_str : limit - lang_str;
if (first_len == 2)
{
ot_languages = ot_languages2;
ot_languages_len = ARRAY_LENGTH (ot_languages2);
}
else if (first_len == 3)
{
ot_languages = ot_languages3;
ot_languages_len = ARRAY_LENGTH (ot_languages3);
}
if (hb_sorted_array (ot_languages, ot_languages_len).bfind (lang_str, &tag_idx))
{ {
unsigned int i; unsigned int i;
while (tag_idx != 0 && while (tag_idx != 0 &&
@ -291,7 +305,7 @@ hb_ot_tags_from_language (const char *lang_str,
tag_idx--; tag_idx--;
for (i = 0; for (i = 0;
i < *count && i < *count &&
tag_idx + i < ARRAY_LENGTH (ot_languages) && tag_idx + i < ot_languages_len &&
ot_languages[tag_idx + i].tag != HB_TAG_NONE && ot_languages[tag_idx + i].tag != HB_TAG_NONE &&
0 == strcmp (ot_languages[tag_idx + i].language, ot_languages[tag_idx].language); 0 == strcmp (ot_languages[tag_idx + i].language, ot_languages[tag_idx].language);
i++) i++)
@ -459,9 +473,12 @@ hb_ot_tag_to_language (hb_tag_t tag)
return disambiguated_tag; return disambiguated_tag;
} }
for (i = 0; i < ARRAY_LENGTH (ot_languages); i++) for (i = 0; i < ARRAY_LENGTH (ot_languages2); i++)
if (ot_languages[i].tag == tag) if (ot_languages2[i].tag == tag)
return hb_language_from_string (ot_languages[i].language, -1); return hb_language_from_string (ot_languages2[i].language, -1);
for (i = 0; i < ARRAY_LENGTH (ot_languages3); i++)
if (ot_languages3[i].tag == tag)
return hb_language_from_string (ot_languages3[i].language, -1);
/* Return a custom language in the form of "x-hbot-AABBCCDD". /* Return a custom language in the form of "x-hbot-AABBCCDD".
* If it's three letters long, also guess it's ISO 639-3 and lower-case and * If it's three letters long, also guess it's ISO 639-3 and lower-case and
@ -557,13 +574,23 @@ hb_ot_tags_to_script_and_language (hb_tag_t script_tag,
static inline void static inline void
test_langs_sorted () test_langs_sorted ()
{ {
for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages); i++) for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages2); i++)
{ {
int c = ot_languages[i].cmp (&ot_languages[i - 1]); int c = ot_languages2[i].cmp (&ot_languages2[i - 1]);
if (c > 0) if (c > 0)
{ {
fprintf (stderr, "ot_languages not sorted at index %d: %s %d %s\n", fprintf (stderr, "ot_languages2 not sorted at index %d: %s %d %s\n",
i, ot_languages[i-1].language, c, ot_languages[i].language); i, ot_languages2[i-1].language, c, ot_languages2[i].language);
abort();
}
}
for (unsigned int i = 1; i < ARRAY_LENGTH (ot_languages3); i++)
{
int c = ot_languages3[i].cmp (&ot_languages3[i - 1]);
if (c > 0)
{
fprintf (stderr, "ot_languages3 not sorted at index %d: %s %d %s\n",
i, ot_languages3[i-1].language, c, ot_languages3[i].language);
abort(); abort();
} }
} }